From 6beb0a0c159abdc0a81441cfdfe980a2f6a23eb9 Mon Sep 17 00:00:00 2001 From: "Node.js GitHub Bot" Date: Sat, 21 Oct 2023 12:52:09 +0000 Subject: [PATCH] deps: update simdutf to 4.0.0 --- deps/simdutf/simdutf.cpp | 17823 ++++++++++------ deps/simdutf/simdutf.h | 1009 +- .../maintaining/maintaining-dependencies.md | 6 +- 3 files changed, 12070 insertions(+), 6768 deletions(-) diff --git a/deps/simdutf/simdutf.cpp b/deps/simdutf/simdutf.cpp index 70b461ab550b46..eb387059ce00e2 100644 --- a/deps/simdutf/simdutf.cpp +++ b/deps/simdutf/simdutf.cpp @@ -1,8 +1,6 @@ -/* auto-generated on 2023-10-08 13:48:09 -0400. Do not edit! */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf.cpp +/* auto-generated on 2023-10-20 19:53:58 -0400. Do not edit! */ /* begin file src/simdutf.cpp */ #include "simdutf.h" -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=implementation.cpp /* begin file src/implementation.cpp */ #include #include @@ -26,7 +24,6 @@ std::string toBinaryString(T b) { // Implementations // The best choice should always come first! -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64.h /* begin file src/simdutf/arm64.h */ #ifndef SIMDUTF_ARM64_H #define SIMDUTF_ARM64_H @@ -53,7 +50,6 @@ namespace arm64 { } // namespace arm64 } // namespace simdutf -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/implementation.h /* begin file src/simdutf/arm64/implementation.h */ #ifndef SIMDUTF_ARM64_IMPLEMENTATION_H #define SIMDUTF_ARM64_IMPLEMENTATION_H @@ -80,6 +76,13 @@ class implementation final : public simdutf::implementation { simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final; simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final; simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final; simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; @@ -89,12 +92,21 @@ class implementation final : public simdutf::implementation { simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; + simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; @@ -122,6 +134,13 @@ class implementation final : public simdutf::implementation { simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept; simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept; simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept; + simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept; + simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept; + simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept; + }; } // namespace arm64 @@ -130,14 +149,12 @@ class implementation final : public simdutf::implementation { #endif // SIMDUTF_ARM64_IMPLEMENTATION_H /* end file src/simdutf/arm64/implementation.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/begin.h /* begin file src/simdutf/arm64/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "arm64" // #define SIMDUTF_IMPLEMENTATION arm64 /* end file src/simdutf/arm64/begin.h */ // Declarations -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/intrinsics.h /* begin file src/simdutf/arm64/intrinsics.h */ #ifndef SIMDUTF_ARM64_INTRINSICS_H #define SIMDUTF_ARM64_INTRINSICS_H @@ -149,7 +166,6 @@ class implementation final : public simdutf::implementation { #endif // SIMDUTF_ARM64_INTRINSICS_H /* end file src/simdutf/arm64/intrinsics.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/bitmanipulation.h /* begin file src/simdutf/arm64/bitmanipulation.h */ #ifndef SIMDUTF_ARM64_BITMANIPULATION_H #define SIMDUTF_ARM64_BITMANIPULATION_H @@ -169,7 +185,6 @@ simdutf_really_inline int count_ones(uint64_t input_num) { #endif // SIMDUTF_ARM64_BITMANIPULATION_H /* end file src/simdutf/arm64/bitmanipulation.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/simd.h /* begin file src/simdutf/arm64/simd.h */ #ifndef SIMDUTF_ARM64_SIMD_H #define SIMDUTF_ARM64_SIMD_H @@ -505,32 +520,68 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1, int16_t x2, int16_t static simdutf_really_inline simd8 splat(int8_t _value) { return vmovq_n_s8(_value); } static simdutf_really_inline simd8 zero() { return vdupq_n_s8(0); } static simdutf_really_inline simd8 load(const int8_t values[16]) { return vld1q_s8(values); } + + // Use ST2 instead of UXTL+UXTL2 to interleave zeroes. UXTL is actually a USHLL #0, + // and shifting in NEON is actually quite slow. + // + // While this needs the registers to be in a specific order, bigger cores can interleave + // these with no overhead, and it still performs decently on little cores. + // movi v1.3d, #0 + // mov v0.16b, value[0] + // st2 {v0.16b, v1.16b}, [ptr], #32 + // mov v0.16b, value[1] + // st2 {v0.16b, v1.16b}, [ptr], #32 + // ... template simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const { - uint16x8_t first = vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value))); - uint16x8_t second = vmovl_high_u8(vreinterpretq_u8_s8(this->value)); - if (!match_system(big_endian)) { - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - #else - const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - #endif - first = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(first), swap)); - second = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(second), swap)); - } - vst1q_u16(reinterpret_cast(p), first); - vst1q_u16(reinterpret_cast(p + 8), second); - } + int8x16x2_t pair = match_system(big_endian) + ? int8x16x2_t{{this->value, vmovq_n_s8(0)}} + : int8x16x2_t{{vmovq_n_s8(0), this->value}}; + vst2q_s8(reinterpret_cast(p), pair); + } + + // currently unused + // Technically this could be done with ST4 like in store_ascii_as_utf16, but it is + // very much not worth it, as explicitly mentioned in the ARM Cortex-X1 Core Software + // Optimization Guide: + // 4.18 Complex ASIMD instructions + // The bandwidth of [ST4 with element size less than 64b] is limited by decode + // constraints and it is advisable to avoid them when high performing code is desired. + // Instead, it is better to use ZIP1+ZIP2 and two ST2. simdutf_really_inline void store_ascii_as_utf32(char32_t * p) const { - vst1q_u32(reinterpret_cast(p), vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value)))))); - vst1q_u32(reinterpret_cast(p + 4), vmovl_high_u16(vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value))))); - vst1q_u32(reinterpret_cast(p + 8), vmovl_u16(vget_low_u16(vmovl_high_u8(vreinterpretq_u8_s8(this->value))))); - vst1q_u32(reinterpret_cast(p + 12), vmovl_high_u16(vmovl_high_u8(vreinterpretq_u8_s8(this->value)))); + const uint16x8_t low = vreinterpretq_u16_s8(vzip1q_s8(this->value, vmovq_n_s8(0))); + const uint16x8_t high = vreinterpretq_u16_s8(vzip2q_s8(this->value, vmovq_n_s8(0))); + const uint16x8x2_t low_pair{{ low, vmovq_n_u16(0) }}; + vst2q_u16(reinterpret_cast(p), low_pair); + const uint16x8x2_t high_pair{{ high, vmovq_n_u16(0) }}; + vst2q_u16(reinterpret_cast(p + 8), high_pair); + } + + // In places where the table can be reused, which is most uses in simdutf, it is worth it to do + // 4 table lookups, as there is no direct zero extension from u8 to u32. + simdutf_really_inline void store_ascii_as_utf32_tbl(char32_t * p) const { + const simd8 tb1{ 0,255,255,255, 1,255,255,255, 2,255,255,255, 3,255,255,255 }; + const simd8 tb2{ 4,255,255,255, 5,255,255,255, 6,255,255,255, 7,255,255,255 }; + const simd8 tb3{ 8,255,255,255, 9,255,255,255, 10,255,255,255, 11,255,255,255 }; + const simd8 tb4{ 12,255,255,255, 13,255,255,255, 14,255,255,255, 15,255,255,255 }; + + // encourage store pairing and interleaving + const auto shuf1 = this->apply_lookup_16_to(tb1); + const auto shuf2 = this->apply_lookup_16_to(tb2); + shuf1.store(reinterpret_cast(p)); + shuf2.store(reinterpret_cast(p + 4)); + + const auto shuf3 = this->apply_lookup_16_to(tb3); + const auto shuf4 = this->apply_lookup_16_to(tb4); + shuf3.store(reinterpret_cast(p + 8)); + shuf4.store(reinterpret_cast(p + 12)); } // Conversion from/to SIMD register simdutf_really_inline simd8(const int8x16_t _value) : value{_value} {} simdutf_really_inline operator const int8x16_t&() const { return this->value; } +#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO simdutf_really_inline operator const uint8x16_t() const { return vreinterpretq_u8_s8(this->value); } +#endif simdutf_really_inline operator int8x16_t&() { return this->value; } // Zero constructor @@ -627,7 +678,7 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1, int16_t x2, int16_t } template - simdutf_really_inline simd8 apply_lookup_16_to(const simd8 original) { + simdutf_really_inline simd8 apply_lookup_16_to(const simd8 original) const { return vqtbl1q_s8(*this, simd8(original)); } }; @@ -678,10 +729,10 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1, int16_t x2, int16_t } simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const { - this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8)*0); - this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8)*1); - this->chunks[2].store_ascii_as_utf32(ptr+sizeof(simd8)*2); - this->chunks[3].store_ascii_as_utf32(ptr+sizeof(simd8)*3); + this->chunks[0].store_ascii_as_utf32_tbl(ptr+sizeof(simd8)*0); + this->chunks[1].store_ascii_as_utf32_tbl(ptr+sizeof(simd8)*1); + this->chunks[2].store_ascii_as_utf32_tbl(ptr+sizeof(simd8)*2); + this->chunks[3].store_ascii_as_utf32_tbl(ptr+sizeof(simd8)*3); } simdutf_really_inline uint64_t to_bitmask() const { @@ -782,7 +833,6 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1, int16_t x2, int16_t ).to_bitmask(); } }; // struct simd8x64 -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/simd16-inl.h /* begin file src/simdutf/arm64/simd16-inl.h */ template struct simd16; @@ -869,7 +919,7 @@ struct base16_numeric: base16 { simdutf_really_inline simd16& operator-=(const simd16 other) { *this = *this - other; return *static_cast*>(this); } }; -// Signed words +// Signed code units template<> struct simd16 : base16_numeric { simdutf_really_inline simd16() : base16_numeric() {} @@ -899,7 +949,7 @@ struct simd16 : base16_numeric { -// Unsigned words +// Unsigned code units template<> struct simd16: base16_numeric { simdutf_really_inline simd16() : base16_numeric() {} @@ -942,19 +992,14 @@ struct simd16: base16_numeric { simdutf_really_inline simd16 operator&(const simd16 other) const { return vandq_u16(*this, other); } simdutf_really_inline simd16 operator^(const simd16 other) const { return veorq_u16(*this, other); } - // Pack with the unsigned saturation two uint16_t words into single uint8_t vector + // Pack with the unsigned saturation two uint16_t code units into single uint8_t vector static simdutf_really_inline simd8 pack(const simd16& v0, const simd16& v1) { return vqmovn_high_u16(vqmovn_u16(v0), v1); } // Change the endianness simdutf_really_inline simd16 swap_bytes() const { - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - #else - const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - #endif - return vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(*this), swap)); + return vreinterpretq_u16_u8(vrev16q_u8((*this))); } }; simdutf_really_inline simd16::operator simd16() const { return this->value; } @@ -1095,7 +1140,6 @@ simdutf_really_inline simd16::operator simd16() const { retur #endif // SIMDUTF_ARM64_SIMD_H /* end file src/simdutf/arm64/simd.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/end.h /* begin file src/simdutf/arm64/end.h */ /* end file src/simdutf/arm64/end.h */ @@ -1103,7 +1147,6 @@ simdutf_really_inline simd16::operator simd16() const { retur #endif // SIMDUTF_ARM64_H /* end file src/simdutf/arm64.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake.h /* begin file src/simdutf/icelake.h */ #ifndef SIMDUTF_ICELAKE_H #define SIMDUTF_ICELAKE_H @@ -1145,7 +1188,7 @@ simdutf_really_inline simd16::operator simd16() const { retur #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE #define SIMDUTF_TARGET_ICELAKE #else -#define SIMDUTF_TARGET_ICELAKE SIMDUTF_TARGET_REGION("avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2,avx512vl,avx2,bmi,bmi2,pclmul,lzcnt,popcnt") +#define SIMDUTF_TARGET_ICELAKE SIMDUTF_TARGET_REGION("avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2,avx512vl,avx2,bmi,bmi2,pclmul,lzcnt,popcnt,avx512vpopcntdq") #endif namespace simdutf { @@ -1158,7 +1201,6 @@ namespace icelake { // // These two need to be included outside SIMDUTF_TARGET_REGION // -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/intrinsics.h /* begin file src/simdutf/icelake/intrinsics.h */ #ifndef SIMDUTF_ICELAKE_INTRINSICS_H #define SIMDUTF_ICELAKE_INTRINSICS_H @@ -1227,6 +1269,8 @@ SIMDUTF_POP_DISABLE_WARNINGS #include #include #include +#include +#include // unfortunately, we may not get _blsr_u64, but, thankfully, clang // has it as a macro. #ifndef _blsr_u64 @@ -1268,7 +1312,6 @@ inline __m512i _mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, u #endif // SIMDUTF_HASWELL_INTRINSICS_H /* end file src/simdutf/icelake/intrinsics.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/implementation.h /* begin file src/simdutf/icelake/implementation.h */ #ifndef SIMDUTF_ICELAKE_IMPLEMENTATION_H #define SIMDUTF_ICELAKE_IMPLEMENTATION_H @@ -1286,7 +1329,7 @@ class implementation final : public simdutf::implementation { simdutf_really_inline implementation() : simdutf::implementation( "icelake", "Intel AVX512 (AVX-512BW, AVX-512CD, AVX-512VL, AVX-512VBMI2 extensions)", - internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 | internal::instruction_set::AVX512BW | internal::instruction_set::AVX512CD | internal::instruction_set::AVX512VL | internal::instruction_set::AVX512VBMI2 ) {} + internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 | internal::instruction_set::AVX512BW | internal::instruction_set::AVX512CD | internal::instruction_set::AVX512VL | internal::instruction_set::AVX512VBMI2 | internal::instruction_set::AVX512VPOPCNTDQ ) {} simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final; simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; @@ -1298,6 +1341,13 @@ class implementation final : public simdutf::implementation { simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final; simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final; simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final; simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; @@ -1307,6 +1357,12 @@ class implementation final : public simdutf::implementation { simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; @@ -1316,6 +1372,9 @@ class implementation final : public simdutf::implementation { simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; + simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; @@ -1340,6 +1399,12 @@ class implementation final : public simdutf::implementation { simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept; simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept; simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept; + simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept; + simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept; + simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept; }; } // namespace icelake @@ -1351,7 +1416,6 @@ class implementation final : public simdutf::implementation { // // The rest need to be inside the region // -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/begin.h /* begin file src/simdutf/icelake/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "icelake" // #define SIMDUTF_IMPLEMENTATION icelake @@ -1367,7 +1431,6 @@ SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized) #endif // end of workaround /* end file src/simdutf/icelake/begin.h */ // Declarations -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/bitmanipulation.h /* begin file src/simdutf/icelake/bitmanipulation.h */ #ifndef SIMDUTF_ICELAKE_BITMANIPULATION_H #define SIMDUTF_ICELAKE_BITMANIPULATION_H @@ -1393,7 +1456,6 @@ simdutf_really_inline long long int count_ones(uint64_t input_num) { #endif // SIMDUTF_ICELAKE_BITMANIPULATION_H /* end file src/simdutf/icelake/bitmanipulation.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/end.h /* begin file src/simdutf/icelake/end.h */ #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE // nothing needed. @@ -1412,7 +1474,6 @@ SIMDUTF_POP_DISABLE_WARNINGS #endif // SIMDUTF_IMPLEMENTATION_ICELAKE #endif // SIMDUTF_ICELAKE_H /* end file src/simdutf/icelake.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell.h /* begin file src/simdutf/haswell.h */ #ifndef SIMDUTF_HASWELL_H #define SIMDUTF_HASWELL_H @@ -1458,7 +1519,6 @@ namespace haswell { // // These two need to be included outside SIMDUTF_TARGET_REGION // -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/implementation.h /* begin file src/simdutf/haswell/implementation.h */ #ifndef SIMDUTF_HASWELL_IMPLEMENTATION_H #define SIMDUTF_HASWELL_IMPLEMENTATION_H @@ -1488,6 +1548,13 @@ class implementation final : public simdutf::implementation { simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final; simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final; simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final; simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; @@ -1497,6 +1564,12 @@ class implementation final : public simdutf::implementation { simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; @@ -1506,6 +1579,9 @@ class implementation final : public simdutf::implementation { simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; + simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; @@ -1530,6 +1606,12 @@ class implementation final : public simdutf::implementation { simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept; simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept; simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept; + simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept; + simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept; + simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept; }; } // namespace haswell @@ -1537,7 +1619,6 @@ class implementation final : public simdutf::implementation { #endif // SIMDUTF_HASWELL_IMPLEMENTATION_H /* end file src/simdutf/haswell/implementation.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/intrinsics.h /* begin file src/simdutf/haswell/intrinsics.h */ #ifndef SIMDUTF_HASWELL_INTRINSICS_H #define SIMDUTF_HASWELL_INTRINSICS_H @@ -1606,7 +1687,6 @@ SIMDUTF_POP_DISABLE_WARNINGS // // The rest need to be inside the region // -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/begin.h /* begin file src/simdutf/haswell/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "haswell" // #define SIMDUTF_IMPLEMENTATION haswell @@ -1622,7 +1702,6 @@ SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized) #endif // end of workaround /* end file src/simdutf/haswell/begin.h */ // Declarations -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/bitmanipulation.h /* begin file src/simdutf/haswell/bitmanipulation.h */ #ifndef SIMDUTF_HASWELL_BITMANIPULATION_H #define SIMDUTF_HASWELL_BITMANIPULATION_H @@ -1648,7 +1727,6 @@ simdutf_really_inline long long int count_ones(uint64_t input_num) { #endif // SIMDUTF_HASWELL_BITMANIPULATION_H /* end file src/simdutf/haswell/bitmanipulation.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/simd.h /* begin file src/simdutf/haswell/simd.h */ #ifndef SIMDUTF_HASWELL_SIMD_H #define SIMDUTF_HASWELL_SIMD_H @@ -2044,7 +2122,6 @@ namespace simd { } }; // struct simd8x64 -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/simd16-inl.h /* begin file src/simdutf/haswell/simd16-inl.h */ #ifdef __GNUC__ #if __GNUC__ < 8 @@ -2117,7 +2194,7 @@ struct base16_numeric: base16 { simdutf_really_inline simd16& operator-=(const simd16 other) { *this = *this - other; return *static_cast*>(this); } }; -// Signed words +// Signed code units template<> struct simd16 : base16_numeric { simdutf_really_inline simd16() : base16_numeric() {} @@ -2134,7 +2211,7 @@ struct simd16 : base16_numeric { simdutf_really_inline simd16 operator<(const simd16 other) const { return _mm256_cmpgt_epi16(other, *this); } }; -// Unsigned words +// Unsigned code units template<> struct simd16: base16_numeric { simdutf_really_inline simd16() : base16_numeric() {} @@ -2188,7 +2265,7 @@ struct simd16: base16_numeric { return _mm256_shuffle_epi8(*this, swap); } - // Pack with the unsigned saturation two uint16_t words into single uint8_t vector + // Pack with the unsigned saturation two uint16_t code units into single uint8_t vector static simdutf_really_inline simd8 pack(const simd16& v0, const simd16& v1) { // Note: the AVX2 variant of pack operates on 128-bit lanes, thus // we have to shuffle lanes in order to produce bytes in the @@ -2206,7 +2283,7 @@ struct simd16: base16_numeric { const __m256i t0 = _mm256_set_m128i(lo_1, lo_0); const __m256i t1 = _mm256_set_m128i(hi_1, hi_0); - // pack words in linear order from v0 and v1 + // pack code units in linear order from v0 and v1 return _mm256_packus_epi16(t0, t1); } }; @@ -2323,7 +2400,6 @@ struct simd16: base16_numeric { #endif // SIMDUTF_HASWELL_SIMD_H /* end file src/simdutf/haswell/simd.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/end.h /* begin file src/simdutf/haswell/end.h */ #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL // nothing needed. @@ -2340,7 +2416,6 @@ SIMDUTF_POP_DISABLE_WARNINGS #endif // SIMDUTF_IMPLEMENTATION_HASWELL #endif // SIMDUTF_HASWELL_COMMON_H /* end file src/simdutf/haswell.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere.h /* begin file src/simdutf/westmere.h */ #ifndef SIMDUTF_WESTMERE_H #define SIMDUTF_WESTMERE_H @@ -2381,7 +2456,6 @@ namespace westmere { // // These two need to be included outside SIMDUTF_TARGET_REGION // -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/implementation.h /* begin file src/simdutf/westmere/implementation.h */ #ifndef SIMDUTF_WESTMERE_IMPLEMENTATION_H #define SIMDUTF_WESTMERE_IMPLEMENTATION_H @@ -2409,6 +2483,13 @@ class implementation final : public simdutf::implementation { simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final; simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final; simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final; simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; @@ -2418,6 +2499,12 @@ class implementation final : public simdutf::implementation { simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; @@ -2427,6 +2514,9 @@ class implementation final : public simdutf::implementation { simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; + simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; @@ -2451,6 +2541,12 @@ class implementation final : public simdutf::implementation { simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept; simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept; simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept; + simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept; + simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept; + simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept; }; } // namespace westmere @@ -2458,7 +2554,6 @@ class implementation final : public simdutf::implementation { #endif // SIMDUTF_WESTMERE_IMPLEMENTATION_H /* end file src/simdutf/westmere/implementation.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/intrinsics.h /* begin file src/simdutf/westmere/intrinsics.h */ #ifndef SIMDUTF_WESTMERE_INTRINSICS_H #define SIMDUTF_WESTMERE_INTRINSICS_H @@ -2507,7 +2602,6 @@ SIMDUTF_POP_DISABLE_WARNINGS // // The rest need to be inside the region // -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/begin.h /* begin file src/simdutf/westmere/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "westmere" // #define SIMDUTF_IMPLEMENTATION westmere @@ -2520,7 +2614,6 @@ SIMDUTF_TARGET_WESTMERE /* end file src/simdutf/westmere/begin.h */ // Declarations -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/bitmanipulation.h /* begin file src/simdutf/westmere/bitmanipulation.h */ #ifndef SIMDUTF_WESTMERE_BITMANIPULATION_H #define SIMDUTF_WESTMERE_BITMANIPULATION_H @@ -2546,7 +2639,6 @@ simdutf_really_inline long long int count_ones(uint64_t input_num) { #endif // SIMDUTF_WESTMERE_BITMANIPULATION_H /* end file src/simdutf/westmere/bitmanipulation.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/simd.h /* begin file src/simdutf/westmere/simd.h */ #ifndef SIMDUTF_WESTMERE_SIMD_H #define SIMDUTF_WESTMERE_SIMD_H @@ -2894,10 +2986,10 @@ namespace simd { } simdutf_really_inline uint64_t to_bitmask() const { - uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() ); - uint64_t r1 = this->chunks[1].to_bitmask() ; - uint64_t r2 = this->chunks[2].to_bitmask() ; - uint64_t r3 = this->chunks[3].to_bitmask() ; + uint64_t r0 = uint32_t(this->chunks[0].to_bitmask()); + uint64_t r1 = this->chunks[1].to_bitmask(); + uint64_t r2 = this->chunks[2].to_bitmask(); + uint64_t r3 = this->chunks[3].to_bitmask(); return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); } @@ -2990,7 +3082,6 @@ namespace simd { } }; // struct simd8x64 -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/simd16-inl.h /* begin file src/simdutf/westmere/simd16-inl.h */ template struct simd16; @@ -3054,7 +3145,7 @@ struct base16_numeric: base16 { simdutf_really_inline simd16& operator-=(const simd16 other) { *this = *this - other; return *static_cast*>(this); } }; -// Signed words +// Signed code units template<> struct simd16 : base16_numeric { simdutf_really_inline simd16() : base16_numeric() {} @@ -3077,7 +3168,7 @@ struct simd16 : base16_numeric { simdutf_really_inline simd16 operator<(const simd16 other) const { return _mm_cmpgt_epi16(other, *this); } }; -// Unsigned words +// Unsigned code units template<> struct simd16: base16_numeric { simdutf_really_inline simd16() : base16_numeric() {} @@ -3140,7 +3231,7 @@ struct simd16: base16_numeric { return _mm_shuffle_epi8(*this, swap); } - // Pack with the unsigned saturation two uint16_t words into single uint8_t vector + // Pack with the unsigned saturation two uint16_t code units into single uint8_t vector static simdutf_really_inline simd8 pack(const simd16& v0, const simd16& v1) { return _mm_packus_epi16(v0, v1); } @@ -3183,10 +3274,10 @@ template } simdutf_really_inline uint64_t to_bitmask() const { - uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() ); - uint64_t r1 = this->chunks[1].to_bitmask() ; - uint64_t r2 = this->chunks[2].to_bitmask() ; - uint64_t r3 = this->chunks[3].to_bitmask() ; + uint64_t r0 = uint32_t(this->chunks[0].to_bitmask()); + uint64_t r1 = this->chunks[1].to_bitmask(); + uint64_t r2 = this->chunks[2].to_bitmask(); + uint64_t r3 = this->chunks[3].to_bitmask(); return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); } @@ -3267,7 +3358,6 @@ template #endif // SIMDUTF_WESTMERE_SIMD_INPUT_H /* end file src/simdutf/westmere/simd.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/end.h /* begin file src/simdutf/westmere/end.h */ #if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE // nothing needed. @@ -3280,7 +3370,6 @@ SIMDUTF_UNTARGET_REGION #endif // SIMDUTF_IMPLEMENTATION_WESTMERE #endif // SIMDUTF_WESTMERE_COMMON_H /* end file src/simdutf/westmere.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64.h /* begin file src/simdutf/ppc64.h */ #ifndef SIMDUTF_PPC64_H #define SIMDUTF_PPC64_H @@ -3307,7 +3396,6 @@ namespace ppc64 { } // namespace ppc64 } // namespace simdutf -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/implementation.h /* begin file src/simdutf/ppc64/implementation.h */ #ifndef SIMDUTF_PPC64_IMPLEMENTATION_H #define SIMDUTF_PPC64_IMPLEMENTATION_H @@ -3386,14 +3474,12 @@ class implementation final : public simdutf::implementation { #endif // SIMDUTF_PPC64_IMPLEMENTATION_H /* end file src/simdutf/ppc64/implementation.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/begin.h /* begin file src/simdutf/ppc64/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "ppc64" // #define SIMDUTF_IMPLEMENTATION ppc64 /* end file src/simdutf/ppc64/begin.h */ // Declarations -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/intrinsics.h /* begin file src/simdutf/ppc64/intrinsics.h */ #ifndef SIMDUTF_PPC64_INTRINSICS_H #define SIMDUTF_PPC64_INTRINSICS_H @@ -3414,7 +3500,6 @@ class implementation final : public simdutf::implementation { #endif // SIMDUTF_PPC64_INTRINSICS_H /* end file src/simdutf/ppc64/intrinsics.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/bitmanipulation.h /* begin file src/simdutf/ppc64/bitmanipulation.h */ #ifndef SIMDUTF_PPC64_BITMANIPULATION_H #define SIMDUTF_PPC64_BITMANIPULATION_H @@ -3440,7 +3525,6 @@ simdutf_really_inline int count_ones(uint64_t input_num) { #endif // SIMDUTF_PPC64_BITMANIPULATION_H /* end file src/simdutf/ppc64/bitmanipulation.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/simd.h /* begin file src/simdutf/ppc64/simd.h */ #ifndef SIMDUTF_PPC64_SIMD_H #define SIMDUTF_PPC64_SIMD_H @@ -3932,7 +4016,6 @@ template struct simd8x64 { #endif // SIMDUTF_PPC64_SIMD_INPUT_H /* end file src/simdutf/ppc64/simd.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/end.h /* begin file src/simdutf/ppc64/end.h */ /* end file src/simdutf/ppc64/end.h */ @@ -3940,7 +4023,6 @@ template struct simd8x64 { #endif // SIMDUTF_PPC64_H /* end file src/simdutf/ppc64.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback.h /* begin file src/simdutf/fallback.h */ #ifndef SIMDUTF_FALLBACK_H #define SIMDUTF_FALLBACK_H @@ -3969,7 +4051,6 @@ namespace fallback { } // namespace fallback } // namespace simdutf -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/implementation.h /* begin file src/simdutf/fallback/implementation.h */ #ifndef SIMDUTF_FALLBACK_IMPLEMENTATION_H #define SIMDUTF_FALLBACK_IMPLEMENTATION_H @@ -4000,6 +4081,13 @@ class implementation final : public simdutf::implementation { simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final; simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final; simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final; simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; @@ -4009,6 +4097,12 @@ class implementation final : public simdutf::implementation { simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final; simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; @@ -4018,6 +4112,9 @@ class implementation final : public simdutf::implementation { simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; + simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final; simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; @@ -4042,7 +4139,12 @@ class implementation final : public simdutf::implementation { simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept; simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept; simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept; -}; + simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept; + simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept; + simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;}; } // namespace fallback } // namespace simdutf @@ -4050,14 +4152,12 @@ class implementation final : public simdutf::implementation { #endif // SIMDUTF_FALLBACK_IMPLEMENTATION_H /* end file src/simdutf/fallback/implementation.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/begin.h /* begin file src/simdutf/fallback/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "fallback" // #define SIMDUTF_IMPLEMENTATION fallback /* end file src/simdutf/fallback/begin.h */ // Declarations -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/bitmanipulation.h /* begin file src/simdutf/fallback/bitmanipulation.h */ #ifndef SIMDUTF_FALLBACK_BITMANIPULATION_H #define SIMDUTF_FALLBACK_BITMANIPULATION_H @@ -4068,23 +4168,6 @@ namespace simdutf { namespace fallback { namespace { -#if defined(_MSC_VER) && !defined(_M_ARM64) && !defined(_M_X64) -static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) { - unsigned long x0 = (unsigned long)x, top, bottom; - _BitScanForward(&top, (unsigned long)(x >> 32)); - _BitScanForward(&bottom, x0); - *ret = x0 ? bottom : 32 + top; - return x != 0; -} -static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) { - unsigned long x1 = (unsigned long)(x >> 32), top, bottom; - _BitScanReverse(&top, x1); - _BitScanReverse(&bottom, (unsigned long)x); - *ret = x1 ? top + 32 : bottom; - return x != 0; -} -#endif - } // unnamed namespace } // namespace fallback } // namespace simdutf @@ -4092,7 +4175,6 @@ static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) { #endif // SIMDUTF_FALLBACK_BITMANIPULATION_H /* end file src/simdutf/fallback/bitmanipulation.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/end.h /* begin file src/simdutf/fallback/end.h */ /* end file src/simdutf/fallback/end.h */ @@ -4138,22 +4220,40 @@ namespace internal { #if SIMDUTF_IMPLEMENTATION_ICELAKE -const icelake::implementation icelake_singleton{}; +static const icelake::implementation* get_icelake_singleton() { + static const icelake::implementation icelake_singleton{}; + return &icelake_singleton; +} #endif #if SIMDUTF_IMPLEMENTATION_HASWELL -const haswell::implementation haswell_singleton{}; +static const haswell::implementation* get_haswell_singleton() { + static const haswell::implementation haswell_singleton{}; + return &haswell_singleton; +} #endif #if SIMDUTF_IMPLEMENTATION_WESTMERE -const westmere::implementation westmere_singleton{}; +static const westmere::implementation* get_westmere_singleton() { + static const westmere::implementation westmere_singleton{}; + return &westmere_singleton; +} #endif #if SIMDUTF_IMPLEMENTATION_ARM64 -const arm64::implementation arm64_singleton{}; +static const arm64::implementation* get_arm64_singleton() { + static const arm64::implementation arm64_singleton{}; + return &arm64_singleton; +} #endif #if SIMDUTF_IMPLEMENTATION_PPC64 -const ppc64::implementation ppc64_singleton{}; +static const ppc64::implementation* get_ppc64_singleton() { + static const ppc64::implementation ppc64_singleton{}; + return &ppc64_singleton; +} #endif #if SIMDUTF_IMPLEMENTATION_FALLBACK -const fallback::implementation fallback_singleton{}; +static const fallback::implementation* get_fallback_singleton() { + static const fallback::implementation fallback_singleton{}; + return &fallback_singleton; +} #endif /** @@ -4209,6 +4309,34 @@ class detect_best_supported_implementation_on_first_use final : public implement return set_best()->validate_utf32_with_errors(buf, len); } + simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final override { + return set_best()->convert_latin1_to_utf8(buf, len,utf8_output); + } + + simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override { + return set_best()->convert_latin1_to_utf16le(buf, len, utf16_output); + } + + simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override { + return set_best()->convert_latin1_to_utf16be(buf, len, utf16_output); + } + + simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t * latin1_output) const noexcept final override { + return set_best()->convert_latin1_to_utf32(buf, len,latin1_output); + } + + simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final override { + return set_best()->convert_utf8_to_latin1(buf, len,latin1_output); + } + + simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept final override { + return set_best()->convert_utf8_to_latin1_with_errors(buf, len, latin1_output); + } + + simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final override { + return set_best()->convert_valid_utf8_to_latin1(buf, len,latin1_output); + } + simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override { return set_best()->convert_utf8_to_utf16le(buf, len, utf16_output); } @@ -4245,6 +4373,30 @@ class detect_best_supported_implementation_on_first_use final : public implement return set_best()->convert_valid_utf8_to_utf32(buf, len, utf32_output); } + simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override { + return set_best()->convert_utf16le_to_latin1(buf, len, latin1_output); + } + + simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override { + return set_best()->convert_utf16be_to_latin1(buf, len, latin1_output); + } + + simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override { + return set_best()->convert_utf16le_to_latin1_with_errors(buf, len, latin1_output); + } + + simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override { + return set_best()->convert_utf16be_to_latin1_with_errors(buf, len, latin1_output); + } + + simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override { + return set_best()->convert_valid_utf16le_to_latin1(buf, len, latin1_output); + } + + simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override { + return set_best()->convert_valid_utf16be_to_latin1(buf, len, latin1_output); + } + simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override { return set_best()->convert_utf16le_to_utf8(buf, len, utf8_output); } @@ -4269,6 +4421,18 @@ class detect_best_supported_implementation_on_first_use final : public implement return set_best()->convert_valid_utf16be_to_utf8(buf, len, utf8_output); } + simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final override { + return set_best()->convert_utf32_to_latin1(buf, len,latin1_output); + } + + simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final override { + return set_best()->convert_utf32_to_latin1_with_errors(buf, len,latin1_output); + } + + simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final override { + return set_best()->convert_utf32_to_latin1(buf, len,latin1_output); + } + simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override { return set_best()->convert_utf32_to_utf8(buf, len, utf8_output); } @@ -4345,6 +4509,22 @@ class detect_best_supported_implementation_on_first_use final : public implement return set_best()->count_utf8(buf, len); } + simdutf_warn_unused size_t latin1_length_from_utf8(const char * buf, size_t len) const noexcept override { + return set_best()->latin1_length_from_utf8(buf, len); + } + + simdutf_warn_unused size_t latin1_length_from_utf16(size_t len) const noexcept override { + return set_best()->latin1_length_from_utf16(len); + } + + simdutf_warn_unused size_t latin1_length_from_utf32(size_t len) const noexcept override { + return set_best()->latin1_length_from_utf32(len); + } + + simdutf_warn_unused size_t utf8_length_from_latin1(const char * buf, size_t len) const noexcept override { + return set_best()->utf8_length_from_latin1(buf, len); + } + simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * buf, size_t len) const noexcept override { return set_best()->utf8_length_from_utf16le(buf, len); } @@ -4353,6 +4533,14 @@ class detect_best_supported_implementation_on_first_use final : public implement return set_best()->utf8_length_from_utf16be(buf, len); } + simdutf_warn_unused size_t utf16_length_from_latin1(size_t len) const noexcept override { + return set_best()->utf16_length_from_latin1(len); + } + + simdutf_warn_unused size_t utf32_length_from_latin1(size_t len) const noexcept override { + return set_best()->utf32_length_from_latin1(len); + } + simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * buf, size_t len) const noexcept override { return set_best()->utf32_length_from_utf16le(buf, len); } @@ -4383,27 +4571,29 @@ class detect_best_supported_implementation_on_first_use final : public implement const implementation *set_best() const noexcept; }; - -const std::initializer_list available_implementation_pointers { +static const std::initializer_list& get_available_implementation_pointers() { + static const std::initializer_list available_implementation_pointers { #if SIMDUTF_IMPLEMENTATION_ICELAKE - &icelake_singleton, + get_icelake_singleton(), #endif #if SIMDUTF_IMPLEMENTATION_HASWELL - &haswell_singleton, + get_haswell_singleton(), #endif #if SIMDUTF_IMPLEMENTATION_WESTMERE - &westmere_singleton, + get_westmere_singleton(), #endif #if SIMDUTF_IMPLEMENTATION_ARM64 - &arm64_singleton, + get_arm64_singleton(), #endif #if SIMDUTF_IMPLEMENTATION_PPC64 - &ppc64_singleton, + get_ppc64_singleton(), #endif #if SIMDUTF_IMPLEMENTATION_FALLBACK - &fallback_singleton, + get_fallback_singleton(), #endif -}; // available_implementation_pointers + }; // available_implementation_pointers + return available_implementation_pointers; +} // So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support class unsupported_implementation final : public implementation { @@ -4459,6 +4649,34 @@ class unsupported_implementation final : public implementation { return result(error_code::OTHER, 0); } + simdutf_warn_unused size_t convert_latin1_to_utf8(const char*, size_t, char*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_latin1_to_utf16le(const char*, size_t, char16_t*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_latin1_to_utf16be(const char*, size_t, char16_t*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_latin1_to_utf32(const char*, size_t, char32_t*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_utf8_to_latin1(const char*, size_t, char*) const noexcept final override { + return 0; + } + + simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char*, size_t, char*) const noexcept final override { + return result(error_code::OTHER, 0); + } + + simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char*, size_t, char*) const noexcept final override { + return 0; + } + simdutf_warn_unused size_t convert_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override { return 0; } @@ -4495,6 +4713,30 @@ class unsupported_implementation final : public implementation { return 0; } + simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t*, size_t, char*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t*, size_t, char*) const noexcept final override { + return 0; + } + + simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t*, size_t, char*) const noexcept final override { + return result(error_code::OTHER, 0); + } + + simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t*, size_t, char*) const noexcept final override { + return result(error_code::OTHER, 0); + } + + simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t*, size_t, char*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t*, size_t, char*) const noexcept final override { + return 0; + } + simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override { return 0; } @@ -4519,6 +4761,18 @@ class unsupported_implementation final : public implementation { return 0; } + simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t *, size_t, char* ) const noexcept final override { + return 0; + } + + simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t *, size_t, char* ) const noexcept final override { + return result(error_code::OTHER, 0); + } + + simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t *, size_t, char* ) const noexcept final override { + return 0; + } + simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override { return 0; } @@ -4595,6 +4849,21 @@ class unsupported_implementation final : public implementation { return 0; } + simdutf_warn_unused size_t latin1_length_from_utf8(const char *, size_t) const noexcept override { + return 0; + } + + simdutf_warn_unused size_t latin1_length_from_utf16(size_t) const noexcept override { + return 0; + } + + simdutf_warn_unused size_t latin1_length_from_utf32(size_t) const noexcept override { + return 0; + } + simdutf_warn_unused size_t utf8_length_from_latin1(const char *, size_t) const noexcept override { + return 0; + } + simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *, size_t) const noexcept override { return 0; } @@ -4611,10 +4880,16 @@ class unsupported_implementation final : public implementation { return 0; } - simdutf_warn_unused size_t utf16_length_from_utf8(const char *, size_t) const noexcept override { + simdutf_warn_unused size_t utf32_length_from_latin1(size_t) const noexcept override { return 0; } + simdutf_warn_unused size_t utf16_length_from_utf8(const char *, size_t) const noexcept override { + return 0; + } + simdutf_warn_unused size_t utf16_length_from_latin1(size_t) const noexcept override { + return 0; + } simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *, size_t) const noexcept override { return 0; } @@ -4633,18 +4908,18 @@ class unsupported_implementation final : public implementation { const unsupported_implementation unsupported_singleton{}; size_t available_implementation_list::size() const noexcept { - return internal::available_implementation_pointers.size(); + return internal::get_available_implementation_pointers().size(); } const implementation * const *available_implementation_list::begin() const noexcept { - return internal::available_implementation_pointers.begin(); + return internal::get_available_implementation_pointers().begin(); } const implementation * const *available_implementation_list::end() const noexcept { - return internal::available_implementation_pointers.end(); + return internal::get_available_implementation_pointers().end(); } const implementation *available_implementation_list::detect_best_supported() const noexcept { // They are prelisted in priority order, so we just go down the list uint32_t supported_instruction_sets = internal::detect_supported_architectures(); - for (const implementation *impl : internal::available_implementation_pointers) { + for (const implementation *impl : internal::get_available_implementation_pointers()) { uint32_t required_instruction_sets = impl->required_instruction_sets(); if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets) { return impl; } } @@ -4709,6 +4984,27 @@ simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t leng return convert_utf8_to_utf16le(input, length, utf16_output); #endif } +simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) noexcept { + return get_active_implementation()->convert_latin1_to_utf8(buf, len,utf8_output); +} +simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) noexcept { + return get_active_implementation()->convert_latin1_to_utf16le(buf, len, utf16_output); +} +simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) noexcept{ + return get_active_implementation()->convert_latin1_to_utf16be(buf, len, utf16_output); +} +simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t * latin1_output) noexcept { + return get_active_implementation()->convert_latin1_to_utf32(buf, len,latin1_output); +} +simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) noexcept { + return get_active_implementation()->convert_utf8_to_latin1(buf, len,latin1_output); +} +simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) noexcept { + return get_active_implementation()->convert_utf8_to_latin1_with_errors(buf, len, latin1_output); +} +simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) noexcept { + return get_active_implementation()->convert_valid_utf8_to_latin1(buf, len,latin1_output); +} simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept { return get_active_implementation()->convert_utf8_to_utf16le(input, length, utf16_output); } @@ -4789,6 +5085,38 @@ simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t le return convert_utf16le_to_utf8(buf, len, utf8_buffer); #endif } +simdutf_warn_unused size_t convert_utf16_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_utf16be_to_latin1(buf, len, latin1_buffer); + #else + return convert_utf16le_to_latin1(buf, len, latin1_buffer); + #endif +} +simdutf_warn_unused size_t convert_latin1_to_utf16(const char * buf, size_t len, char16_t* utf16_output) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_latin1_to_utf16be(buf, len, utf16_output); + #else + return convert_latin1_to_utf16le(buf, len, utf16_output); + #endif +} +simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept { + return get_active_implementation()->convert_utf16be_to_latin1(buf, len, latin1_buffer); +} +simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept { + return get_active_implementation()->convert_utf16le_to_latin1(buf, len, latin1_buffer); +} +simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept { + return get_active_implementation()->convert_valid_utf16be_to_latin1(buf, len, latin1_buffer); +} +simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept { + return get_active_implementation()->convert_valid_utf16le_to_latin1(buf, len, latin1_buffer); +} +simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) noexcept { + return get_active_implementation()->convert_utf16le_to_latin1_with_errors(buf, len, latin1_buffer); +} +simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) noexcept { + return get_active_implementation()->convert_utf16be_to_latin1_with_errors(buf, len, latin1_buffer); +} simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { return get_active_implementation()->convert_utf16le_to_utf8(buf, len, utf8_buffer); } @@ -4802,6 +5130,13 @@ simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * bu return convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer); #endif } +simdutf_warn_unused result convert_utf16_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_utf16be_to_latin1_with_errors(buf, len, latin1_buffer); + #else + return convert_utf16le_to_latin1_with_errors(buf, len, latin1_buffer); + #endif +} simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { return get_active_implementation()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer); } @@ -4815,6 +5150,13 @@ simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, siz return convert_valid_utf16le_to_utf8(buf, len, utf8_buffer); #endif } +simdutf_warn_unused size_t convert_valid_utf16_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_valid_utf16be_to_latin1(buf, len, latin1_buffer); + #else + return convert_valid_utf16le_to_latin1(buf, len, latin1_buffer); + #endif +} simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { return get_active_implementation()->convert_valid_utf16le_to_utf8(buf, len, utf8_buffer); } @@ -4837,6 +5179,9 @@ simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * buf, size_t l return convert_utf32_to_utf16le(buf, len, utf16_buffer); #endif } +simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_output) noexcept { + return get_active_implementation()->convert_utf32_to_latin1(input, length, latin1_output); +} simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { return get_active_implementation()->convert_utf32_to_utf16le(buf, len, utf16_buffer); } @@ -4927,6 +5272,18 @@ simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept { return get_active_implementation()->count_utf8(input, length); } +simdutf_warn_unused size_t latin1_length_from_utf8(const char * buf, size_t len) noexcept { + return get_active_implementation()->latin1_length_from_utf8(buf, len); +} +simdutf_warn_unused size_t latin1_length_from_utf16(size_t len) noexcept { + return get_active_implementation()->latin1_length_from_utf16(len); +} +simdutf_warn_unused size_t latin1_length_from_utf32(size_t len) noexcept { + return get_active_implementation()->latin1_length_from_utf32(len); +} +simdutf_warn_unused size_t utf8_length_from_latin1(const char * buf, size_t len) noexcept { + return get_active_implementation()->utf8_length_from_latin1(buf, len); +} simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept { #if SIMDUTF_IS_BIG_ENDIAN return utf8_length_from_utf16be(input, length); @@ -4956,6 +5313,9 @@ simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, siz simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept { return get_active_implementation()->utf16_length_from_utf8(input, length); } +simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept { + return get_active_implementation()->utf16_length_from_latin1(length); +} simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept { return get_active_implementation()->utf8_length_from_utf32(input, length); } @@ -4971,7 +5331,6 @@ simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * buf, simdutf_warn_unused int detect_encodings(const char * buf, size_t length) noexcept { return get_active_implementation()->detect_encodings(buf, length); } - const implementation * builtin_implementation() { static const implementation * builtin_impl = get_available_implementations()[SIMDUTF_STRINGIFY(SIMDUTF_BUILTIN_IMPLEMENTATION)]; return builtin_impl; @@ -4981,7 +5340,6 @@ const implementation * builtin_implementation() { } // namespace simdutf /* end file src/implementation.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=encoding_types.cpp /* begin file src/encoding_types.cpp */ namespace simdutf { @@ -5043,7 +5401,6 @@ encoding_type check_bom(const char* byte, size_t length) { } } /* end file src/encoding_types.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=error.cpp /* begin file src/error.cpp */ namespace simdutf { @@ -5055,7 +5412,6 @@ namespace simdutf { /* end file src/error.cpp */ // The large tables should be included once and they // should not depend on a kernel. -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=tables/utf8_to_utf16_tables.h /* begin file src/tables/utf8_to_utf16_tables.h */ #ifndef SIMDUTF_UTF8_TO_UTF16_TABLES_H #define SIMDUTF_UTF8_TO_UTF16_TABLES_H @@ -9394,7 +9750,6 @@ const uint8_t utf8bigindex[4096][2] = #endif // SIMDUTF_UTF8_TO_UTF16_TABLES_H /* end file src/tables/utf8_to_utf16_tables.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=tables/utf16_to_utf8_tables.h /* begin file src/tables/utf16_to_utf8_tables.h */ // file generated by scripts/sse_convert_utf16_to_utf8.py #ifndef SIMDUTF_UTF16_TO_UTF8_TABLES_H @@ -9935,7 +10290,6 @@ namespace utf16_to_utf8 { // End of tables. // The scalar routines should be included once. -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/ascii.h /* begin file src/scalar/ascii.h */ #ifndef SIMDUTF_ASCII_H #define SIMDUTF_ASCII_H @@ -9950,7 +10304,7 @@ inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept { const uint8_t *data = reinterpret_cast(buf); uint64_t pos = 0; // process in blocks of 16 bytes when possible - for (;pos + 16 < len; pos += 16) { + for (;pos + 16 <= len; pos += 16) { uint64_t v1; std::memcpy(&v1, data + pos, sizeof(uint64_t)); uint64_t v2; @@ -9970,7 +10324,7 @@ inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t l const uint8_t *data = reinterpret_cast(buf); size_t pos = 0; // process in blocks of 16 bytes when possible - for (;pos + 16 < len; pos += 16) { + for (;pos + 16 <= len; pos += 16) { uint64_t v1; std::memcpy(&v1, data + pos, sizeof(uint64_t)); uint64_t v2; @@ -9996,7 +10350,6 @@ inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t l #endif /* end file src/scalar/ascii.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8.h /* begin file src/scalar/utf8.h */ #ifndef SIMDUTF_UTF8_H #define SIMDUTF_UTF8_H @@ -10013,9 +10366,9 @@ inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept { uint64_t pos = 0; uint32_t code_point = 0; while (pos < len) { - // check of the next 8 bytes are ascii. + // check of the next 16 bytes are ascii. uint64_t next_pos = pos + 16; - if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii + if (next_pos <= len) { // if it is safe to read 16 more bytes, check that they are ascii uint64_t v1; std::memcpy(&v1, data + pos, sizeof(uint64_t)); uint64_t v2; @@ -10079,9 +10432,9 @@ inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t l size_t pos = 0; uint32_t code_point = 0; while (pos < len) { - // check of the next 8 bytes are ascii. + // check of the next 16 bytes are ascii. size_t next_pos = pos + 16; - if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii + if (next_pos <= len) { // if it is safe to read 16 more bytes, check that they are ascii uint64_t v1; std::memcpy(&v1, data + pos, sizeof(uint64_t)); uint64_t v2; @@ -10139,10 +10492,12 @@ inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t l return result(error_code::SUCCESS, len); } -// Finds the previous leading byte and validates with errors from there +// Finds the previous leading byte starting backward from buf and validates with errors from there // Used to pinpoint the location of an error when an invalid chunk is detected +// We assume that the stream starts with a leading byte, and to check that it is the case, we +// ask that you pass a pointer to the start of the stream (start). inline simdutf_warn_unused result rewind_and_validate_with_errors(const char *start, const char *buf, size_t len) noexcept { - // First check that we start with a leading byte + // First check that we start with a leading byte if ((*start & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, 0); } @@ -10183,6 +10538,16 @@ inline size_t utf16_length_from_utf8(const char* buf, size_t len) { return counter; } +inline size_t latin1_length_from_utf8(const char *buf, size_t len) { + const uint8_t * c = reinterpret_cast(buf); + + size_t answer = len; + for(size_t i = 0; i < len; i++) { + if((c[i] & 0b11100000) == 0b11000000) { answer--; } // if we have a two-byte UTF8 character + } + return answer; +} + } // utf8 namespace } // unnamed namespace } // namespace scalar @@ -10190,7 +10555,6 @@ inline size_t utf16_length_from_utf8(const char* buf, size_t len) { #endif /* end file src/scalar/utf8.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16.h /* begin file src/scalar/utf16.h */ #ifndef SIMDUTF_UTF16_H #define SIMDUTF_UTF16_H @@ -10265,14 +10629,9 @@ inline size_t utf8_length_from_utf16(const char16_t* buf, size_t len) { size_t counter{0}; for(size_t i = 0; i < len; i++) { uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i]; - /** ASCII **/ - if(word <= 0x7F) { counter++; } - /** two-byte **/ - else if (word <= 0x7FF) { counter += 2; } - /** three-byte **/ - else if((word <= 0xD7FF) || (word >= 0xE000)) { counter += 3; } - /** surrogates -- 4 bytes **/ - else { counter += 2; } + counter++; // ASCII + counter += static_cast(word > 0x7F); // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes + counter += static_cast((word > 0x7FF && word <= 0xD7FF) || (word >= 0xE000)); // three-byte } return counter; } @@ -10289,6 +10648,11 @@ inline size_t utf32_length_from_utf16(const char16_t* buf, size_t len) { return counter; } + +inline size_t latin1_length_from_utf16(size_t len) { + return len; +} + simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* out) { const uint16_t * input = reinterpret_cast(in); uint16_t * output = reinterpret_cast(out); @@ -10304,7 +10668,6 @@ simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t si #endif /* end file src/scalar/utf16.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32.h /* begin file src/scalar/utf32.h */ #ifndef SIMDUTF_UTF32_H #define SIMDUTF_UTF32_H @@ -10346,14 +10709,11 @@ inline size_t utf8_length_from_utf32(const char32_t* buf, size_t len) { const uint32_t * p = reinterpret_cast(buf); size_t counter{0}; for(size_t i = 0; i < len; i++) { - /** ASCII **/ - if(p[i] <= 0x7F) { counter++; } - /** two-byte **/ - else if(p[i] <= 0x7FF) { counter += 2; } - /** three-byte **/ - else if(p[i] <= 0xFFFF) { counter += 3; } - /** four-bytes **/ - else { counter += 4; } + // credit: @ttsugriy for the vectorizable approach + counter++; // ASCII + counter += static_cast(p[i] > 0x7F); // two-byte + counter += static_cast(p[i] > 0x7FF); // three-byte + counter += static_cast(p[i] > 0xFFFF); // four-bytes } return counter; } @@ -10363,14 +10723,19 @@ inline size_t utf16_length_from_utf32(const char32_t* buf, size_t len) { const uint32_t * p = reinterpret_cast(buf); size_t counter{0}; for(size_t i = 0; i < len; i++) { - /** non-surrogate word **/ - if(p[i] <= 0xFFFF) { counter++; } - /** surrogate pair **/ - else { counter += 2; } + counter++; // non-surrogate word + counter += static_cast(p[i] > 0xFFFF); // surrogate pair } return counter; } +inline size_t latin1_length_from_utf32(size_t len) { + // We are not BOM aware. + return len; // a utf32 codepoint will always represent 1 latin1 character +} + + + } // utf32 namespace } // unnamed namespace } // namespace scalar @@ -10378,8 +10743,41 @@ inline size_t utf16_length_from_utf32(const char32_t* buf, size_t len) { #endif /* end file src/scalar/utf32.h */ +/* begin file src/scalar/latin1.h */ +#ifndef SIMDUTF_LATIN1_H +#define SIMDUTF_LATIN1_H + +namespace simdutf { +namespace scalar { +namespace { +namespace latin1 { + +inline size_t utf32_length_from_latin1(size_t len) { + // We are not BOM aware. + return len; // a utf32 unit will always represent 1 latin1 character +} + +inline size_t utf8_length_from_latin1(const char *buf, size_t len) { + const uint8_t * c = reinterpret_cast(buf); + size_t answer = 0; + for(size_t i = 0; i>7)) { answer++; } + } + return answer + len; +} + +inline size_t utf16_length_from_latin1(size_t len) { + return len; +} + +} // utf32 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/latin1.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf8/valid_utf32_to_utf8.h /* begin file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */ #ifndef SIMDUTF_VALID_UTF32_TO_UTF8_H #define SIMDUTF_VALID_UTF32_TO_UTF8_H @@ -10446,7 +10844,6 @@ inline size_t convert_valid(const char32_t* buf, size_t len, char* utf8_output) #endif /* end file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf8/utf32_to_utf8.h /* begin file src/scalar/utf32_to_utf8/utf32_to_utf8.h */ #ifndef SIMDUTF_UTF32_TO_UTF8_H #define SIMDUTF_UTF32_TO_UTF8_H @@ -10562,7 +10959,6 @@ inline result convert_with_errors(const char32_t* buf, size_t len, char* utf8_ou #endif /* end file src/scalar/utf32_to_utf8/utf32_to_utf8.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf16/valid_utf32_to_utf16.h /* begin file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */ #ifndef SIMDUTF_VALID_UTF32_TO_UTF16_H #define SIMDUTF_VALID_UTF32_TO_UTF16_H @@ -10607,7 +11003,6 @@ inline size_t convert_valid(const char32_t* buf, size_t len, char16_t* utf16_out #endif /* end file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf16/utf32_to_utf16.h /* begin file src/scalar/utf32_to_utf16/utf32_to_utf16.h */ #ifndef SIMDUTF_UTF32_TO_UTF16_H #define SIMDUTF_UTF32_TO_UTF16_H @@ -10683,7 +11078,6 @@ inline result convert_with_errors(const char32_t* buf, size_t len, char16_t* utf #endif /* end file src/scalar/utf32_to_utf16/utf32_to_utf16.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf8/valid_utf16_to_utf8.h /* begin file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */ #ifndef SIMDUTF_VALID_UTF16_TO_UTF8_H #define SIMDUTF_VALID_UTF16_TO_UTF8_H @@ -10703,7 +11097,7 @@ inline size_t convert_valid(const char16_t* buf, size_t len, char* utf8_output) if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii uint64_t v; ::memcpy(&v, data + pos, sizeof(uint64_t)); - if (!match_system(big_endian)) v = (v >> 8) | (v << (64 - 8)); + if (!match_system(big_endian)) { v = (v >> 8) | (v << (64 - 8)); } if ((v & 0xFF80FF80FF80FF80) == 0) { size_t final_pos = pos + 4; while(pos < final_pos) { @@ -10758,7 +11152,6 @@ inline size_t convert_valid(const char16_t* buf, size_t len, char* utf8_output) #endif /* end file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf8/utf16_to_utf8.h /* begin file src/scalar/utf16_to_utf8/utf16_to_utf8.h */ #ifndef SIMDUTF_UTF16_TO_UTF8_H #define SIMDUTF_UTF16_TO_UTF8_H @@ -10774,11 +11167,11 @@ inline size_t convert(const char16_t* buf, size_t len, char* utf8_output) { size_t pos = 0; char* start{utf8_output}; while (pos < len) { - // try to convert the next block of 8 ASCII characters + // try to convert the next block of 8 bytes if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii uint64_t v; ::memcpy(&v, data + pos, sizeof(uint64_t)); - if (!match_system(big_endian)) v = (v >> 8) | (v << (64 - 8)); + if (!match_system(big_endian)) { v = (v >> 8) | (v << (64 - 8)); } if ((v & 0xFF80FF80FF80FF80) == 0) { size_t final_pos = pos + 4; while(pos < final_pos) { @@ -10833,7 +11226,7 @@ inline result convert_with_errors(const char16_t* buf, size_t len, char* utf8_ou size_t pos = 0; char* start{utf8_output}; while (pos < len) { - // try to convert the next block of 8 ASCII characters + // try to convert the next block of 8 bytes if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii uint64_t v; ::memcpy(&v, data + pos, sizeof(uint64_t)); @@ -10894,7 +11287,6 @@ inline result convert_with_errors(const char16_t* buf, size_t len, char* utf8_ou #endif /* end file src/scalar/utf16_to_utf8/utf16_to_utf8.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf32/valid_utf16_to_utf32.h /* begin file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */ #ifndef SIMDUTF_VALID_UTF16_TO_UTF32_H #define SIMDUTF_VALID_UTF16_TO_UTF32_H @@ -10936,7 +11328,6 @@ inline size_t convert_valid(const char16_t* buf, size_t len, char32_t* utf32_out #endif /* end file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf32/utf16_to_utf32.h /* begin file src/scalar/utf16_to_utf32/utf16_to_utf32.h */ #ifndef SIMDUTF_UTF16_TO_UTF32_H #define SIMDUTF_UTF16_TO_UTF32_H @@ -11008,7 +11399,6 @@ inline result convert_with_errors(const char16_t* buf, size_t len, char32_t* utf #endif /* end file src/scalar/utf16_to_utf32/utf16_to_utf32.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf16/valid_utf8_to_utf16.h /* begin file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */ #ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H #define SIMDUTF_VALID_UTF8_TO_UTF16_H @@ -11093,7 +11483,6 @@ inline size_t convert_valid(const char* buf, size_t len, char16_t* utf16_output) #endif /* end file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf16/utf8_to_utf16.h /* begin file src/scalar/utf8_to_utf16/utf8_to_utf16.h */ #ifndef SIMDUTF_UTF8_TO_UTF16_H #define SIMDUTF_UTF8_TO_UTF16_H @@ -11343,7 +11732,6 @@ inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf #endif /* end file src/scalar/utf8_to_utf16/utf8_to_utf16.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf32/valid_utf8_to_utf32.h /* begin file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */ #ifndef SIMDUTF_VALID_UTF8_TO_UTF32_H #define SIMDUTF_VALID_UTF8_TO_UTF32_H @@ -11409,7 +11797,6 @@ inline size_t convert_valid(const char* buf, size_t len, char32_t* utf32_output) #endif /* end file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf32/utf8_to_utf32.h /* begin file src/scalar/utf8_to_utf32/utf8_to_utf32.h */ #ifndef SIMDUTF_UTF8_TO_UTF32_H #define SIMDUTF_UTF8_TO_UTF32_H @@ -11621,4366 +12008,6018 @@ inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf #endif /* end file src/scalar/utf8_to_utf32/utf8_to_utf32.h */ -// - - -SIMDUTF_PUSH_DISABLE_WARNINGS -SIMDUTF_DISABLE_UNDESIRED_WARNINGS +/* begin file src/scalar/latin1_to_utf8/latin1_to_utf8.h */ +#ifndef SIMDUTF_LATIN1_TO_UTF8_H +#define SIMDUTF_LATIN1_TO_UTF8_H -#if SIMDUTF_IMPLEMENTATION_ARM64 -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/implementation.cpp -/* begin file src/arm64/implementation.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/begin.h -/* begin file src/simdutf/arm64/begin.h */ -// redefining SIMDUTF_IMPLEMENTATION to "arm64" -// #define SIMDUTF_IMPLEMENTATION arm64 -/* end file src/simdutf/arm64/begin.h */ namespace simdutf { -namespace arm64 { +namespace scalar { namespace { -#ifndef SIMDUTF_ARM64_H -#error "arm64.h must be included" -#endif -using namespace simd; +namespace latin1_to_utf8 { -simdutf_really_inline bool is_ascii(const simd8x64& input) { - simd8 bits = input.reduce_or(); - return bits.max_val() < 0b10000000u; +inline size_t convert(const char* buf, size_t len, char* utf8_output) { + const unsigned char *data = reinterpret_cast(buf); + size_t pos = 0; + char* start{utf8_output}; + while (pos < len) { + // try to convert the next block of 16 ASCII bytes + if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000, so it makes sense to concatenate everything + if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII + size_t final_pos = pos + 16; + while(pos < final_pos) { + *utf8_output++ = char(buf[pos]); + pos++; + } + continue; + } + } + + unsigned char byte = data[pos]; + if((byte & 0x80) == 0) { // if ASCII + // will generate one UTF-8 bytes + *utf8_output++ = char(byte); + pos++; + } else { + // will generate two UTF-8 bytes + *utf8_output++ = char((byte>>6) | 0b11000000); + *utf8_output++ = char((byte & 0b111111) | 0b10000000); + pos++; + } + } + return utf8_output - start; } -simdutf_unused simdutf_really_inline simd8 must_be_continuation(const simd8 prev1, const simd8 prev2, const simd8 prev3) { - simd8 is_second_byte = prev1 >= uint8_t(0b11000000u); - simd8 is_third_byte = prev2 >= uint8_t(0b11100000u); - simd8 is_fourth_byte = prev3 >= uint8_t(0b11110000u); - // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller is using ^ as well. - // This will work fine because we only have to report errors for cases with 0-1 lead bytes. - // Multiple lead bytes implies 2 overlapping multibyte characters, and if that happens, there is - // guaranteed to be at least *one* lead byte that is part of only 1 other multibyte character. - // The error will be detected there. - return is_second_byte ^ is_third_byte ^ is_fourth_byte; -} +} // latin1_to_utf8 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf -simdutf_really_inline simd8 must_be_2_3_continuation(const simd8 prev2, const simd8 prev3) { - simd8 is_third_byte = prev2 >= uint8_t(0b11100000u); - simd8 is_fourth_byte = prev3 >= uint8_t(0b11110000u); - return is_third_byte ^ is_fourth_byte; -} +#endif +/* end file src/scalar/latin1_to_utf8/latin1_to_utf8.h */ +/* begin file src/scalar/latin1_to_utf16/latin1_to_utf16.h */ +#ifndef SIMDUTF_LATIN1_TO_UTF16_H +#define SIMDUTF_LATIN1_TO_UTF16_H -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_detect_encodings.cpp -/* begin file src/arm64/arm_detect_encodings.cpp */ -template -// len is known to be a multiple of 2 when this is called -int arm_detect_encodings(const char * buf, size_t len) { - const char* start = buf; - const char* end = buf + len; +namespace simdutf { +namespace scalar { +namespace { +namespace latin1_to_utf16 { - bool is_utf8 = true; - bool is_utf16 = true; - bool is_utf32 = true; +template +inline size_t convert(const char* buf, size_t len, char16_t* utf16_output) { + const uint8_t* data = reinterpret_cast(buf); + size_t pos = 0; + char16_t* start{ utf16_output }; - int out = 0; + while (pos < len) { + uint16_t word = uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point + *utf16_output++ = char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word)); + pos++; + } - const auto v_d8 = simd8::splat(0xd8); - const auto v_f8 = simd8::splat(0xf8); + return utf16_output - start; +} - uint32x4_t currentmax = vmovq_n_u32(0x0); +template +inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_output) { + const uint8_t* data = reinterpret_cast(buf); + size_t pos = 0; + char16_t* start{ utf16_output }; - checker check{}; + while (pos < len) { + uint16_t word = uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point + *utf16_output++ = char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word)); + pos++; + } - while(buf + 64 <= end) { - uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); - uint16x8_t secondin = vld1q_u16(reinterpret_cast(buf) + simd16::SIZE / sizeof(char16_t)); - uint16x8_t thirdin = vld1q_u16(reinterpret_cast(buf) + 2*simd16::SIZE / sizeof(char16_t)); - uint16x8_t fourthin = vld1q_u16(reinterpret_cast(buf) + 3*simd16::SIZE / sizeof(char16_t)); + return result(error_code::SUCCESS, utf16_output - start); +} - const auto u0 = simd16(in); - const auto u1 = simd16(secondin); - const auto u2 = simd16(thirdin); - const auto u3 = simd16(fourthin); +} // latin1_to_utf16 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf - const auto v0 = u0.shr<8>(); - const auto v1 = u1.shr<8>(); - const auto v2 = u2.shr<8>(); - const auto v3 = u3.shr<8>(); +#endif +/* end file src/scalar/latin1_to_utf16/latin1_to_utf16.h */ +/* begin file src/scalar/latin1_to_utf32/latin1_to_utf32.h */ +#ifndef SIMDUTF_LATIN1_TO_UTF32_H +#define SIMDUTF_LATIN1_TO_UTF32_H - const auto in16 = simd16::pack(v0, v1); - const auto nextin16 = simd16::pack(v2, v3); +namespace simdutf { +namespace scalar { +namespace { +namespace latin1_to_utf32 { - const uint64_t surrogates_wordmask0 = ((in16 & v_f8) == v_d8).to_bitmask64(); - const uint64_t surrogates_wordmask1 = ((nextin16 & v_f8) == v_d8).to_bitmask64(); - // Check for surrogates - if (surrogates_wordmask0 != 0 || surrogates_wordmask1 != 0) { - // Cannot be UTF8 - is_utf8 = false; - // Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates - // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word. - // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant - // bytes of a 32-bit word since they always come in pairs in UTF-16LE. - // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit words. +inline size_t convert(const char *buf, size_t len, char32_t *utf32_output) { + const unsigned char *data = reinterpret_cast(buf); + char32_t* start{utf32_output}; + for (size_t i = 0; i < len; i++) { + *utf32_output++ = (char32_t)data[i]; + } + return utf32_output - start; +} - if (((surrogates_wordmask0 | surrogates_wordmask1) & 0xf0f0f0f0f0f0f0f0) != 0) { - is_utf32 = false; - // Code from arm_validate_utf16le.cpp - // Not efficient, we do not process surrogates_wordmask1 - const char16_t * input = reinterpret_cast(buf); - const char16_t* end16 = reinterpret_cast(start) + len/2; +inline result convert_with_errors(const char32_t *buf, size_t len, char32_t *utf32_output) { + const uint32_t *data = reinterpret_cast(buf); + char32_t* start{utf32_output}; + for (size_t i = 0; i < len; i++) { + *utf32_output++ = (char32_t)data[i]; + } + return result(error_code::SUCCESS, utf32_output - start); +} - const auto v_fc = simd8::splat(0xfc); - const auto v_dc = simd8::splat(0xdc); +} // latin1_to_utf32 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf - const uint64_t V0 = ~surrogates_wordmask0; +#endif +/* end file src/scalar/latin1_to_utf32/latin1_to_utf32.h */ - const auto vH0 = ((in16 & v_fc) == v_dc); - const uint64_t H0 = vH0.to_bitmask64(); +/* begin file src/scalar/utf8_to_latin1/utf8_to_latin1.h */ +#ifndef SIMDUTF_UTF8_TO_LATIN1_H +#define SIMDUTF_UTF8_TO_LATIN1_H +#include - const uint64_t L0 = ~H0 & surrogates_wordmask0; +namespace simdutf { +namespace scalar { +namespace { +namespace utf8_to_latin1 { - const uint64_t a0 = L0 & (H0 >> 4); +inline size_t convert(const char* buf, size_t len, char* latin_output) { + const uint8_t *data = reinterpret_cast(buf); + size_t pos = 0; + char* start{latin_output}; - const uint64_t b0 = a0 << 4; + while (pos < len) { + // try to convert the next block of 16 ASCII bytes + if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000 .... etc + if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII + size_t final_pos = pos + 16; + while(pos < final_pos) { + *latin_output++ = char(buf[pos]); + pos++; + } + continue; + } + } - const uint64_t c0 = V0 | a0 | b0; - if (c0 == ~0ull) { - input += 16; - } else if (c0 == 0xfffffffffffffffull) { - input += 15; - } else { - is_utf16 = false; - break; - } + // suppose it is not an all ASCII byte sequence + uint8_t leading_byte = data[pos]; // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *latin_output++ = char(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate: + // We have a two-byte UTF-8 + if(pos + 1 >= len) { + return 0; + } // minimal bound checking + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10. + // range check - + uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation. + if (code_point < 0x80 || 0xFF < code_point) { + return 0; // We only care about the range 129-255 which is Non-ASCII latin1 characters. A code_point beneath 0x80 is invalid as it's already covered by bytes whose leading bit is zero. + } + *latin_output++ = char(code_point); + pos += 2; + } else { + return 0; + } + } + return latin_output - start; +} - while (input + 16 < end16) { - const auto in0 = simd16(input); - const auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); - const auto t0 = in0.shr<8>(); - const auto t1 = in1.shr<8>(); - const simd8 in_16 = simd16::pack(t0, t1); +inline result convert_with_errors(const char* buf, size_t len, char* latin_output) { + const uint8_t *data = reinterpret_cast(buf); + size_t pos = 0; + char* start{latin_output}; - const uint64_t surrogates_wordmask = ((in_16 & v_f8) == v_d8).to_bitmask64(); - if(surrogates_wordmask == 0) { - input += 16; - } else { - const uint64_t V = ~surrogates_wordmask; + while (pos < len) { + // try to convert the next block of 16 ASCII bytes + if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000...etc + if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII + size_t final_pos = pos + 16; + while(pos < final_pos) { + *latin_output++ = char(buf[pos]); + pos++; + } + continue; + } + } + // suppose it is not an all ASCII byte sequence + uint8_t leading_byte = data[pos]; // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *latin_output++ = char(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate: + // We have a two-byte UTF-8 + if(pos + 1 >= len) { + return result(error_code::TOO_SHORT, pos); } // minimal bound checking + if ((data[pos + 1] & 0b11000000) != 0b10000000) { + return result(error_code::TOO_SHORT, pos); } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10. + // range check - + uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation. + if (code_point < 0x80) { + return result(error_code::OVERLONG, pos); + } + if (0xFF < code_point) { + return result(error_code::TOO_LARGE, pos); + } // We only care about the range 129-255 which is Non-ASCII latin1 characters + *latin_output++ = char(code_point); + pos += 2; + } else if ((leading_byte & 0b11110000) == 0b11100000) { + // We have a three-byte UTF-8 + return result(error_code::TOO_LARGE, pos); + } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 + // we have a 4-byte UTF-8 word. + return result(error_code::TOO_LARGE, pos); + } else { + // we either have too many continuation bytes or an invalid leading byte + if ((leading_byte & 0b11000000) == 0b10000000) { + return result(error_code::TOO_LONG, pos); + } - const auto vH = ((in_16 & v_fc) == v_dc); - const uint64_t H = vH.to_bitmask64(); + return result(error_code::HEADER_BITS, pos); - const uint64_t L = ~H & surrogates_wordmask; + } + } + return result(error_code::SUCCESS, latin_output - start); +} - const uint64_t a = L & (H >> 4); - const uint64_t b = a << 4; +inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char* latin1_output) { + size_t extra_len{0}; + // We potentially need to go back in time and find a leading byte. + // In theory '3' would be sufficient, but sometimes the error can go back quite far. + size_t how_far_back = prior_bytes; + // size_t how_far_back = 3; // 3 bytes in the past + current position + // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; } + bool found_leading_bytes{false}; + // important: it is i <= how_far_back and not 'i < how_far_back'. + for(size_t i = 0; i <= how_far_back; i++) { + unsigned char byte = buf[0-i]; + found_leading_bytes = ((byte & 0b11000000) != 0b10000000); + if(found_leading_bytes) { + buf -= i; + extra_len = i; + break; + } + } + // + // It is possible for this function to return a negative count in its result. + // C++ Standard Section 18.1 defines size_t is in which is described in C Standard as . + // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator + // + // An unsigned type will simply wrap round arithmetically (well defined). + // + if(!found_leading_bytes) { + // If how_far_back == 3, we may have four consecutive continuation bytes!!! + // [....] [continuation] [continuation] [continuation] | [buf is continuation] + // Or we possibly have a stream that does not start with a leading byte. + return result(error_code::TOO_LONG, 0-how_far_back); + } + result res = convert_with_errors(buf, len + extra_len, latin1_output); + if (res.error) { + res.count -= extra_len; + } + return res; +} - const uint64_t c = V | a | b; - if (c == ~0ull) { - input += 16; - } else if (c == 0xfffffffffffffffull) { - input += 15; - } else { - is_utf16 = false; - break; - } - } - } - } else { - is_utf16 = false; - // Check for UTF-32 - if (len % 4 == 0) { - const char32_t * input = reinterpret_cast(buf); - const char32_t* end32 = reinterpret_cast(start) + len/4; - // Must start checking for surrogates - uint32x4_t currentoffsetmax = vmovq_n_u32(0x0); - const uint32x4_t offset = vmovq_n_u32(0xffff2000); - const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff); +} // utf8_to_latin1 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf - const uint32x4_t in32 = vreinterpretq_u32_u16(in); - const uint32x4_t secondin32 = vreinterpretq_u32_u16(secondin); - const uint32x4_t thirdin32 = vreinterpretq_u32_u16(thirdin); - const uint32x4_t fourthin32 = vreinterpretq_u32_u16(fourthin); +#endif +/* end file src/scalar/utf8_to_latin1/utf8_to_latin1.h */ +/* begin file src/scalar/utf16_to_latin1/utf16_to_latin1.h */ +#ifndef SIMDUTF_UTF16_TO_LATIN1_H +#define SIMDUTF_UTF16_TO_LATIN1_H - currentmax = vmaxq_u32(in32,currentmax); - currentmax = vmaxq_u32(secondin32,currentmax); - currentmax = vmaxq_u32(thirdin32,currentmax); - currentmax = vmaxq_u32(fourthin32,currentmax); +namespace simdutf { +namespace scalar { +namespace { +namespace utf16_to_latin1 { - currentoffsetmax = vmaxq_u32(vaddq_u32(in32, offset), currentoffsetmax); - currentoffsetmax = vmaxq_u32(vaddq_u32(secondin32, offset), currentoffsetmax); - currentoffsetmax = vmaxq_u32(vaddq_u32(thirdin32, offset), currentoffsetmax); - currentoffsetmax = vmaxq_u32(vaddq_u32(fourthin32, offset), currentoffsetmax); +#include // for std::memcpy - while (input + 4 < end32) { - const uint32x4_t in_32 = vld1q_u32(reinterpret_cast(input)); - currentmax = vmaxq_u32(in_32,currentmax); - currentoffsetmax = vmaxq_u32(vaddq_u32(in_32, offset), currentoffsetmax); - input += 4; - } +template +inline size_t convert(const char16_t* buf, size_t len, char* latin_output) { + const uint16_t *data = reinterpret_cast(buf); + size_t pos = 0; + std::vector temp_output(len); + char* current_write = temp_output.data(); + uint16_t word = 0; + uint16_t too_large = 0; - uint32x4_t forbidden_words = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax); - if(vmaxvq_u32(forbidden_words) != 0) { - is_utf32 = false; - } - } else { - is_utf32 = false; - } - } - break; - } - // If no surrogate, validate under other encodings as well + while (pos < len) { + word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos]; + too_large |= word; + *current_write++ = char(word & 0xFF); + pos++; + } + if((too_large & 0xFF00) != 0) { return 0; } - // UTF-32 validation - currentmax = vmaxq_u32(vreinterpretq_u32_u16(in),currentmax); - currentmax = vmaxq_u32(vreinterpretq_u32_u16(secondin),currentmax); - currentmax = vmaxq_u32(vreinterpretq_u32_u16(thirdin),currentmax); - currentmax = vmaxq_u32(vreinterpretq_u32_u16(fourthin),currentmax); + // Only copy to latin_output if there were no errors + std::memcpy(latin_output, temp_output.data(), len); + + return current_write - temp_output.data(); +} - // UTF-8 validation - // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h - simd::simd8x64 in8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(secondin), vreinterpretq_u8_u16(thirdin), vreinterpretq_u8_u16(fourthin)); - check.check_next_input(in8); +template +inline result convert_with_errors(const char16_t* buf, size_t len, char* latin_output) { + const uint16_t *data = reinterpret_cast(buf); + size_t pos = 0; + char* start{latin_output}; + uint16_t word; - buf += 64; - } + while (pos < len) { + if (pos + 16 <= len) { // if it is safe to read 32 more bytes, check that they are Latin1 + uint64_t v1, v2, v3, v4; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + ::memcpy(&v2, data + pos + 4, sizeof(uint64_t)); + ::memcpy(&v3, data + pos + 8, sizeof(uint64_t)); + ::memcpy(&v4, data + pos + 12, sizeof(uint64_t)); - // Check which encodings are possible + if (!match_system(big_endian)) { v1 = (v1 >> 8) | (v1 << (64 - 8)); } + if (!match_system(big_endian)) { v2 = (v2 >> 8) | (v2 << (64 - 8)); } + if (!match_system(big_endian)) { v3 = (v3 >> 8) | (v3 << (64 - 8)); } + if (!match_system(big_endian)) { v4 = (v1 >> 8) | (v4 << (64 - 8)); } - if (is_utf8) { - if (static_cast(buf - start) != len) { - uint8_t block[64]{}; - std::memset(block, 0x20, 64); - std::memcpy(block, buf, len - (buf - start)); - simd::simd8x64 in(block); - check.check_next_input(in); - } - if (!check.errors()) { - out |= simdutf::encoding_type::UTF8; + if (((v1 | v2 | v3 | v4) & 0xFF00FF00FF00FF00) == 0) { + size_t final_pos = pos + 16; + while(pos < final_pos) { + *latin_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(data[pos])) : char(data[pos]); + pos++; } + continue; + } } + word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos]; + if((word & 0xFF00 ) == 0) { + *latin_output++ = char(word & 0xFF); + pos++; + } else { return result(error_code::TOO_LARGE, pos); } + } + return result(error_code::SUCCESS,latin_output - start); +} - if (is_utf16 && scalar::utf16::validate(reinterpret_cast(buf), (len - (buf - start))/2)) { - out |= simdutf::encoding_type::UTF16_LE; - } - - if (is_utf32 && (len % 4 == 0)) { - const uint32x4_t standardmax = vmovq_n_u32(0x10ffff); - uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax); - if (vmaxvq_u32(is_zero) == 0 && scalar::utf32::validate(reinterpret_cast(buf), (len - (buf - start))/4)) { - out |= simdutf::encoding_type::UTF32_LE; - } - } - return out; -} -/* end file src/arm64/arm_detect_encodings.cpp */ +} // utf16_to_latin1 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_validate_utf16.cpp -/* begin file src/arm64/arm_validate_utf16.cpp */ -template -const char16_t* arm_validate_utf16(const char16_t* input, size_t size) { - const char16_t* end = input + size; - const auto v_d8 = simd8::splat(0xd8); - const auto v_f8 = simd8::splat(0xf8); - const auto v_fc = simd8::splat(0xfc); - const auto v_dc = simd8::splat(0xdc); - while (input + 16 < end) { - // 0. Load data: since the validation takes into account only higher - // byte of each word, we compress the two vectors into one which - // consists only the higher bytes. - auto in0 = simd16(input); - auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); - if (!match_system(big_endian)) { - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - #else - const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - #endif - in0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in0), swap)); - in1 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in1), swap)); - } - const auto t0 = in0.shr<8>(); - const auto t1 = in1.shr<8>(); - const simd8 in = simd16::pack(t0, t1); - // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). - const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64(); - if(surrogates_wordmask == 0) { - input += 16; - } else { - // 2. We have some surrogates that have to be distinguished: - // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) - // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) - // - // Fact: high surrogate has 11th bit set (3rd bit in the higher word) +#endif +/* end file src/scalar/utf16_to_latin1/utf16_to_latin1.h */ +/* begin file src/scalar/utf32_to_latin1/utf32_to_latin1.h */ +#ifndef SIMDUTF_UTF32_TO_LATIN1_H +#define SIMDUTF_UTF32_TO_LATIN1_H - // V - non-surrogate words - // V = not surrogates_wordmask - const uint64_t V = ~surrogates_wordmask; +namespace simdutf { +namespace scalar { +namespace { +namespace utf32_to_latin1 { - // H - word-mask for high surrogates: the six highest bits are 0b1101'11 - const auto vH = ((in & v_fc) == v_dc); - const uint64_t H = vH.to_bitmask64(); +inline size_t convert(const char32_t *buf, size_t len, char *latin1_output) { + const uint32_t *data = reinterpret_cast(buf); + char* start = latin1_output; + uint32_t utf32_char; + size_t pos = 0; + uint32_t too_large = 0; - // L - word mask for low surrogates - // L = not H and surrogates_wordmask - const uint64_t L = ~H & surrogates_wordmask; + while (pos < len) { + utf32_char = (uint32_t)data[pos]; + too_large |= utf32_char; + *latin1_output++ = (char)(utf32_char & 0xFF); + pos++; + } + if((too_large & 0xFFFFFF00) != 0) { return 0; } + return latin1_output - start; +} - const uint64_t a = L & (H >> 4); // A low surrogate must be followed by high one. - // (A low surrogate placed in the 7th register's word - // is an exception we handle.) - const uint64_t b = a << 4; // Just mark that the opposite fact is hold, - // thanks to that we have only two masks for valid case. - const uint64_t c = V | a | b; // Combine all the masks into the final one. - if (c == ~0ull) { - // The whole input register contains valid UTF-16, i.e., - // either single words or proper surrogate pairs. - input += 16; - } else if (c == 0xfffffffffffffffull) { - // The 15 lower words of the input register contains valid UTF-16. - // The 15th word may be either a low or high surrogate. It the next - // iteration we 1) check if the low surrogate is followed by a high - // one, 2) reject sole high surrogate. - input += 15; - } else { - return nullptr; - } - } +inline result convert_with_errors(const char32_t *buf, size_t len, char *latin1_output) { + const uint32_t *data = reinterpret_cast(buf); + char* start{latin1_output}; + size_t pos = 0; + while (pos < len) { + if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are Latin1 + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if ((v & 0xFFFFFF00FFFFFF00) == 0) { + *latin1_output++ = char(buf[pos]); + *latin1_output++ = char(buf[pos+1]); + pos += 2; + continue; + } } - return input; + uint32_t utf32_char = data[pos]; + if ((utf32_char & 0xFFFFFF00) == 0) { // Check if the character can be represented in Latin-1 + *latin1_output++ = (char)(utf32_char & 0xFF); + pos++; + } else { return result(error_code::TOO_LARGE, pos); }; + } + return result(error_code::SUCCESS, latin1_output - start); } +} // utf32_to_latin1 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf -template -const result arm_validate_utf16_with_errors(const char16_t* input, size_t size) { - const char16_t* start = input; - const char16_t* end = input + size; - - const auto v_d8 = simd8::splat(0xd8); - const auto v_f8 = simd8::splat(0xf8); - const auto v_fc = simd8::splat(0xfc); - const auto v_dc = simd8::splat(0xdc); - while (input + 16 < end) { - // 0. Load data: since the validation takes into account only higher - // byte of each word, we compress the two vectors into one which - // consists only the higher bytes. - auto in0 = simd16(input); - auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); +#endif +/* end file src/scalar/utf32_to_latin1/utf32_to_latin1.h */ - if (!match_system(big_endian)) { - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - #else - const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - #endif - in0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in0), swap)); - in1 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in1), swap)); - } - const auto t0 = in0.shr<8>(); - const auto t1 = in1.shr<8>(); - const simd8 in = simd16::pack(t0, t1); - // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). - const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64(); - if(surrogates_wordmask == 0) { - input += 16; - } else { - // 2. We have some surrogates that have to be distinguished: - // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) - // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) - // - // Fact: high surrogate has 11th bit set (3rd bit in the higher word) +/* begin file src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */ +#ifndef SIMDUTF_VALID_UTF8_TO_LATIN1_H +#define SIMDUTF_VALID_UTF8_TO_LATIN1_H - // V - non-surrogate words - // V = not surrogates_wordmask - const uint64_t V = ~surrogates_wordmask; +namespace simdutf { +namespace scalar { +namespace { +namespace utf8_to_latin1 { - // H - word-mask for high surrogates: the six highest bits are 0b1101'11 - const auto vH = ((in & v_fc) == v_dc); - const uint64_t H = vH.to_bitmask64(); +inline size_t convert_valid(const char* buf, size_t len, char* latin_output) { + const uint8_t *data = reinterpret_cast(buf); - // L - word mask for low surrogates - // L = not H and surrogates_wordmask - const uint64_t L = ~H & surrogates_wordmask; + size_t pos = 0; + char* start{latin_output}; - const uint64_t a = L & (H >> 4); // A low surrogate must be followed by high one. - // (A low surrogate placed in the 7th register's word - // is an exception we handle.) - const uint64_t b = a << 4; // Just mark that the opposite fact is hold, - // thanks to that we have only two masks for valid case. - const uint64_t c = V | a | b; // Combine all the masks into the final one. - if (c == ~0ull) { - // The whole input register contains valid UTF-16, i.e., - // either single words or proper surrogate pairs. - input += 16; - } else if (c == 0xfffffffffffffffull) { - // The 15 lower words of the input register contains valid UTF-16. - // The 15th word may be either a low or high surrogate. It the next - // iteration we 1) check if the low surrogate is followed by a high - // one, 2) reject sole high surrogate. - input += 15; - } else { - return result(error_code::SURROGATE, input - start); - } + while (pos < len) { + // try to convert the next block of 16 ASCII bytes + if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000, so it makes sense to concatenate everything + if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII + size_t final_pos = pos + 16; + while(pos < final_pos) { + *latin_output++ = char(buf[pos]); + pos++; } + continue; + } } - return result(error_code::SUCCESS, input - start); -} -/* end file src/arm64/arm_validate_utf16.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_validate_utf32le.cpp -/* begin file src/arm64/arm_validate_utf32le.cpp */ - -const char32_t* arm_validate_utf32le(const char32_t* input, size_t size) { - const char32_t* end = input + size; - - const uint32x4_t standardmax = vmovq_n_u32(0x10ffff); - const uint32x4_t offset = vmovq_n_u32(0xffff2000); - const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff); - uint32x4_t currentmax = vmovq_n_u32(0x0); - uint32x4_t currentoffsetmax = vmovq_n_u32(0x0); - while (input + 4 < end) { - const uint32x4_t in = vld1q_u32(reinterpret_cast(input)); - currentmax = vmaxq_u32(in,currentmax); - currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax); - input += 4; + // suppose it is not an all ASCII byte sequence + uint8_t leading_byte = data[pos]; // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *latin_output++ = char(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate: + // We have a two-byte UTF-8 + if(pos + 1 >= len) { break; } // minimal bound checking + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10. + // range check - + uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation. + *latin_output++ = char(code_point); + pos += 2; + } else { + // we may have a continuation but we do not do error checking + return 0; } + } + return latin_output - start; +} - uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax); - if(vmaxvq_u32(is_zero) != 0) { - return nullptr; - } +} // utf8_to_latin1 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf - is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax); - if(vmaxvq_u32(is_zero) != 0) { - return nullptr; - } +#endif +/* end file src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */ +/* begin file src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */ +#ifndef SIMDUTF_VALID_UTF16_TO_LATIN1_H +#define SIMDUTF_VALID_UTF16_TO_LATIN1_H - return input; -} +namespace simdutf { +namespace scalar { +namespace { +namespace utf16_to_latin1 { +template +inline size_t convert_valid(const char16_t* buf, size_t len, char* latin_output) { + const uint16_t *data = reinterpret_cast(buf); + size_t pos = 0; + char* start{latin_output}; + uint16_t word = 0; -const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size) { - const char32_t* start = input; - const char32_t* end = input + size; + while (pos < len) { + word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos]; + *latin_output++ = char(word); + pos++; + } - const uint32x4_t standardmax = vmovq_n_u32(0x10ffff); - const uint32x4_t offset = vmovq_n_u32(0xffff2000); - const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff); - uint32x4_t currentmax = vmovq_n_u32(0x0); - uint32x4_t currentoffsetmax = vmovq_n_u32(0x0); + return latin_output - start; +} - while (input + 4 < end) { - const uint32x4_t in = vld1q_u32(reinterpret_cast(input)); - currentmax = vmaxq_u32(in,currentmax); - currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax); +} // utf16_to_latin1 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf - uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax); - if(vmaxvq_u32(is_zero) != 0) { - return result(error_code::TOO_LARGE, input - start); - } +#endif +/* end file src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */ +/* begin file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */ +#ifndef SIMDUTF_VALID_UTF32_TO_LATIN1_H +#define SIMDUTF_VALID_UTF32_TO_LATIN1_H - is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax); - if(vmaxvq_u32(is_zero) != 0) { - return result(error_code::SURROGATE, input - start); - } +namespace simdutf { +namespace scalar { +namespace { +namespace utf32_to_latin1 { - input += 4; - } +inline size_t convert_valid(const char32_t *buf, size_t len, char *latin1_output) { + const uint32_t *data = reinterpret_cast(buf); + char* start = latin1_output; + uint32_t utf32_char; + size_t pos = 0; - return result(error_code::SUCCESS, input - start); -} -/* end file src/arm64/arm_validate_utf32le.cpp */ + while (pos < len) { + utf32_char = (uint32_t)data[pos]; -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf8_to_utf16.cpp -/* begin file src/arm64/arm_convert_utf8_to_utf16.cpp */ -// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the -// end of the code points. Only the least significant 12 bits of the mask -// are accessed. -// It returns how many bytes were consumed (up to 12). -template -size_t convert_masked_utf8_to_utf16(const char *input, - uint64_t utf8_end_of_code_point_mask, - char16_t *&utf16_output) { - // we use an approach where we try to process up to 12 input bytes. - // Why 12 input bytes and not 16? Because we are concerned with the size of - // the lookup tables. Also 12 is nicely divisible by two and three. - // - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - #else - const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - #endif - uint8x16_t in = vld1q_u8(reinterpret_cast(input)); - const uint16_t input_utf8_end_of_code_point_mask = - utf8_end_of_code_point_mask & 0xfff; - // - // Optimization note: our main path below is load-latency dependent. Thus it is maybe - // beneficial to have fast paths that depend on branch prediction but have less latency. - // This results in more instructions but, potentially, also higher speeds. - // - // We first try a few fast paths. - if((utf8_end_of_code_point_mask & 0xffff) == 0xffff) { - // We process in chunks of 16 bytes - uint16x8_t ascii_first = vmovl_u8(vget_low_u8 (in)); - uint16x8_t ascii_second = vmovl_high_u8(in); - if (!match_system(big_endian)) { - ascii_first = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(ascii_first), swap)); - ascii_second = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(ascii_second), swap)); + if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are Latin1 + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if ((v & 0xFFFFFF00FFFFFF00) == 0) { + *latin1_output++ = char(buf[pos]); + *latin1_output++ = char(buf[pos+1]); + pos += 2; + continue; } - vst1q_u16(reinterpret_cast(utf16_output), ascii_first); - vst1q_u16(reinterpret_cast(utf16_output) + 8, ascii_second); - utf16_output += 16; // We wrote 16 16-bit characters. - return 16; // We consumed 16 bytes. - } - if((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa) { - // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words. - // There is probably a more efficient sequence, but the following might do. - uint8x16_t perm = vqtbl1q_u8(in, swap); - uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f))); - uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00))); - uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2))); - if (!match_system(big_endian)) composed = vqtbl1q_u8(composed, swap); - vst1q_u8(reinterpret_cast(utf16_output), composed); - utf16_output += 8; // We wrote 16 bytes, 8 code points. - return 16; - } - if(input_utf8_end_of_code_point_mask == 0x924) { - // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words. - // There is probably a more efficient sequence, but the following might do. -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x16_t sh = make_uint8x16_t(2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255); -#else - const uint8x16_t sh = {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255}; -#endif - uint8x16_t perm = vqtbl1q_u8(in, sh); - uint8x16_t ascii = - vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits - uint8x16_t middlebyte = - vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits - uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2)); - uint32x4_t highbyte = - vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits - uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4); - uint32x4_t composed = - vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted); - uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed); - if (!match_system(big_endian)) composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap)); - vst1q_u16(reinterpret_cast(utf16_output), composed_repacked); - utf16_output += 4; - return 12; } - /// We do not have a fast path available, so we fallback. - - const uint8_t idx = - simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; - const uint8_t consumed = - simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + *latin1_output++ = (char)(utf32_char & 0xFF); + pos++; - - if (idx < 64) { - // SIX (6) input code-words - // this is a relatively easy scenario - // we process SIX (6) input code-words. The max length in bytes of six code - // words spanning between 1 and 2 bytes each is 12 bytes. - uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); - uint8x16_t perm = vqtbl1q_u8(in, sh); - uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f))); - uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00))); - uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2))); - if (!match_system(big_endian)) composed = vqtbl1q_u8(composed, swap); - vst1q_u8(reinterpret_cast(utf16_output), composed); - utf16_output += 6; // We wrote 12 bytes, 6 code points. - } else if (idx < 145) { - // FOUR (4) input code-words - uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); - uint8x16_t perm = vqtbl1q_u8(in, sh); - uint8x16_t ascii = - vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits - uint8x16_t middlebyte = - vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits - uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2)); - uint32x4_t highbyte = - vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits - uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4); - uint32x4_t composed = - vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted); - uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed); - if (!match_system(big_endian)) composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap)); - vst1q_u16(reinterpret_cast(utf16_output), composed_repacked); - utf16_output += 4; - } else if (idx < 209) { - // TWO (2) input code-words - ////////////// - // There might be garbage inputs where a leading byte mascarades as a four-byte - // leading byte (by being followed by 3 continuation byte), but is not greater than - // 0xf0. This could trigger a buffer overflow if we only counted leading - // bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation. - // Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs. - // We do as at the cost of an extra mask. - ///////////// - uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); - uint8x16_t perm = vqtbl1q_u8(in, sh); - uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); - uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); - uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2)); - uint8x16_t middlehighbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f0000))); - // correct for spurious high bit - uint8x16_t correct = - vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x400000)))), 1)); - middlehighbyte = veorq_u8(correct, middlehighbyte); - uint8x16_t middlehighbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlehighbyte), 4)); - // We deliberately carry the leading four bits if they are present, we remove - // them later when computing hightenbits. - uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0xff000000))); - uint8x16_t highbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(highbyte), 6)); - // When we need to generate a surrogate pair (leading byte > 0xF0), then - // the corresponding 32-bit value in 'composed' will be greater than - // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the - // location of the surrogate pairs. - uint8x16_t composed = - vorrq_u8(vorrq_u8(ascii, middlebyte_shifted), - vorrq_u8(highbyte_shifted, middlehighbyte_shifted)); - uint32x4_t composedminus = - vsubq_u32(vreinterpretq_u32_u8(composed), vmovq_n_u32(0x10000)); - uint32x4_t lowtenbits = - vandq_u32(composedminus, vmovq_n_u32(0x3ff)); - // Notice the 0x3ff mask: - uint32x4_t hightenbits = vandq_u32(vshrq_n_u32(composedminus, 10), vmovq_n_u32(0x3ff)); - uint32x4_t lowtenbitsadd = - vaddq_u32(lowtenbits, vmovq_n_u32(0xDC00)); - uint32x4_t hightenbitsadd = - vaddq_u32(hightenbits, vmovq_n_u32(0xD800)); - uint32x4_t lowtenbitsaddshifted = vshlq_n_u32(lowtenbitsadd, 16); - uint32x4_t surrogates = - vorrq_u32(hightenbitsadd, lowtenbitsaddshifted); - uint32_t basic_buffer[4]; - uint32_t basic_buffer_swap[4]; - if (!match_system(big_endian)) { - vst1q_u32(basic_buffer_swap, vreinterpretq_u32_u8(vqtbl1q_u8(composed, swap))); - surrogates = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(surrogates), swap)); - } - vst1q_u32(basic_buffer, vreinterpretq_u32_u8(composed)); - uint32_t surrogate_buffer[4]; - vst1q_u32(surrogate_buffer, surrogates); - for (size_t i = 0; i < 3; i++) { - if(basic_buffer[i] > 0x3c00000) { - utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff); - utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16); - utf16_output += 2; - } else { - utf16_output[0] = !match_system(big_endian) ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]); - utf16_output++; - } - } - } else { - // here we know that there is an error but we do not handle errors } - return consumed; + return latin1_output - start; } -/* end file src/arm64/arm_convert_utf8_to_utf16.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf8_to_utf32.cpp -/* begin file src/arm64/arm_convert_utf8_to_utf32.cpp */ -// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the -// end of the code points. Only the least significant 12 bits of the mask -// are accessed. -// It returns how many bytes were consumed (up to 12). -size_t convert_masked_utf8_to_utf32(const char *input, - uint64_t utf8_end_of_code_point_mask, - char32_t *&utf32_out) { - // we use an approach where we try to process up to 12 input bytes. - // Why 12 input bytes and not 16? Because we are concerned with the size of - // the lookup tables. Also 12 is nicely divisible by two and three. - // - uint32_t*& utf32_output = reinterpret_cast(utf32_out); - uint8x16_t in = vld1q_u8(reinterpret_cast(input)); - const uint16_t input_utf8_end_of_code_point_mask = - utf8_end_of_code_point_mask & 0xFFF; - // - // Optimization note: our main path below is load-latency dependent. Thus it is maybe - // beneficial to have fast paths that depend on branch prediction but have less latency. - // This results in more instructions but, potentially, also higher speeds. - // - // We first try a few fast paths. - if((utf8_end_of_code_point_mask & 0xffff) == 0xffff) { - // We process in chunks of 16 bytes - vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8 (in))))); - vst1q_u32(utf32_output + 4, vmovl_high_u16(vmovl_u8(vget_low_u8 (in)))); - vst1q_u32(utf32_output + 8, vmovl_u16(vget_low_u16(vmovl_high_u8(in)))); - vst1q_u32(utf32_output + 12, vmovl_high_u16(vmovl_high_u8(in))); - utf32_output += 16; // We wrote 16 16-bit characters. - return 16; // We consumed 16 bytes. - } - if((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa) { - // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words. - // There is probably a more efficient sequence, but the following might do. -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x16_t sh = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); -#else - //const uint8x16_t sh = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - const uint8x16_t sh = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; -#endif - uint8x16_t perm = vqtbl1q_u8(in, sh); - uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f))); - uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00))); - uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2))); - vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vreinterpretq_u16_u8(composed)))); - vst1q_u32(utf32_output+4, vmovl_high_u16(vreinterpretq_u16_u8(composed))); - utf32_output += 8; // We wrote 32 bytes, 8 code points. - return 16; - } - if(input_utf8_end_of_code_point_mask == 0x924) { - // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words. - // There is probably a more efficient sequence, but the following might do. -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x16_t sh = make_uint8x16_t(2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255); -#else - const uint8x16_t sh = {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255}; -#endif - uint8x16_t perm = vqtbl1q_u8(in, sh); - uint8x16_t ascii = - vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits - uint8x16_t middlebyte = - vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits - uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2)); - uint32x4_t highbyte = - vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits - uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4); - uint32x4_t composed = - vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted); - vst1q_u32(utf32_output, composed); - utf32_output += 4; - return 12; - } - /// We do not have a fast path available, so we fallback. - const uint8_t idx = - simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; - const uint8_t consumed = - simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; +} // utf32_to_latin1 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf - if (idx < 64) { - // SIX (6) input code-words - // this is a relatively easy scenario - // we process SIX (6) input code-words. The max length in bytes of six code - // words spanning between 1 and 2 bytes each is 12 bytes. - uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); - uint8x16_t perm = vqtbl1q_u8(in, sh); - uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f))); - uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00))); - uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2))); - vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vreinterpretq_u16_u8(composed)))); - vst1q_u32(utf32_output+4, vmovl_high_u16(vreinterpretq_u16_u8(composed))); - utf32_output += 6; // We wrote 12 bytes, 6 code points. - } else if (idx < 145) { - // FOUR (4) input code-words - uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); - uint8x16_t perm = vqtbl1q_u8(in, sh); - uint8x16_t ascii = - vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits - uint8x16_t middlebyte = - vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits - uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2)); - uint32x4_t highbyte = - vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits - uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4); - uint32x4_t composed = - vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted); - vst1q_u32(utf32_output, composed); - utf32_output += 4; - } else if (idx < 209) { - // TWO (2) input code-words - uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); - uint8x16_t perm = vqtbl1q_u8(in, sh); - uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); - uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); - uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2)); - uint8x16_t middlehighbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f0000))); - // correct for spurious high bit - uint8x16_t correct = - vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x400000)))), 1)); - middlehighbyte = veorq_u8(correct, middlehighbyte); - uint8x16_t middlehighbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlehighbyte), 4)); - uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x07000000))); - uint8x16_t highbyte_shifted =vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(highbyte), 6)); - uint8x16_t composed = - vorrq_u8(vorrq_u8(ascii, middlebyte_shifted), - vorrq_u8(highbyte_shifted, middlehighbyte_shifted)); - vst1q_u32(utf32_output, vreinterpretq_u32_u8(composed)); - utf32_output += 3; - } else { - // here we know that there is an error but we do not handle errors - } - return consumed; -} -/* end file src/arm64/arm_convert_utf8_to_utf32.cpp */ +#endif +/* end file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf16_to_utf8.cpp -/* begin file src/arm64/arm_convert_utf16_to_utf8.cpp */ -/* - The vectorized algorithm works on single SSE register i.e., it - loads eight 16-bit words. - We consider three cases: - 1. an input register contains no surrogates and each value - is in range 0x0000 .. 0x07ff. - 2. an input register contains no surrogates and values are - is in range 0x0000 .. 0xffff. - 3. an input register contains surrogates --- i.e. codepoints - can have 16 or 32 bits. - Ad 1. +SIMDUTF_PUSH_DISABLE_WARNINGS +SIMDUTF_DISABLE_UNDESIRED_WARNINGS - When values are less than 0x0800, it means that a 16-bit words - can be converted into: 1) single UTF8 byte (when it's an ASCII - char) or 2) two UTF8 bytes. - For this case we do only some shuffle to obtain these 2-byte - codes and finally compress the whole SSE register with a single - shuffle. +#if SIMDUTF_IMPLEMENTATION_ARM64 +/* begin file src/arm64/implementation.cpp */ +/* begin file src/simdutf/arm64/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "arm64" +// #define SIMDUTF_IMPLEMENTATION arm64 +/* end file src/simdutf/arm64/begin.h */ +namespace simdutf { +namespace arm64 { +namespace { +#ifndef SIMDUTF_ARM64_H +#error "arm64.h must be included" +#endif +using namespace simd; - We need 256-entry lookup table to get a compression pattern - and the number of output bytes in the compressed vector register. - Each entry occupies 17 bytes. +simdutf_really_inline bool is_ascii(const simd8x64& input) { + simd8 bits = input.reduce_or(); + return bits.max_val() < 0b10000000u; +} - Ad 2. +simdutf_unused simdutf_really_inline simd8 must_be_continuation(const simd8 prev1, const simd8 prev2, const simd8 prev3) { + simd8 is_second_byte = prev1 >= uint8_t(0b11000000u); + simd8 is_third_byte = prev2 >= uint8_t(0b11100000u); + simd8 is_fourth_byte = prev3 >= uint8_t(0b11110000u); + // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller is using ^ as well. + // This will work fine because we only have to report errors for cases with 0-1 lead bytes. + // Multiple lead bytes implies 2 overlapping multibyte characters, and if that happens, there is + // guaranteed to be at least *one* lead byte that is part of only 1 other multibyte character. + // The error will be detected there. + return is_second_byte ^ is_third_byte ^ is_fourth_byte; +} - When values fit in 16-bit words, but are above 0x07ff, then - a single word may produce one, two or three UTF8 bytes. +simdutf_really_inline simd8 must_be_2_3_continuation(const simd8 prev2, const simd8 prev3) { + simd8 is_third_byte = prev2 >= uint8_t(0b11100000u); + simd8 is_fourth_byte = prev3 >= uint8_t(0b11110000u); + return is_third_byte ^ is_fourth_byte; +} - We prepare data for all these three cases in two registers. - The first register contains lower two UTF8 bytes (used in all - cases), while the second one contains just the third byte for - the three-UTF8-bytes case. +// common functions for utf8 conversions +simdutf_really_inline uint16x4_t convert_utf8_3_byte_to_utf16(uint8x16_t in) { + // Low half contains 10cccccc|1110aaaa + // High half contains 10bbbbbb|10bbbbbb +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t sh = make_uint8x16_t(0, 2, 3, 5, 6, 8, 9, 11, 1, 1, 4, 4, 7, 7, 10, 10); +#else + const uint8x16_t sh = {0, 2, 3, 5, 6, 8, 9, 11, 1, 1, 4, 4, 7, 7, 10, 10}; +#endif + uint8x16_t perm = vqtbl1q_u8(in, sh); + // Split into half vectors. + // 10cccccc|1110aaaa + uint8x8_t perm_low = vget_low_u8(perm); // no-op + // 10bbbbbb|10bbbbbb + uint8x8_t perm_high = vget_high_u8(perm); + // xxxxxxxx 10bbbbbb + uint16x4_t mid = vreinterpret_u16_u8(perm_high); // no-op + // xxxxxxxx 1110aaaa + uint16x4_t high = vreinterpret_u16_u8(perm_low); // no-op + // Assemble with shift left insert. + // xxxxxxaa aabbbbbb + uint16x4_t mid_high = vsli_n_u16(mid, high, 6); + // (perm_low << 8) | (perm_low >> 8) + // xxxxxxxx 10cccccc + uint16x4_t low = vreinterpret_u16_u8(vrev16_u8(perm_low)); + // Shift left insert into the low bits + // aaaabbbb bbcccccc + uint16x4_t composed = vsli_n_u16(low, mid_high, 6); + return composed; +} + +simdutf_really_inline uint16x8_t convert_utf8_2_byte_to_utf16(uint8x16_t in) { + // Converts 6 2 byte UTF-8 characters to 6 UTF-16 characters. + // Technically this calculates 8, but 6 does better and happens more often + // (The languages which use these codepoints use ASCII spaces so 8 would need to be + // in the middle of a very long word). + + // 10bbbbbb 110aaaaa + uint16x8_t upper = vreinterpretq_u16_u8(in); + // (in << 8) | (in >> 8) + // 110aaaaa 10bbbbbb + uint16x8_t lower = vreinterpretq_u16_u8(vrev16q_u8(in)); + // 00000000 000aaaaa + uint16x8_t upper_masked = vandq_u16(upper, vmovq_n_u16(0x1F)); + // Assemble with shift left insert. + // 00000aaa aabbbbbb + uint16x8_t composed = vsliq_n_u16(lower, upper_masked, 6); + return composed; +} + +simdutf_really_inline uint16x8_t convert_utf8_1_to_2_byte_to_utf16(uint8x16_t in, size_t shufutf8_idx) { + // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters. + // This is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six code + // code units spanning between 1 and 2 bytes each is 12 bytes. + uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx])); + // Shuffle + // 1 byte: 00000000 0bbbbbbb + // 2 byte: 110aaaaa 10bbbbbb + uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh)); + // Mask + // 1 byte: 00000000 0bbbbbbb + // 2 byte: 00000000 00bbbbbb + uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits + // 1 byte: 00000000 00000000 + // 2 byte: 000aaaaa 00000000 + uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits + // Combine with a shift right accumulate + // 1 byte: 00000000 0bbbbbbb + // 2 byte: 00000aaa aabbbbbb + uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2); + return composed; +} - Finally these two registers are interleaved forming eight-element - array of 32-bit values. The array spans two SSE registers. - The bytes from the registers are compressed using two shuffles. +/* begin file src/arm64/arm_detect_encodings.cpp */ +template +// len is known to be a multiple of 2 when this is called +int arm_detect_encodings(const char * buf, size_t len) { + const char* start = buf; + const char* end = buf + len; - We need 256-entry lookup table to get a compression pattern - and the number of output bytes in the compressed vector register. - Each entry occupies 17 bytes. + bool is_utf8 = true; + bool is_utf16 = true; + bool is_utf32 = true; + int out = 0; - To summarize: - - We need two 256-entry tables that have 8704 bytes in total. -*/ -/* - Returns a pair: the first unprocessed byte from buf and utf8_output - A scalar routing should carry on the conversion of the tail. -*/ -template -std::pair arm_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_out) { - uint8_t * utf8_output = reinterpret_cast(utf8_out); - const char16_t* end = buf + len; + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); - const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); - const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); - const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); + uint32x4_t currentmax = vmovq_n_u32(0x0); - while (buf + 16 <= end) { - uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); - if (!match_system(big_endian)) { - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - #else - const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - #endif - in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap)); - } - if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!! - // It is common enough that we have sequences of 16 consecutive ASCII characters. - uint16x8_t nextin = vld1q_u16(reinterpret_cast(buf) + 8); - if (!match_system(big_endian)) { - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - #else - const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - #endif - nextin = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(nextin), swap)); - } - if(vmaxvq_u16(nextin) > 0x7F) { - // 1. pack the bytes - // obviously suboptimal. - uint8x8_t utf8_packed = vmovn_u16(in); - // 2. store (8 bytes) - vst1_u8(utf8_output, utf8_packed); - // 3. adjust pointers - buf += 8; - utf8_output += 8; - in = nextin; - } else { - // 1. pack the bytes - // obviously suboptimal. - uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin); - // 2. store (16 bytes) - vst1q_u8(utf8_output, utf8_packed); - // 3. adjust pointers - buf += 16; - utf8_output += 16; - continue; // we are done for this round! - } - } + checker check{}; - if (vmaxvq_u16(in) <= 0x7FF) { - // 1. prepare 2-byte values - // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 - // expected output : [110a|aaaa|10bb|bbbb] x 8 - const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); - const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); + while(buf + 64 <= end) { + uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); + uint16x8_t secondin = vld1q_u16(reinterpret_cast(buf) + simd16::SIZE / sizeof(char16_t)); + uint16x8_t thirdin = vld1q_u16(reinterpret_cast(buf) + 2*simd16::SIZE / sizeof(char16_t)); + uint16x8_t fourthin = vld1q_u16(reinterpret_cast(buf) + 3*simd16::SIZE / sizeof(char16_t)); - // t0 = [000a|aaaa|bbbb|bb00] - const uint16x8_t t0 = vshlq_n_u16(in, 2); - // t1 = [000a|aaaa|0000|0000] - const uint16x8_t t1 = vandq_u16(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const uint16x8_t t2 = vandq_u16(in, v_003f); - // t3 = [000a|aaaa|00bb|bbbb] - const uint16x8_t t3 = vorrq_u16(t1, t2); - // t4 = [110a|aaaa|10bb|bbbb] - const uint16x8_t t4 = vorrq_u16(t3, v_c080); - // 2. merge ASCII and 2-byte codewords - const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); - const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); - const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4)); - // 3. prepare bitmask for 8-bit lookup -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004, - 0x0010, 0x0040, - 0x0002, 0x0008, - 0x0020, 0x0080); -#else - const uint16x8_t mask = { 0x0001, 0x0004, - 0x0010, 0x0040, - 0x0002, 0x0008, - 0x0020, 0x0080 }; -#endif - uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); - // 4. pack the bytes - const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; - const uint8x16_t shuffle = vld1q_u8(row + 1); - const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); + const auto u0 = simd16(in); + const auto u1 = simd16(secondin); + const auto u2 = simd16(thirdin); + const auto u3 = simd16(fourthin); - // 5. store bytes - vst1q_u8(utf8_output, utf8_packed); + const auto v0 = u0.shr<8>(); + const auto v1 = u1.shr<8>(); + const auto v2 = u2.shr<8>(); + const auto v3 = u3.shr<8>(); - // 6. adjust pointers - buf += 8; - utf8_output += row[0]; - continue; + const auto in16 = simd16::pack(v0, v1); + const auto nextin16 = simd16::pack(v2, v3); - } - const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800); - // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, - // it is likely an uncommon occurrence. - if (vmaxvq_u16(surrogates_bytemask) == 0) { - // case: words from register produce either 1, 2 or 3 UTF-8 bytes -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); -#else - const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; -#endif - /* In this branch we handle three cases: - 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte - 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes - 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + const uint64_t surrogates_wordmask0 = ((in16 & v_f8) == v_d8).to_bitmask64(); + const uint64_t surrogates_wordmask1 = ((nextin16 & v_f8) == v_d8).to_bitmask64(); - We expand the input word (16-bit) into two words (32-bit), thus - we have room for four bytes. However, we need five distinct bit - layouts. Note that the last byte in cases #2 and #3 is the same. + // Check for surrogates + if (surrogates_wordmask0 != 0 || surrogates_wordmask1 != 0) { + // Cannot be UTF8 + is_utf8 = false; + // Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates + // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word. + // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant + // bytes of a 32-bit word since they always come in pairs in UTF-16LE. + // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit code units. - We precompute byte 1 for case #1 and the common byte for cases #2 & #3 - in register t2. + if (((surrogates_wordmask0 | surrogates_wordmask1) & 0xf0f0f0f0f0f0f0f0) != 0) { + is_utf32 = false; + // Code from arm_validate_utf16le.cpp + // Not efficient, we do not process surrogates_wordmask1 + const char16_t * input = reinterpret_cast(buf); + const char16_t* end16 = reinterpret_cast(start) + len/2; - We precompute byte 1 for case #3 and -- **conditionally** -- precompute - either byte 1 for case #2 or byte 2 for case #3. Note that they - differ by exactly one bit. + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); - Finally from these two words we build proper UTF-8 sequence, taking - into account the case (i.e, the number of bytes to write). - */ - /** - * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: - * t2 => [0ccc|cccc] [10cc|cccc] - * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) - */ -#define simdutf_vec(x) vmovq_n_u16(static_cast(x)) - // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] - const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even))); - // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] - const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111)); - // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000)); + const uint64_t V0 = ~surrogates_wordmask0; - // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] - const uint16x8_t s0 = vshrq_n_u16(in, 12); - // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] - const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000)); - // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] - const uint16x8_t s1s = vshlq_n_u16(s1, 2); - // [00bb|bbbb|0000|aaaa] - const uint16x8_t s2 = vorrq_u16(s0, s1s); - // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] - const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000)); - const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); - const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff); - const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask); - const uint16x8_t s4 = veorq_u16(s3, m0); -#undef simdutf_vec + const auto vH0 = ((in16 & v_fc) == v_dc); + const uint64_t H0 = vH0.to_bitmask64(); - // 4. expand words 16-bit => 32-bit - const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); - const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); + const uint64_t L0 = ~H0 & surrogates_wordmask0; - // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle - const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); - const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004, - 0x0010, 0x0040, - 0x0100, 0x0400, - 0x1000, 0x4000 ); - const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008, - 0x0020, 0x0080, - 0x0200, 0x0800, - 0x2000, 0x8000 ); -#else - const uint16x8_t onemask = { 0x0001, 0x0004, - 0x0010, 0x0040, - 0x0100, 0x0400, - 0x1000, 0x4000 }; - const uint16x8_t twomask = { 0x0002, 0x0008, - 0x0020, 0x0080, - 0x0200, 0x0800, - 0x2000, 0x8000 }; -#endif - const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask)); - const uint16_t mask = vaddvq_u16(combined); - // The following fast path may or may not be beneficial. - /*if(mask == 0) { - // We only have three-byte words. Use fast path. - const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; - const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); - const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); - vst1q_u8(utf8_output, utf8_0); - utf8_output += 12; - vst1q_u8(utf8_output, utf8_1); - utf8_output += 12; - buf += 8; - continue; - }*/ - const uint8_t mask0 = uint8_t(mask); - - const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; - const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); - const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); - - const uint8_t mask1 = static_cast(mask >> 8); - const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; - const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); - const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); - - vst1q_u8(utf8_output, utf8_0); - utf8_output += row0[0]; - vst1q_u8(utf8_output, utf8_1); - utf8_output += row1[0]; + const uint64_t a0 = L0 & (H0 >> 4); - buf += 8; - // surrogate pair(s) in a register - } else { - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; - if((word & 0xFF80)==0) { - *utf8_output++ = char(word); - } else if((word & 0xF800)==0) { - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word &0xF800 ) != 0xD800) { - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else { - // must be a surrogate pair - uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; - k++; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, reinterpret_cast(utf8_output)); } - uint32_t value = (diff << 10) + diff2 + 0x10000; - *utf8_output++ = char((value>>18) | 0b11110000); - *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((value & 0b111111) | 0b10000000); - } - } - buf += k; - } - } // while + const uint64_t b0 = a0 << 4; - return std::make_pair(buf, reinterpret_cast(utf8_output)); -} + const uint64_t c0 = V0 | a0 | b0; + if (c0 == ~0ull) { + input += 16; + } else if (c0 == 0xfffffffffffffffull) { + input += 15; + } else { + is_utf16 = false; + break; + } + while (input + 16 < end16) { + const auto in0 = simd16(input); + const auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + const simd8 in_16 = simd16::pack(t0, t1); -/* - Returns a pair: a result struct and utf8_output. - If there is an error, the count field of the result is the position of the error. - Otherwise, it is the position of the first unprocessed byte in buf (even if finished). - A scalar routing should carry on the conversion of the tail if needed. -*/ -template -std::pair arm_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_out) { - uint8_t * utf8_output = reinterpret_cast(utf8_out); - const char16_t* start = buf; - const char16_t* end = buf + len; + const uint64_t surrogates_wordmask = ((in_16 & v_f8) == v_d8).to_bitmask64(); + if(surrogates_wordmask == 0) { + input += 16; + } else { + const uint64_t V = ~surrogates_wordmask; - const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); - const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); - const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); + const auto vH = ((in_16 & v_fc) == v_dc); + const uint64_t H = vH.to_bitmask64(); - while (buf + 16 <= end) { - uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); - if (!match_system(big_endian)) { - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - #else - const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - #endif - in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap)); - } - if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!! - // It is common enough that we have sequences of 16 consecutive ASCII characters. - uint16x8_t nextin = vld1q_u16(reinterpret_cast(buf) + 8); - if (!match_system(big_endian)) { - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - #else - const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - #endif - nextin = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(nextin), swap)); - } - if(vmaxvq_u16(nextin) > 0x7F) { - // 1. pack the bytes - // obviously suboptimal. - uint8x8_t utf8_packed = vmovn_u16(in); - // 2. store (8 bytes) - vst1_u8(utf8_output, utf8_packed); - // 3. adjust pointers - buf += 8; - utf8_output += 8; - in = nextin; - } else { - // 1. pack the bytes - // obviously suboptimal. - uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin); - // 2. store (16 bytes) - vst1q_u8(utf8_output, utf8_packed); - // 3. adjust pointers - buf += 16; - utf8_output += 16; - continue; // we are done for this round! - } - } + const uint64_t L = ~H & surrogates_wordmask; - if (vmaxvq_u16(in) <= 0x7FF) { - // 1. prepare 2-byte values - // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 - // expected output : [110a|aaaa|10bb|bbbb] x 8 - const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); - const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); + const uint64_t a = L & (H >> 4); - // t0 = [000a|aaaa|bbbb|bb00] - const uint16x8_t t0 = vshlq_n_u16(in, 2); - // t1 = [000a|aaaa|0000|0000] - const uint16x8_t t1 = vandq_u16(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const uint16x8_t t2 = vandq_u16(in, v_003f); - // t3 = [000a|aaaa|00bb|bbbb] - const uint16x8_t t3 = vorrq_u16(t1, t2); - // t4 = [110a|aaaa|10bb|bbbb] - const uint16x8_t t4 = vorrq_u16(t3, v_c080); - // 2. merge ASCII and 2-byte codewords - const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); - const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); - const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4)); - // 3. prepare bitmask for 8-bit lookup -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004, - 0x0010, 0x0040, - 0x0002, 0x0008, - 0x0020, 0x0080); -#else - const uint16x8_t mask = { 0x0001, 0x0004, - 0x0010, 0x0040, - 0x0002, 0x0008, - 0x0020, 0x0080 }; -#endif - uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); - // 4. pack the bytes - const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; - const uint8x16_t shuffle = vld1q_u8(row + 1); - const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); + const uint64_t b = a << 4; - // 5. store bytes - vst1q_u8(utf8_output, utf8_packed); + const uint64_t c = V | a | b; + if (c == ~0ull) { + input += 16; + } else if (c == 0xfffffffffffffffull) { + input += 15; + } else { + is_utf16 = false; + break; + } + } + } + } else { + is_utf16 = false; + // Check for UTF-32 + if (len % 4 == 0) { + const char32_t * input = reinterpret_cast(buf); + const char32_t* end32 = reinterpret_cast(start) + len/4; - // 6. adjust pointers - buf += 8; - utf8_output += row[0]; - continue; + // Must start checking for surrogates + uint32x4_t currentoffsetmax = vmovq_n_u32(0x0); + const uint32x4_t offset = vmovq_n_u32(0xffff2000); + const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff); - } - const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800); - // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, - // it is likely an uncommon occurrence. - if (vmaxvq_u16(surrogates_bytemask) == 0) { - // case: words from register produce either 1, 2 or 3 UTF-8 bytes -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); -#else - const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; -#endif - /* In this branch we handle three cases: - 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte - 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes - 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + const uint32x4_t in32 = vreinterpretq_u32_u16(in); + const uint32x4_t secondin32 = vreinterpretq_u32_u16(secondin); + const uint32x4_t thirdin32 = vreinterpretq_u32_u16(thirdin); + const uint32x4_t fourthin32 = vreinterpretq_u32_u16(fourthin); - We expand the input word (16-bit) into two words (32-bit), thus - we have room for four bytes. However, we need five distinct bit - layouts. Note that the last byte in cases #2 and #3 is the same. + currentmax = vmaxq_u32(in32,currentmax); + currentmax = vmaxq_u32(secondin32,currentmax); + currentmax = vmaxq_u32(thirdin32,currentmax); + currentmax = vmaxq_u32(fourthin32,currentmax); - We precompute byte 1 for case #1 and the common byte for cases #2 & #3 - in register t2. + currentoffsetmax = vmaxq_u32(vaddq_u32(in32, offset), currentoffsetmax); + currentoffsetmax = vmaxq_u32(vaddq_u32(secondin32, offset), currentoffsetmax); + currentoffsetmax = vmaxq_u32(vaddq_u32(thirdin32, offset), currentoffsetmax); + currentoffsetmax = vmaxq_u32(vaddq_u32(fourthin32, offset), currentoffsetmax); - We precompute byte 1 for case #3 and -- **conditionally** -- precompute - either byte 1 for case #2 or byte 2 for case #3. Note that they - differ by exactly one bit. + while (input + 4 < end32) { + const uint32x4_t in_32 = vld1q_u32(reinterpret_cast(input)); + currentmax = vmaxq_u32(in_32,currentmax); + currentoffsetmax = vmaxq_u32(vaddq_u32(in_32, offset), currentoffsetmax); + input += 4; + } - Finally from these two words we build proper UTF-8 sequence, taking - into account the case (i.e, the number of bytes to write). - */ - /** - * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: - * t2 => [0ccc|cccc] [10cc|cccc] - * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) - */ -#define simdutf_vec(x) vmovq_n_u16(static_cast(x)) - // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] - const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even))); - // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] - const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111)); - // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000)); + uint32x4_t forbidden_words = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax); + if(vmaxvq_u32(forbidden_words) != 0) { + is_utf32 = false; + } + } else { + is_utf32 = false; + } + } + break; + } + // If no surrogate, validate under other encodings as well - // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] - const uint16x8_t s0 = vshrq_n_u16(in, 12); - // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] - const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000)); - // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] - const uint16x8_t s1s = vshlq_n_u16(s1, 2); - // [00bb|bbbb|0000|aaaa] - const uint16x8_t s2 = vorrq_u16(s0, s1s); - // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] - const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000)); - const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); - const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff); - const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask); - const uint16x8_t s4 = veorq_u16(s3, m0); -#undef simdutf_vec + // UTF-32 validation + currentmax = vmaxq_u32(vreinterpretq_u32_u16(in),currentmax); + currentmax = vmaxq_u32(vreinterpretq_u32_u16(secondin),currentmax); + currentmax = vmaxq_u32(vreinterpretq_u32_u16(thirdin),currentmax); + currentmax = vmaxq_u32(vreinterpretq_u32_u16(fourthin),currentmax); - // 4. expand words 16-bit => 32-bit - const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); - const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); + // UTF-8 validation + // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h + simd::simd8x64 in8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(secondin), vreinterpretq_u8_u16(thirdin), vreinterpretq_u8_u16(fourthin)); + check.check_next_input(in8); - // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle - const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); - const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); -#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004, - 0x0010, 0x0040, - 0x0100, 0x0400, - 0x1000, 0x4000 ); - const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008, - 0x0020, 0x0080, - 0x0200, 0x0800, - 0x2000, 0x8000 ); -#else - const uint16x8_t onemask = { 0x0001, 0x0004, - 0x0010, 0x0040, - 0x0100, 0x0400, - 0x1000, 0x4000 }; - const uint16x8_t twomask = { 0x0002, 0x0008, - 0x0020, 0x0080, - 0x0200, 0x0800, - 0x2000, 0x8000 }; -#endif - const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask)); - const uint16_t mask = vaddvq_u16(combined); - // The following fast path may or may not be beneficial. - /*if(mask == 0) { - // We only have three-byte words. Use fast path. - const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; - const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); - const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); - vst1q_u8(utf8_output, utf8_0); - utf8_output += 12; - vst1q_u8(utf8_output, utf8_1); - utf8_output += 12; - buf += 8; - continue; - }*/ - const uint8_t mask0 = uint8_t(mask); + buf += 64; + } - const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; - const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); - const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); + // Check which encodings are possible - const uint8_t mask1 = static_cast(mask >> 8); - const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; - const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); - const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); + if (is_utf8) { + if (static_cast(buf - start) != len) { + uint8_t block[64]{}; + std::memset(block, 0x20, 64); + std::memcpy(block, buf, len - (buf - start)); + simd::simd8x64 in(block); + check.check_next_input(in); + } + if (!check.errors()) { + out |= simdutf::encoding_type::UTF8; + } + } - vst1q_u8(utf8_output, utf8_0); - utf8_output += row0[0]; - vst1q_u8(utf8_output, utf8_1); - utf8_output += row1[0]; + if (is_utf16 && scalar::utf16::validate(reinterpret_cast(buf), (len - (buf - start))/2)) { + out |= simdutf::encoding_type::UTF16_LE; + } - buf += 8; - // surrogate pair(s) in a register - } else { - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; - if((word & 0xFF80)==0) { - *utf8_output++ = char(word); - } else if((word & 0xF800)==0) { - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word &0xF800 ) != 0xD800) { - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else { - // must be a surrogate pair - uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; - k++; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast(utf8_output)); } - uint32_t value = (diff << 10) + diff2 + 0x10000; - *utf8_output++ = char((value>>18) | 0b11110000); - *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((value & 0b111111) | 0b10000000); + if (is_utf32 && (len % 4 == 0)) { + const uint32x4_t standardmax = vmovq_n_u32(0x10ffff); + uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax); + if (vmaxvq_u32(is_zero) == 0 && scalar::utf32::validate(reinterpret_cast(buf), (len - (buf - start))/4)) { + out |= simdutf::encoding_type::UTF32_LE; } - } - buf += k; } - } // while - return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast(utf8_output)); + return out; } -/* end file src/arm64/arm_convert_utf16_to_utf8.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf16_to_utf32.cpp -/* begin file src/arm64/arm_convert_utf16_to_utf32.cpp */ -/* - The vectorized algorithm works on single SSE register i.e., it - loads eight 16-bit words. +/* end file src/arm64/arm_detect_encodings.cpp */ - We consider three cases: - 1. an input register contains no surrogates and each value - is in range 0x0000 .. 0x07ff. - 2. an input register contains no surrogates and values are - is in range 0x0000 .. 0xffff. - 3. an input register contains surrogates --- i.e. codepoints - can have 16 or 32 bits. +/* begin file src/arm64/arm_validate_utf16.cpp */ +template +const char16_t* arm_validate_utf16(const char16_t* input, size_t size) { + const char16_t* end = input + size; + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + while (input + 16 < end) { + // 0. Load data: since the validation takes into account only higher + // byte of each word, we compress the two vectors into one which + // consists only the higher bytes. + auto in0 = simd16(input); + auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); + if (!match_system(big_endian)) { + in0 = vrev16q_u8(in0); + in1 = vrev16q_u8(in1); + } + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + const simd8 in = simd16::pack(t0, t1); + // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). + const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64(); + if(surrogates_wordmask == 0) { + input += 16; + } else { + // 2. We have some surrogates that have to be distinguished: + // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) + // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) + // + // Fact: high surrogate has 11th bit set (3rd bit in the higher word) - Ad 1. + // V - non-surrogate code units + // V = not surrogates_wordmask + const uint64_t V = ~surrogates_wordmask; - When values are less than 0x0800, it means that a 16-bit words - can be converted into: 1) single UTF8 byte (when it's an ASCII - char) or 2) two UTF8 bytes. + // H - word-mask for high surrogates: the six highest bits are 0b1101'11 + const auto vH = ((in & v_fc) == v_dc); + const uint64_t H = vH.to_bitmask64(); - For this case we do only some shuffle to obtain these 2-byte - codes and finally compress the whole SSE register with a single - shuffle. + // L - word mask for low surrogates + // L = not H and surrogates_wordmask + const uint64_t L = ~H & surrogates_wordmask; - We need 256-entry lookup table to get a compression pattern - and the number of output bytes in the compressed vector register. - Each entry occupies 17 bytes. + const uint64_t a = L & (H >> 4); // A low surrogate must be followed by high one. + // (A low surrogate placed in the 7th register's word + // is an exception we handle.) + const uint64_t b = a << 4; // Just mark that the opposite fact is hold, + // thanks to that we have only two masks for valid case. + const uint64_t c = V | a | b; // Combine all the masks into the final one. + if (c == ~0ull) { + // The whole input register contains valid UTF-16, i.e., + // either single code units or proper surrogate pairs. + input += 16; + } else if (c == 0xfffffffffffffffull) { + // The 15 lower code units of the input register contains valid UTF-16. + // The 15th word may be either a low or high surrogate. It the next + // iteration we 1) check if the low surrogate is followed by a high + // one, 2) reject sole high surrogate. + input += 15; + } else { + return nullptr; + } + } + } + return input; +} - Ad 2. - When values fit in 16-bit words, but are above 0x07ff, then - a single word may produce one, two or three UTF8 bytes. +template +const result arm_validate_utf16_with_errors(const char16_t* input, size_t size) { + const char16_t* start = input; + const char16_t* end = input + size; - We prepare data for all these three cases in two registers. - The first register contains lower two UTF8 bytes (used in all - cases), while the second one contains just the third byte for - the three-UTF8-bytes case. - - Finally these two registers are interleaved forming eight-element - array of 32-bit values. The array spans two SSE registers. - The bytes from the registers are compressed using two shuffles. - - We need 256-entry lookup table to get a compression pattern - and the number of output bytes in the compressed vector register. - Each entry occupies 17 bytes. + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + while (input + 16 < end) { + // 0. Load data: since the validation takes into account only higher + // byte of each word, we compress the two vectors into one which + // consists only the higher bytes. + auto in0 = simd16(input); + auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); + if (!match_system(big_endian)) { + in0 = vrev16q_u8(in0); + in1 = vrev16q_u8(in1); + } + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + const simd8 in = simd16::pack(t0, t1); + // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). + const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64(); + if(surrogates_wordmask == 0) { + input += 16; + } else { + // 2. We have some surrogates that have to be distinguished: + // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) + // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) + // + // Fact: high surrogate has 11th bit set (3rd bit in the higher word) - To summarize: - - We need two 256-entry tables that have 8704 bytes in total. -*/ -/* - Returns a pair: the first unprocessed byte from buf and utf8_output - A scalar routing should carry on the conversion of the tail. -*/ -template -std::pair arm_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_out) { - uint32_t * utf32_output = reinterpret_cast(utf32_out); - const char16_t* end = buf + len; + // V - non-surrogate code units + // V = not surrogates_wordmask + const uint64_t V = ~surrogates_wordmask; - const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); - const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + // H - word-mask for high surrogates: the six highest bits are 0b1101'11 + const auto vH = ((in & v_fc) == v_dc); + const uint64_t H = vH.to_bitmask64(); - while (buf + 16 <= end) { - uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); - if (!match_system(big_endian)) { - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - #else - const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - #endif - in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap)); - } + // L - word mask for low surrogates + // L = not H and surrogates_wordmask + const uint64_t L = ~H & surrogates_wordmask; - const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800); - // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, - // it is likely an uncommon occurrence. - if (vmaxvq_u16(surrogates_bytemask) == 0) { - // case: no surrogate pairs, extend all 16-bit words to 32-bit words - vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in))); - vst1q_u32(utf32_output+4, vmovl_high_u16(in)); - utf32_output += 8; - buf += 8; - // surrogate pair(s) in a register - } else { - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; - if((word &0xF800 ) != 0xD800) { - *utf32_output++ = char32_t(word); - } else { - // must be a surrogate pair - uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; - k++; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, reinterpret_cast(utf32_output)); } - uint32_t value = (diff << 10) + diff2 + 0x10000; - *utf32_output++ = char32_t(value); + const uint64_t a = L & (H >> 4); // A low surrogate must be followed by high one. + // (A low surrogate placed in the 7th register's word + // is an exception we handle.) + const uint64_t b = a << 4; // Just mark that the opposite fact is hold, + // thanks to that we have only two masks for valid case. + const uint64_t c = V | a | b; // Combine all the masks into the final one. + if (c == ~0ull) { + // The whole input register contains valid UTF-16, i.e., + // either single code units or proper surrogate pairs. + input += 16; + } else if (c == 0xfffffffffffffffull) { + // The 15 lower code units of the input register contains valid UTF-16. + // The 15th word may be either a low or high surrogate. It the next + // iteration we 1) check if the low surrogate is followed by a high + // one, 2) reject sole high surrogate. + input += 15; + } else { + return result(error_code::SURROGATE, input - start); + } } - } - buf += k; } - } // while - return std::make_pair(buf, reinterpret_cast(utf32_output)); + return result(error_code::SUCCESS, input - start); } +/* end file src/arm64/arm_validate_utf16.cpp */ +/* begin file src/arm64/arm_validate_utf32le.cpp */ +const char32_t* arm_validate_utf32le(const char32_t* input, size_t size) { + const char32_t* end = input + size; -/* - Returns a pair: a result struct and utf8_output. - If there is an error, the count field of the result is the position of the error. - Otherwise, it is the position of the first unprocessed byte in buf (even if finished). - A scalar routing should carry on the conversion of the tail if needed. -*/ -template -std::pair arm_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_out) { - uint32_t * utf32_output = reinterpret_cast(utf32_out); - const char16_t* start = buf; - const char16_t* end = buf + len; + const uint32x4_t standardmax = vmovq_n_u32(0x10ffff); + const uint32x4_t offset = vmovq_n_u32(0xffff2000); + const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff); + uint32x4_t currentmax = vmovq_n_u32(0x0); + uint32x4_t currentoffsetmax = vmovq_n_u32(0x0); - const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); - const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + while (input + 4 < end) { + const uint32x4_t in = vld1q_u32(reinterpret_cast(input)); + currentmax = vmaxq_u32(in,currentmax); + currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax); + input += 4; + } - while (buf + 16 <= end) { - uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); - if (!match_system(big_endian)) { - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - #else - const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - #endif - in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap)); + uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax); + if(vmaxvq_u32(is_zero) != 0) { + return nullptr; } - const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800); - // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, - // it is likely an uncommon occurrence. - if (vmaxvq_u16(surrogates_bytemask) == 0) { - // case: no surrogate pairs, extend all 16-bit words to 32-bit words - vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in))); - vst1q_u32(utf32_output+4, vmovl_high_u16(in)); - utf32_output += 8; - buf += 8; - // surrogate pair(s) in a register - } else { - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; - if((word &0xF800 ) != 0xD800) { - *utf32_output++ = char32_t(word); - } else { - // must be a surrogate pair - uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; - k++; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast(utf32_output)); } - uint32_t value = (diff << 10) + diff2 + 0x10000; - *utf32_output++ = char32_t(value); - } - } - buf += k; + is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax); + if(vmaxvq_u32(is_zero) != 0) { + return nullptr; } - } // while - return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast(utf32_output)); + + return input; } -/* end file src/arm64/arm_convert_utf16_to_utf32.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf32_to_utf8.cpp -/* begin file src/arm64/arm_convert_utf32_to_utf8.cpp */ -std::pair arm_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_out) { - uint8_t * utf8_output = reinterpret_cast(utf8_out); - const char32_t* end = buf + len; - const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); +const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size) { + const char32_t* start = input; + const char32_t* end = input + size; - uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0); + const uint32x4_t standardmax = vmovq_n_u32(0x10ffff); + const uint32x4_t offset = vmovq_n_u32(0xffff2000); + const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff); + uint32x4_t currentmax = vmovq_n_u32(0x0); + uint32x4_t currentoffsetmax = vmovq_n_u32(0x0); - while (buf + 16 <= end) { - uint32x4_t in = vld1q_u32(reinterpret_cast(buf)); - uint32x4_t nextin = vld1q_u32(reinterpret_cast(buf+4)); + while (input + 4 < end) { + const uint32x4_t in = vld1q_u32(reinterpret_cast(input)); + currentmax = vmaxq_u32(in,currentmax); + currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax); - // Check if no bits set above 16th - if(vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) { - // Pack UTF-32 to UTF-16 safely (without surrogate pairs) - // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp) - uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin)); - if(vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!! - // 1. pack the bytes - // obviously suboptimal. - uint8x8_t utf8_packed = vmovn_u16(utf16_packed); - // 2. store (8 bytes) - vst1_u8(utf8_output, utf8_packed); - // 3. adjust pointers - buf += 8; - utf8_output += 8; - continue; // we are done for this round! - } + uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax); + if(vmaxvq_u32(is_zero) != 0) { + return result(error_code::TOO_LARGE, input - start); + } - if (vmaxvq_u16(utf16_packed) <= 0x7FF) { - // 1. prepare 2-byte values - // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 - // expected output : [110a|aaaa|10bb|bbbb] x 8 - const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); - const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); - - // t0 = [000a|aaaa|bbbb|bb00] - const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2); - // t1 = [000a|aaaa|0000|0000] - const uint16x8_t t1 = vandq_u16(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f); - // t3 = [000a|aaaa|00bb|bbbb] - const uint16x8_t t3 = vorrq_u16(t1, t2); - // t4 = [110a|aaaa|10bb|bbbb] - const uint16x8_t t4 = vorrq_u16(t3, v_c080); - // 2. merge ASCII and 2-byte codewords - const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); - const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f); - const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4)); - // 3. prepare bitmask for 8-bit lookup - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004, - 0x0010, 0x0040, - 0x0002, 0x0008, - 0x0020, 0x0080); - #else - const uint16x8_t mask = { 0x0001, 0x0004, - 0x0010, 0x0040, - 0x0002, 0x0008, - 0x0020, 0x0080 }; - #endif - uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); - // 4. pack the bytes - const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; - const uint8x16_t shuffle = vld1q_u8(row + 1); - const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); + is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax); + if(vmaxvq_u32(is_zero) != 0) { + return result(error_code::SURROGATE, input - start); + } - // 5. store bytes - vst1q_u8(utf8_output, utf8_packed); + input += 4; + } - // 6. adjust pointers - buf += 8; - utf8_output += row[0]; - continue; + return result(error_code::SUCCESS, input - start); +} +/* end file src/arm64/arm_validate_utf32le.cpp */ - } else { - // case: words from register produce either 1, 2 or 3 UTF-8 bytes - const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); - const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff); - forbidden_bytemask = vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800)), forbidden_bytemask); +/* begin file src/arm64/arm_convert_latin1_to_utf8.cpp */ +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routing should carry on the conversion of the tail. +*/ +std::pair +arm_convert_latin1_to_utf8(const char *latin1_input, size_t len, + char *utf8_out) { + uint8_t *utf8_output = reinterpret_cast(utf8_out); + const char *end = latin1_input + len; + const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); + while (latin1_input + 16 <= end) { + uint8x16_t in8 = vld1q_u8(reinterpret_cast(latin1_input)); + if (vmaxvq_u8(in8) <= 0x7F) { // ASCII fast path!!!! + vst1q_u8(utf8_output, in8); + utf8_output += 16; + latin1_input += 16; + continue; + } - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); - #else - const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; - #endif - /* In this branch we handle three cases: - 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte - 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes - 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + // We just fallback on UTF-16 code. This could be optimized/simplified + // further. + uint16x8_t in16 = vmovl_u8(vget_low_u8(in8)); + // 1. prepare 2-byte values + // input 8-bit word : [aabb|bbbb] x 8 + // expected output : [1100|00aa|10bb|bbbb] x 8 + const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); + const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); + + // t0 = [0000|00aa|bbbb|bb00] + const uint16x8_t t0 = vshlq_n_u16(in16, 2); + // t1 = [0000|00aa|0000|0000] + const uint16x8_t t1 = vandq_u16(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const uint16x8_t t2 = vandq_u16(in16, v_003f); + // t3 = [0000|00aa|00bb|bbbb] + const uint16x8_t t3 = vorrq_u16(t1, t2); + // t4 = [1100|00aa|10bb|bbbb] + const uint16x8_t t4 = vorrq_u16(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(in16, v_007f); + const uint8x16_t utf8_unpacked = + vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in16, t4)); + // 3. prepare bitmask for 8-bit lookup +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004, 0x0010, 0x0040, + 0x0002, 0x0008, 0x0020, 0x0080); +#else + const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040, + 0x0002, 0x0008, 0x0020, 0x0080}; +#endif + uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); + // 4. pack the bytes + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const uint8x16_t shuffle = vld1q_u8(row + 1); + const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); + + // 5. store bytes + vst1q_u8(utf8_output, utf8_packed); + // 6. adjust pointers + latin1_input += 8; + utf8_output += row[0]; - We expand the input word (16-bit) into two words (32-bit), thus - we have room for four bytes. However, we need five distinct bit - layouts. Note that the last byte in cases #2 and #3 is the same. + } // while - We precompute byte 1 for case #1 and the common byte for cases #2 & #3 - in register t2. + return std::make_pair(latin1_input, reinterpret_cast(utf8_output)); +} +/* end file src/arm64/arm_convert_latin1_to_utf8.cpp */ +/* begin file src/arm64/arm_convert_latin1_to_utf16.cpp */ +template +std::pair arm_convert_latin1_to_utf16(const char* buf, size_t len, char16_t* utf16_output) { + const char* end = buf + len; - We precompute byte 1 for case #3 and -- **conditionally** -- precompute - either byte 1 for case #2 or byte 2 for case #3. Note that they - differ by exactly one bit. + while (buf + 16 <= end) { + uint8x16_t in8 = vld1q_u8(reinterpret_cast(buf)); + uint16x8_t inlow = vmovl_u8(vget_low_u8(in8)); + if (!match_system(big_endian)) { inlow = vrev16q_u8(inlow); } + vst1q_u16(reinterpret_cast(utf16_output), inlow); + uint16x8_t inhigh = vmovl_u8(vget_high_u8(in8)); + if (!match_system(big_endian)) { inhigh = vrev16q_u8(inhigh); } + vst1q_u16(reinterpret_cast(utf16_output+8), inhigh); + utf16_output += 16; + buf += 16; + } - Finally from these two words we build proper UTF-8 sequence, taking - into account the case (i.e, the number of bytes to write). - */ - /** - * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: - * t2 => [0ccc|cccc] [10cc|cccc] - * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) - */ - #define simdutf_vec(x) vmovq_n_u16(static_cast(x)) - // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] - const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even))); - // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] - const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111)); - // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000)); + return std::make_pair(buf, utf16_output); +} +/* end file src/arm64/arm_convert_latin1_to_utf16.cpp */ +/* begin file src/arm64/arm_convert_latin1_to_utf32.cpp */ +std::pair arm_convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) { + const char* end = buf + len; - // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] - const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12); - // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] - const uint16x8_t s1 = vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000)); - // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] - const uint16x8_t s1s = vshlq_n_u16(s1, 2); - // [00bb|bbbb|0000|aaaa] - const uint16x8_t s2 = vorrq_u16(s0, s1s); - // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] - const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000)); - const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); - const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff); - const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask); - const uint16x8_t s4 = veorq_u16(s3, m0); - #undef simdutf_vec + while (buf + 16 <= end) { + uint8x16_t in8 = vld1q_u8(reinterpret_cast(buf)); + uint16x8_t in8low = vmovl_u8(vget_low_u8(in8)); + uint32x4_t in16lowlow = vmovl_u16(vget_low_u16(in8low)); + uint32x4_t in16lowhigh = vmovl_u16(vget_high_u16(in8low)); + uint16x8_t in8high = vmovl_u8(vget_high_u8(in8)); + uint32x4_t in8highlow = vmovl_u16(vget_low_u16(in8high)); + uint32x4_t in8highhigh = vmovl_u16(vget_high_u16(in8high)); + vst1q_u32(reinterpret_cast(utf32_output), in16lowlow); + vst1q_u32(reinterpret_cast(utf32_output+4), in16lowhigh); + vst1q_u32(reinterpret_cast(utf32_output+8), in8highlow); + vst1q_u32(reinterpret_cast(utf32_output+12), in8highhigh); - // 4. expand words 16-bit => 32-bit - const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); - const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); + utf32_output += 16; + buf += 16; + } - // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle - const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); - const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f); - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004, - 0x0010, 0x0040, - 0x0100, 0x0400, - 0x1000, 0x4000 ); - const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008, - 0x0020, 0x0080, - 0x0200, 0x0800, - 0x2000, 0x8000 ); - #else - const uint16x8_t onemask = { 0x0001, 0x0004, - 0x0010, 0x0040, - 0x0100, 0x0400, - 0x1000, 0x4000 }; - const uint16x8_t twomask = { 0x0002, 0x0008, - 0x0020, 0x0080, - 0x0200, 0x0800, - 0x2000, 0x8000 }; - #endif - const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask)); - const uint16_t mask = vaddvq_u16(combined); - // The following fast path may or may not be beneficial. - /*if(mask == 0) { - // We only have three-byte words. Use fast path. - const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; - const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); - const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); - vst1q_u8(utf8_output, utf8_0); - utf8_output += 12; - vst1q_u8(utf8_output, utf8_1); - utf8_output += 12; - buf += 8; - continue; - }*/ - const uint8_t mask0 = uint8_t(mask); - const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; - const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); - const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); + return std::make_pair(buf, utf32_output); +} +/* end file src/arm64/arm_convert_latin1_to_utf32.cpp */ - const uint8_t mask1 = static_cast(mask >> 8); - const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; - const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); - const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); +/* begin file src/arm64/arm_convert_utf8_to_utf16.cpp */ +// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 16, usually 12). +template +size_t convert_masked_utf8_to_utf16(const char *input, + uint64_t utf8_end_of_code_point_mask, + char16_t *&utf16_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + uint8x16_t in = vld1q_u8(reinterpret_cast(input)); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + // + // Optimization note: our main path below is load-latency dependent. Thus it is maybe + // beneficial to have fast paths that depend on branch prediction but have less latency. + // This results in more instructions but, potentially, also higher speeds. - vst1q_u8(utf8_output, utf8_0); - utf8_output += row0[0]; - vst1q_u8(utf8_output, utf8_1); - utf8_output += row1[0]; + // We first try a few fast paths. + // The obvious first test is ASCII, which actually consumes the full 16. + if((utf8_end_of_code_point_mask & 0xFFFF) == 0xffff) { + // We process in chunks of 16 bytes + // The routine in simd.h is reused. + simd8 temp{vreinterpretq_s8_u8(in)}; + temp.store_ascii_as_utf16(utf16_output); + utf16_output += 16; // We wrote 16 16-bit characters. + return 16; // We consumed 16 bytes. + } - buf += 8; - } - // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes. - } else { - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFFFF80)==0) { - *utf8_output++ = char(word); - } else if((word & 0xFFFFF800)==0) { - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word & 0xFFFF0000)==0) { - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast(utf8_output)); } - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else { - if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast(utf8_output)); } - *utf8_output++ = char((word>>18) | 0b11110000); - *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } - } - buf += k; + // 3 byte sequences are the next most common, as seen in CJK, which has long sequences + // of these. + if (input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte UTF-16 code units. + uint16x4_t composed = convert_utf8_3_byte_to_utf16(in); + // Byte swap if necessary + if (!match_system(big_endian)) { + composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed))); } - } // while - - // check for invalid input - if (vmaxvq_u16(forbidden_bytemask) != 0) { - return std::make_pair(nullptr, reinterpret_cast(utf8_output)); + vst1_u16(reinterpret_cast(utf16_output), composed); + utf16_output += 4; // We wrote 4 16-bit characters. + return 12; // We consumed 12 bytes. } - return std::make_pair(buf, reinterpret_cast(utf8_output)); -} + // 2 byte sequences occur in short bursts in languages like Greek and Russian. + if ((utf8_end_of_code_point_mask & 0xFFF) == 0xaaa) { + // We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte UTF-16 code units. + uint16x8_t composed = convert_utf8_2_byte_to_utf16(in); + // Byte swap if necessary + if (!match_system(big_endian)) { + composed = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed))); + } + vst1q_u16(reinterpret_cast(utf16_output), composed); -std::pair arm_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_out) { - uint8_t * utf8_output = reinterpret_cast(utf8_out); - const char32_t* start = buf; - const char32_t* end = buf + len; + utf16_output += 6; // We wrote 6 16-bit characters. + return 12; // We consumed 12 bytes. + } - const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); + /// We do not have a fast path available, or the fast path is unimportant, so we fallback. + const uint8_t idx = + simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; - while (buf + 16 <= end) { - uint32x4_t in = vld1q_u32(reinterpret_cast(buf)); - uint32x4_t nextin = vld1q_u32(reinterpret_cast(buf+4)); + const uint8_t consumed = + simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; - // Check if no bits set above 16th - if(vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) { - // Pack UTF-32 to UTF-16 safely (without surrogate pairs) - // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp) - uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin)); - if(vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!! - // 1. pack the bytes - // obviously suboptimal. - uint8x8_t utf8_packed = vmovn_u16(utf16_packed); - // 2. store (8 bytes) - vst1_u8(utf8_output, utf8_packed); - // 3. adjust pointers - buf += 8; - utf8_output += 8; - continue; // we are done for this round! + if (idx < 64) { + // SIX (6) input code-code units + // Convert to UTF-16 + uint16x8_t composed = convert_utf8_1_to_2_byte_to_utf16(in, idx); + // Byte swap if necessary + if (!match_system(big_endian)) { + composed = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed))); + } + // Store + vst1q_u16(reinterpret_cast(utf16_output), composed); + utf16_output += 6; // We wrote 6 16-bit characters. + return consumed; + } else if (idx < 145) { + // FOUR (4) input code-code units + // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing. + uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); + // XXX: depending on the system scalar instructions might be faster. + // 1 byte: 00000000 00000000 0ccccccc + // 2 byte: 00000000 110bbbbb 10cccccc + // 3 byte: 1110aaaa 10bbbbbb 10cccccc + uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh)); + // 1 byte: 00000000 0ccccccc + // 2 byte: xx0bbbbb x0cccccc + // 3 byte: xxbbbbbb x0cccccc + uint16x4_t lowperm = vmovn_u32(perm); + // Partially mask with bic (doesn't require a temporary register unlike and) + // The shift left insert below will clear the top bits. + // 1 byte: 00000000 00000000 + // 2 byte: xx0bbbbb 00000000 + // 3 byte: xxbbbbbb 00000000 + uint16x4_t middlebyte = vbic_u16(lowperm, vmov_n_u16(uint16_t(~0xFF00))); + // ASCII + // 1 byte: 00000000 0ccccccc + // 2+byte: 00000000 00cccccc + uint16x4_t ascii = vand_u16(lowperm, vmov_n_u16(0x7F)); + // Split into narrow vectors. + // 2 byte: 00000000 00000000 + // 3 byte: 00000000 xxxxaaaa + uint16x4_t highperm = vshrn_n_u32(perm, 16); + // Shift right accumulate the middle byte + // 1 byte: 00000000 0ccccccc + // 2 byte: 00xx0bbb bbcccccc + // 3 byte: 00xxbbbb bbcccccc + uint16x4_t middlelow = vsra_n_u16(ascii, middlebyte, 2); + // Shift left and insert the top 4 bits, overwriting the garbage + // 1 byte: 00000000 0ccccccc + // 2 byte: 00000bbb bbcccccc + // 3 byte: aaaabbbb bbcccccc + uint16x4_t composed = vsli_n_u16(middlelow, highperm, 12); + // Byte swap if necessary + if (!match_system(big_endian)) { + composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed))); + } + vst1_u16(reinterpret_cast(utf16_output), composed); + + utf16_output += 4; // We wrote 4 16-bit codepoints + return consumed; + } else if (idx < 209) { + // THREE (3) input code-code units + if (input_utf8_end_of_code_point_mask == 0x888) { + // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte UTF-16 pairs. + // Generating surrogate pairs is a little tricky though, but it is easier when we + // can assume they are all pairs. + // This version does not use the LUT, but 4 byte sequences are less common and the + // overhead of the extra memory access is less important than the early branch overhead + // in shorter sequences. + + // Swap byte pairs + // 10dddddd 10cccccc|10bbbbbb 11110aaa + // 10cccccc 10dddddd|11110aaa 10bbbbbb + uint8x16_t swap = vrev16q_u8(in); + // Shift left 2 bits + // cccccc00 dddddd00 xxxxxxxx bbbbbb00 + uint32x4_t shift = vreinterpretq_u32_u8(vshlq_n_u8(swap, 2)); + // Create a magic number containing the low 2 bits of the trail surrogate and all the + // corrections needed to create the pair. + // UTF-8 4b prefix = -0x0000|0xF000 + // surrogate offset = -0x0000|0x0040 (0x10000 << 6) + // surrogate high = +0x0000|0xD800 + // surrogate low = +0xDC00|0x0000 + // ------------------------------- + // = +0xDC00|0xE7C0 + uint32x4_t magic = vmovq_n_u32(0xDC00E7C0); + // Generate unadjusted trail surrogate minus lowest 2 bits + // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00 + uint32x4_t trail = vbslq_u32(vmovq_n_u32(0x0000FF00), vreinterpretq_u32_u8(swap), shift); + // Insert low 2 bits of trail surrogate to magic number for later + // 11011100 00000000 11100111 110000cc + uint16x8_t magic_with_low_2 = vreinterpretq_u16_u32(vsraq_n_u32(magic, shift, 30)); + // Generate lead surrogate + // xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx + uint32x4_t lead = vreinterpretq_u32_u16(vsliq_n_u16(vreinterpretq_u16_u8(swap), vreinterpretq_u16_u8(in), 6)); + // Mask out lead + // 000000cc ccdddddd|xxxxxxxx xxxxxxxx + lead = vbicq_u32(lead, vmovq_n_u32(uint32_t(~0x03FFFFFF))); + // Blend pairs + // 000000cc ccdddddd|11110aaa bbbbbb00 + uint16x8_t blend = vreinterpretq_u16_u32(vbslq_u32(vmovq_n_u32(0x0000FFFF), trail, lead)); + // Add magic number to finish the result + // 110111CC CCDDDDDD|110110AA BBBBBBCC + uint16x8_t composed = vaddq_u16(blend, magic_with_low_2); + // Byte swap if necessary + if (!match_system(big_endian)) { + composed = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed))); } + vst1q_u16(reinterpret_cast(utf16_output), composed); + utf16_output += 6; // We 3 32-bit surrogate pairs. + return 12; // We consumed 12 bytes. + } + // 3 1-4 byte sequences + uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); - if (vmaxvq_u16(utf16_packed) <= 0x7FF) { - // 1. prepare 2-byte values - // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 - // expected output : [110a|aaaa|10bb|bbbb] x 8 - const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); - const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); - - // t0 = [000a|aaaa|bbbb|bb00] - const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2); - // t1 = [000a|aaaa|0000|0000] - const uint16x8_t t1 = vandq_u16(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f); - // t3 = [000a|aaaa|00bb|bbbb] - const uint16x8_t t3 = vorrq_u16(t1, t2); - // t4 = [110a|aaaa|10bb|bbbb] - const uint16x8_t t4 = vorrq_u16(t3, v_c080); - // 2. merge ASCII and 2-byte codewords - const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); - const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f); - const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4)); - // 3. prepare bitmask for 8-bit lookup - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004, - 0x0010, 0x0040, - 0x0002, 0x0008, - 0x0020, 0x0080); - #else - const uint16x8_t mask = { 0x0001, 0x0004, - 0x0010, 0x0040, - 0x0002, 0x0008, - 0x0020, 0x0080 }; - #endif - uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); - // 4. pack the bytes - const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; - const uint8x16_t shuffle = vld1q_u8(row + 1); - const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); + // 1 byte: 00000000 00000000 00000000 0ddddddd + // 3 byte: 00000000 00000000 110ccccc 10dddddd + // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd + // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd + uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh)); + // Mask the low and middle bytes + // 00000000 00000000 00000000 0ddddddd + uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7f)); + // Because the surrogates need more work, the high surrogate is computed first. + uint32x4_t middlehigh = vshlq_n_u32(perm, 2); + // 00000000 00000000 00cccccc 00000000 + uint32x4_t middlebyte = vandq_u32(perm, vmovq_n_u32(0x3F00)); + // Start assembling the sequence. Since the 4th byte is in the same position as it + // would be in a surrogate and there is no dependency, shift left instead of right. + // 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx + // 4 byte: 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx + uint32x4_t ab = vbslq_u32(vmovq_n_u32(0xFF000000), perm, middlehigh); + // Top 16 bits contains the high ten bits of the surrogate pair before correction + // 3 byte: 00000000 10bbbbcc|cccc0000 00000000 + // 4 byte: 11110aaa bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction + uint32x4_t abc = vbslq_u32(vmovq_n_u32(0xFFFC0000), ab, vshlq_n_u32(middlebyte, 4)); + // Combine the low 6 or 7 bits by a shift right accumulate + // 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct + // 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o correction + uint32x4_t composed = vsraq_n_u32(ascii, abc, 6); + // After this is for surrogates + // Blend the low and high surrogates + // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd + uint32x4_t mixed = vbslq_u32(vmovq_n_u32(0xFFFF0000), abc, composed); + // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits yet as + // 0x10000 was not subtracted from the codepoint yet. + // 4 byte: 11110aaa bbbbbbcc|000000cc ccdddddd + uint16x8_t masked_pair = + vreinterpretq_u16_u32(vbicq_u32(mixed, vmovq_n_u32(uint32_t(~0xFFFF03FF)))); + // Correct the remaining UTF-8 prefix, surrogate offset, and add the surrogate prefixes + // in one magic 16-bit addition. + // similar magic number but without the continue byte adjust and halfword swapped + // UTF-8 4b prefix = -0xF000|0x0000 + // surrogate offset = -0x0040|0x0000 (0x10000 << 6) + // surrogate high = +0xD800|0x0000 + // surrogate low = +0x0000|0xDC00 + // ----------------------------------- + // = +0xE7C0|0xDC00 + uint16x8_t magic = vreinterpretq_u16_u32(vmovq_n_u32(0xE7C0DC00)); + // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete + uint32x4_t surrogates = vreinterpretq_u32_u16(vaddq_u16(masked_pair, magic)); + // If the high bit is 1 (s32 less than zero), this needs a surrogate pair + uint32x4_t is_pair = vcltzq_s32(vreinterpretq_s32_u32(perm)); + + // Select either the 4 byte surrogate pair or the 2 byte solo codepoint + // 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd + // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD + uint32x4_t selected = vbslq_u32(is_pair, surrogates, composed); + // Byte swap if necessary + if (!match_system(big_endian)) { + selected = vreinterpretq_u32_u8(vrev16q_u8(vreinterpretq_u8_u32(selected))); + } + // Attempting to shuffle and store would be complex, just scalarize. + uint32_t buffer[4]; + vst1q_u32(buffer, selected); + // Test for the top bit of the surrogate mask. + const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 : 0x00800000; + for (size_t i = 0; i < 3; i++) { + // Surrogate + if (buffer[i] & SURROGATE_MASK) { + utf16_output[0] = uint16_t(buffer[i] >> 16); + utf16_output[1] = uint16_t(buffer[i] & 0xFFFF); + utf16_output += 2; + } else { + utf16_output[0] = uint16_t(buffer[i] & 0xFFFF); + utf16_output++; + } + } + return consumed; + } else { + // here we know that there is an error but we do not handle errors + return 12; + } +} - // 5. store bytes - vst1q_u8(utf8_output, utf8_packed); +/* end file src/arm64/arm_convert_utf8_to_utf16.cpp */ +/* begin file src/arm64/arm_convert_utf8_to_utf32.cpp */ +// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +size_t convert_masked_utf8_to_utf32(const char *input, + uint64_t utf8_end_of_code_point_mask, + char32_t *&utf32_out) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + uint32_t*& utf32_output = reinterpret_cast(utf32_out); + uint8x16_t in = vld1q_u8(reinterpret_cast(input)); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xFFF; + // + // Optimization note: our main path below is load-latency dependent. Thus it is maybe + // beneficial to have fast paths that depend on branch prediction but have less latency. + // This results in more instructions but, potentially, also higher speeds. + // + // We first try a few fast paths. + if((utf8_end_of_code_point_mask & 0xffff) == 0xffff) { + // We process in chunks of 16 bytes. + // use fast implementation in src/simdutf/arm64/simd.h + // Ideally the compiler can keep the tables in registers. + simd8 temp{vreinterpretq_s8_u8(in)}; + temp.store_ascii_as_utf32_tbl(utf32_out); + utf32_output += 16; // We wrote 16 32-bit characters. + return 16; // We consumed 16 bytes. + } + if(input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte UTF-32 code units. + // Convert to UTF-16 + uint16x4_t composed_utf16 = convert_utf8_3_byte_to_utf16(in); + // Zero extend and store via ST2 with a zero. + uint16x4x2_t interleaver = {{ composed_utf16, vmov_n_u16(0) }}; + vst2_u16(reinterpret_cast(utf32_output), interleaver); + utf32_output += 4; // We wrote 4 32-bit characters. + return 12; // We consumed 12 bytes. + } + + // 2 byte sequences occur in short bursts in languages like Greek and Russian. + if(input_utf8_end_of_code_point_mask == 0xaaa) { + // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte UTF-32 code units. + // Convert to UTF-16 + uint16x8_t composed_utf16 = convert_utf8_2_byte_to_utf16(in); + // Zero extend and store via ST2 with a zero. + uint16x8x2_t interleaver = {{ composed_utf16, vmovq_n_u16(0) }}; + vst2q_u16(reinterpret_cast(utf32_output), interleaver); + utf32_output += 6; // We wrote 6 32-bit characters. + return 12; // We consumed 12 bytes. + } + /// Either no fast path or an unimportant fast path. - // 6. adjust pointers - buf += 8; - utf8_output += row[0]; - continue; + const uint8_t idx = + simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; - } else { - // case: words from register produce either 1, 2 or 3 UTF-8 bytes - // check for invalid input - const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); - const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff); - const uint16x8_t forbidden_bytemask = vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800)); - if (vmaxvq_u16(forbidden_bytemask) != 0) { - return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast(utf8_output)); - } - - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); - #else - const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; - #endif - /* In this branch we handle three cases: - 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte - 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes - 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes - - We expand the input word (16-bit) into two words (32-bit), thus - we have room for four bytes. However, we need five distinct bit - layouts. Note that the last byte in cases #2 and #3 is the same. - - We precompute byte 1 for case #1 and the common byte for cases #2 & #3 - in register t2. - - We precompute byte 1 for case #3 and -- **conditionally** -- precompute - either byte 1 for case #2 or byte 2 for case #3. Note that they - differ by exactly one bit. - - Finally from these two words we build proper UTF-8 sequence, taking - into account the case (i.e, the number of bytes to write). - */ - /** - * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: - * t2 => [0ccc|cccc] [10cc|cccc] - * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) - */ - #define simdutf_vec(x) vmovq_n_u16(static_cast(x)) - // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] - const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even))); - // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] - const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111)); - // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000)); - - // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] - const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12); - // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] - const uint16x8_t s1 = vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000)); - // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] - const uint16x8_t s1s = vshlq_n_u16(s1, 2); - // [00bb|bbbb|0000|aaaa] - const uint16x8_t s2 = vorrq_u16(s0, s1s); - // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] - const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000)); - const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); - const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff); - const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask); - const uint16x8_t s4 = veorq_u16(s3, m0); - #undef simdutf_vec + if (idx < 64) { + // SIX (6) input code-code units + // Convert to UTF-16 + uint16x8_t composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx); + // Zero extend and store with ST2 and zero + uint16x8x2_t interleaver = {{ composed_utf16, vmovq_n_u16(0) }}; + vst2q_u16(reinterpret_cast(utf32_output), interleaver); + utf32_output += 6; // We wrote 6 32-bit characters. + return consumed; + } else if (idx < 145) { + // FOUR (4) input code-code units + // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing. + uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); + // Shuffle + // 1 byte: 00000000 00000000 0ccccccc + // 2 byte: 00000000 110bbbbb 10cccccc + // 3 byte: 1110aaaa 10bbbbbb 10cccccc + uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh)); + // Split + // 00000000 00000000 0ccccccc + uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F)); // 6 or 7 bits + // Note: unmasked + // xxxxxxxx aaaaxxxx xxxxxxxx + uint32x4_t high = vshrq_n_u32(perm, 4); // 4 bits + // Use 16 bit bic instead of and. + // The top bits will be corrected later in the bsl + // 00000000 10bbbbbb 00000000 + uint32x4_t middle = + vreinterpretq_u32_u16(vbicq_u16(vreinterpretq_u16_u32(perm), vmovq_n_u16(uint16_t(~0xff00)))); // 5 or 6 bits + // Combine low and middle with shift right accumulate + // 00000000 00xxbbbb bbcccccc + uint32x4_t lowmid = vsraq_n_u32(ascii, middle, 2); + // Insert top 4 bits from high byte with bitwise select + // 00000000 aaaabbbb bbcccccc + uint32x4_t composed = vbslq_u32(vmovq_n_u32(0x0000F000), high, lowmid); + vst1q_u32(utf32_output, composed); + utf32_output += 4; // We wrote 4 32-bit characters. + return consumed; + } else if (idx < 209) { + // THREE (3) input code-code units + if (input_utf8_end_of_code_point_mask == 0x888) { + // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte UTF-32 code units. + // This uses the same method as the fixed 3 byte version, reversing and shift left insert. + // However, there is no need for a shuffle mask now, just rev16 and rev32. + // + // This version does not use the LUT, but 4 byte sequences are less common and the + // overhead of the extra memory access is less important than the early branch overhead + // in shorter sequences, so it comes last. + + // Swap pairs of bytes + // 10dddddd|10cccccc|10bbbbbb|11110aaa + // 10cccccc 10dddddd|11110aaa 10bbbbbb + uint16x8_t swap1 = vreinterpretq_u16_u8(vrev16q_u8(in)); + // Shift left and insert + // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb + uint16x8_t merge1 = vsliq_n_u16(swap1, vreinterpretq_u16_u8(in), 6); + // Swap 16-bit lanes + // xxxxcccc ccdddddd xxxxxxxa aabbbbbb + // xxxxxxxa aabbbbbb xxxxcccc ccdddddd + uint32x4_t swap2 = vreinterpretq_u32_u16(vrev32q_u16(merge1)); + // Shift insert again + // xxxxxxxx xxxaaabb bbbbcccc ccdddddd + uint32x4_t merge2 = vsliq_n_u32(swap2, vreinterpretq_u32_u16(merge1), 12); + // Clear the garbage + // 00000000 000aaabb bbbbcccc ccdddddd + uint32x4_t composed = vandq_u32(merge2, vmovq_n_u32(0x1FFFFF)); + // Store + vst1q_u32(utf32_output, composed); + + utf32_output += 3; // We wrote 3 32-bit characters. + return 12; // We consumed 12 bytes. + } + // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit due to + // surrogates no longer being involved. + uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); + // 1 byte: 00000000 00000000 00000000 0ddddddd + // 2 byte: 00000000 00000000 110ccccc 10dddddd + // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd + // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd + uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh)); + // Ascii + uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F)); + uint32x4_t middle = vandq_u32(perm, vmovq_n_u32(0x3f00)); + // When converting the way we do, the 3 byte prefix will be interpreted as the + // 18th bit being set, since the code would interpret the lead byte (0b1110bbbb) + // as a continuation byte (0b10bbbbbb). To fix this, we can either xor or do an + // 8 bit add of the 6th bit shifted right by 1. Since NEON has shift right accumulate, + // we use that. + // 4 byte 3 byte + // 10bbbbbb 1110bbbb + // 00000000 01000000 6th bit + // 00000000 00100000 shift right + // 10bbbbbb 0000bbbb add + // 00bbbbbb 0000bbbb mask + uint8x16_t correction = + vreinterpretq_u8_u32(vandq_u32(perm, vmovq_n_u32(0x00400000))); + uint32x4_t corrected = + vreinterpretq_u32_u8(vsraq_n_u8(vreinterpretq_u8_u32(perm), correction, 1)); + // 00000000 00000000 0000cccc ccdddddd + uint32x4_t cd = vsraq_n_u32(ascii, middle, 2); + // Insert twice + // xxxxxxxx xxxaaabb bbbbxxxx xxxxxxxx + uint32x4_t ab = vbslq_u32(vmovq_n_u32(0x01C0000), vshrq_n_u32(corrected, 6), vshrq_n_u32(corrected, 4)); + // 00000000 000aaabb bbbbcccc ccdddddd + uint32x4_t composed = vbslq_u32(vmovq_n_u32(0xFFE00FFF), cd, ab); + // Store + vst1q_u32(utf32_output, composed); + utf32_output += 3; // We wrote 3 32-bit characters. + return consumed; + } else { + // here we know that there is an error but we do not handle errors + return 12; + } +} +/* end file src/arm64/arm_convert_utf8_to_utf32.cpp */ +/* begin file src/arm64/arm_convert_utf8_to_latin1.cpp */ +// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 16, usually 12). +size_t convert_masked_utf8_to_latin1(const char *input, + uint64_t utf8_end_of_code_point_mask, + char *&latin1_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + uint8x16_t in = vld1q_u8(reinterpret_cast(input)); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + // + // Optimization note: our main path below is load-latency dependent. Thus it is maybe + // beneficial to have fast paths that depend on branch prediction but have less latency. + // This results in more instructions but, potentially, also higher speeds. - // 4. expand words 16-bit => 32-bit - const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); - const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); + // We first try a few fast paths. + // The obvious first test is ASCII, which actually consumes the full 16. + if((utf8_end_of_code_point_mask & 0xFFFF) == 0xffff) { + // We process in chunks of 16 bytes + vst1q_u8(reinterpret_cast(latin1_output), in); + latin1_output += 16; // We wrote 16 18-bit characters. + return 16; // We consumed 16 bytes. + } + /// We do not have a fast path available, or the fast path is unimportant, so we fallback. + const uint8_t idx = + simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; - // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle - const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); - const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f); - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004, - 0x0010, 0x0040, - 0x0100, 0x0400, - 0x1000, 0x4000 ); - const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008, - 0x0020, 0x0080, - 0x0200, 0x0800, - 0x2000, 0x8000 ); - #else - const uint16x8_t onemask = { 0x0001, 0x0004, - 0x0010, 0x0040, - 0x0100, 0x0400, - 0x1000, 0x4000 }; - const uint16x8_t twomask = { 0x0002, 0x0008, - 0x0020, 0x0080, - 0x0200, 0x0800, - 0x2000, 0x8000 }; - #endif - const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask)); - const uint16_t mask = vaddvq_u16(combined); - // The following fast path may or may not be beneficial. - /*if(mask == 0) { - // We only have three-byte words. Use fast path. - const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; - const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); - const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); - vst1q_u8(utf8_output, utf8_0); - utf8_output += 12; - vst1q_u8(utf8_output, utf8_1); - utf8_output += 12; - buf += 8; - continue; - }*/ - const uint8_t mask0 = uint8_t(mask); + const uint8_t consumed = + simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + // this indicates an invalid input: + if(idx >= 64) { return consumed; } + // Here we should have (idx < 64), if not, there is a bug in the validation or elsewhere. + // SIX (6) input code-code units + // this is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six code + // code units spanning between 1 and 2 bytes each is 12 bytes. + // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters. + // This is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six code + // code units spanning between 1 and 2 bytes each is 12 bytes. + uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); + // Shuffle + // 1 byte: 00000000 0bbbbbbb + // 2 byte: 110aaaaa 10bbbbbb + uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh)); + // Mask + // 1 byte: 00000000 0bbbbbbb + // 2 byte: 00000000 00bbbbbb + uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits + // 1 byte: 00000000 00000000 + // 2 byte: 000aaaaa 00000000 + uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits + // Combine with a shift right accumulate + // 1 byte: 00000000 0bbbbbbb + // 2 byte: 00000aaa aabbbbbb + uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2); + // writing 8 bytes even though we only care about the first 6 bytes. + uint8x8_t latin1_packed = vmovn_u16(composed); + vst1_u8(reinterpret_cast(latin1_output), latin1_packed); + latin1_output += 6; // We wrote 6 bytes. + return consumed; +} - const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; - const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); - const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); +/* end file src/arm64/arm_convert_utf8_to_latin1.cpp */ - const uint8_t mask1 = static_cast(mask >> 8); - const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; - const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); - const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); +/* begin file src/arm64/arm_convert_utf16_to_latin1.cpp */ - vst1q_u8(utf8_output, utf8_0); - utf8_output += row0[0]; - vst1q_u8(utf8_output, utf8_1); - utf8_output += row1[0]; +template +std::pair arm_convert_utf16_to_latin1(const char16_t* buf, size_t len, char* latin1_output) { + const char16_t* end = buf + len; + while (buf + 8 <= end) { + uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); + if (!match_system(big_endian)) { in = vrev16q_u8(in); } + if (vmaxvq_u16(in) <= 0xff) { + // 1. pack the bytes + uint8x8_t latin1_packed = vmovn_u16(in); + // 2. store (8 bytes) + vst1_u8(reinterpret_cast(latin1_output), latin1_packed); + // 3. adjust pointers + buf += 8; + latin1_output += 8; + } else { + return std::make_pair(nullptr, reinterpret_cast(latin1_output)); + } + } // while + return std::make_pair(buf, latin1_output); +} - buf += 8; - } - // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes. +template +std::pair arm_convert_utf16_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) { + const char16_t* start = buf; + const char16_t* end = buf + len; + while (buf + 8 <= end) { + uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); + if (!match_system(big_endian)) { in = vrev16q_u8(in); } + if (vmaxvq_u16(in) <= 0xff) { + // 1. pack the bytes + uint8x8_t latin1_packed = vmovn_u16(in); + // 2. store (8 bytes) + vst1_u8(reinterpret_cast(latin1_output), latin1_packed); + // 3. adjust pointers + buf += 8; + latin1_output += 8; } else { // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFFFF80)==0) { - *utf8_output++ = char(word); - } else if((word & 0xFFFFF800)==0) { - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word & 0xFFFF0000)==0) { - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast(utf8_output)); } - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); + for(int k = 0; k < 8; k++) { + uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if(word <= 0xff) { + *latin1_output++ = char(word); } else { - if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast(utf8_output)); } - *utf8_output++ = char((word>>18) | 0b11110000); - *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); + return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), latin1_output); } } - buf += k; } } // while - - return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast(utf8_output)); + return std::make_pair(result(error_code::SUCCESS, buf - start), latin1_output); } -/* end file src/arm64/arm_convert_utf32_to_utf8.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf32_to_utf16.cpp -/* begin file src/arm64/arm_convert_utf32_to_utf16.cpp */ -template -std::pair arm_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_out) { - uint16_t * utf16_output = reinterpret_cast(utf16_out); - const char32_t* end = buf + len; - - uint16x4_t forbidden_bytemask = vmov_n_u16(0x0); +/* end file src/arm64/arm_convert_utf16_to_latin1.cpp */ +/* begin file src/arm64/arm_convert_utf16_to_utf8.cpp */ +/* + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit code units. - while(buf + 4 <= end) { - uint32x4_t in = vld1q_u32(reinterpret_cast(buf)); + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + is in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. - // Check if no bits set above 16th - if(vmaxvq_u32(in) <= 0xFFFF) { - uint16x4_t utf16_packed = vmovn_u32(in); + Ad 1. - const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800); - const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff); - forbidden_bytemask = vorr_u16(vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800)), forbidden_bytemask); + When values are less than 0x0800, it means that a 16-bit code unit + can be converted into: 1) single UTF8 byte (when it's an ASCII + char) or 2) two UTF8 bytes. - if (!match_system(big_endian)) { - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6); - #else - const uint8x8_t swap = {1, 0, 3, 2, 5, 4, 7, 6}; - #endif - utf16_packed = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(utf16_packed), swap)); - } - vst1_u16(utf16_output, utf16_packed); - utf16_output += 4; - buf += 4; - } else { - size_t forward = 3; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFF0000)==0) { - // will not generate a surrogate pair - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast(utf16_output)); } - *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word); - } else { - // will generate a surrogate pair - if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast(utf16_output)); } - word -= 0x10000; - uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); - uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (!match_system(big_endian)) { - high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8); - low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8); - } - *utf16_output++ = char16_t(high_surrogate); - *utf16_output++ = char16_t(low_surrogate); - } - } - buf += k; - } - } + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. - // check for invalid input - if (vmaxv_u16(forbidden_bytemask) != 0) { - return std::make_pair(nullptr, reinterpret_cast(utf16_output)); - } + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. - return std::make_pair(buf, reinterpret_cast(utf16_output)); -} + Ad 2. + When values fit in 16-bit code units, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. -template -std::pair arm_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_out) { - uint16_t * utf16_output = reinterpret_cast(utf16_out); - const char32_t* start = buf; - const char32_t* end = buf + len; + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. - while(buf + 4 <= end) { - uint32x4_t in = vld1q_u32(reinterpret_cast(buf)); + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. - // Check if no bits set above 16th - if(vmaxvq_u32(in) <= 0xFFFF) { - uint16x4_t utf16_packed = vmovn_u32(in); + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. - const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800); - const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff); - const uint16x4_t forbidden_bytemask = vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800)); - if (vmaxv_u16(forbidden_bytemask) != 0) { - return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast(utf16_output)); - } - if (!match_system(big_endian)) { - #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO - const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6); - #else - const uint8x8_t swap = {1, 0, 3, 2, 5, 4, 7, 6}; - #endif - utf16_packed = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(utf16_packed), swap)); - } - vst1_u16(utf16_output, utf16_packed); - utf16_output += 4; - buf += 4; - } else { - size_t forward = 3; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFF0000)==0) { - // will not generate a surrogate pair - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast(utf16_output)); } - *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word); + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routing should carry on the conversion of the tail. +*/ +template +std::pair arm_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_out) { + uint8_t * utf8_output = reinterpret_cast(utf8_out); + const char16_t* end = buf + len; + + const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); + + while (buf + 16 <= end) { + uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); + if (!match_system(big_endian)) { in = vrev16q_u8(in); } + if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!! + // It is common enough that we have sequences of 16 consecutive ASCII characters. + uint16x8_t nextin = vld1q_u16(reinterpret_cast(buf) + 8); + if (!match_system(big_endian)) { nextin = vrev16q_u8(nextin); } + if(vmaxvq_u16(nextin) > 0x7F) { + // 1. pack the bytes + // obviously suboptimal. + uint8x8_t utf8_packed = vmovn_u16(in); + // 2. store (8 bytes) + vst1_u8(utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + in = nextin; } else { - // will generate a surrogate pair - if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast(utf16_output)); } - word -= 0x10000; - uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); - uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (!match_system(big_endian)) { - high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8); - low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8); - } - *utf16_output++ = char16_t(high_surrogate); - *utf16_output++ = char16_t(low_surrogate); + // 1. pack the bytes + // obviously suboptimal. + uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin); + // 2. store (16 bytes) + vst1q_u8(utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! } - } - buf += k; } - } - - return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast(utf16_output)); -} -/* end file src/arm64/arm_convert_utf32_to_utf16.cpp */ -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h -/* begin file src/generic/buf_block_reader.h */ -namespace simdutf { -namespace arm64 { -namespace { - -// Walks through a buffer in block-sized increments, loading the last part with spaces -template -struct buf_block_reader { -public: - simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); - simdutf_really_inline size_t block_index(); - simdutf_really_inline bool has_full_block() const; - simdutf_really_inline const uint8_t *full_block() const; - /** - * Get the last block, padded with spaces. - * - * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this - * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there - * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. - * - * @return the number of effective characters in the last block. - */ - simdutf_really_inline size_t get_remainder(uint8_t *dst) const; - simdutf_really_inline void advance(); -private: - const uint8_t *buf; - const size_t len; - const size_t lenminusstep; - size_t idx; -}; -// Routines to print masks and text for debugging bitmask operations -simdutf_unused static char * format_input_text_64(const uint8_t *text) { - static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); - for (size_t i=0; i); i++) { - buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); - } - buf[sizeof(simd8x64)] = '\0'; - return buf; -} + if (vmaxvq_u16(in) <= 0x7FF) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); + const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); -// Routines to print masks and text for debugging bitmask operations -simdutf_unused static char * format_input_text(const simd8x64& in) { - static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); - in.store(reinterpret_cast(buf)); - for (size_t i=0; i); i++) { - if (buf[i] < ' ') { buf[i] = '_'; } - } - buf[sizeof(simd8x64)] = '\0'; - return buf; -} + // t0 = [000a|aaaa|bbbb|bb00] + const uint16x8_t t0 = vshlq_n_u16(in, 2); + // t1 = [000a|aaaa|0000|0000] + const uint16x8_t t1 = vandq_u16(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const uint16x8_t t2 = vandq_u16(in, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const uint16x8_t t3 = vorrq_u16(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const uint16x8_t t4 = vorrq_u16(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); + const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4)); + // 3. prepare bitmask for 8-bit lookup +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004, + 0x0010, 0x0040, + 0x0002, 0x0008, + 0x0020, 0x0080); +#else + const uint16x8_t mask = { 0x0001, 0x0004, + 0x0010, 0x0040, + 0x0002, 0x0008, + 0x0020, 0x0080 }; +#endif + uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); + // 4. pack the bytes + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const uint8x16_t shuffle = vld1q_u8(row + 1); + const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); -simdutf_unused static char * format_mask(uint64_t mask) { - static char *buf = reinterpret_cast(malloc(64 + 1)); - for (size_t i=0; i<64; i++) { - buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; - } - buf[64] = '\0'; - return buf; -} + // 5. store bytes + vst1q_u8(utf8_output, utf8_packed); -template -simdutf_really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; -template -simdutf_really_inline size_t buf_block_reader::block_index() { return idx; } + } + const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, + // it is likely an uncommon occurrence. + if (vmaxvq_u16(surrogates_bytemask) == 0) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); +#else + const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; +#endif + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes -template -simdutf_really_inline bool buf_block_reader::has_full_block() const { - return idx < lenminusstep; -} + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. -template -simdutf_really_inline const uint8_t *buf_block_reader::full_block() const { - return &buf[idx]; -} + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. -template -simdutf_really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { - if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers - std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. - std::memcpy(dst, buf + idx, len - idx); - return len - idx; -} + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. -template -simdutf_really_inline void buf_block_reader::advance() { - idx += STEP_SIZE; -} + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) vmovq_n_u16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even))); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000)); -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -/* end file src/generic/buf_block_reader.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h -/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ -namespace simdutf { -namespace arm64 { -namespace { -namespace utf8_validation { + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + const uint16x8_t s0 = vshrq_n_u16(in, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000)); + // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] + const uint16x8_t s1s = vshlq_n_u16(s1, 2); + // [00bb|bbbb|0000|aaaa] + const uint16x8_t s2 = vorrq_u16(s0, s1s); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000)); + const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); + const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff); + const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask); + const uint16x8_t s4 = veorq_u16(s3, m0); +#undef simdutf_vec -using namespace simd; + // 4. expand code units 16-bit => 32-bit + const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); + const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); - simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { -// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) -// Bit 1 = Too Long (ASCII followed by continuation) -// Bit 2 = Overlong 3-byte -// Bit 4 = Surrogate -// Bit 5 = Overlong 2-byte -// Bit 7 = Two Continuations - constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ - // 11______ 11______ - constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ - constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ - constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ - constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ - constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ - constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ - // 11110100 101_____ - // 11110101 1001____ - // 11110101 101_____ - // 1111011_ 1001____ - // 1111011_ 101_____ - // 11111___ 1001____ - // 11111___ 101_____ - constexpr const uint8_t TOO_LARGE_1000 = 1<<6; - // 11110101 1000____ - // 1111011_ 1000____ - // 11111___ 1000____ - constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004, + 0x0010, 0x0040, + 0x0100, 0x0400, + 0x1000, 0x4000 ); + const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008, + 0x0020, 0x0080, + 0x0200, 0x0800, + 0x2000, 0x8000 ); +#else + const uint16x8_t onemask = { 0x0001, 0x0004, + 0x0010, 0x0040, + 0x0100, 0x0400, + 0x1000, 0x4000 }; + const uint16x8_t twomask = { 0x0002, 0x0008, + 0x0020, 0x0080, + 0x0200, 0x0800, + 0x2000, 0x8000 }; +#endif + const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask)); + const uint16_t mask = vaddvq_u16(combined); + // The following fast path may or may not be beneficial. + /*if(mask == 0) { + // We only have three-byte code units. Use fast path. + const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); + vst1q_u8(utf8_output, utf8_0); + utf8_output += 12; + vst1q_u8(utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); - const simd8 byte_1_high = prev1.shr<4>().lookup_16( - // 0_______ ________ - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - // 10______ ________ - TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, - // 1100____ ________ - TOO_SHORT | OVERLONG_2, - // 1101____ ________ - TOO_SHORT, - // 1110____ ________ - TOO_SHORT | OVERLONG_3 | SURROGATE, - // 1111____ ________ - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 - ); - constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . - const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( - // ____0000 ________ - CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, - // ____0001 ________ - CARRY | OVERLONG_2, - // ____001_ ________ - CARRY, - CARRY, + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); - // ____0100 ________ - CARRY | TOO_LARGE, - // ____0101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____011_ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); - // ____1___ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____1101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000 - ); - const simd8 byte_2_high = input.shr<4>().lookup_16( - // ________ 0_______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + vst1q_u8(utf8_output, utf8_0); + utf8_output += row0[0]; + vst1q_u8(utf8_output, utf8_1); + utf8_output += row1[0]; - // ________ 1000____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, - // ________ 1001____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, - // ________ 101_____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if((word & 0xFF80)==0) { + *utf8_output++ = char(word); + } else if((word & 0xF800)==0) { + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word &0xF800 ) != 0xD800) { + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, reinterpret_cast(utf8_output)); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value>>18) | 0b11110000); + *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while - // ________ 11______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT - ); - return (byte_1_high & byte_1_low & byte_2_high); - } - simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, - const simd8 prev_input, const simd8 sc) { - simd8 prev2 = input.prev<2>(prev_input); - simd8 prev3 = input.prev<3>(prev_input); - simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); - simd8 must23_80 = must23 & uint8_t(0x80); - return must23_80 ^ sc; - } + return std::make_pair(buf, reinterpret_cast(utf8_output)); +} - // - // Return nonzero if there are incomplete multibyte characters at the end of the block: - // e.g. if there is a 4-byte character, but it's 3 bytes from the end. - // - simdutf_really_inline simd8 is_incomplete(const simd8 input) { - // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): - // ... 1111____ 111_____ 11______ - static const uint8_t max_array[32] = { - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 - }; - const simd8 max_value(&max_array[sizeof(max_array)-sizeof(simd8)]); - return input.gt_bits(max_value); - } - struct utf8_checker { - // If this is nonzero, there has been a UTF-8 error. - simd8 error; - // The last input we received - simd8 prev_input_block; - // Whether the last input we received was incomplete (used for ASCII fast path) - simd8 prev_incomplete; +/* + Returns a pair: a result struct and utf8_output. + If there is an error, the count field of the result is the position of the error. + Otherwise, it is the position of the first unprocessed byte in buf (even if finished). + A scalar routing should carry on the conversion of the tail if needed. +*/ +template +std::pair arm_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_out) { + uint8_t * utf8_output = reinterpret_cast(utf8_out); + const char16_t* start = buf; + const char16_t* end = buf + len; - // - // Check whether the current bytes are valid UTF-8. - // - simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { - // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes - // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) - simd8 prev1 = input.prev<1>(prev_input); - simd8 sc = check_special_cases(input, prev1); - this->error |= check_multibyte_lengths(input, prev_input, sc); - } - - // The only problem that can happen at EOF is that a multibyte character is too short - // or a byte value too large in the last bytes: check_special_cases only checks for bytes - // too large in the first of two bytes. - simdutf_really_inline void check_eof() { - // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't - // possibly finish them. - this->error |= this->prev_incomplete; - } + const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); - simdutf_really_inline void check_next_input(const simd8x64& input) { - if(simdutf_likely(is_ascii(input))) { - this->error |= this->prev_incomplete; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], this->prev_input_block); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], this->prev_input_block); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + while (buf + 16 <= end) { + uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); + if (!match_system(big_endian)) { in = vrev16q_u8(in); } + if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!! + // It is common enough that we have sequences of 16 consecutive ASCII characters. + uint16x8_t nextin = vld1q_u16(reinterpret_cast(buf) + 8); + if (!match_system(big_endian)) { nextin = vrev16q_u8(nextin); } + if(vmaxvq_u16(nextin) > 0x7F) { + // 1. pack the bytes + // obviously suboptimal. + uint8x8_t utf8_packed = vmovn_u16(in); + // 2. store (8 bytes) + vst1_u8(utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + in = nextin; + } else { + // 1. pack the bytes + // obviously suboptimal. + uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin); + // 2. store (16 bytes) + vst1q_u8(utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! } - this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); - this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; - - } } - // do not forget to call check_eof! - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); - } + if (vmaxvq_u16(in) <= 0x7FF) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); + const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); - }; // struct utf8_checker -} // namespace utf8_validation + // t0 = [000a|aaaa|bbbb|bb00] + const uint16x8_t t0 = vshlq_n_u16(in, 2); + // t1 = [000a|aaaa|0000|0000] + const uint16x8_t t1 = vandq_u16(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const uint16x8_t t2 = vandq_u16(in, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const uint16x8_t t3 = vorrq_u16(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const uint16x8_t t4 = vorrq_u16(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); + const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4)); + // 3. prepare bitmask for 8-bit lookup +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004, + 0x0010, 0x0040, + 0x0002, 0x0008, + 0x0020, 0x0080); +#else + const uint16x8_t mask = { 0x0001, 0x0004, + 0x0010, 0x0040, + 0x0002, 0x0008, + 0x0020, 0x0080 }; +#endif + uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); + // 4. pack the bytes + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const uint8x16_t shuffle = vld1q_u8(row + 1); + const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); -using utf8_validation::utf8_checker; + // 5. store bytes + vst1q_u8(utf8_output, utf8_packed); -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h -/* begin file src/generic/utf8_validation/utf8_validator.h */ -namespace simdutf { -namespace arm64 { -namespace { -namespace utf8_validation { + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; -/** - * Validates that the string is actual UTF-8. - */ -template -bool generic_validate_utf8(const uint8_t * input, size_t length) { - checker c{}; - buf_block_reader<64> reader(input, length); - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - c.check_next_input(in); - reader.advance(); } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - c.check_next_input(in); - reader.advance(); - c.check_eof(); - return !c.errors(); -} + const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, + // it is likely an uncommon occurrence. + if (vmaxvq_u16(surrogates_bytemask) == 0) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); +#else + const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; +#endif + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes -bool generic_validate_utf8(const char * input, size_t length) { - return generic_validate_utf8(reinterpret_cast(input),length); -} + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. -/** - * Validates that the string is actual UTF-8 and stops on errors. - */ -template -result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) { - checker c{}; - buf_block_reader<64> reader(input, length); - size_t count{0}; - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - c.check_next_input(in); - if(c.errors()) { - if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk - result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input), reinterpret_cast(input + count), length - count); - res.count += count; - return res; - } - reader.advance(); - count += 64; - } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - c.check_next_input(in); - reader.advance(); - c.check_eof(); - if (c.errors()) { - if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk - result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input), reinterpret_cast(input) + count, length - count); - res.count += count; - return res; - } else { - return result(error_code::SUCCESS, length); - } -} + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. -result generic_validate_utf8_with_errors(const char * input, size_t length) { - return generic_validate_utf8_with_errors(reinterpret_cast(input),length); -} + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. -template -bool generic_validate_ascii(const uint8_t * input, size_t length) { - buf_block_reader<64> reader(input, length); - uint8_t blocks[64]{}; - simd::simd8x64 running_or(blocks); - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - running_or |= in; - reader.advance(); - } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - running_or |= in; - return running_or.is_ascii(); -} + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) vmovq_n_u16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even))); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000)); -bool generic_validate_ascii(const char * input, size_t length) { - return generic_validate_ascii(reinterpret_cast(input),length); -} + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + const uint16x8_t s0 = vshrq_n_u16(in, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000)); + // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] + const uint16x8_t s1s = vshlq_n_u16(s1, 2); + // [00bb|bbbb|0000|aaaa] + const uint16x8_t s2 = vorrq_u16(s0, s1s); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000)); + const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); + const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff); + const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask); + const uint16x8_t s4 = veorq_u16(s3, m0); +#undef simdutf_vec -template -result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) { - buf_block_reader<64> reader(input, length); - size_t count{0}; - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - if (!in.is_ascii()) { - result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); - return result(res.error, count + res.count); - } - reader.advance(); - - count += 64; - } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - if (!in.is_ascii()) { - result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); - return result(res.error, count + res.count); - } else { - return result(error_code::SUCCESS, length); - } -} - -result generic_validate_ascii_with_errors(const char * input, size_t length) { - return generic_validate_ascii_with_errors(reinterpret_cast(input),length); -} + // 4. expand code units 16-bit => 32-bit + const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); + const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); -} // namespace utf8_validation -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -/* end file src/generic/utf8_validation/utf8_validator.h */ -// transcoding from UTF-8 to UTF-16 -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h -/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004, + 0x0010, 0x0040, + 0x0100, 0x0400, + 0x1000, 0x4000 ); + const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008, + 0x0020, 0x0080, + 0x0200, 0x0800, + 0x2000, 0x8000 ); +#else + const uint16x8_t onemask = { 0x0001, 0x0004, + 0x0010, 0x0040, + 0x0100, 0x0400, + 0x1000, 0x4000 }; + const uint16x8_t twomask = { 0x0002, 0x0008, + 0x0020, 0x0080, + 0x0200, 0x0800, + 0x2000, 0x8000 }; +#endif + const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask)); + const uint16_t mask = vaddvq_u16(combined); + // The following fast path may or may not be beneficial. + /*if(mask == 0) { + // We only have three-byte code units. Use fast path. + const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); + vst1q_u8(utf8_output, utf8_0); + utf8_output += 12; + vst1q_u8(utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); -namespace simdutf { -namespace arm64 { -namespace { -namespace utf8_to_utf16 { + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); -using namespace simd; + vst1q_u8(utf8_output, utf8_0); + utf8_output += row0[0]; + vst1q_u8(utf8_output, utf8_1); + utf8_output += row1[0]; -template -simdutf_warn_unused size_t convert_valid(const char* input, size_t size, - char16_t* utf16_output) noexcept { - // The implementation is not specific to haswell and should be moved to the generic directory. - size_t pos = 0; - char16_t* start{utf16_output}; - const size_t safety_margin = 16; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - // this loop could be unrolled further. For example, we could process the mask - // far more than 64 bytes. - simd8x64 in(reinterpret_cast(input + pos)); - if(in.is_ascii()) { - in.store_ascii_as_utf16(utf16_output); - utf16_output += 64; - pos += 64; + buf += 8; + // surrogate pair(s) in a register } else { - // Slow path. We hope that the compiler will recognize that this is a slow path. - // Anything that is not a continuation mask is a 'leading byte', that is, the - // start of a new code point. - uint64_t utf8_continuation_mask = in.lt(-65 + 1); - // -65 is 0b10111111 in two-complement's, so largest possible continuation byte - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - // The *start* of code points is not so useful, rather, we want the *end* of code points. - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times when using solely - // the slow/regular path, and at least four times if there are fast paths. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - // - // Thus we may allow convert_masked_utf8_to_utf16 to process - // more bytes at a time under a fast-path mode where 16 bytes - // are consumed at once (e.g., when encountering ASCII). - size_t consumed = convert_masked_utf8_to_utf16(input + pos, - utf8_end_of_code_point_mask, utf16_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if((word & 0xFF80)==0) { + *utf8_output++ = char(word); + } else if((word & 0xF800)==0) { + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word &0xF800 ) != 0xD800) { + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast(utf8_output)); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value>>18) | 0b11110000); + *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. + buf += k; } - } - utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output); - return utf16_output - start; + } // while + + return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast(utf8_output)); } +/* end file src/arm64/arm_convert_utf16_to_utf8.cpp */ +/* begin file src/arm64/arm_convert_utf16_to_utf32.cpp */ +/* + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit code units. -} // namespace utf8_to_utf16 -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h -/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + is in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. + Ad 1. -namespace simdutf { -namespace arm64 { -namespace { -namespace utf8_to_utf16 { -using namespace simd; + When values are less than 0x0800, it means that a 16-bit code unit + can be converted into: 1) single UTF8 byte (when it's an ASCII + char) or 2) two UTF8 bytes. + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. - simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { -// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) -// Bit 1 = Too Long (ASCII followed by continuation) -// Bit 2 = Overlong 3-byte -// Bit 4 = Surrogate -// Bit 5 = Overlong 2-byte -// Bit 7 = Two Continuations - constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ - // 11______ 11______ - constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ - constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ - constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ - constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ - constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ - constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ - // 11110100 101_____ - // 11110101 1001____ - // 11110101 101_____ - // 1111011_ 1001____ - // 1111011_ 101_____ - // 11111___ 1001____ - // 11111___ 101_____ - constexpr const uint8_t TOO_LARGE_1000 = 1<<6; - // 11110101 1000____ - // 1111011_ 1000____ - // 11111___ 1000____ - constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. - const simd8 byte_1_high = prev1.shr<4>().lookup_16( - // 0_______ ________ - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - // 10______ ________ - TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, - // 1100____ ________ - TOO_SHORT | OVERLONG_2, - // 1101____ ________ - TOO_SHORT, - // 1110____ ________ - TOO_SHORT | OVERLONG_3 | SURROGATE, - // 1111____ ________ - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 - ); - constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . - const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( - // ____0000 ________ - CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, - // ____0001 ________ - CARRY | OVERLONG_2, - // ____001_ ________ - CARRY, - CARRY, + Ad 2. - // ____0100 ________ - CARRY | TOO_LARGE, - // ____0101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____011_ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, + When values fit in 16-bit code units, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. - // ____1___ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____1101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000 - ); - const simd8 byte_2_high = input.shr<4>().lookup_16( - // ________ 0_______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. - // ________ 1000____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, - // ________ 1001____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, - // ________ 101_____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. - // ________ 11______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT - ); - return (byte_1_high & byte_1_low & byte_2_high); - } - simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, - const simd8 prev_input, const simd8 sc) { - simd8 prev2 = input.prev<2>(prev_input); - simd8 prev3 = input.prev<3>(prev_input); - simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); - simd8 must23_80 = must23 & uint8_t(0x80); - return must23_80 ^ sc; - } + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. - struct validating_transcoder { - // If this is nonzero, there has been a UTF-8 error. - simd8 error; + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routing should carry on the conversion of the tail. +*/ +template +std::pair arm_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_out) { + uint32_t * utf32_output = reinterpret_cast(utf32_out); + const char16_t* end = buf + len; - validating_transcoder() : error(uint8_t(0)) {} - // - // Check whether the current bytes are valid UTF-8. - // - simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { - // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes - // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) - simd8 prev1 = input.prev<1>(prev_input); - simd8 sc = check_special_cases(input, prev1); - this->error |= check_multibyte_lengths(input, prev_input, sc); - } + const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + while (buf + 8 <= end) { + uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); + if (!match_system(big_endian)) { in = vrev16q_u8(in); } - template - simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) { - size_t pos = 0; - char16_t* start{utf16_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store_ascii_as_utf16(utf16_output); - utf16_output += 64; - pos += 64; + const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, + // it is likely an uncommon occurrence. + if (vmaxvq_u16(surrogates_bytemask) == 0) { + // case: no surrogate pairs, extend all 16-bit code units to 32-bit code units + vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in))); + vst1q_u32(utf32_output+4, vmovl_high_u16(in)); + utf32_output += 8; + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if((word &0xF800 ) != 0xD800) { + *utf32_output++ = char32_t(word); } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf16(in + pos, - utf8_end_of_code_point_mask, utf16_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, reinterpret_cast(utf32_output)); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); } } - if(errors()) { return 0; } - if(pos < size) { - size_t howmany = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output); - if(howmany == 0) { return 0; } - utf16_output += howmany; - } - return utf16_output - start; + buf += k; } + } // while + return std::make_pair(buf, reinterpret_cast(utf32_output)); +} - template - simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) { - size_t pos = 0; - char16_t* start{utf16_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store_ascii_as_utf16(utf16_output); - utf16_output += 64; - pos += 64; + +/* + Returns a pair: a result struct and utf8_output. + If there is an error, the count field of the result is the position of the error. + Otherwise, it is the position of the first unprocessed byte in buf (even if finished). + A scalar routing should carry on the conversion of the tail if needed. +*/ +template +std::pair arm_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_out) { + uint32_t * utf32_output = reinterpret_cast(utf32_out); + const char16_t* start = buf; + const char16_t* end = buf + len; + + const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + + while (buf + 8 <= end) { + uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); + if (!match_system(big_endian)) { in = vrev16q_u8(in); } + + const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, + // it is likely an uncommon occurrence. + if (vmaxvq_u16(surrogates_bytemask) == 0) { + // case: no surrogate pairs, extend all 16-bit code units to 32-bit code units + vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in))); + vst1q_u32(utf32_output+4, vmovl_high_u16(in)); + utf32_output += 8; + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if((word &0xF800 ) != 0xD800) { + *utf32_output++ = char32_t(word); } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - if (errors()) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); - res.count += pos; - return res; - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf16(in + pos, - utf8_end_of_code_point_mask, utf16_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(errors()) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); - res.count += pos; - return res; - } - if(pos < size) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); - if (res.error) { // In case of error, we want the error position - res.count += pos; - return res; - } else { // In case of success, we want the number of word written - utf16_output += res.count; + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast(utf32_output)); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); } } - return result(error_code::SUCCESS, utf16_output - start); + buf += k; } + } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast(utf32_output)); +} +/* end file src/arm64/arm_convert_utf16_to_utf32.cpp */ - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); +/* begin file src/arm64/arm_convert_utf32_to_latin1.cpp */ +std::pair arm_convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) { + const char32_t* end = buf + len; + while (buf + 8 <= end) { + uint32x4_t in1 = vld1q_u32(reinterpret_cast(buf)); + uint32x4_t in2 = vld1q_u32(reinterpret_cast(buf+4)); + + uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2)); + if (vmaxvq_u16(utf16_packed) <= 0xff) { + // 1. pack the bytes + uint8x8_t latin1_packed = vmovn_u16(utf16_packed); + // 2. store (8 bytes) + vst1_u8(reinterpret_cast(latin1_output), latin1_packed); + // 3. adjust pointers + buf += 8; + latin1_output += 8; + } else { + return std::make_pair(nullptr, reinterpret_cast(latin1_output)); } + } // while + return std::make_pair(buf, latin1_output); +} - }; // struct utf8_checker -} // utf8_to_utf16 namespace -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ -// transcoding from UTF-8 to UTF-32 -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h -/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ -namespace simdutf { -namespace arm64 { -namespace { -namespace utf8_to_utf32 { +std::pair arm_convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) { + const char32_t* start = buf; + const char32_t* end = buf + len; -using namespace simd; + while (buf + 8 <= end) { + uint32x4_t in1 = vld1q_u32(reinterpret_cast(buf)); + uint32x4_t in2 = vld1q_u32(reinterpret_cast(buf+4)); + uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2)); -simdutf_warn_unused size_t convert_valid(const char* input, size_t size, - char32_t* utf32_output) noexcept { - size_t pos = 0; - char32_t* start{utf32_output}; - const size_t safety_margin = 16; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 in(reinterpret_cast(input + pos)); - if(in.is_ascii()) { - in.store_ascii_as_utf32(utf32_output); - utf32_output += 64; - pos += 64; + if (vmaxvq_u16(utf16_packed) <= 0xff) { + // 1. pack the bytes + uint8x8_t latin1_packed = vmovn_u16(utf16_packed); + // 2. store (8 bytes) + vst1_u8(reinterpret_cast(latin1_output), latin1_packed); + // 3. adjust pointers + buf += 8; + latin1_output += 8; } else { - // -65 is 0b10111111 in two-complement's, so largest possible continuation byte - uint64_t utf8_continuation_mask = in.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - size_t max_starting_point = (pos + 64) - 12; - while(pos < max_starting_point) { - size_t consumed = convert_masked_utf8_to_utf32(input + pos, - utf8_end_of_code_point_mask, utf32_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; + // Let us do a scalar fallback. + for(int k = 0; k < 8; k++) { + uint32_t word = buf[k]; + if(word <= 0xff) { + *latin1_output++ = char(word); + } else { + return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), latin1_output); + } } } - } - utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output); - return utf32_output - start; + } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), latin1_output); } +/* end file src/arm64/arm_convert_utf32_to_latin1.cpp */ +/* begin file src/arm64/arm_convert_utf32_to_utf8.cpp */ +std::pair arm_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_out) { + uint8_t * utf8_output = reinterpret_cast(utf8_out); + const char32_t* end = buf + len; + const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); -} // namespace utf8_to_utf32 -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h -/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */ + uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0); + while (buf + 8 < end) { + uint32x4_t in = vld1q_u32(reinterpret_cast(buf)); + uint32x4_t nextin = vld1q_u32(reinterpret_cast(buf+4)); -namespace simdutf { -namespace arm64 { -namespace { -namespace utf8_to_utf32 { -using namespace simd; + // Check if no bits set above 16th + if(vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) { + // Pack UTF-32 to UTF-16 safely (without surrogate pairs) + // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp) + uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin)); + if(vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!! + // 1. pack the bytes + // obviously suboptimal. + uint8x8_t utf8_packed = vmovn_u16(utf16_packed); + // 2. store (8 bytes) + vst1_u8(utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + continue; // we are done for this round! + } + if (vmaxvq_u16(utf16_packed) <= 0x7FF) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); + const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2); + // t1 = [000a|aaaa|0000|0000] + const uint16x8_t t1 = vandq_u16(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const uint16x8_t t3 = vorrq_u16(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const uint16x8_t t4 = vorrq_u16(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f); + const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4)); + // 3. prepare bitmask for 8-bit lookup + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004, + 0x0010, 0x0040, + 0x0002, 0x0008, + 0x0020, 0x0080); + #else + const uint16x8_t mask = { 0x0001, 0x0004, + 0x0010, 0x0040, + 0x0002, 0x0008, + 0x0020, 0x0080 }; + #endif + uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); + // 4. pack the bytes + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const uint8x16_t shuffle = vld1q_u8(row + 1); + const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); - simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { -// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) -// Bit 1 = Too Long (ASCII followed by continuation) -// Bit 2 = Overlong 3-byte -// Bit 4 = Surrogate -// Bit 5 = Overlong 2-byte -// Bit 7 = Two Continuations - constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ - // 11______ 11______ - constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ - constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ - constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ - constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ - constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ - constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ - // 11110100 101_____ - // 11110101 1001____ - // 11110101 101_____ - // 1111011_ 1001____ - // 1111011_ 101_____ - // 11111___ 1001____ - // 11111___ 101_____ - constexpr const uint8_t TOO_LARGE_1000 = 1<<6; - // 11110101 1000____ - // 1111011_ 1000____ - // 11111___ 1000____ - constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + // 5. store bytes + vst1q_u8(utf8_output, utf8_packed); - const simd8 byte_1_high = prev1.shr<4>().lookup_16( - // 0_______ ________ - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - // 10______ ________ - TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, - // 1100____ ________ - TOO_SHORT | OVERLONG_2, - // 1101____ ________ - TOO_SHORT, - // 1110____ ________ - TOO_SHORT | OVERLONG_3 | SURROGATE, - // 1111____ ________ - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 - ); - constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . - const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( - // ____0000 ________ - CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, - // ____0001 ________ - CARRY | OVERLONG_2, - // ____001_ ________ - CARRY, - CARRY, + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; + } else { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff); + forbidden_bytemask = vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800)), forbidden_bytemask); - // ____0100 ________ - CARRY | TOO_LARGE, - // ____0101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____011_ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + #else + const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; + #endif + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes - // ____1___ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - // ____1101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000 - ); - const simd8 byte_2_high = input.shr<4>().lookup_16( - // ________ 0_______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. - // ________ 1000____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, - // ________ 1001____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, - // ________ 101_____ - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. - // ________ 11______ - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT - ); - return (byte_1_high & byte_1_low & byte_2_high); - } - simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, - const simd8 prev_input, const simd8 sc) { - simd8 prev2 = input.prev<2>(prev_input); - simd8 prev3 = input.prev<3>(prev_input); - simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); - simd8 must23_80 = must23 & uint8_t(0x80); - return must23_80 ^ sc; - } + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ + #define simdutf_vec(x) vmovq_n_u16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even))); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000)); - struct validating_transcoder { - // If this is nonzero, there has been a UTF-8 error. - simd8 error; + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + const uint16x8_t s1 = vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000)); + // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] + const uint16x8_t s1s = vshlq_n_u16(s1, 2); + // [00bb|bbbb|0000|aaaa] + const uint16x8_t s2 = vorrq_u16(s0, s1s); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000)); + const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); + const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff); + const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask); + const uint16x8_t s4 = veorq_u16(s3, m0); + #undef simdutf_vec - validating_transcoder() : error(uint8_t(0)) {} - // - // Check whether the current bytes are valid UTF-8. - // - simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { - // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes - // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) - simd8 prev1 = input.prev<1>(prev_input); - simd8 sc = check_special_cases(input, prev1); - this->error |= check_multibyte_lengths(input, prev_input, sc); - } + // 4. expand code units 16-bit => 32-bit + const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); + const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f); + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004, + 0x0010, 0x0040, + 0x0100, 0x0400, + 0x1000, 0x4000 ); + const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008, + 0x0020, 0x0080, + 0x0200, 0x0800, + 0x2000, 0x8000 ); + #else + const uint16x8_t onemask = { 0x0001, 0x0004, + 0x0010, 0x0040, + 0x0100, 0x0400, + 0x1000, 0x4000 }; + const uint16x8_t twomask = { 0x0002, 0x0008, + 0x0020, 0x0080, + 0x0200, 0x0800, + 0x2000, 0x8000 }; + #endif + const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask)); + const uint16_t mask = vaddvq_u16(combined); + // The following fast path may or may not be beneficial. + /*if(mask == 0) { + // We only have three-byte code units. Use fast path. + const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); + vst1q_u8(utf8_output, utf8_0); + utf8_output += 12; + vst1q_u8(utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); - simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) { - size_t pos = 0; - char32_t* start{utf32_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 4 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 4; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); + vst1q_u8(utf8_output, utf8_0); + utf8_output += row0[0]; + vst1q_u8(utf8_output, utf8_1); + utf8_output += row1[0]; + + buf += 8; } - // If the input is long enough, then we have that margin-1 is the fourth last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store_ascii_as_utf32(utf32_output); - utf32_output += 64; - pos += 64; + // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes. + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFFFF80)==0) { + *utf8_output++ = char(word); + } else if((word & 0xFFFFF800)==0) { + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word & 0xFFFF0000)==0) { + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast(utf8_output)); } + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf32(in + pos, - utf8_end_of_code_point_mask, utf32_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. + if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast(utf8_output)); } + *utf8_output++ = char((word>>18) | 0b11110000); + *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); } } - if(errors()) { return 0; } - if(pos < size) { - size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output); - if(howmany == 0) { return 0; } - utf32_output += howmany; - } - return utf32_output - start; + buf += k; } + } // while - simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) { - size_t pos = 0; - char32_t* start{utf32_output}; - // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, - // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate - // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 4 leading bytes, - // to give us a good margin. - size_t leading_byte = 0; - size_t margin = size; - for(; margin > 0 && leading_byte < 4; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); - } - // If the input is long enough, then we have that margin-1 is the fourth last leading byte. - const size_t safety_margin = size - margin + 1; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 input(reinterpret_cast(in + pos)); - if(input.is_ascii()) { - input.store_ascii_as_utf32(utf32_output); - utf32_output += 64; - pos += 64; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - if (errors()) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); - res.count += pos; - return res; - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf32(in + pos, - utf8_end_of_code_point_mask, utf32_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. - } - } - if(errors()) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); - res.count += pos; - return res; - } - if(pos < size) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); - if (res.error) { // In case of error, we want the error position - res.count += pos; - return res; - } else { // In case of success, we want the number of word written - utf32_output += res.count; - } - } - return result(error_code::SUCCESS, utf32_output - start); - } - - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); - } + // check for invalid input + if (vmaxvq_u16(forbidden_bytemask) != 0) { + return std::make_pair(nullptr, reinterpret_cast(utf8_output)); + } + return std::make_pair(buf, reinterpret_cast(utf8_output)); +} - }; // struct utf8_checker -} // utf8_to_utf32 namespace -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */ -// other functions -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8.h -/* begin file src/generic/utf8.h */ -namespace simdutf { -namespace arm64 { -namespace { -namespace utf8 { +std::pair arm_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_out) { + uint8_t * utf8_output = reinterpret_cast(utf8_out); + const char32_t* start = buf; + const char32_t* end = buf + len; -using namespace simd; + const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); -simdutf_really_inline size_t count_code_points(const char* in, size_t size) { - size_t pos = 0; - size_t count = 0; - for(;pos + 64 <= size; pos += 64) { - simd8x64 input(reinterpret_cast(in + pos)); - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - count += 64 - count_ones(utf8_continuation_mask); - } - return count + scalar::utf8::count_code_points(in + pos, size - pos); -} + while (buf + 8 < end) { + uint32x4_t in = vld1q_u32(reinterpret_cast(buf)); + uint32x4_t nextin = vld1q_u32(reinterpret_cast(buf+4)); + // Check if no bits set above 16th + if(vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) { + // Pack UTF-32 to UTF-16 safely (without surrogate pairs) + // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp) + uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin)); + if(vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!! + // 1. pack the bytes + // obviously suboptimal. + uint8x8_t utf8_packed = vmovn_u16(utf16_packed); + // 2. store (8 bytes) + vst1_u8(utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + continue; // we are done for this round! + } -simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) { - size_t pos = 0; - size_t count = 0; - // This algorithm could no doubt be improved! - for(;pos + 64 <= size; pos += 64) { - simd8x64 input(reinterpret_cast(in + pos)); - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - // We count one word for anything that is not a continuation (so - // leading bytes). - count += 64 - count_ones(utf8_continuation_mask); - int64_t utf8_4byte = input.gteq_unsigned(240); - count += count_ones(utf8_4byte); - } - return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); -} + if (vmaxvq_u16(utf16_packed) <= 0x7FF) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); + const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2); + // t1 = [000a|aaaa|0000|0000] + const uint16x8_t t1 = vandq_u16(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const uint16x8_t t3 = vorrq_u16(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const uint16x8_t t4 = vorrq_u16(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f); + const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4)); + // 3. prepare bitmask for 8-bit lookup + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004, + 0x0010, 0x0040, + 0x0002, 0x0008, + 0x0020, 0x0080); + #else + const uint16x8_t mask = { 0x0001, 0x0004, + 0x0010, 0x0040, + 0x0002, 0x0008, + 0x0020, 0x0080 }; + #endif + uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); + // 4. pack the bytes + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const uint8x16_t shuffle = vld1q_u8(row + 1); + const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); + // 5. store bytes + vst1q_u8(utf8_output, utf8_packed); -simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) { - return count_code_points(in, size); -} -} // utf8 namespace -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -/* end file src/generic/utf8.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf16.h -/* begin file src/generic/utf16.h */ -namespace simdutf { -namespace arm64 { -namespace { -namespace utf16 { + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; + } else { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes -template -simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) { - size_t pos = 0; - size_t count = 0; - for(;pos + 32 <= size; pos += 32) { - simd16x32 input(reinterpret_cast(in + pos)); - if (!match_system(big_endian)) input.swap_bytes(); - uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); - count += count_ones(not_pair) / 2; - } - return count + scalar::utf16::count_code_points(in + pos, size - pos); -} + // check for invalid input + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff); + const uint16x8_t forbidden_bytemask = vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800)); + if (vmaxvq_u16(forbidden_bytemask) != 0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast(utf8_output)); + } -template -simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) { - size_t pos = 0; - size_t count = 0; - // This algorithm could no doubt be improved! - for(;pos + 32 <= size; pos += 32) { - simd16x32 input(reinterpret_cast(in + pos)); - if (!match_system(big_endian)) input.swap_bytes(); - uint64_t ascii_mask = input.lteq(0x7F); - uint64_t twobyte_mask = input.lteq(0x7FF); - uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + #else + const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; + #endif + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes - size_t ascii_count = count_ones(ascii_mask) / 2; - size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2; - size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2; - size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; - count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count; - } - return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos); -} + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. -template -simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) { - return count_code_points(in, size); -} + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. -simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) { - size_t pos = 0; + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. - while (pos + 32 <= size) { - simd16x32 input(reinterpret_cast(in + pos)); - input.swap_bytes(); - input.store(reinterpret_cast(output)); - pos += 32; - output += 32; - } + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ + #define simdutf_vec(x) vmovq_n_u16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even))); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000)); - scalar::utf16::change_endianness_utf16(in + pos, size - pos, output); -} + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + const uint16x8_t s1 = vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000)); + // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] + const uint16x8_t s1s = vshlq_n_u16(s1, 2); + // [00bb|bbbb|0000|aaaa] + const uint16x8_t s2 = vorrq_u16(s0, s1s); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000)); + const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); + const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff); + const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask); + const uint16x8_t s4 = veorq_u16(s3, m0); + #undef simdutf_vec -} // utf16 -} // unnamed namespace -} // namespace arm64 -} // namespace simdutf -/* end file src/generic/utf16.h */ -// -// Implementation-specific overrides -// -namespace simdutf { -namespace arm64 { + // 4. expand code units 16-bit => 32-bit + const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); + const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); -simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept { - // If there is a BOM, then we trust it. - auto bom_encoding = simdutf::BOM::check_bom(input, length); - if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } - if (length % 2 == 0) { - return arm_detect_encodings(input, length); - } else { - if (implementation::validate_utf8(input, length)) { - return simdutf::encoding_type::UTF8; - } else { - return simdutf::encoding_type::unspecified; - } - } -} + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f); + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004, + 0x0010, 0x0040, + 0x0100, 0x0400, + 0x1000, 0x4000 ); + const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008, + 0x0020, 0x0080, + 0x0200, 0x0800, + 0x2000, 0x8000 ); + #else + const uint16x8_t onemask = { 0x0001, 0x0004, + 0x0010, 0x0040, + 0x0100, 0x0400, + 0x1000, 0x4000 }; + const uint16x8_t twomask = { 0x0002, 0x0008, + 0x0020, 0x0080, + 0x0200, 0x0800, + 0x2000, 0x8000 }; + #endif + const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask)); + const uint16_t mask = vaddvq_u16(combined); + // The following fast path may or may not be beneficial. + /*if(mask == 0) { + // We only have three-byte code units. Use fast path. + const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); + vst1q_u8(utf8_output, utf8_0); + utf8_output += 12; + vst1q_u8(utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); -simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { - return arm64::utf8_validation::generic_validate_utf8(buf,len); -} + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); -simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept { - return arm64::utf8_validation::generic_validate_utf8_with_errors(buf,len); -} + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); -simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept { - return arm64::utf8_validation::generic_validate_ascii(buf,len); -} + vst1q_u8(utf8_output, utf8_0); + utf8_output += row0[0]; + vst1q_u8(utf8_output, utf8_1); + utf8_output += row1[0]; -simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept { - return arm64::utf8_validation::generic_validate_ascii_with_errors(buf,len); -} + buf += 8; + } + // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes. + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFFFF80)==0) { + *utf8_output++ = char(word); + } else if((word & 0xFFFFF800)==0) { + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word & 0xFFFF0000)==0) { + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast(utf8_output)); } + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast(utf8_output)); } + *utf8_output++ = char((word>>18) | 0b11110000); + *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while -simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept { - const char16_t* tail = arm_validate_utf16(buf, len); - if (tail) { - return scalar::utf16::validate(tail, len - (tail - buf)); - } else { - return false; - } + return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast(utf8_output)); } +/* end file src/arm64/arm_convert_utf32_to_utf8.cpp */ +/* begin file src/arm64/arm_convert_utf32_to_utf16.cpp */ +template +std::pair arm_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_out) { + uint16_t * utf16_output = reinterpret_cast(utf16_out); + const char32_t* end = buf + len; -simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept { - const char16_t* tail = arm_validate_utf16(buf, len); - if (tail) { - return scalar::utf16::validate(tail, len - (tail - buf)); - } else { - return false; - } -} + uint16x4_t forbidden_bytemask = vmov_n_u16(0x0); -simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept { - result res = arm_validate_utf16_with_errors(buf, len); - if (res.count != len) { - result scalar_res = scalar::utf16::validate_with_errors(buf + res.count, len - res.count); - return result(scalar_res.error, res.count + scalar_res.count); - } else { - return res; - } -} + while(buf + 4 <= end) { + uint32x4_t in = vld1q_u32(reinterpret_cast(buf)); -simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept { - result res = arm_validate_utf16_with_errors(buf, len); - if (res.count != len) { - result scalar_res = scalar::utf16::validate_with_errors(buf + res.count, len - res.count); - return result(scalar_res.error, res.count + scalar_res.count); - } else { - return res; - } -} + // Check if no bits set above 16th + if(vmaxvq_u32(in) <= 0xFFFF) { + uint16x4_t utf16_packed = vmovn_u32(in); -simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { - const char32_t* tail = arm_validate_utf32le(buf, len); - if (tail) { - return scalar::utf32::validate(tail, len - (tail - buf)); - } else { - return false; - } -} + const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800); + const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff); + forbidden_bytemask = vorr_u16(vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800)), forbidden_bytemask); -simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept { - result res = arm_validate_utf32le_with_errors(buf, len); - if (res.count != len) { - result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count); - return result(scalar_res.error, res.count + scalar_res.count); - } else { - return res; + if (!match_system(big_endian)) { utf16_packed = vrev16_u8(utf16_packed); } + vst1_u16(utf16_output, utf16_packed); + utf16_output += 4; + buf += 4; + } else { + size_t forward = 3; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFF0000)==0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast(utf16_output)); } + *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast(utf16_output)); } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (!match_system(big_endian)) { + high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8); + low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } + buf += k; + } } -} - -simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - utf8_to_utf16::validating_transcoder converter; - return converter.convert(buf, len, utf16_output); -} - -simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - utf8_to_utf16::validating_transcoder converter; - return converter.convert(buf, len, utf16_output); -} -simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - utf8_to_utf16::validating_transcoder converter; - return converter.convert_with_errors(buf, len, utf16_output); -} + // check for invalid input + if (vmaxv_u16(forbidden_bytemask) != 0) { + return std::make_pair(nullptr, reinterpret_cast(utf16_output)); + } -simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - utf8_to_utf16::validating_transcoder converter; - return converter.convert_with_errors(buf, len, utf16_output); + return std::make_pair(buf, reinterpret_cast(utf16_output)); } -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size, - char16_t* utf16_output) const noexcept { - return utf8_to_utf16::convert_valid(input, size, utf16_output); -} -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size, - char16_t* utf16_output) const noexcept { - return utf8_to_utf16::convert_valid(input, size, utf16_output); -} +template +std::pair arm_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_out) { + uint16_t * utf16_output = reinterpret_cast(utf16_out); + const char32_t* start = buf; + const char32_t* end = buf + len; -simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept { - utf8_to_utf32::validating_transcoder converter; - return converter.convert(buf, len, utf32_output); -} + while(buf + 4 <= end) { + uint32x4_t in = vld1q_u32(reinterpret_cast(buf)); -simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept { - utf8_to_utf32::validating_transcoder converter; - return converter.convert_with_errors(buf, len, utf32_output); -} + // Check if no bits set above 16th + if(vmaxvq_u32(in) <= 0xFFFF) { + uint16x4_t utf16_packed = vmovn_u32(in); -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size, - char32_t* utf32_output) const noexcept { - return utf8_to_utf32::convert_valid(input, size, utf32_output); -} + const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800); + const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff); + const uint16x4_t forbidden_bytemask = vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800)); + if (vmaxv_u16(forbidden_bytemask) != 0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast(utf16_output)); + } -simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - std::pair ret = arm_convert_utf16_to_utf8(buf, len, utf8_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf8_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; - } - return saved_bytes; -} - -simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - std::pair ret = arm_convert_utf16_to_utf8(buf, len, utf8_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf8_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; - } - return saved_bytes; -} - -simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished - std::pair ret = arm_convert_utf16_to_utf8_with_errors(buf, len, utf8_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count - if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_utf8::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; - } - } - ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written - return ret.first; -} - -simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished - std::pair ret = arm_convert_utf16_to_utf8_with_errors(buf, len, utf8_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count - if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_utf8::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; + if (!match_system(big_endian)) { utf16_packed = vrev16_u8(utf16_packed); } + vst1_u16(utf16_output, utf16_packed); + utf16_output += 4; + buf += 4; } else { - ret.second += scalar_res.count; + size_t forward = 3; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFF0000)==0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast(utf16_output)); } + *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast(utf16_output)); } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (!match_system(big_endian)) { + high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8); + low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } + buf += k; } } - ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written - return ret.first; -} -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return convert_utf16le_to_utf8(buf, len, utf8_output); + return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast(utf16_output)); } +/* end file src/arm64/arm_convert_utf32_to_utf16.cpp */ +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* begin file src/generic/buf_block_reader.h */ +namespace simdutf { +namespace arm64 { +namespace { -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return convert_utf16be_to_utf8(buf, len, utf8_output); -} +// Walks through a buffer in block-sized increments, loading the last part with spaces +template +struct buf_block_reader { +public: + simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); + simdutf_really_inline size_t block_index(); + simdutf_really_inline bool has_full_block() const; + simdutf_really_inline const uint8_t *full_block() const; + /** + * Get the last block, padded with spaces. + * + * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this + * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there + * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. + * + * @return the number of effective characters in the last block. + */ + simdutf_really_inline size_t get_remainder(uint8_t *dst) const; + simdutf_really_inline void advance(); +private: + const uint8_t *buf; + const size_t len; + const size_t lenminusstep; + size_t idx; +}; -simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - std::pair ret = arm_convert_utf32_to_utf8(buf, len, utf8_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf8_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text_64(const uint8_t *text) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + for (size_t i=0; i); i++) { + buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); } - return saved_bytes; + buf[sizeof(simd8x64)] = '\0'; + return buf; } -simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished - std::pair ret = arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); - if (ret.first.count != len) { - result scalar_res = scalar::utf32_to_utf8::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; - } +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text(const simd8x64& in) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + in.store(reinterpret_cast(buf)); + for (size_t i=0; i); i++) { + if (buf[i] < ' ') { buf[i] = '_'; } } - ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written - return ret.first; + buf[sizeof(simd8x64)] = '\0'; + return buf; } -simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - std::pair ret = arm_convert_utf16_to_utf32(buf, len, utf32_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf32_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; +simdutf_unused static char * format_mask(uint64_t mask) { + static char *buf = reinterpret_cast(malloc(64 + 1)); + for (size_t i=0; i<64; i++) { + buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; } - return saved_bytes; + buf[64] = '\0'; + return buf; } -simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - std::pair ret = arm_convert_utf16_to_utf32(buf, len, utf32_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf32_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; - } - return saved_bytes; -} +template +simdutf_really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} -simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished - std::pair ret = arm_convert_utf16_to_utf32_with_errors(buf, len, utf32_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count - if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_utf32::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; - } - } - ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written - return ret.first; -} +template +simdutf_really_inline size_t buf_block_reader::block_index() { return idx; } -simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished - std::pair ret = arm_convert_utf16_to_utf32_with_errors(buf, len, utf32_output); - if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count - if (ret.first.count != len) { // All good so far, but not finished - result scalar_res = scalar::utf16_to_utf32::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; - } - } - ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written - return ret.first; +template +simdutf_really_inline bool buf_block_reader::has_full_block() const { + return idx < lenminusstep; } -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - return convert_utf32_to_utf8(buf, len, utf8_output); +template +simdutf_really_inline const uint8_t *buf_block_reader::full_block() const { + return &buf[idx]; } -simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - std::pair ret = arm_convert_utf32_to_utf16(buf, len, utf16_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf16_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; - } - return saved_bytes; +template +simdutf_really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { + if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers + std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. + std::memcpy(dst, buf + idx, len - idx); + return len - idx; } -simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - std::pair ret = arm_convert_utf32_to_utf16(buf, len, utf16_output); - if (ret.first == nullptr) { return 0; } - size_t saved_bytes = ret.second - utf16_output; - if (ret.first != buf + len) { - const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert( - ret.first, len - (ret.first - buf), ret.second); - if (scalar_saved_bytes == 0) { return 0; } - saved_bytes += scalar_saved_bytes; - } - return saved_bytes; +template +simdutf_really_inline void buf_block_reader::advance() { + idx += STEP_SIZE; } -simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished - std::pair ret = arm_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); - if (ret.first.count != len) { - result scalar_res = scalar::utf32_to_utf16::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; - } - } - ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written - return ret.first; -} - -simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished - std::pair ret = arm_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); - if (ret.first.count != len) { - result scalar_res = scalar::utf32_to_utf16::convert_with_errors( - buf + ret.first.count, len - ret.first.count, ret.second); - if (scalar_res.error) { - scalar_res.count += ret.first.count; - return scalar_res; - } else { - ret.second += scalar_res.count; - } - } - ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written - return ret.first; -} - -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return convert_utf32_to_utf16le(buf, len, utf16_output); -} +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/buf_block_reader.h */ +/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_validation { -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return convert_utf32_to_utf16be(buf, len, utf16_output); -} +using namespace simd; -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return convert_utf16le_to_utf32(buf, len, utf32_output); -} + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return convert_utf16be_to_utf32(buf, len, utf32_output); -} + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, -void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept { - utf16::change_endianness_utf16(input, length, output); -} + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, -simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept { - return utf16::count_code_points(input, length); -} + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, -simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept { - return utf16::count_code_points(input, length); -} + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, -simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { - return utf8::count_code_points(input, length); -} + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } -simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept { - return utf16::utf8_length_from_utf16(input, length); -} + // + // Return nonzero if there are incomplete multibyte characters at the end of the block: + // e.g. if there is a 4-byte character, but it's 3 bytes from the end. + // + simdutf_really_inline simd8 is_incomplete(const simd8 input) { + // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): + // ... 1111____ 111_____ 11______ + static const uint8_t max_array[32] = { + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 + }; + const simd8 max_value(&max_array[sizeof(max_array)-sizeof(simd8)]); + return input.gt_bits(max_value); + } -simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept { - return utf16::utf8_length_from_utf16(input, length); -} + struct utf8_checker { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + // The last input we received + simd8 prev_input_block; + // Whether the last input we received was incomplete (used for ASCII fast path) + simd8 prev_incomplete; -simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept { - return utf16::utf32_length_from_utf16(input, length); -} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } -simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept { - return utf16::utf32_length_from_utf16(input, length); -} + // The only problem that can happen at EOF is that a multibyte character is too short + // or a byte value too large in the last bytes: check_special_cases only checks for bytes + // too large in the first of two bytes. + simdutf_really_inline void check_eof() { + // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't + // possibly finish them. + this->error |= this->prev_incomplete; + } -simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { - return utf8::utf16_length_from_utf8(input, length); -} + simdutf_really_inline void check_next_input(const simd8x64& input) { + if(simdutf_likely(is_ascii(input))) { + this->error |= this->prev_incomplete; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); + this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; -simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept { - const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f); - const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff); - const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff); - const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1); - size_t pos = 0; - size_t count = 0; - for(;pos + 4 <= length; pos += 4) { - uint32x4_t in = vld1q_u32(reinterpret_cast(input + pos)); - const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f); - const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff); - const uint32x4_t two_bytes_bytemask = veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask); - const uint32x4_t three_bytes_bytemask = veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask); + } + } - const uint16x8_t reduced_ascii_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1)); - const uint16x8_t reduced_two_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1)); - const uint16x8_t reduced_three_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1)); + // do not forget to call check_eof! + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } - const uint16x8_t compressed_bytemask0 = vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask); - const uint16x8_t compressed_bytemask1 = vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask); + }; // struct utf8_checker +} // namespace utf8_validation - size_t ascii_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0)); - size_t two_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1)); - size_t three_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0)); - - count += 16 - 3*ascii_count - 2*two_bytes_count - three_bytes_count; - } - return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos); -} - -simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept { - const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff); - const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1); - size_t pos = 0; - size_t count = 0; - for(;pos + 4 <= length; pos += 4) { - uint32x4_t in = vld1q_u32(reinterpret_cast(input + pos)); - const uint32x4_t surrogate_bytemask = vcgtq_u32(in, v_ffff); - const uint16x8_t reduced_bytemask = vreinterpretq_u16_u32(vandq_u32(surrogate_bytemask, v_1)); - const uint16x8_t compressed_bytemask = vpaddq_u16(reduced_bytemask, reduced_bytemask); - size_t surrogate_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask), 0)); - count += 4 + surrogate_count; - } - return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos); -} - -simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept { - return utf8::utf32_length_from_utf8(input, length); -} +using utf8_validation::utf8_checker; +} // unnamed namespace } // namespace arm64 } // namespace simdutf - -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/end.h -/* begin file src/simdutf/arm64/end.h */ -/* end file src/simdutf/arm64/end.h */ -/* end file src/arm64/implementation.cpp */ -#endif -#if SIMDUTF_IMPLEMENTATION_FALLBACK -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=fallback/implementation.cpp -/* begin file src/fallback/implementation.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/begin.h -/* begin file src/simdutf/fallback/begin.h */ -// redefining SIMDUTF_IMPLEMENTATION to "fallback" -// #define SIMDUTF_IMPLEMENTATION fallback -/* end file src/simdutf/fallback/begin.h */ - - - - - - - - +/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +/* begin file src/generic/utf8_validation/utf8_validator.h */ namespace simdutf { -namespace fallback { - -simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept { - // If there is a BOM, then we trust it. - auto bom_encoding = simdutf::BOM::check_bom(input, length); - if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } - int out = 0; - if(validate_utf8(input, length)) { out |= encoding_type::UTF8; } - if((length % 2) == 0) { - if(validate_utf16le(reinterpret_cast(input), length/2)) { out |= encoding_type::UTF16_LE; } - } - if((length % 4) == 0) { - if(validate_utf32(reinterpret_cast(input), length/4)) { out |= encoding_type::UTF32_LE; } - } +namespace arm64 { +namespace { +namespace utf8_validation { - return out; +/** + * Validates that the string is actual UTF-8. + */ +template +bool generic_validate_utf8(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + return !c.errors(); } -simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { - return scalar::utf8::validate(buf, len); +bool generic_validate_utf8(const char * input, size_t length) { + return generic_validate_utf8(reinterpret_cast(input),length); } -simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept { - return scalar::utf8::validate_with_errors(buf, len); +/** + * Validates that the string is actual UTF-8 and stops on errors. + */ +template +result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + if(c.errors()) { + if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input), reinterpret_cast(input + count), length - count); + res.count += count; + return res; + } + reader.advance(); + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + if (c.errors()) { + if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input), reinterpret_cast(input) + count, length - count); + res.count += count; + return res; + } else { + return result(error_code::SUCCESS, length); + } } -simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept { - return scalar::ascii::validate(buf, len); +result generic_validate_utf8_with_errors(const char * input, size_t length) { + return generic_validate_utf8_with_errors(reinterpret_cast(input),length); } -simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept { - return scalar::ascii::validate_with_errors(buf, len); +template +bool generic_validate_ascii(const uint8_t * input, size_t length) { + buf_block_reader<64> reader(input, length); + uint8_t blocks[64]{}; + simd::simd8x64 running_or(blocks); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + running_or |= in; + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + running_or |= in; + return running_or.is_ascii(); } -simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept { - return scalar::utf16::validate(buf, len); +bool generic_validate_ascii(const char * input, size_t length) { + return generic_validate_ascii(reinterpret_cast(input),length); } -simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept { - return scalar::utf16::validate(buf, len); -} +template +result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) { + buf_block_reader<64> reader(input, length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } + reader.advance(); -simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept { - return scalar::utf16::validate_with_errors(buf, len); + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } else { + return result(error_code::SUCCESS, length); + } } -simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept { - return scalar::utf16::validate_with_errors(buf, len); +result generic_validate_ascii_with_errors(const char * input, size_t length) { + return generic_validate_ascii_with_errors(reinterpret_cast(input),length); } -simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { - return scalar::utf32::validate(buf, len); -} +} // namespace utf8_validation +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_validator.h */ +// transcoding from UTF-8 to UTF-16 +/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ -simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept { - return scalar::utf32::validate_with_errors(buf, len); -} -simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf8_to_utf16::convert(buf, len, utf16_output); -} +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_to_utf16 { -simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf8_to_utf16::convert(buf, len, utf16_output); -} +using namespace simd; -simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf8_to_utf16::convert_with_errors(buf, len, utf16_output); +template +simdutf_warn_unused size_t convert_valid(const char* input, size_t size, + char16_t* utf16_output) noexcept { + // The implementation is not specific to haswell and should be moved to the generic directory. + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + // this loop could be unrolled further. For example, we could process the mask + // far more than 64 bytes. + simd8x64 in(reinterpret_cast(input + pos)); + if(in.is_ascii()) { + in.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // Slow path. We hope that the compiler will recognize that this is a slow path. + // Anything that is not a continuation mask is a 'leading byte', that is, the + // start of a new code point. + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + // -65 is 0b10111111 in two-complement's, so largest possible continuation byte + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + // The *start* of code points is not so useful, rather, we want the *end* of code points. + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times when using solely + // the slow/regular path, and at least four times if there are fast paths. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + // + // Thus we may allow convert_masked_utf8_to_utf16 to process + // more bytes at a time under a fast-path mode where 16 bytes + // are consumed at once (e.g., when encountering ASCII). + size_t consumed = convert_masked_utf8_to_utf16(input + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output); + return utf16_output - start; } -simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf8_to_utf16::convert_with_errors(buf, len, utf16_output); -} +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf8_to_utf16::convert_valid(buf, len, utf16_output); -} - -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf8_to_utf16::convert_valid(buf, len, utf16_output); -} - -simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf8_to_utf32::convert(buf, len, utf32_output); -} - -simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output); -} - -simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size, - char32_t* utf32_output) const noexcept { - return scalar::utf8_to_utf32::convert_valid(input, size, utf32_output); -} - -simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf16_to_utf8::convert(buf, len, utf8_output); -} - -simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf16_to_utf8::convert(buf, len, utf8_output); -} - -simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf16_to_utf8::convert_with_errors(buf, len, utf8_output); -} - -simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf16_to_utf8::convert_with_errors(buf, len, utf8_output); -} -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf16_to_utf8::convert_valid(buf, len, utf8_output); -} +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_to_utf16 { +using namespace simd; -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf16_to_utf8::convert_valid(buf, len, utf8_output); -} -simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf32_to_utf8::convert(buf, len, utf8_output); -} + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ -simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output); -} + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output); -} + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, -simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf32_to_utf16::convert(buf, len, utf16_output); -} + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, -simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf32_to_utf16::convert(buf, len, utf16_output); -} + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, -simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf32_to_utf16::convert_with_errors(buf, len, utf16_output); -} + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } -simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf32_to_utf16::convert_with_errors(buf, len, utf16_output); -} -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf32_to_utf16::convert_valid(buf, len, utf16_output); -} + struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; -simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - return scalar::utf32_to_utf16::convert_valid(buf, len, utf16_output); -} + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } -simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf16_to_utf32::convert(buf, len, utf32_output); -} -simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf16_to_utf32::convert(buf, len, utf32_output); -} + template + simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) { + size_t pos = 0; + char16_t* start{utf16_output}; + // In the worst case, we have the haswell kernel which can cause an overflow of + // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes, + // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate + // much more than 8 bytes. However, you cannot generally assume that you have valid + // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, + // to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for(; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin-1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16(in + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { return 0; } + if(pos < size) { + size_t howmany = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output); + if(howmany == 0) { return 0; } + utf16_output += howmany; + } + return utf16_output - start; + } -simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf16_to_utf32::convert_with_errors(buf, len, utf32_output); -} - -simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf16_to_utf32::convert_with_errors(buf, len, utf32_output); -} - -simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf16_to_utf32::convert_valid(buf, len, utf32_output); -} - -simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::utf16_to_utf32::convert_valid(buf, len, utf32_output); -} - -void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept { - scalar::utf16::change_endianness_utf16(input, length, output); -} - -simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept { - return scalar::utf16::count_code_points(input, length); -} - -simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept { - return scalar::utf16::count_code_points(input, length); -} - -simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { - return scalar::utf8::count_code_points(input, length); -} - -simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept { - return scalar::utf16::utf8_length_from_utf16(input, length); -} + template + simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) { + size_t pos = 0; + char16_t* start{utf16_output}; + // In the worst case, we have the haswell kernel which can cause an overflow of + // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes, + // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate + // much more than 8 bytes. However, you cannot generally assume that you have valid + // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, + // to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for(; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin-1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos onward, + // with the ability to go back up to pos bytes, and read size-pos bytes forward. + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16(in + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos onward, + // with the ability to go back up to pos bytes, and read size-pos bytes forward. + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + if(pos < size) { + // rewind_and_convert_with_errors will seek a potential error from in+pos onward, + // with the ability to go back up to pos bytes, and read size-pos bytes forward. + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf16_output += res.count; + } + } + return result(error_code::SUCCESS, utf16_output - start); + } -simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept { - return scalar::utf16::utf8_length_from_utf16(input, length); -} + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } -simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept { - return scalar::utf16::utf32_length_from_utf16(input, length); -} + }; // struct utf8_checker +} // utf8_to_utf16 namespace +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +// transcoding from UTF-8 to UTF-32 +/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ -simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept { - return scalar::utf16::utf32_length_from_utf16(input, length); -} +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_to_utf32 { -simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { - return scalar::utf8::utf16_length_from_utf8(input, length); -} +using namespace simd; -simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept { - return scalar::utf32::utf8_length_from_utf32(input, length); -} -simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept { - return scalar::utf32::utf16_length_from_utf32(input, length); +simdutf_warn_unused size_t convert_valid(const char* input, size_t size, + char32_t* utf32_output) noexcept { + size_t pos = 0; + char32_t* start{utf32_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 in(reinterpret_cast(input + pos)); + if(in.is_ascii()) { + in.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // -65 is 0b10111111 in two-complement's, so largest possible continuation byte + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + size_t max_starting_point = (pos + 64) - 12; + while(pos < max_starting_point) { + size_t consumed = convert_masked_utf8_to_utf32(input + pos, + utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + } + } + utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output); + return utf32_output - start; } -simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept { - return scalar::utf8::count_code_points(input, length); -} -} // namespace fallback +} // namespace utf8_to_utf32 +} // unnamed namespace +} // namespace arm64 } // namespace simdutf +/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ +/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/end.h -/* begin file src/simdutf/fallback/end.h */ -/* end file src/simdutf/fallback/end.h */ -/* end file src/fallback/implementation.cpp */ -#endif -#if SIMDUTF_IMPLEMENTATION_ICELAKE -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/implementation.cpp -/* begin file src/icelake/implementation.cpp */ - - -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/begin.h -/* begin file src/simdutf/icelake/begin.h */ -// redefining SIMDUTF_IMPLEMENTATION to "icelake" -// #define SIMDUTF_IMPLEMENTATION icelake - -#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE -// nothing needed. -#else -SIMDUTF_TARGET_ICELAKE -#endif -#if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593 -SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized) -#endif // end of workaround -/* end file src/simdutf/icelake/begin.h */ namespace simdutf { -namespace icelake { +namespace arm64 { namespace { -#ifndef SIMDUTF_ICELAKE_H -#error "icelake.h must be included" -#endif -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_utf8_common.inl.cpp -/* begin file src/icelake/icelake_utf8_common.inl.cpp */ -// Common procedures for both validating and non-validating conversions from UTF-8. -enum block_processing_mode { SIMDUTF_FULL, SIMDUTF_TAIL}; - -using utf8_to_utf16_result = std::pair; -using utf8_to_utf32_result = std::pair; - -/* - process_block_utf8_to_utf16 converts up to 64 bytes from 'in' from UTF-8 - to UTF-16. When tail = SIMDUTF_FULL, then the full input buffer (64 bytes) - might be used. When tail = SIMDUTF_TAIL, we take into account 'gap' which - indicates how many input bytes are relevant. +namespace utf8_to_utf32 { +using namespace simd; - Returns true when the result is correct, otherwise it returns false. - The provided in and out pointers are advanced according to how many input - bytes have been processed, upon success. -*/ -template -simdutf_really_inline bool process_block_utf8_to_utf16(const char *&in, char16_t *&out, size_t gap) { - // constants - __m512i mask_identity = _mm512_set_epi8(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - __m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0); - __m512i mask_80808080 = _mm512_set1_epi32(0x80808080); - __m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0); - __m512i mask_dfdfdfdf_tail = _mm512_set_epi64(0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf); - __m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2); - __m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff); - __m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0); - __m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00); - __m512i byteflip = _mm512_setr_epi64( - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809 - ); - // Note that 'tail' is a compile-time constant ! - __mmask64 b = (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1; - __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in) : _mm512_maskz_loadu_epi8(b, in); - __mmask64 m1 = (tail == SIMDUTF_FULL) ? _mm512_cmplt_epu8_mask(input, mask_80808080) : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080); - if(_ktestc_mask64_u8(m1, b)) {// NOT(m1) AND b -- if all zeroes, then all ASCII - // alternatively, we could do 'if (m1 == b) { ' - if (tail == SIMDUTF_FULL) { - in += 64; // consumed 64 bytes - // we convert a full 64-byte block, writing 128 bytes. - __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input)); - if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); } - _mm512_storeu_si512(out, input1); - out += 32; - __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1)); - if(big_endian) { input2 = _mm512_shuffle_epi8(input2, byteflip); } - _mm512_storeu_si512(out, input2); - out += 32; - return true; // we are done - } else { - in += gap; - if (gap <= 32) { - __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input)); - if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); } - _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1), input1); - out += gap; - } else { - __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input)); - if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); } - _mm512_storeu_si512(out, input1); - out += 32; - __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1)); - if(big_endian) { input2 = _mm512_shuffle_epi8(input2, byteflip); } - _mm512_mask_storeu_epi16(out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2); - out += gap - 32; - } - return true; // we are done - } - } - // classify characters further - __mmask64 m234 = _mm512_cmp_epu8_mask(mask_c0c0c0c0, input, - _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte - __mmask64 m34 = _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input, - _MM_CMPINT_LT); // 0xdf < input, 3 or 4 leading byte - - __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(m234, input, mask_c2c2c2c2, - _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence) - // Overlong 2-byte sequence - if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) { - // Overlong 2-byte sequence - return false; - } - if (_ktestz_mask64_u8(m34, m34) == 0) { - // We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a 4-byte sequence! - __mmask64 m4 = _mm512_cmp_epu8_mask(input, mask_f0f0f0f0, - _MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes) - - __mmask64 mask_not_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m1) : _kand_mask64(_knot_mask64(m1), b); - - __mmask64 mp1 = _kshiftli_mask64(m234, 1); - __mmask64 mp2 = _kshiftli_mask64(m34, 2); - // We could do it as follows... - // if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes - // but GCC generates better code when we do: - if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes - // Fast path with 1,2,3 bytes - __mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes - __mmask64 m1234 = _kor_mask64(m1, m234); - // mismatched continuation bytes: - if (tail == SIMDUTF_FULL) { - __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ - // the presence of a 1 bit indicates that they overlap. - // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes. - if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { return false; } - } else { - __mmask64 bxorm1234 = _kxor_mask64(b, m1234); - if (mc != bxorm1234) { return false; } - } - // mend: identifying the last bytes of each sequence to be decoded - __mmask64 mend = _kshiftri_mask64(m1234, 1); - if (tail != SIMDUTF_FULL) { - mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1))); - } + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, - __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity); - __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third)); + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, - __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000 other: 11000000 - __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input); // high two bits cleared where not ASCII - __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16, - clearedbytes); // the last byte of each character + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1); // bytes that precede non-ASCII bytes - __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes - __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes); - __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes, - beforeasciibytes); // the second last bytes (of two, three byte seq, - // surrogates) - secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6); // shifted into position + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff, - indexofsecondlastbytes); // indices of the second last bytes - __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34, - clearedbytes); // only those that are the third last byte of a sequece - __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes, - thirdlastbyte); // the third last bytes (of three byte sequences, hi - // surrogate) - thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position - __m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254); - // the elements of Wout excluding the last element if it happens to be a high surrogate: + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } - __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(0xFFFFFFFF, mend) : _pdep_u64(0xFFFFFFFF, _kand_mask64(mend, b)); // we adjust mend at the end of the output. + struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; - // Encodings out of range... - { - // the location of 3-byte sequence start bytes in the input - __mmask64 m3 = m34 & (b ^ m4); - // words in Wout corresponding to 3-byte sequences. - __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend)); - __m512i mask_08000800 = _mm512_set1_epi32(0x08000800); - __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800); - __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800); - __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800); - __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800); - if (_kor_mask32(Msmall800, M3s)) { return false; } - } - int64_t nout = _mm_popcnt_u64(mprocessed); - in += 64 - _lzcnt_u64(mprocessed); - if(big_endian) { Wout = _mm512_shuffle_epi8(Wout, byteflip); } - _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout); - out += nout; - return true; // ok - } + validating_transcoder() : error(uint8_t(0)) {} // - // We have a 4-byte sequence, this is the general case. - // Slow! - __mmask64 mp3 = _kshiftli_mask64(m4, 3); - __mmask64 mc = _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes - __mmask64 m1234 = _kor_mask64(m1, m234); - - // mend: identifying the last bytes of each sequence to be decoded - __mmask64 mend = _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3); - if (tail != SIMDUTF_FULL) { - mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1))); + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); } - __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity); - __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third)); - __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000 other: 11000000 - __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input); // high two bits cleared where not ASCII - __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16, - clearedbytes); // the last byte of each character - __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1); // bytes that precede non-ASCII bytes - __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes - __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes); + + simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) { + size_t pos = 0; + char32_t* start{utf32_output}; + // In the worst case, we have the haswell kernel which can cause an overflow of + // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, + // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate + // much more than 8 bytes. However, you cannot generally assume that you have valid + // UTF-8 input, so we are going to go back from the end counting 4 leading bytes, + // to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for(; margin > 0 && leading_byte < 4; margin--) { + leading_byte += (int8_t(in[margin-1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the fourth last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32(in + pos, + utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { return 0; } + if(pos < size) { + size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output); + if(howmany == 0) { return 0; } + utf32_output += howmany; + } + return utf32_output - start; + } + + simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) { + size_t pos = 0; + char32_t* start{utf32_output}; + // In the worst case, we have the haswell kernel which can cause an overflow of + // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, + // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate + // much more than 8 bytes. However, you cannot generally assume that you have valid + // UTF-8 input, so we are going to go back from the end counting 4 leading bytes, + // to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for(; margin > 0 && leading_byte < 4; margin--) { + leading_byte += (int8_t(in[margin-1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the fourth last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + if (errors()) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32(in + pos, + utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + if(pos < size) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf32_output += res.count; + } + } + return result(error_code::SUCCESS, utf32_output - start); + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // utf8_to_utf32 namespace +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +// other functions +/* begin file src/generic/utf8.h */ + +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8 { + +using namespace simd; + +simdutf_really_inline size_t count_code_points(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.gt(-65); + count += count_ones(utf8_continuation_mask); + } + return count + scalar::utf8::count_code_points(in + pos, size - pos); +} + +simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + // We count one word for anything that is not a continuation (so + // leading bytes). + count += 64 - count_ones(utf8_continuation_mask); + int64_t utf8_4byte = input.gteq_unsigned(240); + count += count_ones(utf8_4byte); + } + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); +} + + +simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) { + return count_code_points(in, size); +} +} // utf8 namespace +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8.h */ +/* begin file src/generic/utf16.h */ +namespace simdutf { +namespace arm64 { +namespace { +namespace utf16 { + +template +simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos < size/32*32; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if (!match_system(big_endian)) { input.swap_bytes(); } + uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); + count += count_ones(not_pair) / 2; + } + return count + scalar::utf16::count_code_points(in + pos, size - pos); +} + +template +simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos < size/32*32; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if (!match_system(big_endian)) { input.swap_bytes(); } + uint64_t ascii_mask = input.lteq(0x7F); + uint64_t twobyte_mask = input.lteq(0x7FF); + uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); + + size_t ascii_count = count_ones(ascii_mask) / 2; + size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2; + size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2; + size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; + count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count; + } + return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos); +} + +template +simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) { + return count_code_points(in, size); +} + +simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) { + size_t pos = 0; + + while (pos < size/32*32) { + simd16x32 input(reinterpret_cast(in + pos)); + input.swap_bytes(); + input.store(reinterpret_cast(output)); + pos += 32; + output += 32; + } + + scalar::utf16::change_endianness_utf16(in + pos, size - pos, output); +} + +} // utf16 +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf16.h */ +// transcoding from UTF-8 to Latin 1 +/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */ + + +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_to_latin1 { +using namespace simd; + + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// For UTF-8 to Latin 1, we can allow any ASCII character, and any continuation byte, +// but the non-ASCII leading bytes must be 0b11000011 or 0b11000010 and nothing else. +// +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + constexpr const uint8_t FORBIDDEN = 0xff; + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + FORBIDDEN, + // 1110____ ________ + FORBIDDEN, + // 1111____ ________ + FORBIDDEN + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + FORBIDDEN, + // ____0101 ________ + FORBIDDEN, + // ____011_ ________ + FORBIDDEN, + FORBIDDEN, + + // ____1___ ________ + FORBIDDEN, + FORBIDDEN, + FORBIDDEN, + FORBIDDEN, + FORBIDDEN, + // ____1101 ________ + FORBIDDEN, + FORBIDDEN, + FORBIDDEN + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + + struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + this->error |= check_special_cases(input, prev1); + } + + + simdutf_really_inline size_t convert(const char* in, size_t size, char* latin1_output) { + size_t pos = 0; + char* start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow of + // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes, + // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate + // much more than 8 bytes. However, you cannot generally assume that you have valid + // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, + // to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for(; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin-1]) > -65); //twos complement of -65 is 1011 1111 ... + } + // If the input is long enough, then we have that margin-1 is the eight last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store((int8_t*)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for. + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1(in + pos, + utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { return 0; } + if(pos < size) { + size_t howmany = scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output); + if(howmany == 0) { return 0; } + latin1_output += howmany; + } + return latin1_output - start; + } + + simdutf_really_inline result convert_with_errors(const char* in, size_t size, char* latin1_output) { + size_t pos = 0; + char* start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow of + // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes, + // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate + // much more than 8 bytes. However, you cannot generally assume that you have valid + // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, + // to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for(; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin-1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store((int8_t*)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos onward, + // with the ability to go back up to pos bytes, and read size-pos bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output); + res.count += pos; + return res; + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1(in + pos, + utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos onward, + // with the ability to go back up to pos bytes, and read size-pos bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output); + res.count += pos; + return res; + } + if(pos < size) { + // rewind_and_convert_with_errors will seek a potential error from in+pos onward, + // with the ability to go back up to pos bytes, and read size-pos bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + latin1_output += res.count; + } + } + return result(error_code::SUCCESS, latin1_output - start); + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // utf8_to_latin1 namespace +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */ +/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ + + +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_to_latin1 { +using namespace simd; + + + simdutf_really_inline size_t convert_valid(const char* in, size_t size, char* latin1_output) { + size_t pos = 0; + char* start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow of + // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes, + // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate + // much more than 8 bytes. However, you cannot generally assume that you have valid + // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, + // to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for(; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin-1]) > -65); //twos complement of -65 is 1011 1111 ... + } + // If the input is long enough, then we have that margin-1 is the eight last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store((int8_t*)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for. + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1(in + pos, + utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(pos < size) { + size_t howmany = scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output); + if(howmany == 0) { return 0; } + latin1_output += howmany; + } + return latin1_output - start; + } + + }; +} // utf8_to_latin1 namespace +} // unnamed namespace +} // namespace arm64 + // namespace simdutf +/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ + +// placeholder scalars + +// +// Implementation-specific overrides +// +namespace simdutf { +namespace arm64 { + +simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept { + // If there is a BOM, then we trust it. + auto bom_encoding = simdutf::BOM::check_bom(input, length); + if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } + if (length % 2 == 0) { + return arm_detect_encodings(input, length); + } else { + if (implementation::validate_utf8(input, length)) { + return simdutf::encoding_type::UTF8; + } else { + return simdutf::encoding_type::unspecified; + } + } +} + +simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return arm64::utf8_validation::generic_validate_utf8(buf,len); +} + +simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept { + return arm64::utf8_validation::generic_validate_utf8_with_errors(buf,len); +} + +simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept { + return arm64::utf8_validation::generic_validate_ascii(buf,len); +} + +simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept { + return arm64::utf8_validation::generic_validate_ascii_with_errors(buf,len); +} + +simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept { + const char16_t* tail = arm_validate_utf16(buf, len); + if (tail) { + return scalar::utf16::validate(tail, len - (tail - buf)); + } else { + return false; + } +} + +simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept { + const char16_t* tail = arm_validate_utf16(buf, len); + if (tail) { + return scalar::utf16::validate(tail, len - (tail - buf)); + } else { + return false; + } +} + +simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept { + result res = arm_validate_utf16_with_errors(buf, len); + if (res.count != len) { + result scalar_res = scalar::utf16::validate_with_errors(buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } +} + +simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept { + result res = arm_validate_utf16_with_errors(buf, len); + if (res.count != len) { + result scalar_res = scalar::utf16::validate_with_errors(buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } +} + +simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { + const char32_t* tail = arm_validate_utf32le(buf, len); + if (tail) { + return scalar::utf32::validate(tail, len - (tail - buf)); + } else { + return false; + } +} + +simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept { + result res = arm_validate_utf32le_with_errors(buf, len); + if (res.count != len) { + result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept { + std::pair ret = arm_convert_latin1_to_utf8(buf, len, utf8_output); + size_t converted_chars = ret.second - utf8_output; + + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } + + return converted_chars; +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + std::pair ret = arm_convert_latin1_to_utf16(buf, len, utf16_output); + size_t converted_chars = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } + return converted_chars; +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + std::pair ret = arm_convert_latin1_to_utf16(buf, len, utf16_output); + size_t converted_chars = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } + return converted_chars; +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept { + std::pair ret = arm_convert_latin1_to_utf32(buf, len, utf32_output); + size_t converted_chars = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } + return converted_chars; +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept { + utf8_to_latin1::validating_transcoder converter; + return converter.convert(buf, len, latin1_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept { + utf8_to_latin1::validating_transcoder converter; + return converter.convert_with_errors(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept { + return arm64::utf8_to_latin1::convert_valid(buf,len,latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size, + char16_t* utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size, + char16_t* utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept { + utf8_to_utf32::validating_transcoder converter; + return converter.convert(buf, len, utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept { + utf8_to_utf32::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size, + char32_t* utf32_output) const noexcept { + return utf8_to_utf32::convert_valid(input, size, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + std::pair ret = arm_convert_utf16_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - latin1_output; + + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + std::pair ret = arm_convert_utf16_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - latin1_output; + + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + std::pair ret = arm_convert_utf16_to_latin1_with_errors(buf, len, latin1_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + std::pair ret = arm_convert_utf16_to_latin1_with_errors(buf, len, latin1_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + // optimization opportunity: implement a custom function. + return convert_utf16be_to_latin1(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + // optimization opportunity: implement a custom function. + return convert_utf16le_to_latin1(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + std::pair ret = arm_convert_utf16_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + std::pair ret = arm_convert_utf16_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of code units written even if finished + std::pair ret = arm_convert_utf16_to_utf8_with_errors(buf, len, utf8_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of code units written even if finished + std::pair ret = arm_convert_utf16_to_utf8_with_errors(buf, len, utf8_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return convert_utf16le_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return convert_utf16be_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + std::pair ret = arm_convert_utf32_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of code units written even if finished + std::pair ret = arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + std::pair ret = arm_convert_utf16_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + std::pair ret = arm_convert_utf16_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of code units written even if finished + std::pair ret = arm_convert_utf16_to_utf32_with_errors(buf, len, utf32_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_utf32::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of code units written even if finished + std::pair ret = arm_convert_utf16_to_utf32_with_errors(buf, len, utf32_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_utf32::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept { + std::pair ret = arm_convert_utf32_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - latin1_output; + + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept { + std::pair ret = arm_convert_utf32_to_latin1_with_errors(buf, len, latin1_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf32_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept { + // optimization opportunity: implement a custom function. + return convert_utf32_to_latin1(buf,len,latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + // optimization opportunity: implement a custom function. + return convert_utf32_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + std::pair ret = arm_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + std::pair ret = arm_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of code units written even if finished + std::pair ret = arm_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_utf16::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of code units written even if finished + std::pair ret = arm_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_utf16::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return convert_utf32_to_utf16le(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return convert_utf32_to_utf16be(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return convert_utf16le_to_utf32(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return convert_utf16be_to_utf32(buf, len, utf32_output); +} + +void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept { + utf16::change_endianness_utf16(input, length, output); +} + +simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept { + return utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept { + return utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { + return utf8::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept { + return count_utf8(buf,len); +} + +simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept { + return scalar::utf16::latin1_length_from_utf16(length); +} + +simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept { + return scalar::utf32::latin1_length_from_utf32(length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t length) const noexcept { + // See https://lemire.me/blog/2023/05/15/computing-the-utf-8-size-of-a-latin-1-string-quickly-arm-neon-edition/ + // credit to Pete Cawley + const uint8_t *data = reinterpret_cast(input); + uint64_t result = 0; + const int lanes = sizeof(uint8x16_t); + uint8_t rem = length % lanes; + const uint8_t *simd_end = data + (length / lanes) * lanes; + const uint8x16_t threshold = vdupq_n_u8(0x80); + for (; data < simd_end; data += lanes) { + // load 16 bytes + uint8x16_t input_vec = vld1q_u8(data); + // compare to threshold (0x80) + uint8x16_t withhighbit = vcgeq_u8(input_vec, threshold); + // vertical addition + result -= vaddvq_s8(vreinterpretq_s8_u8(withhighbit)); + } + // scalar tail + for (uint8_t j = 0; j < rem; j++) { + result += (simd_end[j] >> 7); + } + return result + length; +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16(input, length); +} + + +simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept { + return scalar::latin1::utf16_length_from_latin1(length); +} + + +simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept { + return scalar::latin1::utf32_length_from_latin1(length); +} + + + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { + return utf8::utf16_length_from_utf8(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept { + const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f); + const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff); + const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff); + const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1); + size_t pos = 0; + size_t count = 0; + for(;pos + 4 <= length; pos += 4) { + uint32x4_t in = vld1q_u32(reinterpret_cast(input + pos)); + const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f); + const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff); + const uint32x4_t two_bytes_bytemask = veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask); + const uint32x4_t three_bytes_bytemask = veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask); + + const uint16x8_t reduced_ascii_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1)); + const uint16x8_t reduced_two_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1)); + const uint16x8_t reduced_three_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1)); + + const uint16x8_t compressed_bytemask0 = vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask); + const uint16x8_t compressed_bytemask1 = vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask); + + size_t ascii_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0)); + size_t two_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1)); + size_t three_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0)); + + count += 16 - 3*ascii_count - 2*two_bytes_count - three_bytes_count; + } + return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept { + const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff); + const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1); + size_t pos = 0; + size_t count = 0; + for(;pos + 4 <= length; pos += 4) { + uint32x4_t in = vld1q_u32(reinterpret_cast(input + pos)); + const uint32x4_t surrogate_bytemask = vcgtq_u32(in, v_ffff); + const uint16x8_t reduced_bytemask = vreinterpretq_u16_u32(vandq_u32(surrogate_bytemask, v_1)); + const uint16x8_t compressed_bytemask = vpaddq_u16(reduced_bytemask, reduced_bytemask); + size_t surrogate_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask), 0)); + count += 4 + surrogate_count; + } + return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept { + return utf8::utf32_length_from_utf8(input, length); +} + +} // namespace arm64 +} // namespace simdutf + +/* begin file src/simdutf/arm64/end.h */ +/* end file src/simdutf/arm64/end.h */ +/* end file src/arm64/implementation.cpp */ +#endif +#if SIMDUTF_IMPLEMENTATION_FALLBACK +/* begin file src/fallback/implementation.cpp */ +/* begin file src/simdutf/fallback/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "fallback" +// #define SIMDUTF_IMPLEMENTATION fallback +/* end file src/simdutf/fallback/begin.h */ + + + + + + + + + +namespace simdutf { +namespace fallback { + +simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept { + // If there is a BOM, then we trust it. + auto bom_encoding = simdutf::BOM::check_bom(input, length); + if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } + int out = 0; + if(validate_utf8(input, length)) { out |= encoding_type::UTF8; } + if((length % 2) == 0) { + if(validate_utf16le(reinterpret_cast(input), length/2)) { out |= encoding_type::UTF16_LE; } + } + if((length % 4) == 0) { + if(validate_utf32(reinterpret_cast(input), length/4)) { out |= encoding_type::UTF32_LE; } + } + + return out; +} + +simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return scalar::utf8::validate(buf, len); +} + +simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept { + return scalar::utf8::validate_with_errors(buf, len); +} + +simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept { + return scalar::ascii::validate(buf, len); +} + +simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept { + return scalar::ascii::validate_with_errors(buf, len); +} + +simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept { + return scalar::utf16::validate(buf, len); +} + +simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept { + return scalar::utf16::validate(buf, len); +} + +simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept { + return scalar::utf16::validate_with_errors(buf, len); +} + +simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept { + return scalar::utf16::validate_with_errors(buf, len); +} + +simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { + return scalar::utf32::validate(buf, len); +} + +simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept { + return scalar::utf32::validate_with_errors(buf, len); +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept { + return scalar::latin1_to_utf8::convert(buf,len,utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::latin1_to_utf16::convert(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::latin1_to_utf16::convert(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::latin1_to_utf32::convert(buf,len,utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept { + return scalar::utf8_to_latin1::convert(buf, len, latin1_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept { + return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept { + return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf8_to_utf16::convert(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf8_to_utf16::convert(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf8_to_utf16::convert_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf8_to_utf16::convert_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf8_to_utf16::convert_valid(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf8_to_utf16::convert_valid(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf8_to_utf32::convert(buf, len, utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size, + char32_t* utf32_output) const noexcept { + return scalar::utf8_to_utf32::convert_valid(input, size, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + return scalar::utf16_to_latin1::convert(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + return scalar::utf16_to_latin1::convert(buf, len, latin1_output); +} + +simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + return scalar::utf16_to_latin1::convert_with_errors(buf, len, latin1_output); +} + +simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + return scalar::utf16_to_latin1::convert_with_errors(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + return scalar::utf16_to_latin1::convert_valid(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + return scalar::utf16_to_latin1::convert_valid(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert(buf, len, utf8_output); +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert_with_errors(buf, len, utf8_output); +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert_with_errors(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert_valid(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert_valid(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept { + return scalar::utf32_to_latin1::convert(buf, len, latin1_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept { + return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept { + return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf32_to_utf8::convert(buf, len, utf8_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert_valid(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert_valid(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert(buf, len, utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert_with_errors(buf, len, utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert_with_errors(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert_valid(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert_valid(buf, len, utf32_output); +} + +void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept { + scalar::utf16::change_endianness_utf16(input, length, output); +} + +simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { + return scalar::utf8::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept { + return scalar::utf8::latin1_length_from_utf8(buf,len); +} + +simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept { + return scalar::utf16::latin1_length_from_utf16(length); +} + +simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept { + return length; +} + +simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t length) const noexcept { + return scalar::latin1::utf8_length_from_latin1(input,length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::utf8_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::utf8_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::utf32_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::utf32_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept { + return scalar::latin1::utf16_length_from_latin1(length); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { + return scalar::utf8::utf16_length_from_utf8(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept { + return scalar::utf32::utf8_length_from_utf32(input, length); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept { + return scalar::utf32::utf16_length_from_utf32(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept { + return scalar::latin1::utf32_length_from_latin1(length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept { + return scalar::utf8::count_code_points(input, length); +} + +} // namespace fallback +} // namespace simdutf + +/* begin file src/simdutf/fallback/end.h */ +/* end file src/simdutf/fallback/end.h */ +/* end file src/fallback/implementation.cpp */ +#endif +#if SIMDUTF_IMPLEMENTATION_ICELAKE +/* begin file src/icelake/implementation.cpp */ + + +/* begin file src/simdutf/icelake/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "icelake" +// #define SIMDUTF_IMPLEMENTATION icelake + +#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE +// nothing needed. +#else +SIMDUTF_TARGET_ICELAKE +#endif + +#if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593 +SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized) +#endif // end of workaround +/* end file src/simdutf/icelake/begin.h */ +namespace simdutf { +namespace icelake { +namespace { +#ifndef SIMDUTF_ICELAKE_H +#error "icelake.h must be included" +#endif +/* begin file src/icelake/icelake_utf8_common.inl.cpp */ +// Common procedures for both validating and non-validating conversions from UTF-8. +enum block_processing_mode { SIMDUTF_FULL, SIMDUTF_TAIL}; + +using utf8_to_utf16_result = std::pair; +using utf8_to_utf32_result = std::pair; + +/* + process_block_utf8_to_utf16 converts up to 64 bytes from 'in' from UTF-8 + to UTF-16. When tail = SIMDUTF_FULL, then the full input buffer (64 bytes) + might be used. When tail = SIMDUTF_TAIL, we take into account 'gap' which + indicates how many input bytes are relevant. + + Returns true when the result is correct, otherwise it returns false. + + The provided in and out pointers are advanced according to how many input + bytes have been processed, upon success. +*/ +template +simdutf_really_inline bool process_block_utf8_to_utf16(const char *&in, char16_t *&out, size_t gap) { + // constants + __m512i mask_identity = _mm512_set_epi8(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0); + __m512i mask_80808080 = _mm512_set1_epi32(0x80808080); + __m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0); + __m512i mask_dfdfdfdf_tail = _mm512_set_epi64(0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf); + __m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2); + __m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff); + __m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0); + __m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00); + __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809 + ); + // Note that 'tail' is a compile-time constant ! + __mmask64 b = (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1; + __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in) : _mm512_maskz_loadu_epi8(b, in); + __mmask64 m1 = (tail == SIMDUTF_FULL) ? _mm512_cmplt_epu8_mask(input, mask_80808080) : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080); + if(_ktestc_mask64_u8(m1, b)) {// NOT(m1) AND b -- if all zeroes, then all ASCII + // alternatively, we could do 'if (m1 == b) { ' + if (tail == SIMDUTF_FULL) { + in += 64; // consumed 64 bytes + // we convert a full 64-byte block, writing 128 bytes. + __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input)); + if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); } + _mm512_storeu_si512(out, input1); + out += 32; + __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1)); + if(big_endian) { input2 = _mm512_shuffle_epi8(input2, byteflip); } + _mm512_storeu_si512(out, input2); + out += 32; + return true; // we are done + } else { + in += gap; + if (gap <= 32) { + __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input)); + if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); } + _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1), input1); + out += gap; + } else { + __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input)); + if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); } + _mm512_storeu_si512(out, input1); + out += 32; + __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1)); + if(big_endian) { input2 = _mm512_shuffle_epi8(input2, byteflip); } + _mm512_mask_storeu_epi16(out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2); + out += gap - 32; + } + return true; // we are done + } + } + // classify characters further + __mmask64 m234 = _mm512_cmp_epu8_mask(mask_c0c0c0c0, input, + _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte + __mmask64 m34 = _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input, + _MM_CMPINT_LT); // 0xdf < input, 3 or 4 leading byte + + __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(m234, input, mask_c2c2c2c2, + _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence) + // Overlong 2-byte sequence + if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) { + // Overlong 2-byte sequence + return false; + } + if (_ktestz_mask64_u8(m34, m34) == 0) { + // We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a 4-byte sequence! + __mmask64 m4 = _mm512_cmp_epu8_mask(input, mask_f0f0f0f0, + _MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes) + + __mmask64 mask_not_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m1) : _kand_mask64(_knot_mask64(m1), b); + + __mmask64 mp1 = _kshiftli_mask64(m234, 1); + __mmask64 mp2 = _kshiftli_mask64(m34, 2); + // We could do it as follows... + // if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes + // but GCC generates better code when we do: + if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes + // Fast path with 1,2,3 bytes + __mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes + __mmask64 m1234 = _kor_mask64(m1, m234); + // mismatched continuation bytes: + if (tail == SIMDUTF_FULL) { + __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ + // the presence of a 1 bit indicates that they overlap. + // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes. + if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { return false; } + } else { + __mmask64 bxorm1234 = _kxor_mask64(b, m1234); + if (mc != bxorm1234) { return false; } + } + // mend: identifying the last bytes of each sequence to be decoded + __mmask64 mend = _kshiftri_mask64(m1234, 1); + if (tail != SIMDUTF_FULL) { + mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1))); + } + + + __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity); + __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third)); + + __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000 other: 11000000 + __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input); // high two bits cleared where not ASCII + __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16, + clearedbytes); // the last byte of each character + + __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1); // bytes that precede non-ASCII bytes + __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes + __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes); + __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes, + beforeasciibytes); // the second last bytes (of two, three byte seq, + // surrogates) + secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6); // shifted into position + + __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff, + indexofsecondlastbytes); // indices of the second last bytes + __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34, + clearedbytes); // only those that are the third last byte of a sequece + __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes, + thirdlastbyte); // the third last bytes (of three byte sequences, hi + // surrogate) + thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position + __m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254); + // the elements of Wout excluding the last element if it happens to be a high surrogate: + + __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(0xFFFFFFFF, mend) : _pdep_u64(0xFFFFFFFF, _kand_mask64(mend, b)); // we adjust mend at the end of the output. + + + // Encodings out of range... + { + // the location of 3-byte sequence start bytes in the input + __mmask64 m3 = m34 & (b ^ m4); + // code units in Wout corresponding to 3-byte sequences. + __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend)); + __m512i mask_08000800 = _mm512_set1_epi32(0x08000800); + __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800); + __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800); + __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800); + __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800); + if (_kor_mask32(Msmall800, M3s)) { return false; } + } + int64_t nout = _mm_popcnt_u64(mprocessed); + in += 64 - _lzcnt_u64(mprocessed); + if(big_endian) { Wout = _mm512_shuffle_epi8(Wout, byteflip); } + _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout); + out += nout; + return true; // ok + } + // + // We have a 4-byte sequence, this is the general case. + // Slow! + __mmask64 mp3 = _kshiftli_mask64(m4, 3); + __mmask64 mc = _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes + __mmask64 m1234 = _kor_mask64(m1, m234); + + // mend: identifying the last bytes of each sequence to be decoded + __mmask64 mend = _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3); + if (tail != SIMDUTF_FULL) { + mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1))); + } + __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity); + __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third)); + + __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000 other: 11000000 + __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input); // high two bits cleared where not ASCII + __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16, + clearedbytes); // the last byte of each character + + __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1); // bytes that precede non-ASCII bytes + __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes + __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes); __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes, beforeasciibytes); // the second last bytes (of two, three byte seq, // surrogates) @@ -16025,7 +18064,7 @@ simdutf_really_inline bool process_block_utf8_to_utf16(const char *&in, char16_t { // the location of 3-byte sequence start bytes in the input __mmask64 m3 = m34 & (b ^ m4); - // words in Wout corresponding to 3-byte sequences. + // code units in Wout corresponding to 3-byte sequences. __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend)); __m512i mask_08000800 = _mm512_set1_epi32(0x08000800); __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800); @@ -16075,9 +18114,9 @@ simdutf_really_inline bool process_block_utf8_to_utf16(const char *&in, char16_t in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii)); } __m512i lead = _mm512_maskz_compress_epi8(leading, leading2byte); // will contain zero for ascii, and the data - lead = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(lead)); // ... zero extended into words + lead = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(lead)); // ... zero extended into code units __m512i follow = _mm512_maskz_compress_epi8(continuation_or_ascii, input); // the last bytes of each sequence - follow = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(follow)); // ... zero extended into words + follow = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(follow)); // ... zero extended into code units lead = _mm512_slli_epi16(lead, 6); // shifted into position __m512i final = _mm512_add_epi16(follow, lead); // combining lead and follow @@ -16100,13 +18139,13 @@ simdutf_really_inline bool process_block_utf8_to_utf16(const char *&in, char16_t /* - utf32_to_utf16_masked converts `count` lower UTF-32 words + utf32_to_utf16_masked converts `count` lower UTF-32 code units from input `utf32` into UTF-16. It differs from utf32_to_utf16 in that it 'masks' the writes. - Returns how many 16-bit words were stored. + Returns how many 16-bit code units were stored. - byteflip is used for flipping 16-bit words, and it should be + byteflip is used for flipping 16-bit code units, and it should be __m512i byteflip = _mm512_setr_epi64( 0x0607040502030001, 0x0e0f0c0d0a0b0809, @@ -16139,7 +18178,7 @@ simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m51 } { - // build surrogate pair words in 32-bit lanes + // build surrogate pair code units in 32-bit lanes // t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb] const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000); @@ -16160,7 +18199,7 @@ simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m51 const __m512i t3 = _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba); const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3); __m512i t5 = _mm512_ror_epi32(t4, 16); - // Here we want to trim all of the upper 16-bit words from the 2-byte + // Here we want to trim all of the upper 16-bit code units from the 2-byte // characters represented as 4-byte values. We can compute it from // sp_mask or the following... It can be more optimized! const __mmask32 nonzero = _kor_mask32(0xaaaaaaaa,_mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512())); @@ -16176,12 +18215,12 @@ simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m51 } /* - utf32_to_utf16 converts `count` lower UTF-32 words + utf32_to_utf16 converts `count` lower UTF-32 code units from input `utf32` into UTF-16. It may overflow. - Returns how many 16-bit words were stored. + Returns how many 16-bit code units were stored. - byteflip is used for flipping 16-bit words, and it should be + byteflip is used for flipping 16-bit code units, and it should be __m512i byteflip = _mm512_setr_epi64( 0x0607040502030001, 0x0e0f0c0d0a0b0809, @@ -16212,7 +18251,7 @@ simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip, __m512i utf3 } { - // build surrogate pair words in 32-bit lanes + // build surrogate pair code units in 32-bit lanes // t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb] const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000); @@ -16299,8 +18338,8 @@ __m512i rotate_by_N_epi8(const __m512i input) { simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i utf8) { /* Input: - - utf8: bytes stored at separate 32-bit words - - valid: which words have valid UTF-8 characters + - utf8: bytes stored at separate 32-bit code units + - valid: which code units have valid UTF-8 characters Bit layout of single word. We show 4 cases for each possible UTF-8 character encoding. The `?` denotes bits we must not @@ -16448,7 +18487,6 @@ simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) { return expanded_utf8_to_utf32(char_class, input); } /* end file src/icelake/icelake_utf8_common.inl.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_macros.inl.cpp /* begin file src/icelake/icelake_macros.inl.cpp */ /* @@ -16584,7 +18622,6 @@ simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) { } \ } /* end file src/icelake/icelake_macros.inl.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_from_valid_utf8.inl.cpp /* begin file src/icelake/icelake_from_valid_utf8.inl.cpp */ // file included directly @@ -16723,7 +18760,6 @@ std::pair valid_utf8_to_fixed_length(const char* str, size using utf8_to_utf16_result = std::pair; /* end file src/icelake/icelake_from_valid_utf8.inl.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_utf8_validation.inl.cpp /* begin file src/icelake/icelake_utf8_validation.inl.cpp */ // file included directly @@ -16853,14 +18889,13 @@ simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i p }; // struct avx512_utf8_checker /* end file src/icelake/icelake_utf8_validation.inl.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_from_utf8.inl.cpp /* begin file src/icelake/icelake_from_utf8.inl.cpp */ // file included directly // File contains conversion procedure from possibly invalid UTF-8 strings. /** - * Attempts to convert up to len 1-byte words from in (in UTF-8 format) to + * Attempts to convert up to len 1-byte code units from in (in UTF-8 format) to * out. * Returns the position of the input and output after the processing is * completed. Upon error, the output is set to null. @@ -17028,17 +19063,422 @@ std::pair validating_utf8_to_fixed_length(const char* str, return {ptr, output}; } -// Like validating_utf8_to_fixed_length but returns as soon as an error is identified -template -std::tuple validating_utf8_to_fixed_length_with_constant_checks(const char* str, size_t len, OUTPUT* dwords) { - constexpr bool UTF32 = std::is_same::value; - constexpr bool UTF16 = std::is_same::value; - static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)"); - static_assert(!(UTF32 and big_endian), "we do not currently support big-endian UTF-32"); - - const char* ptr = str; - const char* end = ptr + len; - __m512i byteflip = _mm512_setr_epi64( +// Like validating_utf8_to_fixed_length but returns as soon as an error is identified +template +std::tuple validating_utf8_to_fixed_length_with_constant_checks(const char* str, size_t len, OUTPUT* dwords) { + constexpr bool UTF32 = std::is_same::value; + constexpr bool UTF16 = std::is_same::value; + static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)"); + static_assert(!(UTF32 and big_endian), "we do not currently support big-endian UTF-32"); + + const char* ptr = str; + const char* end = ptr + len; + __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809 + ); + OUTPUT* output = dwords; + avx512_utf8_checker checker{}; + /** + * In the main loop, we consume 64 bytes per iteration, + * but we access 64 + 4 bytes. + * We check for ptr + 64 + 64 <= end because + * we want to be do maskless writes without overruns. + */ + while (ptr + 64 + 64 <= end) { + const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr); + if(checker.check_next_input(utf8)) { + SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) + output += 64; + ptr += 64; + continue; + } + if(checker.errors()) { + return {ptr, output, false}; // We found an error. + } + const __m512i lane0 = broadcast_epi128<0>(utf8); + const __m512i lane1 = broadcast_epi128<1>(utf8); + int valid_count0; + __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); + const __m512i lane2 = broadcast_epi128<2>(utf8); + int valid_count1; + __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); + if(valid_count0 + valid_count1 <= 16) { + vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<(utf8); + int valid_count2; + __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2); + uint32_t tmp1; + ::memcpy(&tmp1, ptr + 64, sizeof(tmp1)); + const __m512i lane4 = _mm512_set1_epi32(tmp1); + int valid_count3; + __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3); + if(valid_count2 + valid_count3 <= 16) { + vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<(utf8); + const __m512i lane1 = broadcast_epi128<1>(utf8); + int valid_count0; + __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); + const __m512i lane2 = broadcast_epi128<2>(utf8); + int valid_count1; + __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); + if(valid_count0 + valid_count1 <= 16) { + vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<(utf8); + SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true) + + ptr += 3*16; + } + validatedptr += 4*16; + } + { + const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - validatedptr))-1, (const __m512i*)validatedptr); + checker.check_next_input(utf8); + } + checker.check_eof(); + if(checker.errors()) { + return {ptr, output, false}; // We found an error. + } + return {ptr, output, true}; +} +/* end file src/icelake/icelake_from_utf8.inl.cpp */ +/* begin file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */ +// file included directly + +// File contains conversion procedure from possibly invalid UTF-8 strings. + +// template +template +simdutf_really_inline size_t process_block_from_utf8_to_latin1(const char *buf, size_t len, + char *latin_output, __m512i minus64, + __m512i one, + __mmask64 *next_leading_ptr, + __mmask64 *next_bit6_ptr) { + __mmask64 load_mask = + is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL; + __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf); + __mmask64 nonascii = _mm512_movepi8_mask(input); + + if (nonascii == 0) { + is_remaining + ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input) + : _mm512_storeu_si512((__m512i *)latin_output, input); + return len; + } + + __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64); + + __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62)); + __mmask64 invalid_leading_bytes = + _mm512_mask_cmpgt_epu8_mask(leading, highbits, one); + + if (invalid_leading_bytes) { + return 0; // Indicates error + } + + __mmask64 leading_shift = (leading << 1) | *next_leading_ptr; + *next_leading_ptr = leading >> 63; + + if ((nonascii ^ leading) != leading_shift) { + return 0; // Indicates error + } + + __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one); + input = + _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64); + *next_bit6_ptr = bit6 >> 63; + + __mmask64 retain = ~leading & load_mask; + __m512i output = _mm512_maskz_compress_epi8(retain, input); + int64_t written_out = count_ones(retain); + __mmask64 store_mask = (1ULL << written_out) - 1; + + // *************************** + // Possible optimization? (Nick Nuon) + // This commented out line is 5% faster but sadly it'll also write past + // memory bounds for latin1_output: is_remaining ? + // _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output) : + // _mm512_storeu_si512((__m512i *)latin_output, output); I tried using + // _mm512_storeu_si512 and have the next process_block start from the + // "written_out" point but the compiler shuffles memory in such a way that it + // is signifcantly slower... + // **************************** + _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output); + + return written_out; +} + +size_t utf8_to_latin1_avx512(const char *buf, size_t len, char *latin_output) { + char *start = latin_output; + size_t pos = 0; + __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000 + __m512i one = _mm512_set1_epi8(1); + __mmask64 next_leading = 0; + __mmask64 next_bit6 = 0; + + while (pos + 64 <= len) { + size_t written = process_block_from_utf8_to_latin1(buf + pos, 64, latin_output, minus64, + one, &next_leading, &next_bit6); + if (written == 0) { + return 0; // Indicates error + } + latin_output += written; + pos += 64; + } + + if (pos < len) { + size_t remaining = len - pos; + size_t written = + process_block_from_utf8_to_latin1(buf + pos, remaining, latin_output, minus64, one, + &next_leading, &next_bit6); + if (written == 0) { + return 0; // Indicates error + } + latin_output += written; + } + + return (size_t)(latin_output - start); +} +/* end file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */ +/* begin file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */ +// file included directly + +// File contains conversion procedure from valid UTF-8 strings. + +template +simdutf_really_inline size_t process_valid_block_from_utf8_to_latin1(const char *buf, size_t len, + char *latin_output, + __m512i minus64, __m512i one, + __mmask64 *next_leading_ptr, + __mmask64 *next_bit6_ptr) { + __mmask64 load_mask = + is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL; + __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf); + __mmask64 nonascii = _mm512_movepi8_mask(input); + + if (nonascii == 0) { + is_remaining + ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input) + : _mm512_storeu_si512((__m512i *)latin_output, input); + return len; + } + + __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64); + + __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62)); + + *next_leading_ptr = leading >> 63; + + __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one); + input = + _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64); + *next_bit6_ptr = bit6 >> 63; + + __mmask64 retain = ~leading & load_mask; + __m512i output = _mm512_maskz_compress_epi8(retain, input); + int64_t written_out = count_ones(retain); + __mmask64 store_mask = (1ULL << written_out) - 1; + // Optimization opportunity: sometimes, masked writes are not needed. + _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output); + return written_out; +} + +size_t valid_utf8_to_latin1_avx512(const char *buf, size_t len, + char *latin_output) { + char *start = latin_output; + size_t pos = 0; + __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000 + __m512i one = _mm512_set1_epi8(1); + __mmask64 next_leading = 0; + __mmask64 next_bit6 = 0; + + while (pos + 64 <= len) { + size_t written = process_valid_block_from_utf8_to_latin1( + buf + pos, 64, latin_output, minus64, one, &next_leading, &next_bit6); + latin_output += written; + pos += 64; + } + + if (pos < len) { + size_t remaining = len - pos; + size_t written = + process_valid_block_from_utf8_to_latin1(buf + pos, remaining, latin_output, minus64, + one, &next_leading, &next_bit6); + latin_output += written; + } + + return (size_t)(latin_output - start); +} +/* end file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */ +/* begin file src/icelake/icelake_convert_utf16_to_latin1.inl.cpp */ +// file included directly +template +size_t icelake_convert_utf16_to_latin1(const char16_t *buf, size_t len, + char *latin1_output) { + const char16_t *end = buf + len; + __m512i v_0xFF = _mm512_set1_epi16(0xff); + __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809); + __m512i shufmask = _mm512_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, + 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + while (buf + 32 <= end) { + __m512i in = _mm512_loadu_si512((__m512i *)buf); + if (big_endian) { + in = _mm512_shuffle_epi8(in, byteflip); + } + if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) { + return 0; + } + _mm256_storeu_si256( + (__m256i *)latin1_output, + _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in))); + latin1_output += 32; + buf += 32; + } + if (buf < end) { + uint32_t mask(uint32_t(1 << (end - buf)) - 1); + __m512i in = _mm512_maskz_loadu_epi16(mask, buf); + if (big_endian) { + in = _mm512_shuffle_epi8(in, byteflip); + } + if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) { + return 0; + } + _mm256_mask_storeu_epi8( + latin1_output, mask, + _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in))); + } + return len; +} + +template +std::pair +icelake_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len, + char *latin1_output) { + const char16_t *end = buf + len; + const char16_t *start = buf; + __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809); + __m512i v_0xFF = _mm512_set1_epi16(0xff); + __m512i shufmask = _mm512_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, + 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + while (buf + 32 <= end) { + __m512i in = _mm512_loadu_si512((__m512i *)buf); + if (big_endian) { + in = _mm512_shuffle_epi8(in, byteflip); + } + if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) { + uint16_t word; + while ((word = (big_endian ? scalar::utf16::swap_bytes(uint16_t(*buf)) + : uint16_t(*buf))) <= 0xff) { + *latin1_output++ = uint8_t(word); + buf++; + } + return std::make_pair(result(error_code::TOO_LARGE, buf - start), + latin1_output); + } + _mm256_storeu_si256( + (__m256i *)latin1_output, + _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in))); + latin1_output += 32; + buf += 32; + } + if (buf < end) { + uint32_t mask(uint32_t(1 << (end - buf)) - 1); + __m512i in = _mm512_maskz_loadu_epi16(mask, buf); + if (big_endian) { + in = _mm512_shuffle_epi8(in, byteflip); + } + if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) { + + uint16_t word; + while ((word = (big_endian ? scalar::utf16::swap_bytes(uint16_t(*buf)) + : uint16_t(*buf))) <= 0xff) { + *latin1_output++ = uint8_t(word); + buf++; + } + return std::make_pair(result(error_code::TOO_LARGE, buf - start), + latin1_output); + } + _mm256_mask_storeu_epi8( + latin1_output, mask, + _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in))); + } + return std::make_pair(result(error_code::SUCCESS, len), latin1_output); +} +/* end file src/icelake/icelake_convert_utf16_to_latin1.inl.cpp */ +/* begin file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */ +// file included directly + +/** + * This function converts the input (inbuf, inlen), assumed to be valid + * UTF16 (little endian) into UTF-8 (to outbuf). The number of code units written + * is written to 'outlen' and the function reports the number of input word + * consumed. + */ +template +size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen, + unsigned char *outbuf, size_t *outlen) { + __m512i in; + __mmask32 inmask = _cvtu32_mask32(0x7fffffff); + __m512i byteflip = _mm512_setr_epi64( 0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001, @@ -17048,115 +19488,181 @@ std::tuple validating_utf8_to_fixed_length_with_cons 0x0607040502030001, 0x0e0f0c0d0a0b0809 ); - OUTPUT* output = dwords; - avx512_utf8_checker checker{}; - /** - * In the main loop, we consume 64 bytes per iteration, - * but we access 64 + 4 bytes. - * We check for ptr + 64 + 64 <= end because - * we want to be do maskless writes without overruns. - */ - while (ptr + 64 + 64 <= end) { - const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr); - if(checker.check_next_input(utf8)) { - SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) - output += 64; - ptr += 64; - continue; - } - if(checker.errors()) { - return {ptr, output, false}; // We found an error. - } - const __m512i lane0 = broadcast_epi128<0>(utf8); - const __m512i lane1 = broadcast_epi128<1>(utf8); - int valid_count0; - __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); - const __m512i lane2 = broadcast_epi128<2>(utf8); - int valid_count1; - __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); - if(valid_count0 + valid_count1 <= 16) { - vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<(utf8); - int valid_count2; - __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2); - uint32_t tmp1; - ::memcpy(&tmp1, ptr + 64, sizeof(tmp1)); - const __m512i lane4 = _mm512_set1_epi32(tmp1); - int valid_count3; - __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3); - if(valid_count2 + valid_count3 <= 16) { - vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<= 32) { + in = _mm512_loadu_si512(inbuf); + if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); } + inlen -= 31; + lastiteration: + inbuf += 31; + + failiteration: + const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask( + inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT); + + if (_ktestz_mask32_u8(inmask, is234byte)) { + // fast path for ASCII only + _mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in); + outbuf += 31; + carry = 0; + + if (inlen < 32) { + goto tail; + } else { + continue; + } } - const char* validatedptr = ptr; // validated up to ptr - // For the final pass, we validate 64 bytes, but we only transcode - // 3*16 bytes, so we may end up double-validating 16 bytes. - if (ptr + 64 <= end) { - const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr); - if(checker.check_next_input(utf8)) { - SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) - output += 64; - ptr += 64; - } else if(checker.errors()) { - return {ptr, output, false}; // We found an error. - } else { - const __m512i lane0 = broadcast_epi128<0>(utf8); - const __m512i lane1 = broadcast_epi128<1>(utf8); - int valid_count0; - __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); - const __m512i lane2 = broadcast_epi128<2>(utf8); - int valid_count1; - __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); - if(valid_count0 + valid_count1 <= 16) { - vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<> 1); + inlen = _tzcnt_u32(hinolo | lonohi); + inmask = __mmask32(0x7fffffff & ((1 << inlen) - 1)); + in = _mm512_maskz_mov_epi16(inmask, in); + adjust = (int)inlen - 31; + inlen = 0; + goto failiteration; + } + } + + hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff),hi); + carry = carryout; + + __m512i mslo = + _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo); + + __m512i mshi = + _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi); + + const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask)); + const __mmask64 outmhi = _kshiftri_mask64(outmask, 16); + + const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte)); + const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16); + const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16); + + taglo = + _mm512_mask_mov_epi32(taglo, __mmask16(is12byte), _mm512_set1_epi32(0x80c00000)); + taghi = + _mm512_mask_mov_epi32(taghi, __mmask16(is12bhi), _mm512_set1_epi32(0x80c00000)); + __m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff), + _mm512_set1_epi32(0x00010101)); + __m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff), + _mm512_set1_epi32(0x00010101)); + + + magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff), + _mm512_set1_epi32(0x00010101)); + magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff), + _mm512_set1_epi32(0x00010101)); + + mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo, + 0xea); // A&B|C + mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi, + 0xea); + mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24); + + mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24); + + const __mmask64 wantlo = _mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT); + const __mmask64 wanthi = _mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT); + const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo); + const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi); + const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo); + const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi); - const __m512i lane3 = broadcast_epi128<3>(utf8); - SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true) + uint64_t advlo = _mm_popcnt_u64(wantlo_uint64); + uint64_t advhi = _mm_popcnt_u64(wanthi_uint64); - ptr += 3*16; - } - validatedptr += 4*16; - } - { - const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - validatedptr))-1, (const __m512i*)validatedptr); - checker.check_next_input(utf8); - } - checker.check_eof(); - if(checker.errors()) { - return {ptr, output, false}; // We found an error. - } - return {ptr, output, true}; + _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo); + _mm512_mask_storeu_epi8(outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)), outhi); + outbuf += advlo + advhi; + } + outbuf -= adjust; + +tail: + if (inlen != 0) { + // We must have inlen < 31. + inmask = _cvtu32_mask32((1 << inlen) - 1); + in = _mm512_maskz_loadu_epi16(inmask, inbuf); + if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); } + adjust = inlen - 31; + inlen = 0; + goto lastiteration; + } + *outlen = (outbuf - outbuf_orig) + adjust; + return ((inbuf - inbuf_orig) + adjust); } -/* end file src/icelake/icelake_from_utf8.inl.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf16_to_utf32.inl.cpp +/* end file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */ /* begin file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */ // file included directly @@ -17181,7 +19687,7 @@ std::tuple convert_utf16_to_utf32(const char16 0x0607040502030001, 0x0e0f0c0d0a0b0809 ); - while (buf + 32 <= end) { + while (std::distance(buf,end) >= 32) { // Always safe because buf + 32 <= end so that end - buf >= 32 bytes: __m512i in = _mm512_loadu_si512((__m512i*)buf); if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); } @@ -17203,7 +19709,7 @@ std::tuple convert_utf16_to_utf32(const char16 |1101.11aa.aaaa.aaaa|1101.10bb.bbbb.bbbb| low surrogate high surrogate */ - /* 1. Expand all words to 32-bit words + /* 1. Expand all code units to 32-bit code units in |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb| */ const __m512i first = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)); @@ -17234,7 +19740,7 @@ std::tuple convert_utf16_to_utf32(const char16 const __m512i added_second = _mm512_mask_add_epi32(aligned_second, (__mmask16)(H>>16), aligned_second, shifted_second); const __m512i utf32_second = _mm512_mask_add_epi32(added_second, (__mmask16)(H>>16), added_second, constant); - // 5. Store all valid UTF-32 words (low surrogate positions and 32nd word are invalid) + // 5. Store all valid UTF-32 code units (low surrogate positions and 32nd word are invalid) const __mmask32 valid = ~L & 0x7fffffff; // We deliberately do a _mm512_maskz_compress_epi32 followed by storeu_epi32 // to ease performance portability to Zen 4. @@ -17248,7 +19754,7 @@ std::tuple convert_utf16_to_utf32(const char16 //_mm512_storeu_epi32((__m512i *) utf32_output, compressed_second); _mm512_mask_storeu_epi32((__m512i *) utf32_output, __mmask16((1<> 30) & 0x1; } else { @@ -17257,7 +19763,7 @@ std::tuple convert_utf16_to_utf32(const char16 } } else { // no surrogates - // extend all thirty-two 16-bit words to thirty-two 32-bit words + // extend all thirty-two 16-bit code units to thirty-two 32-bit code units _mm512_storeu_si512((__m512i *)(utf32_output), _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in))); _mm512_storeu_si512((__m512i *)(utf32_output) + 1, _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in,1))); utf32_output += 32; @@ -17268,7 +19774,80 @@ std::tuple convert_utf16_to_utf32(const char16 return std::make_tuple(buf+carry, utf32_output, true); } /* end file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf32_to_utf8.inl.cpp +/* begin file src/icelake/icelake_convert_utf32_to_latin1.inl.cpp */ +// file included directly +size_t icelake_convert_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) { + const char32_t *end = buf + len; + __m512i v_0xFF = _mm512_set1_epi32(0xff); + __m512i shufmask = _mm512_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, + 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0); + while (buf + 16 <= end) { + __m512i in = _mm512_loadu_si512((__m512i *)buf); + if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) { + return 0; + } + _mm_storeu_si128((__m128i *)latin1_output, + _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in))); + latin1_output += 16; + buf += 16; + } + if (buf < end) { + uint16_t mask = uint16_t((1 << (end - buf)) - 1); + __m512i in = _mm512_maskz_loadu_epi32(mask, buf); + if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) { + return 0; + } + _mm_mask_storeu_epi8( + latin1_output, mask, + _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in))); + } + return len; +} + +std::pair +icelake_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, + char *latin1_output) { + const char32_t *end = buf + len; + const char32_t *start = buf; + __m512i v_0xFF = _mm512_set1_epi32(0xff); + __m512i shufmask = _mm512_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, + 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0); + while (buf + 16 <= end) { + __m512i in = _mm512_loadu_si512((__m512i *)buf); + if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) { + while (uint32_t(*buf) <= 0xff) { + *latin1_output++ = uint8_t(*buf++); + } + return std::make_pair(result(error_code::TOO_LARGE, buf - start), + latin1_output); + } + _mm_storeu_si128((__m128i *)latin1_output, + _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in))); + latin1_output += 16; + buf += 16; + } + if (buf < end) { + uint16_t mask = uint16_t((1 << (end - buf)) - 1); + __m512i in = _mm512_maskz_loadu_epi32(mask, buf); + if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) { + while (uint32_t(*buf) <= 0xff) { + *latin1_output++ = uint8_t(*buf++); + } + return std::make_pair(result(error_code::TOO_LARGE, buf - start), + latin1_output); + } + _mm_mask_storeu_epi8( + latin1_output, mask, + _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in))); + } + return std::make_pair(result(error_code::SUCCESS, len), latin1_output); +} +/* end file src/icelake/icelake_convert_utf32_to_latin1.inl.cpp */ /* begin file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */ // file included directly @@ -17291,7 +19870,7 @@ std::pair avx512_convert_utf32_to_utf8(const char32_t* b __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1); running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin); - // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation + // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned saturation __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff)); in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000); @@ -17362,7 +19941,7 @@ std::pair avx512_convert_utf32_to_utf8(const char32_t* b const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000); const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); if (saturation_bitmask == 0xffffffff) { - // case: words from register produce either 1, 2 or 3 UTF-8 bytes + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800); forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800)); @@ -17376,7 +19955,7 @@ std::pair avx512_convert_utf32_to_utf8(const char32_t* b 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes - We expand the input word (16-bit) into two words (32-bit), thus + We expand the input word (16-bit) into two code units (32-bit), thus we have room for four bytes. However, we need five distinct bit layouts. Note that the last byte in cases #2 and #3 is the same. @@ -17387,7 +19966,7 @@ std::pair avx512_convert_utf32_to_utf8(const char32_t* b either byte 1 for case #2 or byte 2 for case #3. Note that they differ by exactly one bit. - Finally from these two words we build proper UTF-8 sequence, taking + Finally from these two code units we build proper UTF-8 sequence, taking into account the case (i.e, the number of bytes to write). */ /** @@ -17415,16 +19994,16 @@ std::pair avx512_convert_utf32_to_utf8(const char32_t* b const __m256i s4 = _mm256_xor_si256(s3, m0); #undef simdutf_vec - // 4. expand words 16-bit => 32-bit + // 4. expand code units 16-bit => 32-bit const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); - // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa); // Due to the wider registers, the following path is less likely to be useful. /*if(mask == 0) { - // We only have three-byte words. Use fast path. + // We only have three-byte code units. Use fast path. const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle); @@ -17536,7 +20115,7 @@ std::pair avx512_convert_utf32_to_utf8_with_errors(const char32_t return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output); } - // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation + // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned saturation __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff)); in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000); @@ -17607,9 +20186,9 @@ std::pair avx512_convert_utf32_to_utf8_with_errors(const char32_t const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000); const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); if (saturation_bitmask == 0xffffffff) { - // case: words from register produce either 1, 2 or 3 UTF-8 bytes + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes - // Check for illegal surrogate words + // Check for illegal surrogate code units const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800); const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800); if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) { @@ -17626,7 +20205,7 @@ std::pair avx512_convert_utf32_to_utf8_with_errors(const char32_t 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes - We expand the input word (16-bit) into two words (32-bit), thus + We expand the input word (16-bit) into two code units (32-bit), thus we have room for four bytes. However, we need five distinct bit layouts. Note that the last byte in cases #2 and #3 is the same. @@ -17637,7 +20216,7 @@ std::pair avx512_convert_utf32_to_utf8_with_errors(const char32_t either byte 1 for case #2 or byte 2 for case #3. Note that they differ by exactly one bit. - Finally from these two words we build proper UTF-8 sequence, taking + Finally from these two code units we build proper UTF-8 sequence, taking into account the case (i.e, the number of bytes to write). */ /** @@ -17665,16 +20244,16 @@ std::pair avx512_convert_utf32_to_utf8_with_errors(const char32_t const __m256i s4 = _mm256_xor_si256(s3, m0); #undef simdutf_vec - // 4. expand words 16-bit => 32-bit + // 4. expand code units 16-bit => 32-bit const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); - // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa); // Due to the wider registers, the following path is less likely to be useful. /*if(mask == 0) { - // We only have three-byte words. Use fast path. + // We only have three-byte code units. Use fast path. const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle); @@ -17754,7 +20333,6 @@ std::pair avx512_convert_utf32_to_utf8_with_errors(const char32_t return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); } /* end file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf32_to_utf16.inl.cpp /* begin file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */ // file included directly @@ -17889,7 +20467,6 @@ std::pair avx512_convert_utf32_to_utf16_with_errors(const cha return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output); } /* end file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_ascii_validation.inl.cpp /* begin file src/icelake/icelake_ascii_validation.inl.cpp */ // file included directly @@ -17908,7 +20485,6 @@ bool validate_ascii(const char* buf, size_t len) { return (_mm512_test_epi8_mask(running_or, running_or) == 0); } /* end file src/icelake/icelake_ascii_validation.inl.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_utf32_validation.inl.cpp /* begin file src/icelake/icelake_utf32_validation.inl.cpp */ // file included directly @@ -17940,206 +20516,173 @@ const char32_t* validate_utf32(const char32_t* buf, size_t len) { return buf; } /* end file src/icelake/icelake_utf32_validation.inl.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf16_to_utf8.inl.cpp -/* begin file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */ +/* begin file src/icelake/icelake_convert_latin1_to_utf8.inl.cpp */ // file included directly -/** - * This function converts the input (inbuf, inlen), assumed to be valid - * UTF16 (little endian) into UTF-8 (to outbuf). The number of words written - * is written to 'outlen' and the function reports the number of input word - * consumed. - */ -template -size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen, - unsigned char *outbuf, size_t *outlen) { - __m512i in; - __mmask32 inmask = _cvtu32_mask32(0x7fffffff); - __m512i byteflip = _mm512_setr_epi64( - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809, - 0x0607040502030001, - 0x0e0f0c0d0a0b0809 - ); - const char16_t * const inbuf_orig = inbuf; - const unsigned char * const outbuf_orig = outbuf; - size_t adjust = 0; - int carry = 0; - - while (inlen >= 32) { - in = _mm512_loadu_si512(inbuf); - if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); } - inlen -= 31; - lastiteration: - inbuf += 31; - - failiteration: - const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask( - inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT); - - if (_ktestz_mask32_u8(inmask, is234byte)) { - // fast path for ASCII only - _mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in); - outbuf += 31; - carry = 0; - - if (inlen < 32) { - goto tail; - } else { - continue; - } - } - - const __mmask32 is12byte = - _mm512_cmp_epu16_mask(in, _mm512_set1_epi16(0x0800), _MM_CMPINT_LT); - - if (_ktestc_mask32_u8(is12byte, inmask)) { - // fast path for 1 and 2 byte only - - const __m512i twobytes = _mm512_ternarylogic_epi32( - _mm512_slli_epi16(in, 8), _mm512_srli_epi16(in, 6), - _mm512_set1_epi16(0x3f3f), 0xa8); // (A|B)&C - in = _mm512_mask_add_epi16(in, is234byte, twobytes, - _mm512_set1_epi16(int16_t(0x80c0))); - const __m512i cmpmask = - _mm512_mask_blend_epi16(inmask, _mm512_set1_epi16(int16_t(0xffff)), - _mm512_set1_epi16(0x0800)); - const __mmask64 smoosh = _mm512_cmp_epu8_mask(in, cmpmask, _MM_CMPINT_NLT); - const __m512i out = _mm512_maskz_compress_epi8(smoosh, in); - _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(_cvtmask64_u64(smoosh), _cvtmask64_u64(smoosh))), - out); - outbuf += 31 + _mm_popcnt_u32(_cvtmask32_u32(is234byte)); - carry = 0; - - if (inlen < 32) { - goto tail; - } else { - continue; - } - } - __m512i lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)); - __m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1)); - - - __m512i taglo = _mm512_set1_epi32(0x8080e000); - __m512i taghi = taglo; - - const __m512i fc00masked = _mm512_and_epi32(in, _mm512_set1_epi16(int16_t(0xfc00))); - const __mmask32 hisurr = _mm512_mask_cmp_epu16_mask( - inmask, fc00masked, _mm512_set1_epi16(int16_t(0xd800)), _MM_CMPINT_EQ); - const __mmask32 losurr = _mm512_cmp_epu16_mask( - fc00masked, _mm512_set1_epi16(int16_t(0xdc00)), _MM_CMPINT_EQ); - - int carryout = 0; - if (!_kortestz_mask32_u8(hisurr, losurr)) { - // handle surrogates - - __m512i los = _mm512_alignr_epi32(hi, lo, 1); - __m512i his = _mm512_alignr_epi32(lo, hi, 1); - - const __mmask32 hisurrhi = _kshiftri_mask32(hisurr, 16); - taglo = - _mm512_mask_mov_epi32(taglo,__mmask16(hisurr), _mm512_set1_epi32(0x808080f0)); - taghi = - _mm512_mask_mov_epi32(taghi, __mmask16(hisurrhi), _mm512_set1_epi32(0x808080f0)); - - lo = _mm512_mask_slli_epi32(lo, __mmask16(hisurr), lo, 10); - hi = _mm512_mask_slli_epi32(hi, __mmask16(hisurrhi), hi, 10); - los = _mm512_add_epi32(los, _mm512_set1_epi32(0xfca02400)); - his = _mm512_add_epi32(his, _mm512_set1_epi32(0xfca02400)); - lo = _mm512_mask_add_epi32(lo, __mmask16(hisurr), lo, los); - hi = _mm512_mask_add_epi32(hi, __mmask16(hisurrhi), hi, his); - - carryout = _cvtu32_mask32(_kshiftri_mask32(hisurr, 30)); - - const uint32_t h = _cvtmask32_u32(hisurr); - const uint32_t l = _cvtmask32_u32(losurr); - // check for mismatched surrogates - if ((h + h + carry) ^ l) { - const uint32_t lonohi = l & ~(h + h + carry); - const uint32_t hinolo = h & ~(l >> 1); - inlen = _tzcnt_u32(hinolo | lonohi); - inmask = __mmask32(0x7fffffff & ((1 << inlen) - 1)); - in = _mm512_maskz_mov_epi16(inmask, in); - adjust = (int)inlen - 31; - inlen = 0; - goto failiteration; - } +static inline size_t latin1_to_utf8_avx512_vec(__m512i input, size_t input_len, char *utf8_output, int mask_output) { + __mmask64 nonascii = _mm512_movepi8_mask(input); + size_t output_size = input_len + (size_t)count_ones(nonascii); + + // Mask to denote whether the byte is a leading byte that is not ascii + __mmask64 sixth = + _mm512_cmpge_epu8_mask(input, _mm512_set1_epi8(-64)); //binary representation of -64: 1100 0000 + + const uint64_t alternate_bits = UINT64_C(0x5555555555555555); + uint64_t ascii = ~nonascii; + // the bits in ascii are inverted and zeros are interspersed in between them + uint64_t maskA = ~_pdep_u64(ascii, alternate_bits); + uint64_t maskB = ~_pdep_u64(ascii>>32, alternate_bits); + + // interleave bytes from top and bottom halves (abcd...ABCD -> aAbBcCdD) + __m512i input_interleaved = _mm512_permutexvar_epi8(_mm512_set_epi32( + 0x3f1f3e1e, 0x3d1d3c1c, 0x3b1b3a1a, 0x39193818, + 0x37173616, 0x35153414, 0x33133212, 0x31113010, + 0x2f0f2e0e, 0x2d0d2c0c, 0x2b0b2a0a, 0x29092808, + 0x27072606, 0x25052404, 0x23032202, 0x21012000 + ), input); + + // double size of each byte, and insert the leading byte 1100 0010 + +/* +upscale the bytes to 16-bit value, adding the 0b11000000 leading byte in the process. +We adjust for the bytes that have their two most significant bits. This takes care of the first 32 bytes, assuming we interleaved the bytes. */ + __m512i outputA = _mm512_shldi_epi16(input_interleaved, _mm512_set1_epi8(-62), 8); + outputA = _mm512_mask_add_epi16( + outputA, + (__mmask32)sixth, + outputA, + _mm512_set1_epi16(1 - 0x4000)); // 1- 0x4000 = 1100 0000 0000 0001???? + + // in the second 32-bit half, set first or second option based on whether original input is leading byte (second case) or not (first case) + __m512i leadingB = _mm512_mask_blend_epi16( + (__mmask32)(sixth>>32), + _mm512_set1_epi16(0x00c2), // 0000 0000 1101 0010 + _mm512_set1_epi16(0x40c3));// 0100 0000 1100 0011 + __m512i outputB = _mm512_ternarylogic_epi32( + input_interleaved, + leadingB, + _mm512_set1_epi16((short)0xff00), + (240 & 170) ^ 204); // (input_interleaved & 0xff00) ^ leadingB + + // prune redundant bytes + outputA = _mm512_maskz_compress_epi8(maskA, outputA); + outputB = _mm512_maskz_compress_epi8(maskB, outputB); + + + size_t output_sizeA = (size_t)count_ones((uint32_t)nonascii) + 32; + + if(mask_output) { + if(input_len > 32) { // is the second half of the input vector used? + __mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_sizeA); + _mm512_mask_storeu_epi8(utf8_output, write_mask, outputA); + utf8_output += output_sizeA; + write_mask = _bzhi_u64(~0ULL, (unsigned int)(output_size - output_sizeA)); + _mm512_mask_storeu_epi8(utf8_output, write_mask, outputB); + } else { + __mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_size); + _mm512_mask_storeu_epi8(utf8_output, write_mask, outputA); } - - hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff),hi); - carry = carryout; - - __m512i mslo = - _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo); - - __m512i mshi = - _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi); - - const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask)); - const __mmask64 outmhi = _kshiftri_mask64(outmask, 16); - - const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte)); - const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16); - const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16); - - taglo = - _mm512_mask_mov_epi32(taglo, __mmask16(is12byte), _mm512_set1_epi32(0x80c00000)); - taghi = - _mm512_mask_mov_epi32(taghi, __mmask16(is12bhi), _mm512_set1_epi32(0x80c00000)); - __m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff), - _mm512_set1_epi32(0x00010101)); - __m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff), - _mm512_set1_epi32(0x00010101)); - - - magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff), - _mm512_set1_epi32(0x00010101)); - magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff), - _mm512_set1_epi32(0x00010101)); - - mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo, - 0xea); // A&B|C - mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi, - 0xea); - mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24); - - mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24); - - const __mmask64 wantlo = _mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT); - const __mmask64 wanthi = _mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT); - const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo); - const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi); - const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo); - const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi); - - uint64_t advlo = _mm_popcnt_u64(wantlo_uint64); - uint64_t advhi = _mm_popcnt_u64(wanthi_uint64); - - _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo); - _mm512_mask_storeu_epi8(outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)), outhi); - outbuf += advlo + advhi; - } - outbuf -= adjust; - -tail: - if (inlen != 0) { - // We must have inlen < 31. - inmask = _cvtu32_mask32((1 << inlen) - 1); - in = _mm512_maskz_loadu_epi16(inmask, inbuf); - if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); } - adjust = inlen - 31; - inlen = 0; - goto lastiteration; + } else { + _mm512_storeu_si512(utf8_output, outputA); + utf8_output += output_sizeA; + _mm512_storeu_si512(utf8_output, outputB); + } + return output_size; +} + +static inline size_t latin1_to_utf8_avx512_branch(__m512i input, char *utf8_output) { + __mmask64 nonascii = _mm512_movepi8_mask(input); + size_t nonascii_count = (size_t)count_ones(nonascii); + if(nonascii_count > 0){ + return latin1_to_utf8_avx512_vec(input, 64, utf8_output, 0); + } else { + _mm512_storeu_si512(utf8_output, input); + return 64 + nonascii_count;} +} + +size_t latin1_to_utf8_avx512_start(const char *buf, size_t len, char *utf8_output) { + char *start = utf8_output; + size_t pos = 0; + // if there's at least 128 bytes remaining, we don't need to mask the output + for (; pos + 128 <= len; pos += 64) { + __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos)); + utf8_output += latin1_to_utf8_avx512_branch(input, utf8_output); + } + // in the last 128 bytes, the first 64 may require masking the output + if (pos + 64 <= len) { + __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos)); + utf8_output += latin1_to_utf8_avx512_vec(input, 64, utf8_output, 1); + pos += 64; + } + // with the last 64 bytes, the input also needs to be masked + if (pos < len) { + __mmask64 load_mask = _bzhi_u64(~0ULL, (unsigned int)(len - pos)); + __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)(buf + pos)); + utf8_output += latin1_to_utf8_avx512_vec(input, len - pos, utf8_output, 1); + } + return (size_t)(utf8_output - start); +} +/* end file src/icelake/icelake_convert_latin1_to_utf8.inl.cpp */ +/* begin file src/icelake/icelake_convert_latin1_to_utf16.inl.cpp */ +// file included directly +template +size_t icelake_convert_latin1_to_utf16(const char *latin1_input, size_t len, + char16_t *utf16_output) { + size_t rounded_len = len & ~0x1F; // Round down to nearest multiple of 32 + + __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809); + for (size_t i = 0; i < rounded_len; i += 32) { + // Load 32 Latin1 characters into a 256-bit register + __m256i in = _mm256_loadu_si256((__m256i *)&latin1_input[i]); + // Zero extend each set of 8 Latin1 characters to 32 16-bit integers + __m512i out = _mm512_cvtepu8_epi16(in); + if (big_endian) { + out = _mm512_shuffle_epi8(out, byteflip); + } + // Store the results back to memory + _mm512_storeu_si512((__m512i *)&utf16_output[i], out); } - *outlen = (outbuf - outbuf_orig) + adjust; - return ((inbuf - inbuf_orig) + adjust); + if (rounded_len != len) { + uint32_t mask = uint32_t(1 << (len - rounded_len)) - 1; + __m256i in = _mm256_maskz_loadu_epi8(mask, latin1_input + rounded_len); + + // Zero extend each set of 8 Latin1 characters to 32 16-bit integers + __m512i out = _mm512_cvtepu8_epi16(in); + if (big_endian) { + out = _mm512_shuffle_epi8(out, byteflip); + } + // Store the results back to memory + _mm512_mask_storeu_epi16(utf16_output + rounded_len, mask, out); + } + + return len; } -/* end file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */ +/* end file src/icelake/icelake_convert_latin1_to_utf16.inl.cpp */ +/* begin file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */ +std::pair avx512_convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) { + size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16 + + for (size_t i = 0; i < rounded_len; i += 16) { + // Load 16 Latin1 characters into a 128-bit register + __m128i in = _mm_loadu_si128((__m128i*)&buf[i]); + + // Zero extend each set of 8 Latin1 characters to 16 32-bit integers using vpmovzxbd + __m512i out = _mm512_cvtepu8_epi32(in); + + // Store the results back to memory + _mm512_storeu_si512((__m512i*)&utf32_output[i], out); + } + + // Return pointers pointing to where we left off + return std::make_pair(buf + rounded_len, utf32_output + rounded_len); +} +/* end file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */ + + +#include } // namespace } // namespace icelake @@ -18148,7 +20691,6 @@ size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen, namespace simdutf { namespace icelake { - simdutf_warn_unused int implementation::detect_encodings(const char *input, size_t length) const noexcept { @@ -18183,7 +20725,7 @@ implementation::detect_encodings(const char *input, // be valid UTF-16LE, at least one surrogate must be in the two most // significant bytes of a 32-bit word since they always come in pairs in // UTF-16LE. Note that we always proceed in multiple of 4 before this - // point so there is no offset in 32-bit words. + // point so there is no offset in 32-bit code units. if ((surrogates & 0xaaaaaaaa) != 0) { is_utf32 = false; @@ -18199,7 +20741,7 @@ implementation::detect_encodings(const char *input, if (ends_with_high) { buf += 31 * - sizeof(char16_t); // advance only by 31 words so that we start + sizeof(char16_t); // advance only by 31 code units so that we start // with the high surrogate on the next round. } else { buf += 32 * sizeof(char16_t); @@ -18368,7 +20910,7 @@ simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, s } bool ends_with_high = ((highsurrogates & 0x80000000) != 0); if(ends_with_high) { - buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round. + buf += 31; // advance only by 31 code units so that we start with the high surrogate on the next round. } else { buf += 32; } @@ -18417,7 +20959,7 @@ simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, s } bool ends_with_high = ((highsurrogates & 0x80000000) != 0); if(ends_with_high) { - buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round. + buf += 31; // advance only by 31 code units so that we start with the high surrogate on the next round. } else { buf += 32; } @@ -18459,7 +21001,7 @@ simdutf_warn_unused result implementation::validate_utf16le_with_errors(const ch } bool ends_with_high = ((highsurrogates & 0x80000000) != 0); if(ends_with_high) { - buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round. + buf += 31; // advance only by 31 code units so that we start with the high surrogate on the next round. } else { buf += 32; } @@ -18513,7 +21055,7 @@ simdutf_warn_unused result implementation::validate_utf16be_with_errors(const ch } bool ends_with_high = ((highsurrogates & 0x80000000) != 0); if(ends_with_high) { - buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round. + buf += 31; // advance only by 31 code units so that we start with the high surrogate on the next round. } else { buf += 32; } @@ -18588,6 +21130,63 @@ simdutf_warn_unused result implementation::validate_utf32_with_errors(const char return result(error_code::SUCCESS, len); } +simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept { + return icelake::latin1_to_utf8_avx512_start(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + return icelake_convert_latin1_to_utf16(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + return icelake_convert_latin1_to_utf16(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept { + std::pair ret = avx512_convert_latin1_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { return 0; } + size_t converted_chars = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_converted_chars == 0) { return 0; } + converted_chars += scalar_converted_chars; + } + return converted_chars; +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept { + return icelake::utf8_to_latin1_avx512(buf, len, latin1_output); +} + + +simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept { + // Initialize output length and input length counters + size_t inlen = 0; + + // First, try to convert as much as possible using the SIMD implementation. + inlen = icelake::utf8_to_latin1_avx512(buf, len, latin1_output); + + // If we have completely converted the string + if(inlen == len) { + return {simdutf::SUCCESS, len}; + } + + // Else if there are remaining bytes, use the scalar function to process them. + // Note: This is assuming scalar::utf8_to_latin1::convert_with_errors is a function that takes + // the input buffer, length, and output buffer, and returns a result object with an error code + // and the number of characters processed. + result res = scalar::utf8_to_latin1::convert_with_errors(buf + inlen, len - inlen, latin1_output + inlen); + res.count += inlen; // Add the number of characters processed by the SIMD implementation + + return res; +} + + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept { + return icelake::valid_utf8_to_latin1_avx512(buf, len, latin1_output); +} + simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16(buf, len, utf16_output); if (ret.second == nullptr) { @@ -18767,6 +21366,33 @@ simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const cha return saved_bytes; } + +simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + return icelake_convert_utf16_to_latin1(buf,len,latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + return icelake_convert_utf16_to_latin1(buf,len,latin1_output); +} + +simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + return icelake_convert_utf16_to_latin1_with_errors(buf,len,latin1_output).first; +} + +simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + return icelake_convert_utf16_to_latin1_with_errors(buf,len,latin1_output).first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + // optimization opportunity: implement custom function + return convert_utf16be_to_latin1(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + // optimization opportunity: implement custom function + return convert_utf16le_to_latin1(buf, len, latin1_output); +} + simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { size_t outlen; size_t inlen = utf16_to_utf8_avx512i(buf, len, (unsigned char*)utf8_output, &outlen); @@ -18811,6 +21437,18 @@ simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const c return convert_utf16be_to_utf8(buf, len, utf8_output); } +simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept { + return icelake_convert_utf32_to_latin1(buf,len,latin1_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept { + return icelake_convert_utf32_to_latin1_with_errors(buf,len,latin1_output).first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept { + return icelake_convert_utf32_to_latin1(buf,len,latin1_output); +} + simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { std::pair ret = avx512_convert_utf32_to_utf8(buf, len, utf8_output); @@ -18826,7 +21464,7 @@ simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* } simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished + // ret.first.count is always the position in the buffer, not the number of code units written even if finished std::pair ret = icelake::avx512_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); if (ret.first.count != len) { result scalar_res = scalar::utf32_to_utf8::convert_with_errors( @@ -18838,7 +21476,7 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(con ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written + ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit code units written return ret.first; } @@ -18873,7 +21511,7 @@ simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32 } simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished + // ret.first.count is always the position in the buffer, not the number of code units written even if finished std::pair ret = avx512_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); if (ret.first.count != len) { result scalar_res = scalar::utf32_to_utf16::convert_with_errors( @@ -18885,12 +21523,12 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors( ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written + ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit code units written return ret.first; } simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished + // ret.first.count is always the position in the buffer, not the number of code units written even if finished std::pair ret = avx512_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); if (ret.first.count != len) { result scalar_res = scalar::utf32_to_utf16::convert_with_errors( @@ -18902,7 +21540,7 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors( ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written + ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit code units written return ret.first; } @@ -19088,23 +21726,75 @@ simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { - const char* end = length >= 64 ? input + length - 64 : nullptr; - const char* ptr = input; + const uint8_t *str = reinterpret_cast(input); + size_t answer = length / sizeof(__m512i) * sizeof(__m512i); // Number of 512-bit chunks that fits into the length. + size_t i = 0; + __m512i unrolled_popcount{0}; const __m512i continuation = _mm512_set1_epi8(char(0b10111111)); - size_t count{0}; + while (i + sizeof(__m512i) <= length) { + size_t iterations = (length - i) / sizeof(__m512i); - while (ptr <= end) { - __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr); - ptr += 64; - uint64_t continuation_bitmask = static_cast(_mm512_cmple_epi8_mask(utf8, continuation)); - count += 64 - count_ones(continuation_bitmask); + size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i); + for (; i + 8*sizeof(__m512i) <= max_i; i += 8*sizeof(__m512i)) { + __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i)); + __m512i input2 = _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i))); + __m512i input3 = _mm512_loadu_si512((const __m512i *)(str + i + 2*sizeof(__m512i))); + __m512i input4 = _mm512_loadu_si512((const __m512i *)(str + i + 3*sizeof(__m512i))); + __m512i input5 = _mm512_loadu_si512((const __m512i *)(str + i + 4*sizeof(__m512i))); + __m512i input6 = _mm512_loadu_si512((const __m512i *)(str + i + 5*sizeof(__m512i))); + __m512i input7 = _mm512_loadu_si512((const __m512i *)(str + i + 6*sizeof(__m512i))); + __m512i input8 = _mm512_loadu_si512((const __m512i *)(str + i + 7*sizeof(__m512i))); + + + __mmask64 mask1 = _mm512_cmple_epi8_mask(input1, continuation); + __mmask64 mask2 = _mm512_cmple_epi8_mask(input2, continuation); + __mmask64 mask3 = _mm512_cmple_epi8_mask(input3, continuation); + __mmask64 mask4 = _mm512_cmple_epi8_mask(input4, continuation); + __mmask64 mask5 = _mm512_cmple_epi8_mask(input5, continuation); + __mmask64 mask6 = _mm512_cmple_epi8_mask(input6, continuation); + __mmask64 mask7 = _mm512_cmple_epi8_mask(input7, continuation); + __mmask64 mask8 = _mm512_cmple_epi8_mask(input8, continuation); + + __m512i mask_register = _mm512_set_epi64(mask8, mask7, mask6, mask5, mask4, mask3, mask2, mask1); + + + unrolled_popcount = _mm512_add_epi64(unrolled_popcount, _mm512_popcnt_epi64(mask_register)); + } + + for (; i <= max_i; i += sizeof(__m512i)) { + __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i)); + uint64_t continuation_bitmask = static_cast(_mm512_cmple_epi8_mask(more_input, continuation)); + answer -= count_ones(continuation_bitmask); + } } - return count + scalar::utf8::count_code_points(ptr, length - (ptr - input)); + __m256i first_half = _mm512_extracti64x4_epi64(unrolled_popcount, 0); + __m256i second_half = _mm512_extracti64x4_epi64(unrolled_popcount, 1); + answer -= (size_t)_mm256_extract_epi64(first_half, 0) + + (size_t)_mm256_extract_epi64(first_half, 1) + + (size_t)_mm256_extract_epi64(first_half, 2) + + (size_t)_mm256_extract_epi64(first_half, 3) + + (size_t)_mm256_extract_epi64(second_half, 0) + + (size_t)_mm256_extract_epi64(second_half, 1) + + (size_t)_mm256_extract_epi64(second_half, 2) + + (size_t)_mm256_extract_epi64(second_half, 3); + + return answer + scalar::utf8::count_code_points(reinterpret_cast(str + i), length - i); +} + +simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept { + return count_utf8(buf,len); +} + +simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept { + return scalar::utf16::latin1_length_from_utf16(length); } +simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept { + return scalar::utf32::latin1_length_from_utf32(length); +} simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept { const char16_t* end = length >= 32 ? input + length - 32 : nullptr; @@ -19183,6 +21873,76 @@ simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char1 return implementation::count_utf16be(input, length); } +simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept { + return scalar::latin1::utf16_length_from_latin1(length); +} + + +simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept { + return scalar::latin1::utf32_length_from_latin1(length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t length) const noexcept { + const uint8_t *str = reinterpret_cast(input); + size_t answer = length / sizeof(__m512i) * sizeof(__m512i); + size_t i = 0; + unsigned char v_0xFF = 0xff; + __m512i eight_64bits = _mm512_setzero_si512(); + while (i + sizeof(__m512i) <= length) { + __m512i runner = _mm512_setzero_si512(); + size_t iterations = (length - i) / sizeof(__m512i); + if (iterations > 255) { + iterations = 255; + } + size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i); + for (; i + 4*sizeof(__m512i) <= max_i; i += 4*sizeof(__m512i)) { + // Load four __m512i vectors + __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i)); + __m512i input2 = _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i))); + __m512i input3 = _mm512_loadu_si512((const __m512i *)(str + i + 2*sizeof(__m512i))); + __m512i input4 = _mm512_loadu_si512((const __m512i *)(str + i + 3*sizeof(__m512i))); + + // Generate four masks + __mmask64 mask1 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input1); + __mmask64 mask2 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input2); + __mmask64 mask3 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input3); + __mmask64 mask4 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input4); + // Apply the masks and subtract from the runner + __m512i not_ascii1 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask1, v_0xFF); + __m512i not_ascii2 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask2, v_0xFF); + __m512i not_ascii3 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask3, v_0xFF); + __m512i not_ascii4 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask4, v_0xFF); + + runner = _mm512_sub_epi8(runner, not_ascii1); + runner = _mm512_sub_epi8(runner, not_ascii2); + runner = _mm512_sub_epi8(runner, not_ascii3); + runner = _mm512_sub_epi8(runner, not_ascii4); + } + + for (; i <= max_i; i += sizeof(__m512i)) { + __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i)); + + __mmask64 mask = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), more_input); + __m512i not_ascii = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask, v_0xFF); + runner = _mm512_sub_epi8(runner, not_ascii); + } + + eight_64bits = _mm512_add_epi64(eight_64bits, _mm512_sad_epu8(runner, _mm512_setzero_si512())); + } + + __m256i first_half = _mm512_extracti64x4_epi64(eight_64bits, 0); + __m256i second_half = _mm512_extracti64x4_epi64(eight_64bits, 1); + answer += (size_t)_mm256_extract_epi64(first_half, 0) + + (size_t)_mm256_extract_epi64(first_half, 1) + + (size_t)_mm256_extract_epi64(first_half, 2) + + (size_t)_mm256_extract_epi64(first_half, 3) + + (size_t)_mm256_extract_epi64(second_half, 0) + + (size_t)_mm256_extract_epi64(second_half, 1) + + (size_t)_mm256_extract_epi64(second_half, 2) + + (size_t)_mm256_extract_epi64(second_half, 3); + return answer + scalar::latin1::utf8_length_from_latin1(reinterpret_cast(str + i), length - i); +} + simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { size_t pos = 0; size_t count = 0; @@ -19252,7 +22012,6 @@ simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * i } // namespace icelake } // namespace simdutf -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/end.h /* begin file src/simdutf/icelake/end.h */ #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE // nothing needed. @@ -19268,10 +22027,8 @@ SIMDUTF_POP_DISABLE_WARNINGS /* end file src/icelake/implementation.cpp */ #endif #if SIMDUTF_IMPLEMENTATION_HASWELL -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/implementation.cpp /* begin file src/haswell/implementation.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/begin.h /* begin file src/simdutf/haswell/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "haswell" // #define SIMDUTF_IMPLEMENTATION haswell @@ -19314,7 +22071,6 @@ simdutf_really_inline simd8 must_be_2_3_continuation(const simd8 return simd8(is_third_byte | is_fourth_byte) > int8_t(0); } -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_detect_encodings.cpp /* begin file src/haswell/avx2_detect_encodings.cpp */ template // len is known to be a multiple of 2 when this is called @@ -19358,7 +22114,7 @@ int avx2_detect_encodings(const char * buf, size_t len) { // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word. // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant // bytes of a 32-bit word since they always come in pairs in UTF-16LE. - // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit words. + // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit code units. if ((surrogates_bitmask0 & 0xaaaaaaaa) != 0) { is_utf32 = false; @@ -19504,10 +22260,9 @@ int avx2_detect_encodings(const char * buf, size_t len) { } /* end file src/haswell/avx2_detect_encodings.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_validate_utf16.cpp /* begin file src/haswell/avx2_validate_utf16.cpp */ /* - In UTF-16 words in range 0xD800 to 0xDFFF have special meaning. + In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning. In a vectorized algorithm we want to examine the most significant nibble in order to select a fast path. If none of highest nibbles @@ -19543,7 +22298,7 @@ int avx2_detect_encodings(const char * buf, size_t len) { 0 0 1 0 1 0 0 0 b = a << 1 1 1 1 1 1 1 1 0 c = V | a | b ^ - the last bit can be zero, we just consume 7 words + the last bit can be zero, we just consume 7 code units and recheck this word in the next iteration */ @@ -19589,7 +22344,7 @@ const char16_t* avx2_validate_utf16(const char16_t* input, size_t size) { // // Fact: high surrogate has 11th bit set (3rd bit in the higher word) - // V - non-surrogate words + // V - non-surrogate code units // V = not surrogates_wordmask const uint32_t V = ~surrogates_bitmask; @@ -19610,10 +22365,10 @@ const char16_t* avx2_validate_utf16(const char16_t* input, size_t size) { if (c == 0xffffffff) { // The whole input register contains valid UTF-16, i.e., - // either single words or proper surrogate pairs. + // either single code units or proper surrogate pairs. input += simd16::ELEMENTS * 2; } else if (c == 0x7fffffff) { - // The 31 lower words of the input register contains valid UTF-16. + // The 31 lower code units of the input register contains valid UTF-16. // The 31 word may be either a low or high surrogate. It the next // iteration we 1) check if the low surrogate is followed by a high // one, 2) reject sole high surrogate. @@ -19667,7 +22422,7 @@ const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size) // // Fact: high surrogate has 11th bit set (3rd bit in the higher word) - // V - non-surrogate words + // V - non-surrogate code units // V = not surrogates_wordmask const uint32_t V = ~surrogates_bitmask; @@ -19688,10 +22443,10 @@ const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size) if (c == 0xffffffff) { // The whole input register contains valid UTF-16, i.e., - // either single words or proper surrogate pairs. + // either single code units or proper surrogate pairs. input += simd16::ELEMENTS * 2; } else if (c == 0x7fffffff) { - // The 31 lower words of the input register contains valid UTF-16. + // The 31 lower code units of the input register contains valid UTF-16. // The 31 word may be either a low or high surrogate. It the next // iteration we 1) check if the low surrogate is followed by a high // one, 2) reject sole high surrogate. @@ -19705,7 +22460,6 @@ const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size) return result(error_code::SUCCESS, input - start); } /* end file src/haswell/avx2_validate_utf16.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_validate_utf32le.cpp /* begin file src/haswell/avx2_validate_utf32le.cpp */ /* Returns: - pointer to the last unprocessed character (a scalar fallback should check the rest); @@ -19771,7 +22525,145 @@ const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t siz } /* end file src/haswell/avx2_validate_utf32le.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf8_to_utf16.cpp +/* begin file src/haswell/avx2_convert_latin1_to_utf8.cpp */ +std::pair avx2_convert_latin1_to_utf8(const char *latin1_input, size_t len, + char *utf8_output) { + const char *end = latin1_input + len; + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080); + const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80); + const size_t safety_margin = 12; + + while (latin1_input + 16 + safety_margin <= end) { + __m128i in8 = _mm_loadu_si128((__m128i *)latin1_input); + // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes + const __m128i v_80 = _mm_set1_epi8((char)0x80); + if (_mm_testz_si128(in8, v_80)) { // ASCII fast path!!!! + // 1. store (16 bytes) + _mm_storeu_si128((__m128i *)utf8_output, in8); + // 2. adjust pointers + latin1_input += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // We proceed only with the first 16 bytes. + const __m256i in = _mm256_cvtepu8_epi16((in8)); + + // 1. prepare 2-byte values + // input 16-bit word : [0000|0000|aabb|bbbb] x 8 + // expected output : [1100|00aa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [0000|00aa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in, 2); + // t1 = [0000|00aa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [1100|00aa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + + // no bits set above 7th bit + const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000); + const uint32_t one_byte_bitmask = static_cast(_mm256_movemask_epi8(one_byte_bytemask)); + + const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t *row_2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)] + [0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8( + utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_extractf128_si256(utf8_packed, 1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + latin1_input += 16; + continue; + + } // while + return std::make_pair(latin1_input, utf8_output); +} +/* end file src/haswell/avx2_convert_latin1_to_utf8.cpp */ +/* begin file src/haswell/avx2_convert_latin1_to_utf16.cpp */ +template +std::pair avx2_convert_latin1_to_utf16(const char* latin1_input, size_t len, char16_t* utf16_output) { + size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 32 + + size_t i = 0; + for (; i < rounded_len; i += 16) { + // Load 16 bytes from the address (input + i) into a xmm register + __m128i xmm0 = _mm_loadu_si128(reinterpret_cast(latin1_input + i)); + + // Zero extend each byte in xmm0 to word and put it in another xmm register + __m128i xmm1 = _mm_cvtepu8_epi16(xmm0); + + // Shift xmm0 to the right by 8 bytes + xmm0 = _mm_srli_si128(xmm0, 8); + + // Zero extend each byte in the shifted xmm0 to word in xmm0 + xmm0 = _mm_cvtepu8_epi16(xmm0); + + if (big_endian) { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + xmm0 = _mm_shuffle_epi8(xmm0, swap); + xmm1 = _mm_shuffle_epi8(xmm1, swap); + } + + // Store the contents of xmm1 into the address pointed by (output + i) + _mm_storeu_si128(reinterpret_cast<__m128i*>(utf16_output + i), xmm1); + + // Store the contents of xmm0 into the address pointed by (output + i + 8) + _mm_storeu_si128(reinterpret_cast<__m128i*>(utf16_output + i + 8), xmm0); + } + + return std::make_pair(latin1_input + rounded_len, utf16_output + rounded_len); + +} +/* end file src/haswell/avx2_convert_latin1_to_utf16.cpp */ +/* begin file src/haswell/avx2_convert_latin1_to_utf32.cpp */ +std::pair avx2_convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) { + size_t rounded_len = ((len | 7) ^ 7); // Round down to nearest multiple of 8 + + for (size_t i = 0; i < rounded_len; i += 8) { + // Load 8 Latin1 characters into a 64-bit register + __m128i in = _mm_loadl_epi64((__m128i*)&buf[i]); + + // Zero extend each set of 8 Latin1 characters to 8 32-bit integers using vpmovzxbd + __m256i out = _mm256_cvtepu8_epi32(in); + + // Store the results back to memory + _mm256_storeu_si256((__m256i*)&utf32_output[i], out); + } + + // return pointers pointing to where we left off + return std::make_pair(buf + rounded_len, utf32_output + rounded_len); +} + +/* end file src/haswell/avx2_convert_latin1_to_utf32.cpp */ + /* begin file src/haswell/avx2_convert_utf8_to_utf16.cpp */ // depends on "tables/utf8_to_utf16_tables.h" @@ -19811,7 +22703,7 @@ size_t convert_masked_utf8_to_utf16(const char *input, return 16; // We consumed 16 bytes. } if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) { - // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words. + // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte UTF-16 code units. // There is probably a more efficient sequence, but the following might do. const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); const __m128i perm = _mm_shuffle_epi8(in, sh); @@ -19824,7 +22716,7 @@ size_t convert_masked_utf8_to_utf16(const char *input, return 16; } if(input_utf8_end_of_code_point_mask == 0x924) { - // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words. + // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte UTF-16 code units. // There is probably a more efficient sequence, but the following might do. const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); const __m128i perm = _mm_shuffle_epi8(in, sh); @@ -19850,10 +22742,10 @@ size_t convert_masked_utf8_to_utf16(const char *input, const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; if (idx < 64) { - // SIX (6) input code-words + // SIX (6) input code-code units // this is a relatively easy scenario - // we process SIX (6) input code-words. The max length in bytes of six code - // words spanning between 1 and 2 bytes each is 12 bytes. On processors + // we process SIX (6) input code-code units. The max length in bytes of six code + // code units spanning between 1 and 2 bytes each is 12 bytes. On processors // where pdep/pext is fast, we might be able to use a small lookup table. const __m128i sh = _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]); @@ -19865,7 +22757,7 @@ size_t convert_masked_utf8_to_utf16(const char *input, _mm_storeu_si128((__m128i *)utf16_output, composed); utf16_output += 6; // We wrote 12 bytes, 6 code points. There is a potential overflow of 4 bytes. } else if (idx < 145) { - // FOUR (4) input code-words + // FOUR (4) input code-code units const __m128i sh = _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]); const __m128i perm = _mm_shuffle_epi8(in, sh); @@ -19884,7 +22776,7 @@ size_t convert_masked_utf8_to_utf16(const char *input, _mm_storeu_si128((__m128i *)utf16_output, composed_repacked); utf16_output += 4; // Here we overflow by 8 bytes. } else if (idx < 209) { - // TWO (2) input code-words + // TWO (2) input code-code units ////////////// // There might be garbage inputs where a leading byte mascarades as a four-byte // leading byte (by being followed by 3 continuation byte), but is not greater than @@ -19954,7 +22846,6 @@ size_t convert_masked_utf8_to_utf16(const char *input, return consumed; } /* end file src/haswell/avx2_convert_utf8_to_utf16.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf8_to_utf32.cpp /* begin file src/haswell/avx2_convert_utf8_to_utf32.cpp */ // depends on "tables/utf8_to_utf16_tables.h" @@ -19987,7 +22878,7 @@ size_t convert_masked_utf8_to_utf32(const char *input, return 16; // We consumed 16 bytes. } if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) { - // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words. + // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte UTF-32 code units. // There is probably a more efficient sequence, but the following might do. const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); const __m128i perm = _mm_shuffle_epi8(in, sh); @@ -19999,7 +22890,7 @@ size_t convert_masked_utf8_to_utf32(const char *input, return 16; } if(input_utf8_end_of_code_point_mask == 0x924) { - // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words. + // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte UTF-32 code units. // There is probably a more efficient sequence, but the following might do. const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); const __m128i perm = _mm_shuffle_epi8(in, sh); @@ -20024,10 +22915,10 @@ size_t convert_masked_utf8_to_utf32(const char *input, const uint8_t consumed = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; if (idx < 64) { - // SIX (6) input code-words + // SIX (6) input code-code units // this is a relatively easy scenario - // we process SIX (6) input code-words. The max length in bytes of six code - // words spanning between 1 and 2 bytes each is 12 bytes. On processors + // we process SIX (6) input code-code units. The max length in bytes of six code + // code units spanning between 1 and 2 bytes each is 12 bytes. On processors // where pdep/pext is fast, we might be able to use a small lookup table. const __m128i sh = _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); @@ -20039,7 +22930,7 @@ size_t convert_masked_utf8_to_utf32(const char *input, utf32_output += 6; // We wrote 24 bytes, 6 code points. There is a potential // overflow of 32 - 24 = 8 bytes. } else if (idx < 145) { - // FOUR (4) input code-words + // FOUR (4) input code-code units const __m128i sh = _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); const __m128i perm = _mm_shuffle_epi8(in, sh); @@ -20056,7 +22947,7 @@ size_t convert_masked_utf8_to_utf32(const char *input, _mm_storeu_si128((__m128i *)utf32_output, composed); utf32_output += 4; } else if (idx < 209) { - // TWO (2) input code-words + // TWO (2) input code-code units const __m128i sh = _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); const __m128i perm = _mm_shuffle_epi8(in, sh); @@ -20083,11 +22974,97 @@ size_t convert_masked_utf8_to_utf32(const char *input, } /* end file src/haswell/avx2_convert_utf8_to_utf32.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf16_to_utf8.cpp +/* begin file src/haswell/avx2_convert_utf16_to_latin1.cpp */ +template +std::pair +avx2_convert_utf16_to_latin1(const char16_t *buf, size_t len, + char *latin1_output) { + const char16_t *end = buf + len; + while (buf + 16 <= end) { + // Load 16 UTF-16 characters into 256-bit AVX2 register + __m256i in = _mm256_loadu_si256(reinterpret_cast(buf)); + + if (!match_system(big_endian)) { + const __m256i swap = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, + 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + in = _mm256_shuffle_epi8(in, swap); + } + + __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00); + if (_mm256_testz_si256(in, high_byte_mask)) { + // Pack 16-bit characters into 8-bit and store in latin1_output + __m128i lo = _mm256_extractf128_si256(in, 0); + __m128i hi = _mm256_extractf128_si256(in, 1); + __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo); + __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi); + _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output), + latin1_packed_lo); + _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8), + latin1_packed_hi); + // Adjust pointers for next iteration + buf += 16; + latin1_output += 16; + } else { + return std::make_pair(nullptr, reinterpret_cast(latin1_output)); + } + } // while + return std::make_pair(buf, latin1_output); +} + +template +std::pair +avx2_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len, + char *latin1_output) { + const char16_t *start = buf; + const char16_t *end = buf + len; + while (buf + 16 <= end) { + __m256i in = _mm256_loadu_si256(reinterpret_cast(buf)); + + if (!big_endian) { + const __m256i swap = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, + 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + in = _mm256_shuffle_epi8(in, swap); + } + + __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00); + if (_mm256_testz_si256(in, high_byte_mask)) { + __m128i lo = _mm256_extractf128_si256(in, 0); + __m128i hi = _mm256_extractf128_si256(in, 1); + __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo); + __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi); + _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output), + latin1_packed_lo); + _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8), + latin1_packed_hi); + buf += 16; + latin1_output += 16; + } else { + // Fallback to scalar code for handling errors + for (int k = 0; k < 16; k++) { + uint16_t word = !match_system(big_endian) + ? scalar::utf16::swap_bytes(buf[k]) + : buf[k]; + if (word <= 0xff) { + *latin1_output++ = char(word); + } else { + return std::make_pair( + result{error_code::TOO_LARGE, (size_t)(buf - start + k)}, + latin1_output); + } + } + buf += 16; + } + } // while + return std::make_pair(result{error_code::SUCCESS, (size_t)(buf - start)}, + latin1_output); +} +/* end file src/haswell/avx2_convert_utf16_to_latin1.cpp */ /* begin file src/haswell/avx2_convert_utf16_to_utf8.cpp */ /* The vectorized algorithm works on single SSE register i.e., it - loads eight 16-bit words. + loads eight 16-bit code units. We consider three cases: 1. an input register contains no surrogates and each value @@ -20099,7 +23076,7 @@ size_t convert_masked_utf8_to_utf32(const char *input, Ad 1. - When values are less than 0x0800, it means that a 16-bit words + When values are less than 0x0800, it means that a 16-bit code unit can be converted into: 1) single UTF8 byte (when it's an ASCII char) or 2) two UTF8 bytes. @@ -20113,7 +23090,7 @@ size_t convert_masked_utf8_to_utf32(const char *input, Ad 2. - When values fit in 16-bit words, but are above 0x07ff, then + When values fit in 16-bit code units, but are above 0x07ff, then a single word may produce one, two or three UTF8 bytes. We prepare data for all these three cases in two registers. @@ -20230,7 +23207,7 @@ std::pair avx2_convert_utf16_to_utf8(const char16_t* buf // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, // it is likely an uncommon occurrence. if (surrogates_bitmask == 0x00000000) { - // case: words from register produce either 1, 2 or 3 UTF-8 bytes + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, 0x0000, 0x0202, 0x0404, 0x0606, @@ -20241,7 +23218,7 @@ std::pair avx2_convert_utf16_to_utf8(const char16_t* buf 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes - We expand the input word (16-bit) into two words (32-bit), thus + We expand the input word (16-bit) into two code units (32-bit), thus we have room for four bytes. However, we need five distinct bit layouts. Note that the last byte in cases #2 and #3 is the same. @@ -20252,7 +23229,7 @@ std::pair avx2_convert_utf16_to_utf8(const char16_t* buf either byte 1 for case #2 or byte 2 for case #3. Note that they differ by exactly one bit. - Finally from these two words we build proper UTF-8 sequence, taking + Finally from these two code units we build proper UTF-8 sequence, taking into account the case (i.e, the number of bytes to write). */ /** @@ -20280,16 +23257,16 @@ std::pair avx2_convert_utf16_to_utf8(const char16_t* buf const __m256i s4 = _mm256_xor_si256(s3, m0); #undef simdutf_vec - // 4. expand words 16-bit => 32-bit + // 4. expand code units 16-bit => 32-bit const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); - // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa); // Due to the wider registers, the following path is less likely to be useful. /*if(mask == 0) { - // We only have three-byte words. Use fast path. + // We only have three-byte code units. Use fast path. const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle); @@ -20473,7 +23450,7 @@ std::pair avx2_convert_utf16_to_utf8_with_errors(const char16_t* // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, // it is likely an uncommon occurrence. if (surrogates_bitmask == 0x00000000) { - // case: words from register produce either 1, 2 or 3 UTF-8 bytes + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, 0x0000, 0x0202, 0x0404, 0x0606, @@ -20484,7 +23461,7 @@ std::pair avx2_convert_utf16_to_utf8_with_errors(const char16_t* 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes - We expand the input word (16-bit) into two words (32-bit), thus + We expand the input word (16-bit) into two code units (32-bit), thus we have room for four bytes. However, we need five distinct bit layouts. Note that the last byte in cases #2 and #3 is the same. @@ -20495,7 +23472,7 @@ std::pair avx2_convert_utf16_to_utf8_with_errors(const char16_t* either byte 1 for case #2 or byte 2 for case #3. Note that they differ by exactly one bit. - Finally from these two words we build proper UTF-8 sequence, taking + Finally from these two code units we build proper UTF-8 sequence, taking into account the case (i.e, the number of bytes to write). */ /** @@ -20523,16 +23500,16 @@ std::pair avx2_convert_utf16_to_utf8_with_errors(const char16_t* const __m256i s4 = _mm256_xor_si256(s3, m0); #undef simdutf_vec - // 4. expand words 16-bit => 32-bit + // 4. expand code units 16-bit => 32-bit const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); - // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa); // Due to the wider registers, the following path is less likely to be useful. /*if(mask == 0) { - // We only have three-byte words. Use fast path. + // We only have three-byte code units. Use fast path. const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle); @@ -20613,67 +23590,134 @@ std::pair avx2_convert_utf16_to_utf8_with_errors(const char16_t* buf += k; } } // while - return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); + return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); +} +/* end file src/haswell/avx2_convert_utf16_to_utf8.cpp */ +/* begin file src/haswell/avx2_convert_utf16_to_utf32.cpp */ +/* + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit code units. + + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + is in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. + + Ad 1. + + When values are less than 0x0800, it means that a 16-bit code unit + can be converted into: 1) single UTF8 byte (when it's an ASCII + char) or 2) two UTF8 bytes. + + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + Ad 2. + + When values fit in 16-bit code units, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. + + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. + + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ + + +/* + Returns a pair: the first unprocessed byte from buf and utf32_output + A scalar routing should carry on the conversion of the tail. +*/ +template +std::pair avx2_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) { + const char16_t* end = buf + len; + const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800); + + while (buf + 16 <= end) { + __m256i in = _mm256_loadu_si256((__m256i*)buf); + if (big_endian) { + const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, + 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + in = _mm256_shuffle_epi8(in, swap); + } + + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800); + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint32_t surrogates_bitmask = static_cast(_mm256_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, + // it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x00000000) { + // case: we extend all sixteen 16-bit code units to sixteen 32-bit code units + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in))); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1))); + utf32_output += 16; + buf += 16; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if((word &0xF800 ) != 0xD800) { + // No surrogate pair + *utf32_output++ = char32_t(word); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, utf32_output); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + } + } + buf += k; + } + } // while + return std::make_pair(buf, utf32_output); } -/* end file src/haswell/avx2_convert_utf16_to_utf8.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf16_to_utf32.cpp -/* begin file src/haswell/avx2_convert_utf16_to_utf32.cpp */ -/* - The vectorized algorithm works on single SSE register i.e., it - loads eight 16-bit words. - - We consider three cases: - 1. an input register contains no surrogates and each value - is in range 0x0000 .. 0x07ff. - 2. an input register contains no surrogates and values are - is in range 0x0000 .. 0xffff. - 3. an input register contains surrogates --- i.e. codepoints - can have 16 or 32 bits. - - Ad 1. - - When values are less than 0x0800, it means that a 16-bit words - can be converted into: 1) single UTF8 byte (when it's an ASCII - char) or 2) two UTF8 bytes. - - For this case we do only some shuffle to obtain these 2-byte - codes and finally compress the whole SSE register with a single - shuffle. - - We need 256-entry lookup table to get a compression pattern - and the number of output bytes in the compressed vector register. - Each entry occupies 17 bytes. - - Ad 2. - - When values fit in 16-bit words, but are above 0x07ff, then - a single word may produce one, two or three UTF8 bytes. - - We prepare data for all these three cases in two registers. - The first register contains lower two UTF8 bytes (used in all - cases), while the second one contains just the third byte for - the three-UTF8-bytes case. - - Finally these two registers are interleaved forming eight-element - array of 32-bit values. The array spans two SSE registers. - The bytes from the registers are compressed using two shuffles. - - We need 256-entry lookup table to get a compression pattern - and the number of output bytes in the compressed vector register. - Each entry occupies 17 bytes. - - - To summarize: - - We need two 256-entry tables that have 8704 bytes in total. -*/ /* - Returns a pair: the first unprocessed byte from buf and utf32_output - A scalar routing should carry on the conversion of the tail. + Returns a pair: a result struct and utf8_output. + If there is an error, the count field of the result is the position of the error. + Otherwise, it is the position of the first unprocessed byte in buf (even if finished). + A scalar routing should carry on the conversion of the tail if needed. */ template -std::pair avx2_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) { +std::pair avx2_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) { + const char16_t* start = buf; const char16_t* end = buf + len; const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800); const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800); @@ -20697,7 +23741,7 @@ std::pair avx2_convert_utf16_to_utf32(const char16_t // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, // it is likely an uncommon occurrence. if (surrogates_bitmask == 0x00000000) { - // case: we extend all sixteen 16-bit words to sixteen 32-bit words + // case: we extend all sixteen 16-bit code units to sixteen 32-bit code units _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in))); _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1))); utf32_output += 16; @@ -20721,7 +23765,7 @@ std::pair avx2_convert_utf16_to_utf32(const char16_t uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1]; k++; uint16_t diff2 = uint16_t(next_word - 0xDC00); - if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, utf32_output); } + if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output); } uint32_t value = (diff << 10) + diff2 + 0x10000; *utf32_output++ = char32_t(value); } @@ -20729,99 +23773,369 @@ std::pair avx2_convert_utf16_to_utf32(const char16_t buf += k; } } // while - return std::make_pair(buf, utf32_output); + return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output); +} +/* end file src/haswell/avx2_convert_utf16_to_utf32.cpp */ + +/* begin file src/haswell/avx2_convert_utf32_to_latin1.cpp */ +std::pair +avx2_convert_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) { + const size_t rounded_len = + len & ~0x1F; // Round down to nearest multiple of 32 + + __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00); + + __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, 12, 8, 4, 0); + + for (size_t i = 0; i < rounded_len; i += 16) { + __m256i in1 = _mm256_loadu_si256((__m256i *)buf); + __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8)); + + __m256i check_combined = _mm256_or_si256(in1, in2); + + if (!_mm256_testz_si256(check_combined, high_bytes_mask)) { + return std::make_pair(nullptr, latin1_output); + } + + //Turn UTF32 bytes into latin 1 bytes + __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask); + __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask); + + //move Latin1 bytes to their correct spot + __m256i idx1 = _mm256_set_epi32(-1, -1,-1,-1,-1,-1,4,0); + __m256i idx2 = _mm256_set_epi32(-1, -1,-1,-1,4,0,-1,-1); + __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1); + __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2); + + __m256i result = _mm256_or_si256(reshuffled1, reshuffled2); + _mm_storeu_si128((__m128i *)latin1_output, + _mm256_castsi256_si128(result)); + + latin1_output += 16; + buf += 16; + } + + return std::make_pair(buf, latin1_output); +} +std::pair +avx2_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, + char *latin1_output) { + const size_t rounded_len = + len & ~0x1F; // Round down to nearest multiple of 32 + + __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00); + __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, 12, 8, 4, 0); + + const char32_t *start = buf; + + for (size_t i = 0; i < rounded_len; i += 16) { + __m256i in1 = _mm256_loadu_si256((__m256i *)buf); + __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8)); + + __m256i check_combined = _mm256_or_si256(in1, in2); + + if (!_mm256_testz_si256(check_combined, high_bytes_mask)) { + // Fallback to scalar code for handling errors + for (int k = 0; k < 8; k++) { + char32_t codepoint = buf[k]; + if (codepoint <= 0xFF) { + *latin1_output++ = static_cast(codepoint); + } else { + return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), + latin1_output); + } + } + buf += 8; + } else { + __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask); + __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask); + + __m256i idx1 = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, 4, 0); + __m256i idx2 = _mm256_set_epi32(-1, -1, -1, -1, 4, 0, -1, -1); + __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1); + __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2); + + __m256i result = _mm256_or_si256(reshuffled1, reshuffled2); + _mm_storeu_si128((__m128i *)latin1_output, _mm256_castsi256_si128(result)); + + latin1_output += 16; + buf += 16; + } + } + + return std::make_pair(result(error_code::SUCCESS, buf - start), latin1_output); } +/* end file src/haswell/avx2_convert_utf32_to_latin1.cpp */ +/* begin file src/haswell/avx2_convert_utf32_to_utf8.cpp */ +std::pair avx2_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) { + const char32_t* end = buf + len; + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); + const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80); + const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800); + const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080); + const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff); + __m256i running_max = _mm256_setzero_si256(); + __m256i forbidden_bytemask = _mm256_setzero_si256(); + + const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + + while (buf + 16 + safety_margin <= end) { + __m256i in = _mm256_loadu_si256((__m256i*)buf); + __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1); + running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin); + + // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned saturation + __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff)); + in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000); + + // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp) + + if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!! + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1)); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // no bits set above 7th bit + const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000); + const uint32_t one_byte_bitmask = static_cast(_mm256_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000); + const uint32_t one_or_two_bytes_bitmask = static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); + if (one_or_two_bytes_bitmask == 0xffffffff) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in_16, 2); + // t1 = [000a|aaaa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in_16, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + buf += 16; + continue; + } + // Must check for overflow in packing + const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000); + const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); + if (saturation_bitmask == 0xffffffff) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800); + forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800)); + + const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, + 0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) _mm256_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m256i s0 = _mm256_srli_epi16(in_16, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000)); + const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000)); + const __m256i s4 = _mm256_xor_si256(s3, m0); +#undef simdutf_vec + + // 4. expand code units 16-bit => 32-bit + const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); + const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint32_t mask = (one_byte_bitmask & 0x55555555) | + (one_or_two_bytes_bitmask & 0xaaaaaaaa); + // Due to the wider registers, the following path is less likely to be useful. + /*if(mask == 0) { + // We only have three-byte code units. Use fast path. + const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); + const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle); + const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1)); + utf8_output += 12; + buf += 16; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); + const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); + const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); -/* - Returns a pair: a result struct and utf8_output. - If there is an error, the count field of the result is the position of the error. - Otherwise, it is the position of the first unprocessed byte in buf (even if finished). - A scalar routing should carry on the conversion of the tail if needed. -*/ -template -std::pair avx2_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) { - const char16_t* start = buf; - const char16_t* end = buf + len; - const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800); - const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800); + const uint8_t mask2 = static_cast(mask >> 16); + const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; + const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1)); + const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2); - while (buf + 16 <= end) { - __m256i in = _mm256_loadu_si256((__m256i*)buf); - if (big_endian) { - const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, - 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); - in = _mm256_shuffle_epi8(in, swap); - } - // 1. Check if there are any surrogate word in the input chunk. - // We have also deal with situation when there is a surrogate word - // at the end of a chunk. - const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800); + const uint8_t mask3 = static_cast(mask >> 24); + const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; + const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1)); + const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3); - // bitmask = 0x0000 if there are no surrogates - // = 0xc000 if the last word is a surrogate - const uint32_t surrogates_bitmask = static_cast(_mm256_movemask_epi8(surrogates_bytemask)); - // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, - // it is likely an uncommon occurrence. - if (surrogates_bitmask == 0x00000000) { - // case: we extend all sixteen 16-bit words to sixteen 32-bit words - _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in))); - _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1))); - utf32_output += 16; - buf += 16; - // surrogate pair(s) in a register + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += row1[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_2); + utf8_output += row2[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_3); + utf8_output += row3[0]; + buf += 16; } else { + // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes. // Let us do a scalar fallback. // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. + // may require large, non-trivial tables? size_t forward = 15; size_t k = 0; if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} for(; k < forward; k++) { - uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; - if((word &0xF800 ) != 0xD800) { - // No surrogate pair - *utf32_output++ = char32_t(word); - } else { - // must be a surrogate pair - uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1]; - k++; - uint16_t diff2 = uint16_t(next_word - 0xDC00); - if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output); } - uint32_t value = (diff << 10) + diff2 + 0x10000; - *utf32_output++ = char32_t(value); + uint32_t word = buf[k]; + if((word & 0xFFFFFF80)==0) { // 1-byte (ASCII) + *utf8_output++ = char(word); + } else if((word & 0xFFFFF800)==0) { // 2-byte + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word & 0xFFFF0000 )==0) { // 3-byte + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); } + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { // 4-byte + if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); } + *utf8_output++ = char((word>>18) | 0b11110000); + *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); } } buf += k; } } // while - return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output); + + // check for invalid input + const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff); + if(static_cast(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) { + return std::make_pair(nullptr, utf8_output); + } + + if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); } + + return std::make_pair(buf, utf8_output); } -/* end file src/haswell/avx2_convert_utf16_to_utf32.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf32_to_utf8.cpp -/* begin file src/haswell/avx2_convert_utf32_to_utf8.cpp */ -std::pair avx2_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) { + +std::pair avx2_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) { const char32_t* end = buf + len; + const char32_t* start = buf; + const __m256i v_0000 = _mm256_setzero_si256(); const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80); const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800); const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080); const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff); - __m256i running_max = _mm256_setzero_si256(); - __m256i forbidden_bytemask = _mm256_setzero_si256(); + const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff); const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 while (buf + 16 + safety_margin <= end) { __m256i in = _mm256_loadu_si256((__m256i*)buf); __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1); - running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin); + // Check for too large input + const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff); + if(static_cast(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) { + return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output); + } - // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation + // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned saturation __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff)); in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000); @@ -20892,9 +24206,14 @@ std::pair avx2_convert_utf32_to_utf8(const char32_t* buf const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000); const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); if (saturation_bitmask == 0xffffffff) { - // case: words from register produce either 1, 2 or 3 UTF-8 bytes + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + + // Check for illegal surrogate code units const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800); - forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800)); + const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800); + if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output); + } const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, @@ -20906,7 +24225,7 @@ std::pair avx2_convert_utf32_to_utf8(const char32_t* buf 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes - We expand the input word (16-bit) into two words (32-bit), thus + We expand the input word (16-bit) into two code units (32-bit), thus we have room for four bytes. However, we need five distinct bit layouts. Note that the last byte in cases #2 and #3 is the same. @@ -20917,7 +24236,7 @@ std::pair avx2_convert_utf32_to_utf8(const char32_t* buf either byte 1 for case #2 or byte 2 for case #3. Note that they differ by exactly one bit. - Finally from these two words we build proper UTF-8 sequence, taking + Finally from these two code units we build proper UTF-8 sequence, taking into account the case (i.e, the number of bytes to write). */ /** @@ -20945,16 +24264,16 @@ std::pair avx2_convert_utf32_to_utf8(const char32_t* buf const __m256i s4 = _mm256_xor_si256(s3, m0); #undef simdutf_vec - // 4. expand words 16-bit => 32-bit + // 4. expand code units 16-bit => 32-bit const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); - // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa); // Due to the wider registers, the following path is less likely to be useful. /*if(mask == 0) { - // We only have three-byte words. Use fast path. + // We only have three-byte code units. Use fast path. const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle); @@ -20985,545 +24304,757 @@ std::pair avx2_convert_utf32_to_utf8(const char32_t* buf const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2); - const uint8_t mask3 = static_cast(mask >> 24); - const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; - const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1)); - const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3); + const uint8_t mask3 = static_cast(mask >> 24); + const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; + const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1)); + const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3); + + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += row1[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_2); + utf8_output += row2[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_3); + utf8_output += row3[0]; + buf += 16; + } else { + // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes. + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // may require large, non-trivial tables? + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFFFF80)==0) { // 1-byte (ASCII) + *utf8_output++ = char(word); + } else if((word & 0xFFFFF800)==0) { // 2-byte + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word & 0xFFFF0000 )==0) { // 3-byte + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); } + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { // 4-byte + if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); } + *utf8_output++ = char((word>>18) | 0b11110000); + *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); +} +/* end file src/haswell/avx2_convert_utf32_to_utf8.cpp */ +/* begin file src/haswell/avx2_convert_utf32_to_utf16.cpp */ +template +std::pair avx2_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) { + const char32_t* end = buf + len; + + const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + __m256i forbidden_bytemask = _mm256_setzero_si256(); + + + while (buf + 8 + safety_margin <= end) { + __m256i in = _mm256_loadu_si256((__m256i*)buf); + + const __m256i v_00000000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000); + + // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs + const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); + const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); + + if (saturation_bitmask == 0xffffffff) { + const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800); + forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800)); + + __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1)); + if (big_endian) { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); + } + _mm_storeu_si128((__m128i*)utf16_output, utf16_packed); + utf16_output += 8; + buf += 8; + } else { + size_t forward = 7; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFF0000)==0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); } + *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (big_endian) { + high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); + low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } + buf += k; + } + } + + // check for invalid input + if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); } + + return std::make_pair(buf, utf16_output); +} + + +template +std::pair avx2_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) { + const char32_t* start = buf; + const char32_t* end = buf + len; + + const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + + while (buf + 8 + safety_margin <= end) { + __m256i in = _mm256_loadu_si256((__m256i*)buf); + + const __m256i v_00000000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000); + + // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs + const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); + const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); + + if (saturation_bitmask == 0xffffffff) { + const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800); + const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800); + if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output); + } - _mm_storeu_si128((__m128i*)utf8_output, utf8_0); - utf8_output += row0[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_1); - utf8_output += row1[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_2); - utf8_output += row2[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_3); - utf8_output += row3[0]; - buf += 16; + __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1)); + if (big_endian) { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); + } + _mm_storeu_si128((__m128i*)utf16_output, utf16_packed); + utf16_output += 8; + buf += 8; } else { - // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes. - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // may require large, non-trivial tables? - size_t forward = 15; + size_t forward = 7; size_t k = 0; if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} for(; k < forward; k++) { uint32_t word = buf[k]; - if((word & 0xFFFFFF80)==0) { // 1-byte (ASCII) - *utf8_output++ = char(word); - } else if((word & 0xFFFFF800)==0) { // 2-byte - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word & 0xFFFF0000 )==0) { // 3-byte - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); } - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else { // 4-byte - if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); } - *utf8_output++ = char((word>>18) | 0b11110000); - *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); + if((word & 0xFFFF0000)==0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); } + *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (big_endian) { + high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); + low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); } } buf += k; } - } // while - - // check for invalid input - const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff); - if(static_cast(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) { - return std::make_pair(nullptr, utf8_output); } - if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); } - - return std::make_pair(buf, utf8_output); + return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output); } +/* end file src/haswell/avx2_convert_utf32_to_utf16.cpp */ +/* begin file src/haswell/avx2_convert_utf8_to_latin1.cpp */ +// depends on "tables/utf8_to_utf16_tables.h" -std::pair avx2_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) { - const char32_t* end = buf + len; - const char32_t* start = buf; +// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +size_t convert_masked_utf8_to_latin1(const char *input, + uint64_t utf8_end_of_code_point_mask, + char *&latin1_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + // + // Optimization note: our main path below is load-latency dependent. Thus it is maybe + // beneficial to have fast paths that depend on branch prediction but have less latency. + // This results in more instructions but, potentially, also higher speeds. + // + const __m128i in = _mm_loadu_si128((__m128i *)input); + const __m128i in_second_half = _mm_loadu_si128((__m128i *)(input + 16)); - const __m256i v_0000 = _mm256_setzero_si256(); - const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); - const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80); - const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800); - const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080); - const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff); - const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; //we're only processing 12 bytes in case it`s not all ASCII - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + if((input_utf8_end_of_code_point_mask & 0xffffffff) == 0xffffffff) { + // Load the next 128 bits. - while (buf + 16 + safety_margin <= end) { - __m256i in = _mm256_loadu_si256((__m256i*)buf); - __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1); - // Check for too large input - const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff); - if(static_cast(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) { - return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output); - } + // Combine the two 128-bit registers into a single 256-bit register. + __m256i in_combined = _mm256_set_m128i(in_second_half, in); - // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation - __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff)); - in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000); + // We process the data in chunks of 32 bytes. + _mm256_storeu_si256(reinterpret_cast<__m256i *>(latin1_output), in_combined); - // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp) + latin1_output += 32; // We wrote 32 characters. + return 32; // We consumed 32 bytes. + } - if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!! - // 1. pack the bytes - const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1)); - // 2. store (16 bytes) - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); - // 3. adjust pointers - buf += 16; - utf8_output += 16; - continue; // we are done for this round! - } - // no bits set above 7th bit - const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000); - const uint32_t one_byte_bitmask = static_cast(_mm256_movemask_epi8(one_byte_bytemask)); - // no bits set above 11th bit - const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000); - const uint32_t one_or_two_bytes_bitmask = static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); - if (one_or_two_bytes_bitmask == 0xffffffff) { - // 1. prepare 2-byte values - // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 - // expected output : [110a|aaaa|10bb|bbbb] x 8 - const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); - const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) { + // We process the data in chunks of 16 bytes. + _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in); + latin1_output += 16; // We wrote 16 characters. + return 16; // We consumed 16 bytes. + } + /// We do not have a fast path available, so we fallback. + const uint8_t idx = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + // this indicates an invalid input: + if(idx >= 64) { return consumed; } + // Here we should have (idx < 64), if not, there is a bug in the validation or elsewhere. + // SIX (6) input code-code units + // this is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six code + // code units spanning between 1 and 2 bytes each is 12 bytes. On processors + // where pdep/pext is fast, we might be able to use a small lookup table. + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + const __m128i latin1_packed = _mm_packus_epi16(composed,composed); + // writing 8 bytes even though we only care about the first 6 bytes. + // performance note: it would be faster to use _mm_storeu_si128, we should investigate. + _mm_storel_epi64((__m128i *)latin1_output, latin1_packed); + latin1_output += 6; // We wrote 6 bytes. + return consumed; +} +/* end file src/haswell/avx2_convert_utf8_to_latin1.cpp */ - // t0 = [000a|aaaa|bbbb|bb00] - const __m256i t0 = _mm256_slli_epi16(in_16, 2); - // t1 = [000a|aaaa|0000|0000] - const __m256i t1 = _mm256_and_si256(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const __m256i t2 = _mm256_and_si256(in_16, v_003f); - // t3 = [000a|aaaa|00bb|bbbb] - const __m256i t3 = _mm256_or_si256(t1, t2); - // t4 = [110a|aaaa|10bb|bbbb] - const __m256i t4 = _mm256_or_si256(t3, v_c080); +} // unnamed namespace +} // namespace haswell +} // namespace simdutf - // 2. merge ASCII and 2-byte codewords - const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask); +/* begin file src/generic/buf_block_reader.h */ +namespace simdutf { +namespace haswell { +namespace { - // 3. prepare bitmask for 8-bit lookup - const uint32_t M0 = one_byte_bitmask & 0x55555555; - const uint32_t M1 = M0 >> 7; - const uint32_t M2 = (M1 | M0) & 0x00ff00ff; - // 4. pack the bytes +// Walks through a buffer in block-sized increments, loading the last part with spaces +template +struct buf_block_reader { +public: + simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); + simdutf_really_inline size_t block_index(); + simdutf_really_inline bool has_full_block() const; + simdutf_really_inline const uint8_t *full_block() const; + /** + * Get the last block, padded with spaces. + * + * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this + * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there + * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. + * + * @return the number of effective characters in the last block. + */ + simdutf_really_inline size_t get_remainder(uint8_t *dst) const; + simdutf_really_inline void advance(); +private: + const uint8_t *buf; + const size_t len; + const size_t lenminusstep; + size_t idx; +}; - const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; - const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0]; +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text_64(const uint8_t *text) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + for (size_t i=0; i); i++) { + buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} - const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); - const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1)); +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text(const simd8x64& in) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + in.store(reinterpret_cast(buf)); + for (size_t i=0; i); i++) { + if (buf[i] < ' ') { buf[i] = '_'; } + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} - const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2)); - // 5. store bytes - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed)); - utf8_output += row[0]; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1)); - utf8_output += row_2[0]; +simdutf_unused static char * format_mask(uint64_t mask) { + static char *buf = reinterpret_cast(malloc(64 + 1)); + for (size_t i=0; i<64; i++) { + buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; + } + buf[64] = '\0'; + return buf; +} - // 6. adjust pointers - buf += 16; - continue; - } - // Must check for overflow in packing - const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000); - const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); - if (saturation_bitmask == 0xffffffff) { - // case: words from register produce either 1, 2 or 3 UTF-8 bytes +template +simdutf_really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} - // Check for illegal surrogate words - const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800); - const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800); - if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) { - return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output); - } +template +simdutf_really_inline size_t buf_block_reader::block_index() { return idx; } - const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, - 0x0000, 0x0202, 0x0404, 0x0606, - 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); +template +simdutf_really_inline bool buf_block_reader::has_full_block() const { + return idx < lenminusstep; +} - /* In this branch we handle three cases: - 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte - 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes - 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes +template +simdutf_really_inline const uint8_t *buf_block_reader::full_block() const { + return &buf[idx]; +} - We expand the input word (16-bit) into two words (32-bit), thus - we have room for four bytes. However, we need five distinct bit - layouts. Note that the last byte in cases #2 and #3 is the same. +template +simdutf_really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { + if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers + std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. + std::memcpy(dst, buf + idx, len - idx); + return len - idx; +} - We precompute byte 1 for case #1 and the common byte for cases #2 & #3 - in register t2. +template +simdutf_really_inline void buf_block_reader::advance() { + idx += STEP_SIZE; +} - We precompute byte 1 for case #3 and -- **conditionally** -- precompute - either byte 1 for case #2 or byte 2 for case #3. Note that they - differ by exactly one bit. +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/buf_block_reader.h */ +/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf8_validation { - Finally from these two words we build proper UTF-8 sequence, taking - into account the case (i.e, the number of bytes to write). - */ - /** - * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: - * t2 => [0ccc|cccc] [10cc|cccc] - * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) - */ -#define simdutf_vec(x) _mm256_set1_epi16(static_cast(x)) - // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] - const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even); - // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] - const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111)); - // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000)); +using namespace simd; - // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] - const __m256i s0 = _mm256_srli_epi16(in_16, 4); - // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] - const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100)); - // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] - const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140)); - // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] - const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000)); - const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000)); - const __m256i s4 = _mm256_xor_si256(s3, m0); -#undef simdutf_vec + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ - // 4. expand words 16-bit => 32-bit - const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); - const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, - // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle - const uint32_t mask = (one_byte_bitmask & 0x55555555) | - (one_or_two_bytes_bitmask & 0xaaaaaaaa); - // Due to the wider registers, the following path is less likely to be useful. - /*if(mask == 0) { - // We only have three-byte words. Use fast path. - const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); - const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle); - const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle); - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1)); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1)); - utf8_output += 12; - buf += 16; - continue; - }*/ - const uint8_t mask0 = uint8_t(mask); - const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; - const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); - const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, - const uint8_t mask1 = static_cast(mask >> 8); - const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; - const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); - const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - const uint8_t mask2 = static_cast(mask >> 16); - const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; - const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1)); - const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2); + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } - const uint8_t mask3 = static_cast(mask >> 24); - const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; - const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1)); - const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3); + // + // Return nonzero if there are incomplete multibyte characters at the end of the block: + // e.g. if there is a 4-byte character, but it's 3 bytes from the end. + // + simdutf_really_inline simd8 is_incomplete(const simd8 input) { + // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): + // ... 1111____ 111_____ 11______ + static const uint8_t max_array[32] = { + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 + }; + const simd8 max_value(&max_array[sizeof(max_array)-sizeof(simd8)]); + return input.gt_bits(max_value); + } - _mm_storeu_si128((__m128i*)utf8_output, utf8_0); - utf8_output += row0[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_1); - utf8_output += row1[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_2); - utf8_output += row2[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_3); - utf8_output += row3[0]; - buf += 16; - } else { - // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes. - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // may require large, non-trivial tables? - size_t forward = 15; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFFFF80)==0) { // 1-byte (ASCII) - *utf8_output++ = char(word); - } else if((word & 0xFFFFF800)==0) { // 2-byte - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word & 0xFFFF0000 )==0) { // 3-byte - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); } - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else { // 4-byte - if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); } - *utf8_output++ = char((word>>18) | 0b11110000); - *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); + struct utf8_checker { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + // The last input we received + simd8 prev_input_block; + // Whether the last input we received was incomplete (used for ASCII fast path) + simd8 prev_incomplete; + + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + // The only problem that can happen at EOF is that a multibyte character is too short + // or a byte value too large in the last bytes: check_special_cases only checks for bytes + // too large in the first of two bytes. + simdutf_really_inline void check_eof() { + // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't + // possibly finish them. + this->error |= this->prev_incomplete; + } + + simdutf_really_inline void check_next_input(const simd8x64& input) { + if(simdutf_likely(is_ascii(input))) { + this->error |= this->prev_incomplete; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } + this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); + this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; + } - buf += k; } - } // while - - return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); -} -/* end file src/haswell/avx2_convert_utf32_to_utf8.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf32_to_utf16.cpp -/* begin file src/haswell/avx2_convert_utf32_to_utf16.cpp */ -template -std::pair avx2_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) { - const char32_t* end = buf + len; - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 - __m256i forbidden_bytemask = _mm256_setzero_si256(); + // do not forget to call check_eof! + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + }; // struct utf8_checker +} // namespace utf8_validation - while (buf + 8 + safety_margin <= end) { - __m256i in = _mm256_loadu_si256((__m256i*)buf); +using utf8_validation::utf8_checker; - const __m256i v_00000000 = _mm256_setzero_si256(); - const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000); +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +/* begin file src/generic/utf8_validation/utf8_validator.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf8_validation { - // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs - const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); - const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); +/** + * Validates that the string is actual UTF-8. + */ +template +bool generic_validate_utf8(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + return !c.errors(); +} - if (saturation_bitmask == 0xffffffff) { - const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800); - const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800); - forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800)); +bool generic_validate_utf8(const char * input, size_t length) { + return generic_validate_utf8(reinterpret_cast(input),length); +} - __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1)); - if (big_endian) { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); +/** + * Validates that the string is actual UTF-8 and stops on errors. + */ +template +result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + if(c.errors()) { + if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input), reinterpret_cast(input + count), length - count); + res.count += count; + return res; } - _mm_storeu_si128((__m128i*)utf16_output, utf16_packed); - utf16_output += 8; - buf += 8; + reader.advance(); + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + if (c.errors()) { + if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input), reinterpret_cast(input) + count, length - count); + res.count += count; + return res; } else { - size_t forward = 7; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFF0000)==0) { - // will not generate a surrogate pair - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); } - *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word); - } else { - // will generate a surrogate pair - if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); } - word -= 0x10000; - uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); - uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (big_endian) { - high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); - low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); - } - *utf16_output++ = char16_t(high_surrogate); - *utf16_output++ = char16_t(low_surrogate); - } - } - buf += k; + return result(error_code::SUCCESS, length); } - } - - // check for invalid input - if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); } - - return std::make_pair(buf, utf16_output); } +result generic_validate_utf8_with_errors(const char * input, size_t length) { + return generic_validate_utf8_with_errors(reinterpret_cast(input),length); +} -template -std::pair avx2_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) { - const char32_t* start = buf; - const char32_t* end = buf + len; - - const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 - - while (buf + 8 + safety_margin <= end) { - __m256i in = _mm256_loadu_si256((__m256i*)buf); - - const __m256i v_00000000 = _mm256_setzero_si256(); - const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000); - - // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs - const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); - const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); +template +bool generic_validate_ascii(const uint8_t * input, size_t length) { + buf_block_reader<64> reader(input, length); + uint8_t blocks[64]{}; + simd::simd8x64 running_or(blocks); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + running_or |= in; + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + running_or |= in; + return running_or.is_ascii(); +} - if (saturation_bitmask == 0xffffffff) { - const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800); - const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800); - const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800); - if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) { - return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output); - } +bool generic_validate_ascii(const char * input, size_t length) { + return generic_validate_ascii(reinterpret_cast(input),length); +} - __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1)); - if (big_endian) { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); - } - _mm_storeu_si128((__m128i*)utf16_output, utf16_packed); - utf16_output += 8; - buf += 8; - } else { - size_t forward = 7; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFF0000)==0) { - // will not generate a surrogate pair - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); } - *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word); - } else { - // will generate a surrogate pair - if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); } - word -= 0x10000; - uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); - uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (big_endian) { - high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); - low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); - } - *utf16_output++ = char16_t(high_surrogate); - *utf16_output++ = char16_t(low_surrogate); - } - } - buf += k; +template +result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) { + buf_block_reader<64> reader(input, length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); } + reader.advance(); + + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } else { + return result(error_code::SUCCESS, length); } +} - return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output); +result generic_validate_ascii_with_errors(const char * input, size_t length) { + return generic_validate_ascii_with_errors(reinterpret_cast(input),length); } -/* end file src/haswell/avx2_convert_utf32_to_utf16.cpp */ + +} // namespace utf8_validation } // unnamed namespace } // namespace haswell } // namespace simdutf +/* end file src/generic/utf8_validation/utf8_validator.h */ +// transcoding from UTF-8 to UTF-16 +/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ + -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h -/* begin file src/generic/buf_block_reader.h */ namespace simdutf { namespace haswell { namespace { +namespace utf8_to_utf16 { -// Walks through a buffer in block-sized increments, loading the last part with spaces -template -struct buf_block_reader { -public: - simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); - simdutf_really_inline size_t block_index(); - simdutf_really_inline bool has_full_block() const; - simdutf_really_inline const uint8_t *full_block() const; - /** - * Get the last block, padded with spaces. - * - * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this - * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there - * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. - * - * @return the number of effective characters in the last block. - */ - simdutf_really_inline size_t get_remainder(uint8_t *dst) const; - simdutf_really_inline void advance(); -private: - const uint8_t *buf; - const size_t len; - const size_t lenminusstep; - size_t idx; -}; - -// Routines to print masks and text for debugging bitmask operations -simdutf_unused static char * format_input_text_64(const uint8_t *text) { - static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); - for (size_t i=0; i); i++) { - buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); - } - buf[sizeof(simd8x64)] = '\0'; - return buf; -} - -// Routines to print masks and text for debugging bitmask operations -simdutf_unused static char * format_input_text(const simd8x64& in) { - static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); - in.store(reinterpret_cast(buf)); - for (size_t i=0; i); i++) { - if (buf[i] < ' ') { buf[i] = '_'; } - } - buf[sizeof(simd8x64)] = '\0'; - return buf; -} +using namespace simd; -simdutf_unused static char * format_mask(uint64_t mask) { - static char *buf = reinterpret_cast(malloc(64 + 1)); - for (size_t i=0; i<64; i++) { - buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; +template +simdutf_warn_unused size_t convert_valid(const char* input, size_t size, + char16_t* utf16_output) noexcept { + // The implementation is not specific to haswell and should be moved to the generic directory. + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + // this loop could be unrolled further. For example, we could process the mask + // far more than 64 bytes. + simd8x64 in(reinterpret_cast(input + pos)); + if(in.is_ascii()) { + in.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // Slow path. We hope that the compiler will recognize that this is a slow path. + // Anything that is not a continuation mask is a 'leading byte', that is, the + // start of a new code point. + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + // -65 is 0b10111111 in two-complement's, so largest possible continuation byte + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + // The *start* of code points is not so useful, rather, we want the *end* of code points. + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times when using solely + // the slow/regular path, and at least four times if there are fast paths. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + // + // Thus we may allow convert_masked_utf8_to_utf16 to process + // more bytes at a time under a fast-path mode where 16 bytes + // are consumed at once (e.g., when encountering ASCII). + size_t consumed = convert_masked_utf8_to_utf16(input + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } } - buf[64] = '\0'; - return buf; -} - -template -simdutf_really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} - -template -simdutf_really_inline size_t buf_block_reader::block_index() { return idx; } - -template -simdutf_really_inline bool buf_block_reader::has_full_block() const { - return idx < lenminusstep; -} - -template -simdutf_really_inline const uint8_t *buf_block_reader::full_block() const { - return &buf[idx]; -} - -template -simdutf_really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { - if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers - std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. - std::memcpy(dst, buf + idx, len - idx); - return len - idx; -} - -template -simdutf_really_inline void buf_block_reader::advance() { - idx += STEP_SIZE; + utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output); + return utf16_output - start; } +} // namespace utf8_to_utf16 } // unnamed namespace } // namespace haswell } // namespace simdutf -/* end file src/generic/buf_block_reader.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h -/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ + + namespace simdutf { namespace haswell { namespace { -namespace utf8_validation { - +namespace utf8_to_utf16 { using namespace simd; + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) // Bit 1 = Too Long (ASCII followed by continuation) @@ -21615,306 +25146,264 @@ using namespace simd; return (byte_1_high & byte_1_low & byte_2_high); } simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, - const simd8 prev_input, const simd8 sc) { - simd8 prev2 = input.prev<2>(prev_input); - simd8 prev3 = input.prev<3>(prev_input); - simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); - simd8 must23_80 = must23 & uint8_t(0x80); - return must23_80 ^ sc; - } - - // - // Return nonzero if there are incomplete multibyte characters at the end of the block: - // e.g. if there is a 4-byte character, but it's 3 bytes from the end. - // - simdutf_really_inline simd8 is_incomplete(const simd8 input) { - // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): - // ... 1111____ 111_____ 11______ - static const uint8_t max_array[32] = { - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 - }; - const simd8 max_value(&max_array[sizeof(max_array)-sizeof(simd8)]); - return input.gt_bits(max_value); - } - - struct utf8_checker { - // If this is nonzero, there has been a UTF-8 error. - simd8 error; - // The last input we received - simd8 prev_input_block; - // Whether the last input we received was incomplete (used for ASCII fast path) - simd8 prev_incomplete; - - // - // Check whether the current bytes are valid UTF-8. - // - simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { - // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes - // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) - simd8 prev1 = input.prev<1>(prev_input); - simd8 sc = check_special_cases(input, prev1); - this->error |= check_multibyte_lengths(input, prev_input, sc); - } - - // The only problem that can happen at EOF is that a multibyte character is too short - // or a byte value too large in the last bytes: check_special_cases only checks for bytes - // too large in the first of two bytes. - simdutf_really_inline void check_eof() { - // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't - // possibly finish them. - this->error |= this->prev_incomplete; - } - - simdutf_really_inline void check_next_input(const simd8x64& input) { - if(simdutf_likely(is_ascii(input))) { - this->error |= this->prev_incomplete; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], this->prev_input_block); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], this->prev_input_block); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); - this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; - - } - } - - // do not forget to call check_eof! - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); - } - - }; // struct utf8_checker -} // namespace utf8_validation + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } -using utf8_validation::utf8_checker; -} // unnamed namespace -} // namespace haswell -} // namespace simdutf -/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h -/* begin file src/generic/utf8_validation/utf8_validator.h */ -namespace simdutf { -namespace haswell { -namespace { -namespace utf8_validation { + struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; -/** - * Validates that the string is actual UTF-8. - */ -template -bool generic_validate_utf8(const uint8_t * input, size_t length) { - checker c{}; - buf_block_reader<64> reader(input, length); - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - c.check_next_input(in); - reader.advance(); + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - c.check_next_input(in); - reader.advance(); - c.check_eof(); - return !c.errors(); -} -bool generic_validate_utf8(const char * input, size_t length) { - return generic_validate_utf8(reinterpret_cast(input),length); -} -/** - * Validates that the string is actual UTF-8 and stops on errors. - */ -template -result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) { - checker c{}; - buf_block_reader<64> reader(input, length); - size_t count{0}; - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - c.check_next_input(in); - if(c.errors()) { - if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk - result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input), reinterpret_cast(input + count), length - count); - res.count += count; - return res; + template + simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) { + size_t pos = 0; + char16_t* start{utf16_output}; + // In the worst case, we have the haswell kernel which can cause an overflow of + // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes, + // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate + // much more than 8 bytes. However, you cannot generally assume that you have valid + // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, + // to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for(; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin-1]) > -65); } - reader.advance(); - count += 64; - } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - c.check_next_input(in); - reader.advance(); - c.check_eof(); - if (c.errors()) { - if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk - result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input), reinterpret_cast(input) + count, length - count); - res.count += count; - return res; - } else { - return result(error_code::SUCCESS, length); + // If the input is long enough, then we have that margin-1 is the eight last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16(in + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { return 0; } + if(pos < size) { + size_t howmany = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output); + if(howmany == 0) { return 0; } + utf16_output += howmany; + } + return utf16_output - start; } -} - -result generic_validate_utf8_with_errors(const char * input, size_t length) { - return generic_validate_utf8_with_errors(reinterpret_cast(input),length); -} -template -bool generic_validate_ascii(const uint8_t * input, size_t length) { - buf_block_reader<64> reader(input, length); - uint8_t blocks[64]{}; - simd::simd8x64 running_or(blocks); - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - running_or |= in; - reader.advance(); + template + simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) { + size_t pos = 0; + char16_t* start{utf16_output}; + // In the worst case, we have the haswell kernel which can cause an overflow of + // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes, + // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate + // much more than 8 bytes. However, you cannot generally assume that you have valid + // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, + // to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for(; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin-1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos onward, + // with the ability to go back up to pos bytes, and read size-pos bytes forward. + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16(in + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos onward, + // with the ability to go back up to pos bytes, and read size-pos bytes forward. + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + if(pos < size) { + // rewind_and_convert_with_errors will seek a potential error from in+pos onward, + // with the ability to go back up to pos bytes, and read size-pos bytes forward. + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf16_output += res.count; + } + } + return result(error_code::SUCCESS, utf16_output - start); } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - running_or |= in; - return running_or.is_ascii(); -} - -bool generic_validate_ascii(const char * input, size_t length) { - return generic_validate_ascii(reinterpret_cast(input),length); -} -template -result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) { - buf_block_reader<64> reader(input, length); - size_t count{0}; - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - if (!in.is_ascii()) { - result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); - return result(res.error, count + res.count); + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); } - reader.advance(); - - count += 64; - } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - if (!in.is_ascii()) { - result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); - return result(res.error, count + res.count); - } else { - return result(error_code::SUCCESS, length); - } -} - -result generic_validate_ascii_with_errors(const char * input, size_t length) { - return generic_validate_ascii_with_errors(reinterpret_cast(input),length); -} -} // namespace utf8_validation + }; // struct utf8_checker +} // utf8_to_utf16 namespace } // unnamed namespace } // namespace haswell } // namespace simdutf -/* end file src/generic/utf8_validation/utf8_validator.h */ -// transcoding from UTF-8 to UTF-16 -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h -/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ - +/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +// transcoding from UTF-8 to UTF-32 +/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ namespace simdutf { namespace haswell { namespace { -namespace utf8_to_utf16 { +namespace utf8_to_utf32 { using namespace simd; -template + simdutf_warn_unused size_t convert_valid(const char* input, size_t size, - char16_t* utf16_output) noexcept { - // The implementation is not specific to haswell and should be moved to the generic directory. + char32_t* utf32_output) noexcept { size_t pos = 0; - char16_t* start{utf16_output}; + char32_t* start{utf32_output}; const size_t safety_margin = 16; // to avoid overruns! while(pos + 64 + safety_margin <= size) { - // this loop could be unrolled further. For example, we could process the mask - // far more than 64 bytes. simd8x64 in(reinterpret_cast(input + pos)); if(in.is_ascii()) { - in.store_ascii_as_utf16(utf16_output); - utf16_output += 64; + in.store_ascii_as_utf32(utf32_output); + utf32_output += 64; pos += 64; } else { - // Slow path. We hope that the compiler will recognize that this is a slow path. - // Anything that is not a continuation mask is a 'leading byte', that is, the - // start of a new code point. - uint64_t utf8_continuation_mask = in.lt(-65 + 1); - // -65 is 0b10111111 in two-complement's, so largest possible continuation byte - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - // The *start* of code points is not so useful, rather, we want the *end* of code points. - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times when using solely - // the slow/regular path, and at least four times if there are fast paths. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - // - // Thus we may allow convert_masked_utf8_to_utf16 to process - // more bytes at a time under a fast-path mode where 16 bytes - // are consumed at once (e.g., when encountering ASCII). - size_t consumed = convert_masked_utf8_to_utf16(input + pos, - utf8_end_of_code_point_mask, utf16_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; + // -65 is 0b10111111 in two-complement's, so largest possible continuation byte + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + size_t max_starting_point = (pos + 64) - 12; + while(pos < max_starting_point) { + size_t consumed = convert_masked_utf8_to_utf32(input + pos, + utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. } } - utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output); - return utf16_output - start; + utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output); + return utf32_output - start; } -} // namespace utf8_to_utf16 + +} // namespace utf8_to_utf32 } // unnamed namespace } // namespace haswell } // namespace simdutf -/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h -/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ +/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */ namespace simdutf { namespace haswell { namespace { -namespace utf8_to_utf16 { +namespace utf8_to_utf32 { using namespace simd; @@ -22035,28 +25524,28 @@ using namespace simd; } - template - simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) { + + simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) { size_t pos = 0; - char16_t* start{utf16_output}; + char32_t* start{utf32_output}; // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes, + // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, + // UTF-8 input, so we are going to go back from the end counting 4 leading bytes, // to give us a good margin. size_t leading_byte = 0; size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { + for(; margin > 0 && leading_byte < 4; margin--) { leading_byte += (int8_t(in[margin-1]) > -65); } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. + // If the input is long enough, then we have that margin-1 is the fourth last leading byte. const size_t safety_margin = size - margin + 1; // to avoid overruns! while(pos + 64 + safety_margin <= size) { simd8x64 input(reinterpret_cast(in + pos)); if(input.is_ascii()) { - input.store_ascii_as_utf16(utf16_output); - utf16_output += 64; + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; pos += 64; } else { // you might think that a for-loop would work, but under Visual Studio, it is not good enough. @@ -22090,8 +25579,8 @@ using namespace simd; // for this section of the code. Hence, there is a limit // to how much we can further increase this latency before // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf16(in + pos, - utf8_end_of_code_point_mask, utf16_output); + size_t consumed = convert_masked_utf8_to_utf32(in + pos, + utf8_end_of_code_point_mask, utf32_output); pos += consumed; utf8_end_of_code_point_mask >>= consumed; } @@ -22103,35 +25592,34 @@ using namespace simd; } if(errors()) { return 0; } if(pos < size) { - size_t howmany = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output); + size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output); if(howmany == 0) { return 0; } - utf16_output += howmany; + utf32_output += howmany; } - return utf16_output - start; + return utf32_output - start; } - template - simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) { + simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) { size_t pos = 0; - char16_t* start{utf16_output}; + char32_t* start{utf32_output}; // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes, + // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, + // UTF-8 input, so we are going to go back from the end counting 4 leading bytes, // to give us a good margin. size_t leading_byte = 0; size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { + for(; margin > 0 && leading_byte < 4; margin--) { leading_byte += (int8_t(in[margin-1]) > -65); } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. + // If the input is long enough, then we have that margin-1 is the fourth last leading byte. const size_t safety_margin = size - margin + 1; // to avoid overruns! while(pos + 64 + safety_margin <= size) { simd8x64 input(reinterpret_cast(in + pos)); if(input.is_ascii()) { - input.store_ascii_as_utf16(utf16_output); - utf16_output += 64; + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; pos += 64; } else { // you might think that a for-loop would work, but under Visual Studio, it is not good enough. @@ -22148,9 +25636,7 @@ using namespace simd; this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } if (errors()) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); res.count += pos; return res; } @@ -22172,8 +25658,8 @@ using namespace simd; // for this section of the code. Hence, there is a limit // to how much we can further increase this latency before // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf16(in + pos, - utf8_end_of_code_point_mask, utf16_output); + size_t consumed = convert_masked_utf8_to_utf32(in + pos, + utf8_end_of_code_point_mask, utf32_output); pos += consumed; utf8_end_of_code_point_mask >>= consumed; } @@ -22184,24 +25670,20 @@ using namespace simd; } } if(errors()) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); res.count += pos; return res; } if(pos < size) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); if (res.error) { // In case of error, we want the error position res.count += pos; return res; } else { // In case of success, we want the number of word written - utf16_output += res.count; + utf32_output += res.count; } } - return result(error_code::SUCCESS, utf16_output - start); + return result(error_code::SUCCESS, utf32_output - start); } simdutf_really_inline bool errors() const { @@ -22209,70 +25691,138 @@ using namespace simd; } }; // struct utf8_checker -} // utf8_to_utf16 namespace +} // utf8_to_utf32 namespace } // unnamed namespace } // namespace haswell } // namespace simdutf -/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ -// transcoding from UTF-8 to UTF-32 -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h -/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ +/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +// other functions +/* begin file src/generic/utf8.h */ namespace simdutf { namespace haswell { namespace { -namespace utf8_to_utf32 { +namespace utf8 { + +using namespace simd; + +simdutf_really_inline size_t count_code_points(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.gt(-65); + count += count_ones(utf8_continuation_mask); + } + return count + scalar::utf8::count_code_points(in + pos, size - pos); +} + +simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + // We count one word for anything that is not a continuation (so + // leading bytes). + count += 64 - count_ones(utf8_continuation_mask); + int64_t utf8_4byte = input.gteq_unsigned(240); + count += count_ones(utf8_4byte); + } + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); +} + + +simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) { + return count_code_points(in, size); +} +} // utf8 namespace +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8.h */ +/* begin file src/generic/utf16.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf16 { + +template +simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos < size/32*32; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if (!match_system(big_endian)) { input.swap_bytes(); } + uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); + count += count_ones(not_pair) / 2; + } + return count + scalar::utf16::count_code_points(in + pos, size - pos); +} -using namespace simd; +template +simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos < size/32*32; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if (!match_system(big_endian)) { input.swap_bytes(); } + uint64_t ascii_mask = input.lteq(0x7F); + uint64_t twobyte_mask = input.lteq(0x7FF); + uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); + + size_t ascii_count = count_ones(ascii_mask) / 2; + size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2; + size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2; + size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; + count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count; + } + return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos); +} +template +simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) { + return count_code_points(in, size); +} -simdutf_warn_unused size_t convert_valid(const char* input, size_t size, - char32_t* utf32_output) noexcept { +simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) { size_t pos = 0; - char32_t* start{utf32_output}; - const size_t safety_margin = 16; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 in(reinterpret_cast(input + pos)); - if(in.is_ascii()) { - in.store_ascii_as_utf32(utf32_output); - utf32_output += 64; - pos += 64; - } else { - // -65 is 0b10111111 in two-complement's, so largest possible continuation byte - uint64_t utf8_continuation_mask = in.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - size_t max_starting_point = (pos + 64) - 12; - while(pos < max_starting_point) { - size_t consumed = convert_masked_utf8_to_utf32(input + pos, - utf8_end_of_code_point_mask, utf32_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } - } + + while (pos < size/32*32) { + simd16x32 input(reinterpret_cast(in + pos)); + input.swap_bytes(); + input.store(reinterpret_cast(output)); + pos += 32; + output += 32; } - utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output); - return utf32_output - start; -} + scalar::utf16::change_endianness_utf16(in + pos, size - pos, output); +} -} // namespace utf8_to_utf32 +} // utf16 } // unnamed namespace } // namespace haswell } // namespace simdutf -/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h -/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +/* end file src/generic/utf16.h */ + + +// transcoding from UTF-8 to Latin 1 +/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */ namespace simdutf { namespace haswell { namespace { -namespace utf8_to_utf32 { +namespace utf8_to_latin1 { using namespace simd; simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// For UTF-8 to Latin 1, we can allow any ASCII character, and any continuation byte, +// but the non-ASCII leading bytes must be 0b11000011 or 0b11000010 and nothing else. +// // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) // Bit 1 = Too Long (ASCII followed by continuation) // Bit 2 = Overlong 3-byte @@ -22299,6 +25849,7 @@ using namespace simd; // 1111011_ 1000____ // 11111___ 1000____ constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + constexpr const uint8_t FORBIDDEN = 0xff; const simd8 byte_1_high = prev1.shr<4>().lookup_16( // 0_______ ________ @@ -22309,11 +25860,11 @@ using namespace simd; // 1100____ ________ TOO_SHORT | OVERLONG_2, // 1101____ ________ - TOO_SHORT, + FORBIDDEN, // 1110____ ________ - TOO_SHORT | OVERLONG_3 | SURROGATE, + FORBIDDEN, // 1111____ ________ - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + FORBIDDEN ); constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( @@ -22326,23 +25877,23 @@ using namespace simd; CARRY, // ____0100 ________ - CARRY | TOO_LARGE, + FORBIDDEN, // ____0101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, + FORBIDDEN, // ____011_ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, + FORBIDDEN, + FORBIDDEN, // ____1___ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, + FORBIDDEN, + FORBIDDEN, + FORBIDDEN, + FORBIDDEN, + FORBIDDEN, // ____1101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000 + FORBIDDEN, + FORBIDDEN, + FORBIDDEN ); const simd8 byte_2_high = input.shr<4>().lookup_16( // ________ 0_______ @@ -22362,15 +25913,6 @@ using namespace simd; ); return (byte_1_high & byte_1_low & byte_2_high); } - simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, - const simd8 prev_input, const simd8 sc) { - simd8 prev2 = input.prev<2>(prev_input); - simd8 prev3 = input.prev<3>(prev_input); - simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); - simd8 must23_80 = must23 & uint8_t(0x80); - return must23_80 ^ sc; - } - struct validating_transcoder { // If this is nonzero, there has been a UTF-8 error. @@ -22384,33 +25926,31 @@ using namespace simd; // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) simd8 prev1 = input.prev<1>(prev_input); - simd8 sc = check_special_cases(input, prev1); - this->error |= check_multibyte_lengths(input, prev_input, sc); + this->error |= check_special_cases(input, prev1); } - - simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) { + simdutf_really_inline size_t convert(const char* in, size_t size, char* latin1_output) { size_t pos = 0; - char32_t* start{utf32_output}; + char* start{latin1_output}; // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, + // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes, // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 4 leading bytes, + // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, // to give us a good margin. size_t leading_byte = 0; size_t margin = size; - for(; margin > 0 && leading_byte < 4; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); + for(; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin-1]) > -65); //twos complement of -65 is 1011 1111 ... } - // If the input is long enough, then we have that margin-1 is the fourth last leading byte. + // If the input is long enough, then we have that margin-1 is the eight last leading byte. const size_t safety_margin = size - margin + 1; // to avoid overruns! while(pos + 64 + safety_margin <= size) { simd8x64 input(reinterpret_cast(in + pos)); if(input.is_ascii()) { - input.store_ascii_as_utf32(utf32_output); - utf32_output += 64; + input.store((int8_t*)latin1_output); + latin1_output += 64; pos += 64; } else { // you might think that a for-loop would work, but under Visual Studio, it is not good enough. @@ -22426,7 +25966,7 @@ using namespace simd; this->check_utf8_bytes(input.chunks[2], input.chunks[1]); this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for. uint64_t utf8_leading_mask = ~utf8_continuation_mask; uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; // We process in blocks of up to 12 bytes except possibly @@ -22444,8 +25984,8 @@ using namespace simd; // for this section of the code. Hence, there is a limit // to how much we can further increase this latency before // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf32(in + pos, - utf8_end_of_code_point_mask, utf32_output); + size_t consumed = convert_masked_utf8_to_latin1(in + pos, + utf8_end_of_code_point_mask, latin1_output); pos += consumed; utf8_end_of_code_point_mask >>= consumed; } @@ -22457,34 +25997,34 @@ using namespace simd; } if(errors()) { return 0; } if(pos < size) { - size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output); + size_t howmany = scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output); if(howmany == 0) { return 0; } - utf32_output += howmany; + latin1_output += howmany; } - return utf32_output - start; + return latin1_output - start; } - simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) { + simdutf_really_inline result convert_with_errors(const char* in, size_t size, char* latin1_output) { size_t pos = 0; - char32_t* start{utf32_output}; + char* start{latin1_output}; // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, + // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes, // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 4 leading bytes, + // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, // to give us a good margin. size_t leading_byte = 0; size_t margin = size; - for(; margin > 0 && leading_byte < 4; margin--) { + for(; margin > 0 && leading_byte < 8; margin--) { leading_byte += (int8_t(in[margin-1]) > -65); } - // If the input is long enough, then we have that margin-1 is the fourth last leading byte. + // If the input is long enough, then we have that margin-1 is the eight last leading byte. const size_t safety_margin = size - margin + 1; // to avoid overruns! while(pos + 64 + safety_margin <= size) { simd8x64 input(reinterpret_cast(in + pos)); if(input.is_ascii()) { - input.store_ascii_as_utf32(utf32_output); - utf32_output += 64; + input.store((int8_t*)latin1_output); + latin1_output += 64; pos += 64; } else { // you might think that a for-loop would work, but under Visual Studio, it is not good enough. @@ -22501,7 +26041,9 @@ using namespace simd; this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } if (errors()) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); + // rewind_and_convert_with_errors will seek a potential error from in+pos onward, + // with the ability to go back up to pos bytes, and read size-pos bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output); res.count += pos; return res; } @@ -22523,8 +26065,8 @@ using namespace simd; // for this section of the code. Hence, there is a limit // to how much we can further increase this latency before // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf32(in + pos, - utf8_end_of_code_point_mask, utf32_output); + size_t consumed = convert_masked_utf8_to_latin1(in + pos, + utf8_end_of_code_point_mask, latin1_output); pos += consumed; utf8_end_of_code_point_mask >>= consumed; } @@ -22535,20 +26077,24 @@ using namespace simd; } } if(errors()) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); + // rewind_and_convert_with_errors will seek a potential error from in+pos onward, + // with the ability to go back up to pos bytes, and read size-pos bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output); res.count += pos; return res; } if(pos < size) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); + // rewind_and_convert_with_errors will seek a potential error from in+pos onward, + // with the ability to go back up to pos bytes, and read size-pos bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output); if (res.error) { // In case of error, we want the error position res.count += pos; return res; } else { // In case of success, we want the number of word written - utf32_output += res.count; + latin1_output += res.count; } } - return result(error_code::SUCCESS, utf32_output - start); + return result(error_code::SUCCESS, latin1_output - start); } simdutf_really_inline bool errors() const { @@ -22556,124 +26102,88 @@ using namespace simd; } }; // struct utf8_checker -} // utf8_to_utf32 namespace +} // utf8_to_latin1 namespace } // unnamed namespace } // namespace haswell } // namespace simdutf -/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */ -// other functions -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8.h -/* begin file src/generic/utf8.h */ - -namespace simdutf { -namespace haswell { -namespace { -namespace utf8 { - -using namespace simd; - -simdutf_really_inline size_t count_code_points(const char* in, size_t size) { - size_t pos = 0; - size_t count = 0; - for(;pos + 64 <= size; pos += 64) { - simd8x64 input(reinterpret_cast(in + pos)); - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - count += 64 - count_ones(utf8_continuation_mask); - } - return count + scalar::utf8::count_code_points(in + pos, size - pos); -} - - -simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) { - size_t pos = 0; - size_t count = 0; - // This algorithm could no doubt be improved! - for(;pos + 64 <= size; pos += 64) { - simd8x64 input(reinterpret_cast(in + pos)); - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - // We count one word for anything that is not a continuation (so - // leading bytes). - count += 64 - count_ones(utf8_continuation_mask); - int64_t utf8_4byte = input.gteq_unsigned(240); - count += count_ones(utf8_4byte); - } - return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); -} +/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */ +/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ -simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) { - return count_code_points(in, size); -} -} // utf8 namespace -} // unnamed namespace -} // namespace haswell -} // namespace simdutf -/* end file src/generic/utf8.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf16.h -/* begin file src/generic/utf16.h */ namespace simdutf { namespace haswell { namespace { -namespace utf16 { - -template -simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) { - size_t pos = 0; - size_t count = 0; - for(;pos + 32 <= size; pos += 32) { - simd16x32 input(reinterpret_cast(in + pos)); - if (!match_system(big_endian)) input.swap_bytes(); - uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); - count += count_ones(not_pair) / 2; - } - return count + scalar::utf16::count_code_points(in + pos, size - pos); -} +namespace utf8_to_latin1 { +using namespace simd; -template -simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) { - size_t pos = 0; - size_t count = 0; - // This algorithm could no doubt be improved! - for(;pos + 32 <= size; pos += 32) { - simd16x32 input(reinterpret_cast(in + pos)); - if (!match_system(big_endian)) input.swap_bytes(); - uint64_t ascii_mask = input.lteq(0x7F); - uint64_t twobyte_mask = input.lteq(0x7FF); - uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); - size_t ascii_count = count_ones(ascii_mask) / 2; - size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2; - size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2; - size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; - count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count; + simdutf_really_inline size_t convert_valid(const char* in, size_t size, char* latin1_output) { + size_t pos = 0; + char* start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow of + // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes, + // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate + // much more than 8 bytes. However, you cannot generally assume that you have valid + // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, + // to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for(; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin-1]) > -65); //twos complement of -65 is 1011 1111 ... + } + // If the input is long enough, then we have that margin-1 is the eight last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store((int8_t*)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for. + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1(in + pos, + utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(pos < size) { + size_t howmany = scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output); + if(howmany == 0) { return 0; } + latin1_output += howmany; + } + return latin1_output - start; } - return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos); -} - -template -simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) { - return count_code_points(in, size); -} - -simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) { - size_t pos = 0; - - while (pos + 32 <= size) { - simd16x32 input(reinterpret_cast(in + pos)); - input.swap_bytes(); - input.store(reinterpret_cast(output)); - pos += 32; - output += 32; - } - scalar::utf16::change_endianness_utf16(in + pos, size - pos, output); -} - -} // utf16 -} // unnamed namespace -} // namespace haswell -} // namespace simdutf -/* end file src/generic/utf16.h */ + }; +} // utf8_to_latin1 namespace +} // unnamed namespace +} // namespace haswell + // namespace simdutf +/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ namespace simdutf { namespace haswell { @@ -22766,6 +26276,73 @@ simdutf_warn_unused result implementation::validate_utf32_with_errors(const char } } +simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept { + std::pair ret = avx2_convert_latin1_to_utf8(buf, len, utf8_output); + size_t converted_chars = ret.second - utf8_output; + + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } + + return converted_chars; +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + std::pair ret = avx2_convert_latin1_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { return 0; } + size_t converted_chars = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_converted_chars == 0) { return 0; } + converted_chars += scalar_converted_chars; + } + return converted_chars; +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + std::pair ret = avx2_convert_latin1_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { return 0; } + size_t converted_chars = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_converted_chars == 0) { return 0; } + converted_chars += scalar_converted_chars; + } + return converted_chars; +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept { + std::pair ret = avx2_convert_latin1_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { return 0; } + size_t converted_chars = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_converted_chars == 0) { return 0; } + converted_chars += scalar_converted_chars; + } + return converted_chars; +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept { + utf8_to_latin1::validating_transcoder converter; + return converter.convert(buf, len, latin1_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept { + utf8_to_latin1::validating_transcoder converter; + return converter.convert_with_errors(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* input, size_t size, + char* latin1_output) const noexcept { + return utf8_to_latin1::convert_valid(input, size, latin1_output); +} + simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { utf8_to_utf16::validating_transcoder converter; return converter.convert(buf, len, utf16_output); @@ -22811,6 +26388,77 @@ simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const cha return utf8_to_utf32::convert_valid(input, size, utf32_output); } + +simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + std::pair ret = haswell::avx2_convert_utf16_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - latin1_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + std::pair ret = haswell::avx2_convert_utf16_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - latin1_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + std::pair ret = avx2_convert_utf16_to_latin1_with_errors(buf, len, latin1_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + std::pair ret = avx2_convert_utf16_to_latin1_with_errors(buf, len, latin1_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + // optimization opportunity: implement a custom function + return convert_utf16be_to_latin1(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + // optimization opportunity: implement a custom function + return convert_utf16le_to_latin1(buf, len, latin1_output); +} + simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { std::pair ret = haswell::avx2_convert_utf16_to_utf8(buf, len, utf8_output); if (ret.first == nullptr) { return 0; } @@ -22838,7 +26486,7 @@ simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_ } simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished + // ret.first.count is always the position in the buffer, not the number of code units written even if finished std::pair ret = haswell::avx2_convert_utf16_to_utf8_with_errors(buf, len, utf8_output); if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count if (ret.first.count != len) { // All good so far, but not finished @@ -22851,12 +26499,12 @@ simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(c ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written + ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit code units written return ret.first; } simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished + // ret.first.count is always the position in the buffer, not the number of code units written even if finished std::pair ret = haswell::avx2_convert_utf16_to_utf8_with_errors(buf, len, utf8_output); if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count if (ret.first.count != len) { // All good so far, but not finished @@ -22869,7 +26517,7 @@ simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(c ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written + ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit code units written return ret.first; } @@ -22894,8 +26542,43 @@ simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* return saved_bytes; } +simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept { + std::pair ret = avx2_convert_utf32_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - latin1_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept { + return scalar::utf32_to_latin1::convert_with_errors(buf,len,latin1_output); + // ret.first.count is always the position in the buffer, not the number of code units written even if finished + std::pair ret = avx2_convert_utf32_to_latin1_with_errors(buf, len, latin1_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept { + return convert_utf32_to_latin1(buf,len,latin1_output); +} + simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished + // ret.first.count is always the position in the buffer, not the number of code units written even if finished std::pair ret = haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); if (ret.first.count != len) { result scalar_res = scalar::utf32_to_utf8::convert_with_errors( @@ -22907,7 +26590,7 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(con ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written + ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit code units written return ret.first; } @@ -22938,7 +26621,7 @@ simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16 } simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished + // ret.first.count is always the position in the buffer, not the number of code units written even if finished std::pair ret = haswell::avx2_convert_utf16_to_utf32_with_errors(buf, len, utf32_output); if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count if (ret.first.count != len) { // All good so far, but not finished @@ -22951,12 +26634,12 @@ simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors( ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written + ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit code units written return ret.first; } simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished + // ret.first.count is always the position in the buffer, not the number of code units written even if finished std::pair ret = haswell::avx2_convert_utf16_to_utf32_with_errors(buf, len, utf32_output); if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count if (ret.first.count != len) { // All good so far, but not finished @@ -22969,7 +26652,7 @@ simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors( ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written + ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit code units written return ret.first; } @@ -23004,7 +26687,7 @@ simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32 } simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished + // ret.first.count is always the position in the buffer, not the number of code units written even if finished std::pair ret = haswell::avx2_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); if (ret.first.count != len) { result scalar_res = scalar::utf32_to_utf16::convert_with_errors( @@ -23016,12 +26699,12 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors( ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written + ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit code units written return ret.first; } simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished + // ret.first.count is always the position in the buffer, not the number of code units written even if finished std::pair ret = haswell::avx2_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); if (ret.first.count != len) { result scalar_res = scalar::utf32_to_utf16::convert_with_errors( @@ -23033,7 +26716,7 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors( ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written + ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit code units written return ret.first; } @@ -23069,6 +26752,18 @@ simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t return utf8::count_code_points(input, length); } +simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept { + return count_utf8(buf,len); +} + +simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept { + return scalar::utf16::latin1_length_from_utf16(length); +} + +simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept { + return scalar::utf32::latin1_length_from_utf32(length); +} + simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept { return utf16::utf8_length_from_utf16(input, length); } @@ -23085,10 +26780,61 @@ simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char1 return utf16::utf32_length_from_utf16(input, length); } + +simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept { + return scalar::latin1::utf16_length_from_latin1(length); +} + simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { return utf8::utf16_length_from_utf8(input, length); } + +simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept { + return scalar::latin1::utf32_length_from_latin1(length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char *input, size_t len) const noexcept { + const uint8_t *data = reinterpret_cast(input); + size_t answer = len / sizeof(__m256i) * sizeof(__m256i); + size_t i = 0; + __m256i four_64bits = _mm256_setzero_si256(); + while (i + sizeof(__m256i) <= len) { + __m256i runner = _mm256_setzero_si256(); + // We can do up to 255 loops without overflow. + size_t iterations = (len - i) / sizeof(__m256i); + if (iterations > 255) { + iterations = 255; + } + size_t max_i = i + iterations * sizeof(__m256i) - sizeof(__m256i); + for (; i + 4*sizeof(__m256i) <= max_i; i += 4*sizeof(__m256i)) { + __m256i input1 = _mm256_loadu_si256((const __m256i *)(data + i)); + __m256i input2 = _mm256_loadu_si256((const __m256i *)(data + i + sizeof(__m256i))); + __m256i input3 = _mm256_loadu_si256((const __m256i *)(data + i + 2*sizeof(__m256i))); + __m256i input4 = _mm256_loadu_si256((const __m256i *)(data + i + 3*sizeof(__m256i))); + __m256i input12 = _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input1), + _mm256_cmpgt_epi8(_mm256_setzero_si256(), input2)); + __m256i input23 = _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input3), + _mm256_cmpgt_epi8(_mm256_setzero_si256(), input4)); + __m256i input1234 = _mm256_add_epi8(input12, input23); + runner = _mm256_sub_epi8( + runner, input1234); + } + for (; i <= max_i; i += sizeof(__m256i)) { + __m256i input_256_chunk = _mm256_loadu_si256((const __m256i *)(data + i)); + runner = _mm256_sub_epi8( + runner, _mm256_cmpgt_epi8(_mm256_setzero_si256(), input_256_chunk)); + } + four_64bits = _mm256_add_epi64( + four_64bits, _mm256_sad_epu8(runner, _mm256_setzero_si256())); + } + answer += _mm256_extract_epi64(four_64bits, 0) + + _mm256_extract_epi64(four_64bits, 1) + + _mm256_extract_epi64(four_64bits, 2) + + _mm256_extract_epi64(four_64bits, 3); + return answer + scalar::latin1::utf8_length_from_latin1(reinterpret_cast(data + i), len - i); +} + simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept { const __m256i v_00000000 = _mm256_setzero_si256(); const __m256i v_ffffff80 = _mm256_set1_epi32((uint32_t)0xffffff80); @@ -23131,13 +26877,12 @@ simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_ } simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept { - return scalar::utf8::count_code_points(input, length); + return utf8::count_code_points(input, length); } } // namespace haswell } // namespace simdutf -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/end.h /* begin file src/simdutf/haswell/end.h */ #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL // nothing needed. @@ -23153,14 +26898,12 @@ SIMDUTF_POP_DISABLE_WARNINGS /* end file src/haswell/implementation.cpp */ #endif #if SIMDUTF_IMPLEMENTATION_PPC64 -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=ppc64/implementation.cpp /* begin file src/ppc64/implementation.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/begin.h /* begin file src/simdutf/ppc64/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "ppc64" // #define SIMDUTF_IMPLEMENTATION ppc64 @@ -23198,7 +26941,6 @@ simdutf_really_inline simd8 must_be_2_3_continuation(const simd8 } // namespace ppc64 } // namespace simdutf -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h /* begin file src/generic/buf_block_reader.h */ namespace simdutf { namespace ppc64 { @@ -23293,7 +27035,6 @@ simdutf_really_inline void buf_block_reader::advance() { } // namespace ppc64 } // namespace simdutf /* end file src/generic/buf_block_reader.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ namespace simdutf { namespace ppc64 { @@ -23482,7 +27223,6 @@ using utf8_validation::utf8_checker; } // namespace ppc64 } // namespace simdutf /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h /* begin file src/generic/utf8_validation/utf8_validator.h */ namespace simdutf { namespace ppc64 { @@ -23610,7 +27350,6 @@ result generic_validate_ascii_with_errors(const char * input, size_t length) { } // namespace simdutf /* end file src/generic/utf8_validation/utf8_validator.h */ // transcoding from UTF-8 to UTF-16 -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ @@ -23685,7 +27424,6 @@ simdutf_warn_unused size_t convert_valid(const char* input, size_t size, } // namespace ppc64 } // namespace simdutf /* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ @@ -23993,7 +27731,6 @@ using namespace simd; } // namespace simdutf /* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ // transcoding from UTF-8 to UTF-32 -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h /* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ namespace simdutf { @@ -24039,7 +27776,6 @@ simdutf_warn_unused size_t convert_valid(const char* input, size_t size, } // namespace ppc64 } // namespace simdutf /* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */ @@ -24340,7 +28076,6 @@ using namespace simd; } // namespace simdutf /* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */ // other functions -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8.h /* begin file src/generic/utf8.h */ namespace simdutf { @@ -24355,13 +28090,12 @@ simdutf_really_inline size_t count_code_points(const char* in, size_t size) { size_t count = 0; for(;pos + 64 <= size; pos += 64) { simd8x64 input(reinterpret_cast(in + pos)); - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - count += 64 - count_ones(utf8_continuation_mask); + uint64_t utf8_continuation_mask = input.gt(-65); + count += count_ones(utf8_continuation_mask); } return count + scalar::utf8::count_code_points(in + pos, size - pos); } - simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) { size_t pos = 0; size_t count = 0; @@ -24387,7 +28121,6 @@ simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) } // namespace ppc64 } // namespace simdutf /* end file src/generic/utf8.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf16.h /* begin file src/generic/utf16.h */ namespace simdutf { namespace ppc64 { @@ -24398,9 +28131,9 @@ template simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) { size_t pos = 0; size_t count = 0; - for(;pos + 32 <= size; pos += 32) { + for(;pos < size/32*32; pos += 32) { simd16x32 input(reinterpret_cast(in + pos)); - if (!match_system(big_endian)) input.swap_bytes(); + if (!match_system(big_endian)) { input.swap_bytes(); } uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); count += count_ones(not_pair) / 2; } @@ -24412,9 +28145,9 @@ simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t s size_t pos = 0; size_t count = 0; // This algorithm could no doubt be improved! - for(;pos + 32 <= size; pos += 32) { + for(;pos < size/32*32; pos += 32) { simd16x32 input(reinterpret_cast(in + pos)); - if (!match_system(big_endian)) input.swap_bytes(); + if (!match_system(big_endian)) { input.swap_bytes(); } uint64_t ascii_mask = input.lteq(0x7F); uint64_t twobyte_mask = input.lteq(0x7FF); uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); @@ -24436,7 +28169,7 @@ simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) { size_t pos = 0; - while (pos + 32 <= size) { + while (pos < size/32*32) { simd16x32 input(reinterpret_cast(in + pos)); input.swap_bytes(); input.store(reinterpret_cast(output)); @@ -24686,15 +28419,12 @@ simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * i } // namespace ppc64 } // namespace simdutf -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/end.h /* begin file src/simdutf/ppc64/end.h */ /* end file src/simdutf/ppc64/end.h */ /* end file src/ppc64/implementation.cpp */ #endif #if SIMDUTF_IMPLEMENTATION_WESTMERE -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/implementation.cpp /* begin file src/westmere/implementation.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/begin.h /* begin file src/simdutf/westmere/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "westmere" // #define SIMDUTF_IMPLEMENTATION westmere @@ -24732,7 +28462,84 @@ simdutf_really_inline simd8 must_be_2_3_continuation(const simd8 return simd8(is_third_byte | is_fourth_byte) > int8_t(0); } -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_detect_encodings.cpp +/* begin file src/westmere/internal/loader.cpp */ +namespace internal { +namespace westmere { + +/* begin file src/westmere/internal/write_v_u16_11bits_to_utf8.cpp */ +/* +* reads a vector of uint16 values +* bits after 11th are ignored +* first 11 bits are encoded into utf8 +* !important! utf8_output must have at least 16 writable bytes +*/ + +inline void write_v_u16_11bits_to_utf8( + const __m128i v_u16, + char*& utf8_output, + const __m128i one_byte_bytemask, + const uint16_t one_byte_bitmask +) { + // 0b1100_0000_1000_0000 + const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080); + // 0b0001_1111_0000_0000 + const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); + // 0b0000_0000_0011_1111 + const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); + + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + + // t0 = [000a|aaaa|bbbb|bb00] + const __m128i t0 = _mm_slli_epi16(v_u16, 2); + // t1 = [000a|aaaa|0000|0000] + const __m128i t1 = _mm_and_si128(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m128i t2 = _mm_and_si128(v_u16, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m128i t3 = _mm_or_si128(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m128i t4 = _mm_or_si128(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m128i utf8_unpacked = _mm_blendv_epi8(t4, v_u16, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + // one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB) + const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a + const uint16_t m1 = static_cast(m0 >> 7); // m1 = 00000000h0g0f0e0 + const uint8_t m2 = static_cast((m0 | m1) & 0xff); // m2 = hdgcfbea + // 4. pack the bytes + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); + const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle); + + // 5. store bytes + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + + // 6. adjust pointers + utf8_output += row[0]; +}; + +inline void write_v_u16_11bits_to_utf8( + const __m128i v_u16, + char*& utf8_output, + const __m128i v_0000, + const __m128i v_ff80 +) { + // no bits set above 7th bit + const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(v_u16, v_ff80), v_0000); + const uint16_t one_byte_bitmask = static_cast(_mm_movemask_epi8(one_byte_bytemask)); + + write_v_u16_11bits_to_utf8( + v_u16, utf8_output, one_byte_bytemask, one_byte_bitmask); +}; +/* end file src/westmere/internal/write_v_u16_11bits_to_utf8.cpp */ + +} // namespace westmere +} // namespace internal +/* end file src/westmere/internal/loader.cpp */ /* begin file src/westmere/sse_detect_encodings.cpp */ template // len is known to be a multiple of 2 when this is called @@ -24785,7 +28592,7 @@ int sse_detect_encodings(const char * buf, size_t len) { // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word. // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant // bytes of a 32-bit word since they always come in pairs in UTF-16LE. - // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit words. + // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit code units. if (((surrogates_bitmask0 | surrogates_bitmask1) & 0xaaaa) != 0) { is_utf32 = false; @@ -24942,10 +28749,9 @@ int sse_detect_encodings(const char * buf, size_t len) { } /* end file src/westmere/sse_detect_encodings.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_validate_utf16.cpp /* begin file src/westmere/sse_validate_utf16.cpp */ /* - In UTF-16 words in range 0xD800 to 0xDFFF have special meaning. + In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning. In a vectorized algorithm we want to examine the most significant nibble in order to select a fast path. If none of highest nibbles @@ -24981,7 +28787,7 @@ int sse_detect_encodings(const char * buf, size_t len) { 0 0 1 0 1 0 0 0 b = a << 1 1 1 1 1 1 1 1 0 c = V | a | b ^ - the last bit can be zero, we just consume 7 words + the last bit can be zero, we just consume 7 code units and recheck this word in the next iteration */ @@ -25026,7 +28832,7 @@ const char16_t* sse_validate_utf16(const char16_t* input, size_t size) { // // Fact: high surrogate has 11th bit set (3rd bit in the higher word) - // V - non-surrogate words + // V - non-surrogate code units // V = not surrogates_wordmask const uint16_t V = static_cast(~surrogates_bitmask); @@ -25047,10 +28853,10 @@ const char16_t* sse_validate_utf16(const char16_t* input, size_t size) { if (c == 0xffff) { // The whole input register contains valid UTF-16, i.e., - // either single words or proper surrogate pairs. + // either single code units or proper surrogate pairs. input += 16; } else if (c == 0x7fff) { - // The 15 lower words of the input register contains valid UTF-16. + // The 15 lower code units of the input register contains valid UTF-16. // The 15th word may be either a low or high surrogate. It the next // iteration we 1) check if the low surrogate is followed by a high // one, 2) reject sole high surrogate. @@ -25104,7 +28910,7 @@ const result sse_validate_utf16_with_errors(const char16_t* input, size_t size) // // Fact: high surrogate has 11th bit set (3rd bit in the higher word) - // V - non-surrogate words + // V - non-surrogate code units // V = not surrogates_wordmask const uint16_t V = static_cast(~surrogates_bitmask); @@ -25125,10 +28931,10 @@ const result sse_validate_utf16_with_errors(const char16_t* input, size_t size) if (c == 0xffff) { // The whole input register contains valid UTF-16, i.e., - // either single words or proper surrogate pairs. + // either single code units or proper surrogate pairs. input += 16; } else if (c == 0x7fff) { - // The 15 lower words of the input register contains valid UTF-16. + // The 15 lower code units of the input register contains valid UTF-16. // The 15th word may be either a low or high surrogate. It the next // iteration we 1) check if the low surrogate is followed by a high // one, 2) reject sole high surrogate. @@ -25142,7 +28948,6 @@ const result sse_validate_utf16_with_errors(const char16_t* input, size_t size) return result(error_code::SUCCESS, input - start); } /* end file src/westmere/sse_validate_utf16.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_validate_utf32le.cpp /* begin file src/westmere/sse_validate_utf32le.cpp */ /* Returns: - pointer to the last unprocessed character (a scalar fallback should check the rest); @@ -25208,7 +29013,144 @@ const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size } /* end file src/westmere/sse_validate_utf32le.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf8_to_utf16.cpp +/* begin file src/westmere/sse_convert_latin1_to_utf8.cpp */ +std::pair sse_convert_latin1_to_utf8( + const char* latin_input, + const size_t latin_input_length, + char* utf8_output) { + const char* end = latin_input + latin_input_length; + + const __m128i v_0000 = _mm_setzero_si128(); + // 0b1000_0000 + const __m128i v_80 = _mm_set1_epi8((uint8_t)0x80); + // 0b1111_1111_1000_0000 + const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); + + const __m128i latin_1_half_into_u16_byte_mask = _mm_setr_epi8( + 0, '\x80', + 1, '\x80', + 2, '\x80', + 3, '\x80', + 4, '\x80', + 5, '\x80', + 6, '\x80', + 7, '\x80' + ); + + const __m128i latin_2_half_into_u16_byte_mask = _mm_setr_epi8( + 8, '\x80', + 9, '\x80', + 10, '\x80', + 11, '\x80', + 12, '\x80', + 13, '\x80', + 14, '\x80', + 15, '\x80' + ); + + // each latin1 takes 1-2 utf8 bytes + // slow path writes useful 8-15 bytes twice (eagerly writes 16 bytes and then adjust the pointer) + // so the last write can exceed the utf8_output size by 8-1 bytes + // by reserving 8 extra input bytes, we expect the output to have 8-16 bytes free + while (latin_input + 16 + 8 <= end) { + // Load 16 Latin1 characters (16 bytes) into a 128-bit register + __m128i v_latin = _mm_loadu_si128((__m128i*)latin_input); + + + if (_mm_testz_si128(v_latin, v_80)) {// ASCII fast path!!!! + _mm_storeu_si128((__m128i*)utf8_output, v_latin); + latin_input += 16; + utf8_output += 16; + continue; + } + + + // assuming a/b are bytes and A/B are uint16 of the same value + // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA + __m128i v_u16_latin_1_half = _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask); + // aaaa_aaaa_bbbb_bbbb -> BBBB_BBBB + __m128i v_u16_latin_2_half = _mm_shuffle_epi8(v_latin, latin_2_half_into_u16_byte_mask); + + + internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_1_half, utf8_output, v_0000, v_ff80); + internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_2_half, utf8_output, v_0000, v_ff80); + latin_input += 16; + } + + if (latin_input + 16 <= end) { + // Load 16 Latin1 characters (16 bytes) into a 128-bit register + __m128i v_latin = _mm_loadu_si128((__m128i*)latin_input); + + if (_mm_testz_si128(v_latin, v_80)) {// ASCII fast path!!!! + _mm_storeu_si128((__m128i*)utf8_output, v_latin); + latin_input += 16; + utf8_output += 16; + } else { + // assuming a/b are bytes and A/B are uint16 of the same value + // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA + __m128i v_u16_latin_1_half = _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask); + internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_1_half, utf8_output, v_0000, v_ff80); + latin_input += 8; + } + } + + return std::make_pair(latin_input, utf8_output); +}; +/* end file src/westmere/sse_convert_latin1_to_utf8.cpp */ +/* begin file src/westmere/sse_convert_latin1_to_utf16.cpp */ +template +std::pair sse_convert_latin1_to_utf16(const char *latin1_input, size_t len, + char16_t *utf16_output) { + size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16 + for (size_t i = 0; i < rounded_len; i += 16) { + // Load 16 Latin1 characters into a 128-bit register + __m128i in = _mm_loadu_si128(reinterpret_cast(&latin1_input[i])); + __m128i out1 = big_endian ? _mm_unpacklo_epi8(_mm_setzero_si128(), in) + : _mm_unpacklo_epi8(in, _mm_setzero_si128()); + __m128i out2 = big_endian ? _mm_unpackhi_epi8(_mm_setzero_si128(), in) + : _mm_unpackhi_epi8(in, _mm_setzero_si128()); + // Zero extend each Latin1 character to 16-bit integers and store the results back to memory + _mm_storeu_si128(reinterpret_cast<__m128i*>(&utf16_output[i]), out1); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&utf16_output[i + 8]), out2); + } + // return pointers pointing to where we left off + return std::make_pair(latin1_input + rounded_len, utf16_output + rounded_len); +} +/* end file src/westmere/sse_convert_latin1_to_utf16.cpp */ +/* begin file src/westmere/sse_convert_latin1_to_utf32.cpp */ +std::pair sse_convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) { + const char* end = buf + len; + + while (buf + 16 <= end) { + // Load 16 Latin1 characters (16 bytes) into a 128-bit register + __m128i in = _mm_loadu_si128((__m128i*)buf); + + // Shift input to process next 4 bytes + __m128i in_shifted1 = _mm_srli_si128(in, 4); + __m128i in_shifted2 = _mm_srli_si128(in, 8); + __m128i in_shifted3 = _mm_srli_si128(in, 12); + + // expand 8-bit to 32-bit unit + __m128i out1 = _mm_cvtepu8_epi32(in); + __m128i out2 = _mm_cvtepu8_epi32(in_shifted1); + __m128i out3 = _mm_cvtepu8_epi32(in_shifted2); + __m128i out4 = _mm_cvtepu8_epi32(in_shifted3); + + _mm_storeu_si128((__m128i*)utf32_output, out1); + _mm_storeu_si128((__m128i*)(utf32_output + 4), out2); + _mm_storeu_si128((__m128i*)(utf32_output + 8), out3); + _mm_storeu_si128((__m128i*)(utf32_output + 12), out4); + + utf32_output += 16; + buf += 16; + } + + return std::make_pair(buf, utf32_output); +} + +/* end file src/westmere/sse_convert_latin1_to_utf32.cpp */ + + /* begin file src/westmere/sse_convert_utf8_to_utf16.cpp */ // depends on "tables/utf8_to_utf16_tables.h" @@ -25249,7 +29191,7 @@ size_t convert_masked_utf8_to_utf16(const char *input, return 16; // We consumed 16 bytes. } if(((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) { - // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words. + // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte UTF-16 code units. // There is probably a more efficient sequence, but the following might do. const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); const __m128i perm = _mm_shuffle_epi8(in, sh); @@ -25262,7 +29204,7 @@ size_t convert_masked_utf8_to_utf16(const char *input, return 16; } if(input_utf8_end_of_code_point_mask == 0x924) { - // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words. + // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte UTF-16 code units. // There is probably a more efficient sequence, but the following might do. const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); const __m128i perm = _mm_shuffle_epi8(in, sh); @@ -25289,10 +29231,10 @@ size_t convert_masked_utf8_to_utf16(const char *input, const uint8_t consumed = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; if (idx < 64) { - // SIX (6) input code-words + // SIX (6) input code-code units // this is a relatively easy scenario - // we process SIX (6) input code-words. The max length in bytes of six code - // words spanning between 1 and 2 bytes each is 12 bytes. On processors + // we process SIX (6) input code-code units. The max length in bytes of six code + // code units spanning between 1 and 2 bytes each is 12 bytes. On processors // where pdep/pext is fast, we might be able to use a small lookup table. const __m128i sh = _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); @@ -25304,7 +29246,7 @@ size_t convert_masked_utf8_to_utf16(const char *input, _mm_storeu_si128((__m128i *)utf16_output, composed); utf16_output += 6; // We wrote 12 bytes, 6 code points. } else if (idx < 145) { - // FOUR (4) input code-words + // FOUR (4) input code-code units const __m128i sh = _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); const __m128i perm = _mm_shuffle_epi8(in, sh); @@ -25323,7 +29265,7 @@ size_t convert_masked_utf8_to_utf16(const char *input, _mm_storeu_si128((__m128i *)utf16_output, composed_repacked); utf16_output += 4; } else if (idx < 209) { - // TWO (2) input code-words + // TWO (2) input code-code units ////////////// // There might be garbage inputs where a leading byte mascarades as a four-byte // leading byte (by being followed by 3 continuation byte), but is not greater than @@ -25393,7 +29335,6 @@ size_t convert_masked_utf8_to_utf16(const char *input, return consumed; } /* end file src/westmere/sse_convert_utf8_to_utf16.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf8_to_utf32.cpp /* begin file src/westmere/sse_convert_utf8_to_utf32.cpp */ // depends on "tables/utf8_to_utf16_tables.h" @@ -25428,7 +29369,7 @@ size_t convert_masked_utf8_to_utf32(const char *input, return 16; // We consumed 16 bytes. } if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) { - // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words. + // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte UTF-32 code units. // There is probably a more efficient sequence, but the following might do. const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); const __m128i perm = _mm_shuffle_epi8(in, sh); @@ -25441,7 +29382,7 @@ size_t convert_masked_utf8_to_utf32(const char *input, return 16; } if(input_utf8_end_of_code_point_mask == 0x924) { - // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words. + // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte UTF-32 code units. // There is probably a more efficient sequence, but the following might do. const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); const __m128i perm = _mm_shuffle_epi8(in, sh); @@ -25466,10 +29407,10 @@ size_t convert_masked_utf8_to_utf32(const char *input, const uint8_t consumed = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; if (idx < 64) { - // SIX (6) input code-words + // SIX (6) input code-code units // this is a relatively easy scenario - // we process SIX (6) input code-words. The max length in bytes of six code - // words spanning between 1 and 2 bytes each is 12 bytes. On processors + // we process SIX (6) input code-code units. The max length in bytes of six code + // code units spanning between 1 and 2 bytes each is 12 bytes. On processors // where pdep/pext is fast, we might be able to use a small lookup table. const __m128i sh = _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); @@ -25481,7 +29422,7 @@ size_t convert_masked_utf8_to_utf32(const char *input, _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(composed,8))); utf32_output += 6; // We wrote 12 bytes, 6 code points. } else if (idx < 145) { - // FOUR (4) input code-words + // FOUR (4) input code-code units const __m128i sh = _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); const __m128i perm = _mm_shuffle_epi8(in, sh); @@ -25498,7 +29439,7 @@ size_t convert_masked_utf8_to_utf32(const char *input, _mm_storeu_si128((__m128i *)utf32_output, composed); utf32_output += 4; } else if (idx < 209) { - // TWO (2) input code-words + // TWO (2) input code-code units const __m128i sh = _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); const __m128i perm = _mm_shuffle_epi8(in, sh); @@ -25523,13 +29464,130 @@ size_t convert_masked_utf8_to_utf32(const char *input, } return consumed; } -/* end file src/westmere/sse_convert_utf8_to_utf32.cpp */ - -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf16_to_utf8.cpp +/* end file src/westmere/sse_convert_utf8_to_utf32.cpp */ +/* begin file src/westmere/sse_convert_utf8_to_latin1.cpp */ +// depends on "tables/utf8_to_utf16_tables.h" + + +// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +size_t convert_masked_utf8_to_latin1(const char *input, + uint64_t utf8_end_of_code_point_mask, + char *&latin1_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + // + // Optimization note: our main path below is load-latency dependent. Thus it is maybe + // beneficial to have fast paths that depend on branch prediction but have less latency. + // This results in more instructions but, potentially, also higher speeds. + // + const __m128i in = _mm_loadu_si128((__m128i *)input); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; //we're only processing 12 bytes in case it`s not all ASCII + if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) { + // We process the data in chunks of 16 bytes. + _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in); + latin1_output += 16; // We wrote 16 characters. + return 16; // We consumed 16 bytes. + } + /// We do not have a fast path available, so we fallback. + const uint8_t idx = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + // this indicates an invalid input: + if(idx >= 64) { return consumed; } + // Here we should have (idx < 64), if not, there is a bug in the validation or elsewhere. + // SIX (6) input code-code units + // this is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six code + // code units spanning between 1 and 2 bytes each is 12 bytes. On processors + // where pdep/pext is fast, we might be able to use a small lookup table. + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + const __m128i latin1_packed = _mm_packus_epi16(composed,composed); + // writing 8 bytes even though we only care about the first 6 bytes. + // performance note: it would be faster to use _mm_storeu_si128, we should investigate. + _mm_storel_epi64((__m128i *)latin1_output, latin1_packed); + latin1_output += 6; // We wrote 6 bytes. + return consumed; +} +/* end file src/westmere/sse_convert_utf8_to_latin1.cpp */ + +/* begin file src/westmere/sse_convert_utf16_to_latin1.cpp */ +template +std::pair sse_convert_utf16_to_latin1(const char16_t* buf, size_t len, char* latin1_output) { + const char16_t* end = buf + len; + while (buf + 8 <= end) { + // Load 8 UTF-16 characters into 128-bit SSE register + __m128i in = _mm_loadu_si128(reinterpret_cast(buf)); + + if (!match_system(big_endian)) { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + in = _mm_shuffle_epi8(in, swap); + } + + __m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00); + if (_mm_testz_si128(in, high_byte_mask)) { + // Pack 16-bit characters into 8-bit and store in latin1_output + __m128i latin1_packed = _mm_packus_epi16(in, in); + _mm_storel_epi64(reinterpret_cast<__m128i*>(latin1_output), latin1_packed); + // Adjust pointers for next iteration + buf += 8; + latin1_output += 8; + } else { + return std::make_pair(nullptr, reinterpret_cast(latin1_output)); + } + } // while + return std::make_pair(buf, latin1_output); +} + +template +std::pair sse_convert_utf16_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) { + const char16_t* start = buf; + const char16_t* end = buf + len; + while (buf + 8 <= end) { + __m128i in = _mm_loadu_si128(reinterpret_cast(buf)); + + if (!big_endian) { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + in = _mm_shuffle_epi8(in, swap); + } + + __m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00); + if (_mm_testz_si128(in, high_byte_mask)) { + __m128i latin1_packed = _mm_packus_epi16(in, in); + _mm_storel_epi64(reinterpret_cast<__m128i*>(latin1_output), latin1_packed); + buf += 8; + latin1_output += 8; + } else { + // Fallback to scalar code for handling errors + for(int k = 0; k < 8; k++) { + uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if(word <= 0xff) { + *latin1_output++ = char(word); + } else { + return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), latin1_output); + } + } + buf += 8; + } + } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), latin1_output); +} +/* end file src/westmere/sse_convert_utf16_to_latin1.cpp */ /* begin file src/westmere/sse_convert_utf16_to_utf8.cpp */ /* The vectorized algorithm works on single SSE register i.e., it - loads eight 16-bit words. + loads eight 16-bit code units. We consider three cases: 1. an input register contains no surrogates and each value @@ -25541,7 +29599,7 @@ size_t convert_masked_utf8_to_utf32(const char *input, Ad 1. - When values are less than 0x0800, it means that a 16-bit words + When values are less than 0x0800, it means that a 16-bit code unit can be converted into: 1) single UTF8 byte (when it's an ASCII char) or 2) two UTF8 bytes. @@ -25555,7 +29613,7 @@ size_t convert_masked_utf8_to_utf32(const char *input, Ad 2. - When values fit in 16-bit words, but are above 0x07ff, then + When values fit in 16-bit code units, but are above 0x07ff, then a single word may produce one, two or three UTF8 bytes. We prepare data for all these three cases in two registers. @@ -25588,7 +29646,6 @@ std::pair sse_convert_utf16_to_utf8(const char16_t* buf, const __m128i v_0000 = _mm_setzero_si128(); const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800); const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800); - const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080); const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 while (buf + 16 + safety_margin <= end) { @@ -25637,44 +29694,9 @@ std::pair sse_convert_utf16_to_utf8(const char16_t* buf, const uint16_t one_or_two_bytes_bitmask = static_cast(_mm_movemask_epi8(one_or_two_bytes_bytemask)); if (one_or_two_bytes_bitmask == 0xffff) { - // 1. prepare 2-byte values - // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 - // expected output : [110a|aaaa|10bb|bbbb] x 8 - const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); - const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); - - // t0 = [000a|aaaa|bbbb|bb00] - const __m128i t0 = _mm_slli_epi16(in, 2); - // t1 = [000a|aaaa|0000|0000] - const __m128i t1 = _mm_and_si128(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const __m128i t2 = _mm_and_si128(in, v_003f); - // t3 = [000a|aaaa|00bb|bbbb] - const __m128i t3 = _mm_or_si128(t1, t2); - // t4 = [110a|aaaa|10bb|bbbb] - const __m128i t4 = _mm_or_si128(t3, v_c080); - - // 2. merge ASCII and 2-byte codewords - const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask); - - // 3. prepare bitmask for 8-bit lookup - // one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB) - const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a - const uint16_t m1 = static_cast(m0 >> 7); // m1 = 00000000h0g0f0e0 - const uint8_t m2 = static_cast((m0 | m1) & 0xff); // m2 = hdgcfbea - // 4. pack the bytes - const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; - const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); - const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle); - - // 5. store bytes - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); - - // 6. adjust pointers - buf += 8; - utf8_output += row[0]; - continue; - + internal::westmere::write_v_u16_11bits_to_utf8(in, utf8_output, one_byte_bytemask, one_byte_bitmask); + buf += 8; + continue; } // 1. Check if there are any surrogate word in the input chunk. @@ -25688,7 +29710,7 @@ std::pair sse_convert_utf16_to_utf8(const char16_t* buf, // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, // it is likely an uncommon occurrence. if (surrogates_bitmask == 0x0000) { - // case: words from register produce either 1, 2 or 3 UTF-8 bytes + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); @@ -25697,7 +29719,7 @@ std::pair sse_convert_utf16_to_utf8(const char16_t* buf, 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes - We expand the input word (16-bit) into two words (32-bit), thus + We expand the input word (16-bit) into two code units (32-bit), thus we have room for four bytes. However, we need five distinct bit layouts. Note that the last byte in cases #2 and #3 is the same. @@ -25708,7 +29730,7 @@ std::pair sse_convert_utf16_to_utf8(const char16_t* buf, either byte 1 for case #2 or byte 2 for case #3. Note that they differ by exactly one bit. - Finally from these two words we build proper UTF-8 sequence, taking + Finally from these two code units we build proper UTF-8 sequence, taking into account the case (i.e, the number of bytes to write). */ /** @@ -25736,15 +29758,15 @@ std::pair sse_convert_utf16_to_utf8(const char16_t* buf, const __m128i s4 = _mm_xor_si128(s3, m0); #undef simdutf_vec - // 4. expand words 16-bit => 32-bit + // 4. expand code units 16-bit => 32-bit const __m128i out0 = _mm_unpacklo_epi16(t2, s4); const __m128i out1 = _mm_unpackhi_epi16(t2, s4); - // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle const uint16_t mask = (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa); if(mask == 0) { - // We only have three-byte words. Use fast path. + // We only have three-byte code units. Use fast path. const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle); const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle); @@ -25828,7 +29850,6 @@ std::pair sse_convert_utf16_to_utf8_with_errors(const char16_t* b const __m128i v_0000 = _mm_setzero_si128(); const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800); const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800); - const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080); const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 while (buf + 16 + safety_margin <= end) { @@ -25877,44 +29898,9 @@ std::pair sse_convert_utf16_to_utf8_with_errors(const char16_t* b const uint16_t one_or_two_bytes_bitmask = static_cast(_mm_movemask_epi8(one_or_two_bytes_bytemask)); if (one_or_two_bytes_bitmask == 0xffff) { - // 1. prepare 2-byte values - // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 - // expected output : [110a|aaaa|10bb|bbbb] x 8 - const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); - const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); - - // t0 = [000a|aaaa|bbbb|bb00] - const __m128i t0 = _mm_slli_epi16(in, 2); - // t1 = [000a|aaaa|0000|0000] - const __m128i t1 = _mm_and_si128(t0, v_1f00); - // t2 = [0000|0000|00bb|bbbb] - const __m128i t2 = _mm_and_si128(in, v_003f); - // t3 = [000a|aaaa|00bb|bbbb] - const __m128i t3 = _mm_or_si128(t1, t2); - // t4 = [110a|aaaa|10bb|bbbb] - const __m128i t4 = _mm_or_si128(t3, v_c080); - - // 2. merge ASCII and 2-byte codewords - const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask); - - // 3. prepare bitmask for 8-bit lookup - // one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB) - const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a - const uint16_t m1 = static_cast(m0 >> 7); // m1 = 00000000h0g0f0e0 - const uint8_t m2 = static_cast((m0 | m1) & 0xff); // m2 = hdgcfbea - // 4. pack the bytes - const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; - const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); - const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle); - - // 5. store bytes - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); - - // 6. adjust pointers - buf += 8; - utf8_output += row[0]; - continue; - + internal::westmere::write_v_u16_11bits_to_utf8(in, utf8_output, one_byte_bytemask, one_byte_bitmask); + buf += 8; + continue; } // 1. Check if there are any surrogate word in the input chunk. @@ -25928,7 +29914,7 @@ std::pair sse_convert_utf16_to_utf8_with_errors(const char16_t* b // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, // it is likely an uncommon occurrence. if (surrogates_bitmask == 0x0000) { - // case: words from register produce either 1, 2 or 3 UTF-8 bytes + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); @@ -25937,7 +29923,7 @@ std::pair sse_convert_utf16_to_utf8_with_errors(const char16_t* b 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes - We expand the input word (16-bit) into two words (32-bit), thus + We expand the input word (16-bit) into two code units (32-bit), thus we have room for four bytes. However, we need five distinct bit layouts. Note that the last byte in cases #2 and #3 is the same. @@ -25948,7 +29934,7 @@ std::pair sse_convert_utf16_to_utf8_with_errors(const char16_t* b either byte 1 for case #2 or byte 2 for case #3. Note that they differ by exactly one bit. - Finally from these two words we build proper UTF-8 sequence, taking + Finally from these two code units we build proper UTF-8 sequence, taking into account the case (i.e, the number of bytes to write). */ /** @@ -25976,15 +29962,15 @@ std::pair sse_convert_utf16_to_utf8_with_errors(const char16_t* b const __m128i s4 = _mm_xor_si128(s3, m0); #undef simdutf_vec - // 4. expand words 16-bit => 32-bit + // 4. expand code units 16-bit => 32-bit const __m128i out0 = _mm_unpacklo_epi16(t2, s4); const __m128i out1 = _mm_unpackhi_epi16(t2, s4); - // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle const uint16_t mask = (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa); if(mask == 0) { - // We only have three-byte words. Use fast path. + // We only have three-byte code units. Use fast path. const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle); const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle); @@ -26053,11 +30039,10 @@ std::pair sse_convert_utf16_to_utf8_with_errors(const char16_t* b return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); } /* end file src/westmere/sse_convert_utf16_to_utf8.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf16_to_utf32.cpp /* begin file src/westmere/sse_convert_utf16_to_utf32.cpp */ /* The vectorized algorithm works on single SSE register i.e., it - loads eight 16-bit words. + loads eight 16-bit code units. We consider three cases: 1. an input register contains no surrogates and each value @@ -26069,7 +30054,7 @@ std::pair sse_convert_utf16_to_utf8_with_errors(const char16_t* b Ad 1. - When values are less than 0x0800, it means that a 16-bit words + When values are less than 0x0800, it means that a 16-bit code unit can be converted into: 1) single UTF8 byte (when it's an ASCII char) or 2) two UTF8 bytes. @@ -26083,7 +30068,7 @@ std::pair sse_convert_utf16_to_utf8_with_errors(const char16_t* b Ad 2. - When values fit in 16-bit words, but are above 0x07ff, then + When values fit in 16-bit code units, but are above 0x07ff, then a single word may produce one, two or three UTF8 bytes. We prepare data for all these three cases in two registers. @@ -26115,7 +30100,7 @@ std::pair sse_convert_utf16_to_utf32(const char16_t* const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800); const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800); - while (buf + 16 <= end) { + while (buf + 8 <= end) { __m128i in = _mm_loadu_si128((__m128i*)buf); if (big_endian) { @@ -26134,7 +30119,7 @@ std::pair sse_convert_utf16_to_utf32(const char16_t* // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, // it is likely an uncommon occurrence. if (surrogates_bitmask == 0x0000) { - // case: no surrogate pair, extend 16-bit words to 32-bit words + // case: no surrogate pair, extend 16-bit code units to 32-bit code units _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(in)); _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(in,8))); utf32_output += 8; @@ -26183,7 +30168,7 @@ std::pair sse_convert_utf16_to_utf32_with_errors(const char16 const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800); const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800); - while (buf + 16 <= end) { + while (buf + 8 <= end) { __m128i in = _mm_loadu_si128((__m128i*)buf); if (big_endian) { @@ -26202,7 +30187,7 @@ std::pair sse_convert_utf16_to_utf32_with_errors(const char16 // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, // it is likely an uncommon occurrence. if (surrogates_bitmask == 0x0000) { - // case: no surrogate pair, extend 16-bit words to 32-bit words + // case: no surrogate pair, extend 16-bit code units to 32-bit code units _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(in)); _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(in,8))); utf32_output += 8; @@ -26237,51 +30222,154 @@ std::pair sse_convert_utf16_to_utf32_with_errors(const char16 } /* end file src/westmere/sse_convert_utf16_to_utf32.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf32_to_utf8.cpp +/* begin file src/westmere/sse_convert_utf32_to_latin1.cpp */ +std::pair +sse_convert_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) { + const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16 + + __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00); + __m128i shufmask_1 = + _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0); + + for (size_t i = 0; i < rounded_len; i += 16) { + __m128i in1 = _mm_loadu_si128((__m128i *)buf); + __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4)); + __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8)); + __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12)); + + __m128i check_combined = _mm_or_si128(in1, in2); + check_combined = _mm_or_si128(check_combined, in3); + check_combined = _mm_or_si128(check_combined, in4); + + if (!_mm_testz_si128(check_combined, high_bytes_mask)) { + return std::make_pair(nullptr, latin1_output); + } + + __m128i shuffled1 = _mm_shuffle_epi8(in1, shufmask_1); + _mm_storeu_si64(latin1_output, shuffled1); + __m128i shuffled2 = _mm_shuffle_epi8(in2, shufmask_1); + _mm_storeu_si64(latin1_output + 4, shuffled2); + __m128i shuffled3 = _mm_shuffle_epi8(in3, shufmask_1); + _mm_storeu_si64(latin1_output + 8, shuffled3); + __m128i shuffled4 = _mm_shuffle_epi8(in4, shufmask_1); + + *reinterpret_cast(latin1_output + 12) = + _mm_cvtsi128_si32(shuffled4); + + latin1_output += 16; + buf += 16; + } + + return std::make_pair(buf, latin1_output); +} + +std::pair +sse_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, + char *latin1_output) { + const char32_t *start = buf; + const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16 + + __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00); + __m128i shufmask = + _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0); + + for (size_t i = 0; i < rounded_len; i += 16) { + __m128i in1 = _mm_loadu_si128((__m128i *)buf); + __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4)); + __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8)); + __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12)); + + __m128i check_combined = _mm_or_si128(in1, in2); + check_combined = _mm_or_si128(check_combined, in3); + check_combined = _mm_or_si128(check_combined, in4); + + if (!_mm_testz_si128(check_combined, high_bytes_mask)) { + // Fallback to scalar code for handling errors + for (int k = 0; k < 16; k++) { + char32_t codepoint = buf[k]; + if (codepoint <= 0xff) { + *latin1_output++ = char(codepoint); + } else { + return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), + latin1_output); + } + } + buf += 16; + continue; + } + + __m128i shuffled1 = _mm_shuffle_epi8(in1, shufmask); + _mm_storeu_si64(latin1_output, shuffled1); + __m128i shuffled2 = _mm_shuffle_epi8(in2, shufmask); + _mm_storeu_si64(latin1_output + 4, shuffled2); + __m128i shuffled3 = _mm_shuffle_epi8(in3, shufmask); + _mm_storeu_si64(latin1_output + 8, shuffled3); + __m128i shuffled4 = _mm_shuffle_epi8(in4, shufmask); + *reinterpret_cast(latin1_output + 12) = + _mm_cvtsi128_si32(shuffled4); + + latin1_output += 16; + buf += 16; + } + + return std::make_pair(result(error_code::SUCCESS, buf - start), + latin1_output); +} +/* end file src/westmere/sse_convert_utf32_to_latin1.cpp */ /* begin file src/westmere/sse_convert_utf32_to_utf8.cpp */ std::pair sse_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) { const char32_t* end = buf + len; - const __m128i v_0000 = _mm_setzero_si128(); - const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); - const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); - const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); - const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000); - const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); + const __m128i v_0000 = _mm_setzero_si128();//__m128 = 128 bits + const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); //1111 1000 0000 0000 + const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); //1100 0000 1000 0000 + const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); //1111 1111 1000 0000 + const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000); //1111 1111 1111 1111 0000 0000 0000 0000 + const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); //0111 1111 1111 1111 1111 1111 1111 1111 __m128i running_max = _mm_setzero_si128(); __m128i forbidden_bytemask = _mm_setzero_si128(); const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 - while (buf + 16 + safety_margin <= end) { + while (buf + 16 + safety_margin <= end) { //buf is a char32_t pointer, each char32_t has 4 bytes or 32 bits, thus buf + 16 * char_32t = 512 bits = 64 bytes // We load two 16 bytes registers for a total of 32 bytes or 16 characters. __m128i in = _mm_loadu_si128((__m128i*)buf); - __m128i nextin = _mm_loadu_si128((__m128i*)buf+1); - running_max = _mm_max_epu32(_mm_max_epu32(in, running_max), nextin); - - // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation - __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff), _mm_and_si128(nextin, v_7fffffff)); + __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);//These two values can hold only 8 UTF32 chars + running_max = _mm_max_epu32( + _mm_max_epu32(in, running_max), //take element-wise max char32_t from in and running_max vector + nextin); //and take element-wise max element from nextin and running_max vector + + // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned saturation + __m128i in_16 = _mm_packus_epi32( + _mm_and_si128(in, v_7fffffff), + _mm_and_si128(nextin, v_7fffffff) + );//in this context pack the two __m128 into a single + //By ensuring the highest bit is set to 0(&v_7fffffff), we're making sure all values are interpreted as non-negative, or specifically, the values are within the range of valid Unicode code points. + //remember : having leading byte 0 means a positive number by the two complements system. Unicode is well beneath the range where you'll start getting issues so that's OK. // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp - // Check for ASCII fast path - if(_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!! + // Check for ASCII fast path + + // ASCII fast path!!!! // We eagerly load another 32 bytes, hoping that they will be ASCII too. // The intuition is that we try to collect 16 ASCII characters which requires // a total of 64 bytes of input. If we fail, we just pass thirdin and fourthin // as our new inputs. + if(_mm_testz_si128(in_16, v_ff80)) { //if the first two blocks are ASCII __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2); __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3); - running_max = _mm_max_epu32(_mm_max_epu32(thirdin, running_max), fourthin); - __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff)); - if(!_mm_testz_si128(nextin_16, v_ff80)) { + running_max = _mm_max_epu32(_mm_max_epu32(thirdin, running_max), fourthin);//take the running max of all 4 vectors thus far + __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff));//pack into 1 vector, now you have two + if(!_mm_testz_si128(nextin_16, v_ff80)) { //checks if the second packed vector is ASCII, if not: // 1. pack the bytes // obviously suboptimal. - const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16); + const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16); //creates two copy of in_16 in 1 vector // 2. store (16 bytes) - _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); //put them into the output // 3. adjust pointers - buf += 8; - utf8_output += 8; + buf += 8; //the char32_t buffer pointer goes up 8 char32_t chars* 32 bits = 256 bits + utf8_output += 8; //same with output, e.g. lift the first two blocks alone. // Proceed with next input in_16 = nextin_16; // We need to update in and nextin because they are used later. @@ -26299,32 +30387,36 @@ std::pair sse_convert_utf32_to_utf8(const char32_t* buf, } } - // no bits set above 7th bit - const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000); - const uint16_t one_byte_bitmask = static_cast(_mm_movemask_epi8(one_byte_bytemask)); + // no bits set above 7th bit -- find out all the ASCII characters + const __m128i one_byte_bytemask = _mm_cmpeq_epi16( // this takes four bytes at a time and compares: + _mm_and_si128(in_16, v_ff80), // the vector that get only the first 9 bits of each 16-bit/2-byte units + v_0000 // + ); // they should be all zero if they are ASCII. E.g. ASCII in UTF32 is of format 0000 0000 0000 0XXX XXXX + // _mm_cmpeq_epi16 should now return a 1111 1111 1111 1111 for equals, and 0000 0000 0000 0000 if not for each 16-bit/2-byte units + const uint16_t one_byte_bitmask = static_cast(_mm_movemask_epi8(one_byte_bytemask)); // collect the MSB from previous vector and put them into uint16_t mas // no bits set above 11th bit const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000); const uint16_t one_or_two_bytes_bitmask = static_cast(_mm_movemask_epi8(one_or_two_bytes_bytemask)); if (one_or_two_bytes_bitmask == 0xffff) { - // case: all words either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes) + // case: all code units either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes) // 1. prepare 2-byte values // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 // expected output : [110a|aaaa|10bb|bbbb] x 8 - const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); - const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); + const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); // 0001 1111 0000 0000 + const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); // 0000 0000 0011 1111 // t0 = [000a|aaaa|bbbb|bb00] - const __m128i t0 = _mm_slli_epi16(in_16, 2); + const __m128i t0 = _mm_slli_epi16(in_16, 2); // shift packed vector by two // t1 = [000a|aaaa|0000|0000] - const __m128i t1 = _mm_and_si128(t0, v_1f00); + const __m128i t1 = _mm_and_si128(t0, v_1f00); // potentital first utf8 byte // t2 = [0000|0000|00bb|bbbb] - const __m128i t2 = _mm_and_si128(in_16, v_003f); + const __m128i t2 = _mm_and_si128(in_16, v_003f);// potential second utf8 byte // t3 = [000a|aaaa|00bb|bbbb] - const __m128i t3 = _mm_or_si128(t1, t2); + const __m128i t3 = _mm_or_si128(t1, t2); // first and second potential utf8 byte together // t4 = [110a|aaaa|10bb|bbbb] - const __m128i t4 = _mm_or_si128(t3, v_c080); + const __m128i t4 = _mm_or_si128(t3, v_c080); // t3 | 1100 0000 1000 0000 = full potential 2-byte utf8 unit // 2. merge ASCII and 2-byte codewords const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask); @@ -26353,7 +30445,7 @@ std::pair sse_convert_utf32_to_utf8(const char32_t* buf, const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000); const uint32_t saturation_bitmask = static_cast(_mm_movemask_epi8(saturation_bytemask)); if (saturation_bitmask == 0xffff) { - // case: words from register produce either 1, 2 or 3 UTF-8 bytes + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800); forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800)); @@ -26365,7 +30457,7 @@ std::pair sse_convert_utf32_to_utf8(const char32_t* buf, 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes - We expand the input word (16-bit) into two words (32-bit), thus + We expand the input word (16-bit) into two code units (32-bit), thus we have room for four bytes. However, we need five distinct bit layouts. Note that the last byte in cases #2 and #3 is the same. @@ -26376,7 +30468,7 @@ std::pair sse_convert_utf32_to_utf8(const char32_t* buf, either byte 1 for case #2 or byte 2 for case #3. Note that they differ by exactly one bit. - Finally from these two words we build proper UTF-8 sequence, taking + Finally from these two code units we build proper UTF-8 sequence, taking into account the case (i.e, the number of bytes to write). */ /** @@ -26404,15 +30496,15 @@ std::pair sse_convert_utf32_to_utf8(const char32_t* buf, const __m128i s4 = _mm_xor_si128(s3, m0); #undef simdutf_vec - // 4. expand words 16-bit => 32-bit + // 4. expand code units 16-bit => 32-bit const __m128i out0 = _mm_unpacklo_epi16(t2, s4); const __m128i out1 = _mm_unpackhi_epi16(t2, s4); - // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle const uint16_t mask = (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa); if(mask == 0) { - // We only have three-byte words. Use fast path. + // We only have three-byte code units. Use fast path. const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle); const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle); @@ -26511,7 +30603,7 @@ std::pair sse_convert_utf32_to_utf8_with_errors(const char32_t* b return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output); } - // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation + // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned saturation __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff), _mm_and_si128(nextin, v_7fffffff)); // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp @@ -26564,7 +30656,7 @@ std::pair sse_convert_utf32_to_utf8_with_errors(const char32_t* b const uint16_t one_or_two_bytes_bitmask = static_cast(_mm_movemask_epi8(one_or_two_bytes_bytemask)); if (one_or_two_bytes_bitmask == 0xffff) { - // case: all words either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes) + // case: all code units either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes) // 1. prepare 2-byte values // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 // expected output : [110a|aaaa|10bb|bbbb] x 8 @@ -26610,9 +30702,9 @@ std::pair sse_convert_utf32_to_utf8_with_errors(const char32_t* b const uint32_t saturation_bitmask = static_cast(_mm_movemask_epi8(saturation_bytemask)); if (saturation_bitmask == 0xffff) { - // case: words from register produce either 1, 2 or 3 UTF-8 bytes + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes - // Check for illegal surrogate words + // Check for illegal surrogate code units const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800); const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800); if (static_cast(_mm_movemask_epi8(forbidden_bytemask)) != 0) { @@ -26627,7 +30719,7 @@ std::pair sse_convert_utf32_to_utf8_with_errors(const char32_t* b 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes - We expand the input word (16-bit) into two words (32-bit), thus + We expand the input word (16-bit) into two code units (32-bit), thus we have room for four bytes. However, we need five distinct bit layouts. Note that the last byte in cases #2 and #3 is the same. @@ -26638,7 +30730,7 @@ std::pair sse_convert_utf32_to_utf8_with_errors(const char32_t* b either byte 1 for case #2 or byte 2 for case #3. Note that they differ by exactly one bit. - Finally from these two words we build proper UTF-8 sequence, taking + Finally from these two code units we build proper UTF-8 sequence, taking into account the case (i.e, the number of bytes to write). */ /** @@ -26666,321 +30758,709 @@ std::pair sse_convert_utf32_to_utf8_with_errors(const char32_t* b const __m128i s4 = _mm_xor_si128(s3, m0); #undef simdutf_vec - // 4. expand words 16-bit => 32-bit - const __m128i out0 = _mm_unpacklo_epi16(t2, s4); - const __m128i out1 = _mm_unpackhi_epi16(t2, s4); + // 4. expand code units 16-bit => 32-bit + const __m128i out0 = _mm_unpacklo_epi16(t2, s4); + const __m128i out1 = _mm_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint16_t mask = (one_byte_bitmask & 0x5555) | + (one_or_two_bytes_bitmask & 0xaaaa); + if(mask == 0) { + // We only have three-byte code units. Use fast path. + const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); + const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle); + const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + } + const uint8_t mask0 = uint8_t(mask); + + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); + const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); + const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1); + + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += row1[0]; + + buf += 8; + } else { + // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFFFF80)==0) { + *utf8_output++ = char(word); + } else if((word & 0xFFFFF800)==0) { + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word &0xFFFF0000 )==0) { + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); } + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf- start + k), utf8_output); } + *utf8_output++ = char((word>>18) | 0b11110000); + *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); +} +/* end file src/westmere/sse_convert_utf32_to_utf8.cpp */ +/* begin file src/westmere/sse_convert_utf32_to_utf16.cpp */ +template +std::pair sse_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) { + + const char32_t* end = buf + len; + + const __m128i v_0000 = _mm_setzero_si128(); + const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000); + __m128i forbidden_bytemask = _mm_setzero_si128(); + + while (buf + 8 <= end) { + __m128i in = _mm_loadu_si128((__m128i*)buf); + __m128i nextin = _mm_loadu_si128((__m128i*)buf+1); + const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000); + const uint32_t saturation_bitmask = static_cast(_mm_movemask_epi8(saturation_bytemask)); + + // Check if no bits set above 16th + if (saturation_bitmask == 0xffff) { + // Pack UTF-32 to UTF-16 + __m128i utf16_packed = _mm_packus_epi32(in, nextin); + + const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); + const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800); + forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800)); + + if (big_endian) { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); + } + + _mm_storeu_si128((__m128i*)utf16_output, utf16_packed); + utf16_output += 8; + buf += 8; + } else { + size_t forward = 7; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFF0000)==0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); } + *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (big_endian) { + high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); + low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } + buf += k; + } + } + + // check for invalid input + if (static_cast(_mm_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); } + + return std::make_pair(buf, utf16_output); +} - // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle - const uint16_t mask = (one_byte_bitmask & 0x5555) | - (one_or_two_bytes_bitmask & 0xaaaa); - if(mask == 0) { - // We only have three-byte words. Use fast path. - const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); - const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle); - const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle); - _mm_storeu_si128((__m128i*)utf8_output, utf8_0); - utf8_output += 12; - _mm_storeu_si128((__m128i*)utf8_output, utf8_1); - utf8_output += 12; - buf += 8; - continue; - } - const uint8_t mask0 = uint8_t(mask); - const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; - const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); - const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0); +template +std::pair sse_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) { + const char32_t* start = buf; + const char32_t* end = buf + len; - const uint8_t mask1 = static_cast(mask >> 8); + const __m128i v_0000 = _mm_setzero_si128(); + const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000); - const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; - const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); - const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1); + while (buf + 8 <= end) { + __m128i in = _mm_loadu_si128((__m128i*)buf); + __m128i nextin = _mm_loadu_si128((__m128i*)buf+1); + const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000); + const uint32_t saturation_bitmask = static_cast(_mm_movemask_epi8(saturation_bytemask)); - _mm_storeu_si128((__m128i*)utf8_output, utf8_0); - utf8_output += row0[0]; - _mm_storeu_si128((__m128i*)utf8_output, utf8_1); - utf8_output += row1[0]; + // Check if no bits set above 16th + if (saturation_bitmask == 0xffff) { + // Pack UTF-32 to UTF-16 + __m128i utf16_packed = _mm_packus_epi32(in, nextin); + + const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); + const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800); + const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800); + if (static_cast(_mm_movemask_epi8(forbidden_bytemask)) != 0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output); + } + + if (big_endian) { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); + } + _mm_storeu_si128((__m128i*)utf16_output, utf16_packed); + utf16_output += 8; buf += 8; } else { - // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes - // Let us do a scalar fallback. - // It may seem wasteful to use scalar code, but being efficient with SIMD - // in the presence of surrogate pairs may require non-trivial tables. - size_t forward = 15; + size_t forward = 7; size_t k = 0; if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} for(; k < forward; k++) { uint32_t word = buf[k]; - if((word & 0xFFFFFF80)==0) { - *utf8_output++ = char(word); - } else if((word & 0xFFFFF800)==0) { - *utf8_output++ = char((word>>6) | 0b11000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); - } else if((word &0xFFFF0000 )==0) { - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); } - *utf8_output++ = char((word>>12) | 0b11100000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); + if((word & 0xFFFF0000)==0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); } + *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word); } else { - if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf- start + k), utf8_output); } - *utf8_output++ = char((word>>18) | 0b11110000); - *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); - *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); - *utf8_output++ = char((word & 0b111111) | 0b10000000); + // will generate a surrogate pair + if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (big_endian) { + high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); + low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); } } buf += k; } - } // while + } - return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); + return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output); } -/* end file src/westmere/sse_convert_utf32_to_utf8.cpp */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf32_to_utf16.cpp -/* begin file src/westmere/sse_convert_utf32_to_utf16.cpp */ -template -std::pair sse_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) { +/* end file src/westmere/sse_convert_utf32_to_utf16.cpp */ + +} // unnamed namespace +} // namespace westmere +} // namespace simdutf + +/* begin file src/generic/buf_block_reader.h */ +namespace simdutf { +namespace westmere { +namespace { + +// Walks through a buffer in block-sized increments, loading the last part with spaces +template +struct buf_block_reader { +public: + simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); + simdutf_really_inline size_t block_index(); + simdutf_really_inline bool has_full_block() const; + simdutf_really_inline const uint8_t *full_block() const; + /** + * Get the last block, padded with spaces. + * + * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this + * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there + * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. + * + * @return the number of effective characters in the last block. + */ + simdutf_really_inline size_t get_remainder(uint8_t *dst) const; + simdutf_really_inline void advance(); +private: + const uint8_t *buf; + const size_t len; + const size_t lenminusstep; + size_t idx; +}; + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text_64(const uint8_t *text) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + for (size_t i=0; i); i++) { + buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text(const simd8x64& in) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + in.store(reinterpret_cast(buf)); + for (size_t i=0; i); i++) { + if (buf[i] < ' ') { buf[i] = '_'; } + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +simdutf_unused static char * format_mask(uint64_t mask) { + static char *buf = reinterpret_cast(malloc(64 + 1)); + for (size_t i=0; i<64; i++) { + buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; + } + buf[64] = '\0'; + return buf; +} + +template +simdutf_really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} + +template +simdutf_really_inline size_t buf_block_reader::block_index() { return idx; } + +template +simdutf_really_inline bool buf_block_reader::has_full_block() const { + return idx < lenminusstep; +} + +template +simdutf_really_inline const uint8_t *buf_block_reader::full_block() const { + return &buf[idx]; +} + +template +simdutf_really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { + if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers + std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. + std::memcpy(dst, buf + idx, len - idx); + return len - idx; +} + +template +simdutf_really_inline void buf_block_reader::advance() { + idx += STEP_SIZE; +} + +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/buf_block_reader.h */ +/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_validation { + +using namespace simd; + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, - const char32_t* end = buf + len; + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, - const __m128i v_0000 = _mm_setzero_si128(); - const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000); - __m128i forbidden_bytemask = _mm_setzero_si128(); + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - while (buf + 8 <= end) { - __m128i in = _mm_loadu_si128((__m128i*)buf); - __m128i nextin = _mm_loadu_si128((__m128i*)buf+1); - const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000); - const uint32_t saturation_bitmask = static_cast(_mm_movemask_epi8(saturation_bytemask)); + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - // Check if no bits set above 16th - if (saturation_bitmask == 0xffff) { - // Pack UTF-32 to UTF-16 - __m128i utf16_packed = _mm_packus_epi32(in, nextin); + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } - const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); - const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800); - forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800)); + // + // Return nonzero if there are incomplete multibyte characters at the end of the block: + // e.g. if there is a 4-byte character, but it's 3 bytes from the end. + // + simdutf_really_inline simd8 is_incomplete(const simd8 input) { + // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): + // ... 1111____ 111_____ 11______ + static const uint8_t max_array[32] = { + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 + }; + const simd8 max_value(&max_array[sizeof(max_array)-sizeof(simd8)]); + return input.gt_bits(max_value); + } - if (big_endian) { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); - } + struct utf8_checker { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + // The last input we received + simd8 prev_input_block; + // Whether the last input we received was incomplete (used for ASCII fast path) + simd8 prev_incomplete; - _mm_storeu_si128((__m128i*)utf16_output, utf16_packed); - utf16_output += 8; - buf += 8; - } else { - size_t forward = 7; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFF0000)==0) { - // will not generate a surrogate pair - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); } - *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word); - } else { - // will generate a surrogate pair - if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); } - word -= 0x10000; - uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); - uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (big_endian) { - high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); - low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); - } - *utf16_output++ = char16_t(high_surrogate); - *utf16_output++ = char16_t(low_surrogate); - } - } - buf += k; + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); } - } - // check for invalid input - if (static_cast(_mm_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); } + // The only problem that can happen at EOF is that a multibyte character is too short + // or a byte value too large in the last bytes: check_special_cases only checks for bytes + // too large in the first of two bytes. + simdutf_really_inline void check_eof() { + // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't + // possibly finish them. + this->error |= this->prev_incomplete; + } - return std::make_pair(buf, utf16_output); -} + simdutf_really_inline void check_next_input(const simd8x64& input) { + if(simdutf_likely(is_ascii(input))) { + this->error |= this->prev_incomplete; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); + this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; + } + } -template -std::pair sse_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) { - const char32_t* start = buf; - const char32_t* end = buf + len; + // do not forget to call check_eof! + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } - const __m128i v_0000 = _mm_setzero_si128(); - const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000); + }; // struct utf8_checker +} // namespace utf8_validation - while (buf + 8 <= end) { - __m128i in = _mm_loadu_si128((__m128i*)buf); - __m128i nextin = _mm_loadu_si128((__m128i*)buf+1); - const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000); - const uint32_t saturation_bitmask = static_cast(_mm_movemask_epi8(saturation_bytemask)); +using utf8_validation::utf8_checker; - // Check if no bits set above 16th - if (saturation_bitmask == 0xffff) { - // Pack UTF-32 to UTF-16 - __m128i utf16_packed = _mm_packus_epi32(in, nextin); +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +/* begin file src/generic/utf8_validation/utf8_validator.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_validation { - const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); - const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800); - const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800); - if (static_cast(_mm_movemask_epi8(forbidden_bytemask)) != 0) { - return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output); - } +/** + * Validates that the string is actual UTF-8. + */ +template +bool generic_validate_utf8(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + return !c.errors(); +} - if (big_endian) { - const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); - } +bool generic_validate_utf8(const char * input, size_t length) { + return generic_validate_utf8(reinterpret_cast(input),length); +} - _mm_storeu_si128((__m128i*)utf16_output, utf16_packed); - utf16_output += 8; - buf += 8; - } else { - size_t forward = 7; - size_t k = 0; - if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} - for(; k < forward; k++) { - uint32_t word = buf[k]; - if((word & 0xFFFF0000)==0) { - // will not generate a surrogate pair - if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); } - *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word); - } else { - // will generate a surrogate pair - if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); } - word -= 0x10000; - uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); - uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (big_endian) { - high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); - low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); - } - *utf16_output++ = char16_t(high_surrogate); - *utf16_output++ = char16_t(low_surrogate); - } +/** + * Validates that the string is actual UTF-8 and stops on errors. + */ +template +result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + if(c.errors()) { + if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input), reinterpret_cast(input + count), length - count); + res.count += count; + return res; } - buf += k; + reader.advance(); + count += 64; } - } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + if (c.errors()) { + if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input), reinterpret_cast(input) + count, length - count); + res.count += count; + return res; + } else { + return result(error_code::SUCCESS, length); + } +} - return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output); +result generic_validate_utf8_with_errors(const char * input, size_t length) { + return generic_validate_utf8_with_errors(reinterpret_cast(input),length); } -/* end file src/westmere/sse_convert_utf32_to_utf16.cpp */ -} // unnamed namespace -} // namespace westmere -} // namespace simdutf +template +bool generic_validate_ascii(const uint8_t * input, size_t length) { + buf_block_reader<64> reader(input, length); + uint8_t blocks[64]{}; + simd::simd8x64 running_or(blocks); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + running_or |= in; + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + running_or |= in; + return running_or.is_ascii(); +} -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h -/* begin file src/generic/buf_block_reader.h */ -namespace simdutf { -namespace westmere { -namespace { +bool generic_validate_ascii(const char * input, size_t length) { + return generic_validate_ascii(reinterpret_cast(input),length); +} -// Walks through a buffer in block-sized increments, loading the last part with spaces -template -struct buf_block_reader { -public: - simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); - simdutf_really_inline size_t block_index(); - simdutf_really_inline bool has_full_block() const; - simdutf_really_inline const uint8_t *full_block() const; - /** - * Get the last block, padded with spaces. - * - * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this - * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there - * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. - * - * @return the number of effective characters in the last block. - */ - simdutf_really_inline size_t get_remainder(uint8_t *dst) const; - simdutf_really_inline void advance(); -private: - const uint8_t *buf; - const size_t len; - const size_t lenminusstep; - size_t idx; -}; +template +result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) { + buf_block_reader<64> reader(input, length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } + reader.advance(); -// Routines to print masks and text for debugging bitmask operations -simdutf_unused static char * format_input_text_64(const uint8_t *text) { - static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); - for (size_t i=0; i); i++) { - buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); + count += 64; } - buf[sizeof(simd8x64)] = '\0'; - return buf; -} - -// Routines to print masks and text for debugging bitmask operations -simdutf_unused static char * format_input_text(const simd8x64& in) { - static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); - in.store(reinterpret_cast(buf)); - for (size_t i=0; i); i++) { - if (buf[i] < ' ') { buf[i] = '_'; } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } else { + return result(error_code::SUCCESS, length); } - buf[sizeof(simd8x64)] = '\0'; - return buf; } -simdutf_unused static char * format_mask(uint64_t mask) { - static char *buf = reinterpret_cast(malloc(64 + 1)); - for (size_t i=0; i<64; i++) { - buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; - } - buf[64] = '\0'; - return buf; +result generic_validate_ascii_with_errors(const char * input, size_t length) { + return generic_validate_ascii_with_errors(reinterpret_cast(input),length); } -template -simdutf_really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} - -template -simdutf_really_inline size_t buf_block_reader::block_index() { return idx; } +} // namespace utf8_validation +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_validator.h */ +// transcoding from UTF-8 to UTF-16 +/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ -template -simdutf_really_inline bool buf_block_reader::has_full_block() const { - return idx < lenminusstep; -} -template -simdutf_really_inline const uint8_t *buf_block_reader::full_block() const { - return &buf[idx]; -} +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_to_utf16 { -template -simdutf_really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { - if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers - std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. - std::memcpy(dst, buf + idx, len - idx); - return len - idx; -} +using namespace simd; -template -simdutf_really_inline void buf_block_reader::advance() { - idx += STEP_SIZE; +template +simdutf_warn_unused size_t convert_valid(const char* input, size_t size, + char16_t* utf16_output) noexcept { + // The implementation is not specific to haswell and should be moved to the generic directory. + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + // this loop could be unrolled further. For example, we could process the mask + // far more than 64 bytes. + simd8x64 in(reinterpret_cast(input + pos)); + if(in.is_ascii()) { + in.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // Slow path. We hope that the compiler will recognize that this is a slow path. + // Anything that is not a continuation mask is a 'leading byte', that is, the + // start of a new code point. + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + // -65 is 0b10111111 in two-complement's, so largest possible continuation byte + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + // The *start* of code points is not so useful, rather, we want the *end* of code points. + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times when using solely + // the slow/regular path, and at least four times if there are fast paths. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + // + // Thus we may allow convert_masked_utf8_to_utf16 to process + // more bytes at a time under a fast-path mode where 16 bytes + // are consumed at once (e.g., when encountering ASCII). + size_t consumed = convert_masked_utf8_to_utf16(input + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output); + return utf16_output - start; } +} // namespace utf8_to_utf16 } // unnamed namespace } // namespace westmere } // namespace simdutf -/* end file src/generic/buf_block_reader.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h -/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ + + namespace simdutf { namespace westmere { namespace { -namespace utf8_validation { - +namespace utf8_to_utf16 { using namespace simd; + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) // Bit 1 = Too Long (ASCII followed by continuation) @@ -27074,37 +31554,18 @@ using namespace simd; simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, const simd8 prev_input, const simd8 sc) { simd8 prev2 = input.prev<2>(prev_input); - simd8 prev3 = input.prev<3>(prev_input); - simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); - simd8 must23_80 = must23 & uint8_t(0x80); - return must23_80 ^ sc; - } - - // - // Return nonzero if there are incomplete multibyte characters at the end of the block: - // e.g. if there is a 4-byte character, but it's 3 bytes from the end. - // - simdutf_really_inline simd8 is_incomplete(const simd8 input) { - // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): - // ... 1111____ 111_____ 11______ - static const uint8_t max_array[32] = { - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 - }; - const simd8 max_value(&max_array[sizeof(max_array)-sizeof(simd8)]); - return input.gt_bits(max_value); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; } - struct utf8_checker { + + struct validating_transcoder { // If this is nonzero, there has been a UTF-8 error. simd8 error; - // The last input we received - simd8 prev_input_block; - // Whether the last input we received was incomplete (used for ASCII fast path) - simd8 prev_incomplete; + validating_transcoder() : error(uint8_t(0)) {} // // Check whether the current bytes are valid UTF-8. // @@ -27116,262 +31577,239 @@ using namespace simd; this->error |= check_multibyte_lengths(input, prev_input, sc); } - // The only problem that can happen at EOF is that a multibyte character is too short - // or a byte value too large in the last bytes: check_special_cases only checks for bytes - // too large in the first of two bytes. - simdutf_really_inline void check_eof() { - // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't - // possibly finish them. - this->error |= this->prev_incomplete; - } - simdutf_really_inline void check_next_input(const simd8x64& input) { - if(simdutf_likely(is_ascii(input))) { - this->error |= this->prev_incomplete; - } else { - // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], this->prev_input_block); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], this->prev_input_block); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + template + simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) { + size_t pos = 0; + char16_t* start{utf16_output}; + // In the worst case, we have the haswell kernel which can cause an overflow of + // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes, + // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate + // much more than 8 bytes. However, you cannot generally assume that you have valid + // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, + // to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for(; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin-1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16(in + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. } - this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); - this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; - } + if(errors()) { return 0; } + if(pos < size) { + size_t howmany = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output); + if(howmany == 0) { return 0; } + utf16_output += howmany; + } + return utf16_output - start; } - // do not forget to call check_eof! - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); - } - - }; // struct utf8_checker -} // namespace utf8_validation - -using utf8_validation::utf8_checker; - -} // unnamed namespace -} // namespace westmere -} // namespace simdutf -/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h -/* begin file src/generic/utf8_validation/utf8_validator.h */ -namespace simdutf { -namespace westmere { -namespace { -namespace utf8_validation { - -/** - * Validates that the string is actual UTF-8. - */ -template -bool generic_validate_utf8(const uint8_t * input, size_t length) { - checker c{}; - buf_block_reader<64> reader(input, length); - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - c.check_next_input(in); - reader.advance(); - } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - c.check_next_input(in); - reader.advance(); - c.check_eof(); - return !c.errors(); -} - -bool generic_validate_utf8(const char * input, size_t length) { - return generic_validate_utf8(reinterpret_cast(input),length); -} - -/** - * Validates that the string is actual UTF-8 and stops on errors. - */ -template -result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) { - checker c{}; - buf_block_reader<64> reader(input, length); - size_t count{0}; - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - c.check_next_input(in); - if(c.errors()) { - if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk - result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input), reinterpret_cast(input + count), length - count); - res.count += count; + template + simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) { + size_t pos = 0; + char16_t* start{utf16_output}; + // In the worst case, we have the haswell kernel which can cause an overflow of + // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes, + // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate + // much more than 8 bytes. However, you cannot generally assume that you have valid + // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, + // to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for(; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin-1]) > -65); + } + // If the input is long enough, then we have that margin-1 is the eight last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos onward, + // with the ability to go back up to pos bytes, and read size-pos bytes forward. + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16(in + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos onward, + // with the ability to go back up to pos bytes, and read size-pos bytes forward. + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); + res.count += pos; return res; } - reader.advance(); - count += 64; - } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - c.check_next_input(in); - reader.advance(); - c.check_eof(); - if (c.errors()) { - if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk - result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input), reinterpret_cast(input) + count, length - count); - res.count += count; - return res; - } else { - return result(error_code::SUCCESS, length); - } -} - -result generic_validate_utf8_with_errors(const char * input, size_t length) { - return generic_validate_utf8_with_errors(reinterpret_cast(input),length); -} - -template -bool generic_validate_ascii(const uint8_t * input, size_t length) { - buf_block_reader<64> reader(input, length); - uint8_t blocks[64]{}; - simd::simd8x64 running_or(blocks); - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - running_or |= in; - reader.advance(); + if(pos < size) { + // rewind_and_convert_with_errors will seek a potential error from in+pos onward, + // with the ability to go back up to pos bytes, and read size-pos bytes forward. + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf16_output += res.count; + } + } + return result(error_code::SUCCESS, utf16_output - start); } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - running_or |= in; - return running_or.is_ascii(); -} - -bool generic_validate_ascii(const char * input, size_t length) { - return generic_validate_ascii(reinterpret_cast(input),length); -} -template -result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) { - buf_block_reader<64> reader(input, length); - size_t count{0}; - while (reader.has_full_block()) { - simd::simd8x64 in(reader.full_block()); - if (!in.is_ascii()) { - result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); - return result(res.error, count + res.count); + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); } - reader.advance(); - - count += 64; - } - uint8_t block[64]{}; - reader.get_remainder(block); - simd::simd8x64 in(block); - if (!in.is_ascii()) { - result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); - return result(res.error, count + res.count); - } else { - return result(error_code::SUCCESS, length); - } -} - -result generic_validate_ascii_with_errors(const char * input, size_t length) { - return generic_validate_ascii_with_errors(reinterpret_cast(input),length); -} -} // namespace utf8_validation + }; // struct utf8_checker +} // utf8_to_utf16 namespace } // unnamed namespace } // namespace westmere } // namespace simdutf -/* end file src/generic/utf8_validation/utf8_validator.h */ -// transcoding from UTF-8 to UTF-16 -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h -/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ - +/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +// transcoding from UTF-8 to UTF-32 +/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ namespace simdutf { namespace westmere { namespace { -namespace utf8_to_utf16 { +namespace utf8_to_utf32 { using namespace simd; -template + simdutf_warn_unused size_t convert_valid(const char* input, size_t size, - char16_t* utf16_output) noexcept { - // The implementation is not specific to haswell and should be moved to the generic directory. + char32_t* utf32_output) noexcept { size_t pos = 0; - char16_t* start{utf16_output}; + char32_t* start{utf32_output}; const size_t safety_margin = 16; // to avoid overruns! while(pos + 64 + safety_margin <= size) { - // this loop could be unrolled further. For example, we could process the mask - // far more than 64 bytes. simd8x64 in(reinterpret_cast(input + pos)); if(in.is_ascii()) { - in.store_ascii_as_utf16(utf16_output); - utf16_output += 64; + in.store_ascii_as_utf32(utf32_output); + utf32_output += 64; pos += 64; } else { - // Slow path. We hope that the compiler will recognize that this is a slow path. - // Anything that is not a continuation mask is a 'leading byte', that is, the - // start of a new code point. - uint64_t utf8_continuation_mask = in.lt(-65 + 1); - // -65 is 0b10111111 in two-complement's, so largest possible continuation byte - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - // The *start* of code points is not so useful, rather, we want the *end* of code points. - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - // We process in blocks of up to 12 bytes except possibly - // for fast paths which may process up to 16 bytes. For the - // slow path to work, we should have at least 12 input bytes left. - size_t max_starting_point = (pos + 64) - 12; - // Next loop is going to run at least five times when using solely - // the slow/regular path, and at least four times if there are fast paths. - while(pos < max_starting_point) { - // Performance note: our ability to compute 'consumed' and - // then shift and recompute is critical. If there is a - // latency of, say, 4 cycles on getting 'consumed', then - // the inner loop might have a total latency of about 6 cycles. - // Yet we process between 6 to 12 inputs bytes, thus we get - // a speed limit between 1 cycle/byte and 0.5 cycle/byte - // for this section of the code. Hence, there is a limit - // to how much we can further increase this latency before - // it seriously harms performance. - // - // Thus we may allow convert_masked_utf8_to_utf16 to process - // more bytes at a time under a fast-path mode where 16 bytes - // are consumed at once (e.g., when encountering ASCII). - size_t consumed = convert_masked_utf8_to_utf16(input + pos, - utf8_end_of_code_point_mask, utf16_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; + // -65 is 0b10111111 in two-complement's, so largest possible continuation byte + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + size_t max_starting_point = (pos + 64) - 12; + while(pos < max_starting_point) { + size_t consumed = convert_masked_utf8_to_utf32(input + pos, + utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; } - // At this point there may remain between 0 and 12 bytes in the - // 64-byte block. These bytes will be processed again. So we have an - // 80% efficiency (in the worst case). In practice we expect an - // 85% to 90% efficiency. } } - utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output); - return utf16_output - start; + utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output); + return utf32_output - start; } -} // namespace utf8_to_utf16 + +} // namespace utf8_to_utf32 } // unnamed namespace } // namespace westmere } // namespace simdutf -/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h -/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ +/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */ namespace simdutf { namespace westmere { namespace { -namespace utf8_to_utf16 { +namespace utf8_to_utf32 { using namespace simd; @@ -27492,28 +31930,28 @@ using namespace simd; } - template - simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) { + + simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) { size_t pos = 0; - char16_t* start{utf16_output}; + char32_t* start{utf32_output}; // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes, + // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, + // UTF-8 input, so we are going to go back from the end counting 4 leading bytes, // to give us a good margin. size_t leading_byte = 0; size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { + for(; margin > 0 && leading_byte < 4; margin--) { leading_byte += (int8_t(in[margin-1]) > -65); } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. + // If the input is long enough, then we have that margin-1 is the fourth last leading byte. const size_t safety_margin = size - margin + 1; // to avoid overruns! while(pos + 64 + safety_margin <= size) { simd8x64 input(reinterpret_cast(in + pos)); if(input.is_ascii()) { - input.store_ascii_as_utf16(utf16_output); - utf16_output += 64; + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; pos += 64; } else { // you might think that a for-loop would work, but under Visual Studio, it is not good enough. @@ -27547,8 +31985,8 @@ using namespace simd; // for this section of the code. Hence, there is a limit // to how much we can further increase this latency before // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf16(in + pos, - utf8_end_of_code_point_mask, utf16_output); + size_t consumed = convert_masked_utf8_to_utf32(in + pos, + utf8_end_of_code_point_mask, utf32_output); pos += consumed; utf8_end_of_code_point_mask >>= consumed; } @@ -27560,35 +31998,34 @@ using namespace simd; } if(errors()) { return 0; } if(pos < size) { - size_t howmany = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output); + size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output); if(howmany == 0) { return 0; } - utf16_output += howmany; + utf32_output += howmany; } - return utf16_output - start; + return utf32_output - start; } - template - simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) { + simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) { size_t pos = 0; - char16_t* start{utf16_output}; + char32_t* start{utf32_output}; // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes, + // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, + // UTF-8 input, so we are going to go back from the end counting 4 leading bytes, // to give us a good margin. size_t leading_byte = 0; size_t margin = size; - for(; margin > 0 && leading_byte < 8; margin--) { + for(; margin > 0 && leading_byte < 4; margin--) { leading_byte += (int8_t(in[margin-1]) > -65); } - // If the input is long enough, then we have that margin-1 is the eight last leading byte. + // If the input is long enough, then we have that margin-1 is the fourth last leading byte. const size_t safety_margin = size - margin + 1; // to avoid overruns! while(pos + 64 + safety_margin <= size) { simd8x64 input(reinterpret_cast(in + pos)); if(input.is_ascii()) { - input.store_ascii_as_utf16(utf16_output); - utf16_output += 64; + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; pos += 64; } else { // you might think that a for-loop would work, but under Visual Studio, it is not good enough. @@ -27605,9 +32042,7 @@ using namespace simd; this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } if (errors()) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); res.count += pos; return res; } @@ -27629,8 +32064,8 @@ using namespace simd; // for this section of the code. Hence, there is a limit // to how much we can further increase this latency before // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf16(in + pos, - utf8_end_of_code_point_mask, utf16_output); + size_t consumed = convert_masked_utf8_to_utf32(in + pos, + utf8_end_of_code_point_mask, utf32_output); pos += consumed; utf8_end_of_code_point_mask >>= consumed; } @@ -27641,95 +32076,157 @@ using namespace simd; } } if(errors()) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); res.count += pos; return res; } if(pos < size) { - // rewind_and_convert_with_errors will seek a potential error from in+pos onward, - // with the ability to go back up to pos bytes, and read size-pos bytes forward. - result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); if (res.error) { // In case of error, we want the error position res.count += pos; return res; } else { // In case of success, we want the number of word written - utf16_output += res.count; + utf32_output += res.count; } } - return result(error_code::SUCCESS, utf16_output - start); + return result(error_code::SUCCESS, utf32_output - start); + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // utf8_to_utf32 namespace +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +// other functions +/* begin file src/generic/utf8.h */ + +namespace simdutf { +namespace westmere { +namespace { +namespace utf8 { + +using namespace simd; + +simdutf_really_inline size_t count_code_points(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.gt(-65); + count += count_ones(utf8_continuation_mask); + } + return count + scalar::utf8::count_code_points(in + pos, size - pos); +} + +simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + // We count one word for anything that is not a continuation (so + // leading bytes). + count += 64 - count_ones(utf8_continuation_mask); + int64_t utf8_4byte = input.gteq_unsigned(240); + count += count_ones(utf8_4byte); } + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); +} - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); - } - }; // struct utf8_checker -} // utf8_to_utf16 namespace +simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) { + return count_code_points(in, size); +} +} // utf8 namespace } // unnamed namespace } // namespace westmere } // namespace simdutf -/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ -// transcoding from UTF-8 to UTF-32 -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h -/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ - +/* end file src/generic/utf8.h */ +/* begin file src/generic/utf16.h */ namespace simdutf { namespace westmere { namespace { -namespace utf8_to_utf32 { +namespace utf16 { -using namespace simd; +template +simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos < size/32*32; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if (!match_system(big_endian)) { input.swap_bytes(); } + uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); + count += count_ones(not_pair) / 2; + } + return count + scalar::utf16::count_code_points(in + pos, size - pos); +} +template +simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos < size/32*32; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if (!match_system(big_endian)) { input.swap_bytes(); } + uint64_t ascii_mask = input.lteq(0x7F); + uint64_t twobyte_mask = input.lteq(0x7FF); + uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); -simdutf_warn_unused size_t convert_valid(const char* input, size_t size, - char32_t* utf32_output) noexcept { - size_t pos = 0; - char32_t* start{utf32_output}; - const size_t safety_margin = 16; // to avoid overruns! - while(pos + 64 + safety_margin <= size) { - simd8x64 in(reinterpret_cast(input + pos)); - if(in.is_ascii()) { - in.store_ascii_as_utf32(utf32_output); - utf32_output += 64; - pos += 64; - } else { - // -65 is 0b10111111 in two-complement's, so largest possible continuation byte - uint64_t utf8_continuation_mask = in.lt(-65 + 1); - uint64_t utf8_leading_mask = ~utf8_continuation_mask; - uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; - size_t max_starting_point = (pos + 64) - 12; - while(pos < max_starting_point) { - size_t consumed = convert_masked_utf8_to_utf32(input + pos, - utf8_end_of_code_point_mask, utf32_output); - pos += consumed; - utf8_end_of_code_point_mask >>= consumed; - } + size_t ascii_count = count_ones(ascii_mask) / 2; + size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2; + size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2; + size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; + count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count; } - } - utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output); - return utf32_output - start; + return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos); } +template +simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) { + return count_code_points(in, size); +} -} // namespace utf8_to_utf32 +simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) { + size_t pos = 0; + + while (pos < size/32*32) { + simd16x32 input(reinterpret_cast(in + pos)); + input.swap_bytes(); + input.store(reinterpret_cast(output)); + pos += 32; + output += 32; + } + + scalar::utf16::change_endianness_utf16(in + pos, size - pos, output); +} + +} // utf16 } // unnamed namespace } // namespace westmere } // namespace simdutf -/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h -/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +/* end file src/generic/utf16.h */ +// transcoding from UTF-8 to Latin 1 +/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */ namespace simdutf { namespace westmere { namespace { -namespace utf8_to_utf32 { +namespace utf8_to_latin1 { using namespace simd; simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// For UTF-8 to Latin 1, we can allow any ASCII character, and any continuation byte, +// but the non-ASCII leading bytes must be 0b11000011 or 0b11000010 and nothing else. +// // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) // Bit 1 = Too Long (ASCII followed by continuation) // Bit 2 = Overlong 3-byte @@ -27756,6 +32253,7 @@ using namespace simd; // 1111011_ 1000____ // 11111___ 1000____ constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + constexpr const uint8_t FORBIDDEN = 0xff; const simd8 byte_1_high = prev1.shr<4>().lookup_16( // 0_______ ________ @@ -27766,11 +32264,11 @@ using namespace simd; // 1100____ ________ TOO_SHORT | OVERLONG_2, // 1101____ ________ - TOO_SHORT, + FORBIDDEN, // 1110____ ________ - TOO_SHORT | OVERLONG_3 | SURROGATE, + FORBIDDEN, // 1111____ ________ - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + FORBIDDEN ); constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( @@ -27783,23 +32281,23 @@ using namespace simd; CARRY, // ____0100 ________ - CARRY | TOO_LARGE, + FORBIDDEN, // ____0101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, + FORBIDDEN, // ____011_ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, + FORBIDDEN, + FORBIDDEN, // ____1___ ________ - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, + FORBIDDEN, + FORBIDDEN, + FORBIDDEN, + FORBIDDEN, + FORBIDDEN, // ____1101 ________ - CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000 + FORBIDDEN, + FORBIDDEN, + FORBIDDEN ); const simd8 byte_2_high = input.shr<4>().lookup_16( // ________ 0_______ @@ -27819,15 +32317,6 @@ using namespace simd; ); return (byte_1_high & byte_1_low & byte_2_high); } - simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, - const simd8 prev_input, const simd8 sc) { - simd8 prev2 = input.prev<2>(prev_input); - simd8 prev3 = input.prev<3>(prev_input); - simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); - simd8 must23_80 = must23 & uint8_t(0x80); - return must23_80 ^ sc; - } - struct validating_transcoder { // If this is nonzero, there has been a UTF-8 error. @@ -27841,33 +32330,31 @@ using namespace simd; // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) simd8 prev1 = input.prev<1>(prev_input); - simd8 sc = check_special_cases(input, prev1); - this->error |= check_multibyte_lengths(input, prev_input, sc); + this->error |= check_special_cases(input, prev1); } - - simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) { + simdutf_really_inline size_t convert(const char* in, size_t size, char* latin1_output) { size_t pos = 0; - char32_t* start{utf32_output}; + char* start{latin1_output}; // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, + // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes, // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 4 leading bytes, + // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, // to give us a good margin. size_t leading_byte = 0; size_t margin = size; - for(; margin > 0 && leading_byte < 4; margin--) { - leading_byte += (int8_t(in[margin-1]) > -65); + for(; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin-1]) > -65); //twos complement of -65 is 1011 1111 ... } - // If the input is long enough, then we have that margin-1 is the fourth last leading byte. + // If the input is long enough, then we have that margin-1 is the eight last leading byte. const size_t safety_margin = size - margin + 1; // to avoid overruns! while(pos + 64 + safety_margin <= size) { simd8x64 input(reinterpret_cast(in + pos)); if(input.is_ascii()) { - input.store_ascii_as_utf32(utf32_output); - utf32_output += 64; + input.store((int8_t*)latin1_output); + latin1_output += 64; pos += 64; } else { // you might think that a for-loop would work, but under Visual Studio, it is not good enough. @@ -27883,7 +32370,7 @@ using namespace simd; this->check_utf8_bytes(input.chunks[2], input.chunks[1]); this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for. uint64_t utf8_leading_mask = ~utf8_continuation_mask; uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; // We process in blocks of up to 12 bytes except possibly @@ -27901,8 +32388,8 @@ using namespace simd; // for this section of the code. Hence, there is a limit // to how much we can further increase this latency before // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf32(in + pos, - utf8_end_of_code_point_mask, utf32_output); + size_t consumed = convert_masked_utf8_to_latin1(in + pos, + utf8_end_of_code_point_mask, latin1_output); pos += consumed; utf8_end_of_code_point_mask >>= consumed; } @@ -27914,55 +32401,151 @@ using namespace simd; } if(errors()) { return 0; } if(pos < size) { - size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output); + size_t howmany = scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output); if(howmany == 0) { return 0; } - utf32_output += howmany; + latin1_output += howmany; } - return utf32_output - start; + return latin1_output - start; } - simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) { + simdutf_really_inline result convert_with_errors(const char* in, size_t size, char* latin1_output) { size_t pos = 0; - char32_t* start{utf32_output}; + char* start{latin1_output}; // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, + // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes, // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 4 leading bytes, + // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, // to give us a good margin. size_t leading_byte = 0; size_t margin = size; - for(; margin > 0 && leading_byte < 4; margin--) { + for(; margin > 0 && leading_byte < 8; margin--) { leading_byte += (int8_t(in[margin-1]) > -65); } - // If the input is long enough, then we have that margin-1 is the fourth last leading byte. + // If the input is long enough, then we have that margin-1 is the eight last leading byte. const size_t safety_margin = size - margin + 1; // to avoid overruns! while(pos + 64 + safety_margin <= size) { simd8x64 input(reinterpret_cast(in + pos)); if(input.is_ascii()) { - input.store_ascii_as_utf32(utf32_output); - utf32_output += 64; + input.store((int8_t*)latin1_output); + latin1_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + if (errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos onward, + // with the ability to go back up to pos bytes, and read size-pos bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output); + res.count += pos; + return res; + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_latin1(in + pos, + utf8_end_of_code_point_mask, latin1_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block. These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { + // rewind_and_convert_with_errors will seek a potential error from in+pos onward, + // with the ability to go back up to pos bytes, and read size-pos bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output); + res.count += pos; + return res; + } + if(pos < size) { + // rewind_and_convert_with_errors will seek a potential error from in+pos onward, + // with the ability to go back up to pos bytes, and read size-pos bytes forward. + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + latin1_output += res.count; + } + } + return result(error_code::SUCCESS, latin1_output - start); + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // utf8_to_latin1 namespace +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */ +/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ + + +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_to_latin1 { +using namespace simd; + + + simdutf_really_inline size_t convert_valid(const char* in, size_t size, char* latin1_output) { + size_t pos = 0; + char* start{latin1_output}; + // In the worst case, we have the haswell kernel which can cause an overflow of + // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes, + // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate + // much more than 8 bytes. However, you cannot generally assume that you have valid + // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, + // to give us a good margin. + size_t leading_byte = 0; + size_t margin = size; + for(; margin > 0 && leading_byte < 8; margin--) { + leading_byte += (int8_t(in[margin-1]) > -65); //twos complement of -65 is 1011 1111 ... + } + // If the input is long enough, then we have that margin-1 is the eight last leading byte. + const size_t safety_margin = size - margin + 1; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store((int8_t*)latin1_output); + latin1_output += 64; pos += 64; } else { // you might think that a for-loop would work, but under Visual Studio, it is not good enough. - static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), - "We support either two or four chunks per 64-byte block."); - auto zero = simd8{uint8_t(0)}; - if(simd8x64::NUM_CHUNKS == 2) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - } else if(simd8x64::NUM_CHUNKS == 4) { - this->check_utf8_bytes(input.chunks[0], zero); - this->check_utf8_bytes(input.chunks[1], input.chunks[0]); - this->check_utf8_bytes(input.chunks[2], input.chunks[1]); - this->check_utf8_bytes(input.chunks[3], input.chunks[2]); - } - if (errors()) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); - res.count += pos; - return res; - } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for. uint64_t utf8_leading_mask = ~utf8_continuation_mask; uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; // We process in blocks of up to 12 bytes except possibly @@ -27980,8 +32563,8 @@ using namespace simd; // for this section of the code. Hence, there is a limit // to how much we can further increase this latency before // it seriously harms performance. - size_t consumed = convert_masked_utf8_to_utf32(in + pos, - utf8_end_of_code_point_mask, utf32_output); + size_t consumed = convert_masked_utf8_to_latin1(in + pos, + utf8_end_of_code_point_mask, latin1_output); pos += consumed; utf8_end_of_code_point_mask >>= consumed; } @@ -27991,146 +32574,22 @@ using namespace simd; // 85% to 90% efficiency. } } - if(errors()) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); - res.count += pos; - return res; - } if(pos < size) { - result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); - if (res.error) { // In case of error, we want the error position - res.count += pos; - return res; - } else { // In case of success, we want the number of word written - utf32_output += res.count; - } + size_t howmany = scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output); + if(howmany == 0) { return 0; } + latin1_output += howmany; } - return result(error_code::SUCCESS, utf32_output - start); - } - - simdutf_really_inline bool errors() const { - return this->error.any_bits_set_anywhere(); - } - - }; // struct utf8_checker -} // utf8_to_utf32 namespace -} // unnamed namespace -} // namespace westmere -} // namespace simdutf -/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */ -// other functions -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8.h -/* begin file src/generic/utf8.h */ - -namespace simdutf { -namespace westmere { -namespace { -namespace utf8 { - -using namespace simd; - -simdutf_really_inline size_t count_code_points(const char* in, size_t size) { - size_t pos = 0; - size_t count = 0; - for(;pos + 64 <= size; pos += 64) { - simd8x64 input(reinterpret_cast(in + pos)); - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - count += 64 - count_ones(utf8_continuation_mask); + return latin1_output - start; } - return count + scalar::utf8::count_code_points(in + pos, size - pos); -} - - -simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) { - size_t pos = 0; - size_t count = 0; - // This algorithm could no doubt be improved! - for(;pos + 64 <= size; pos += 64) { - simd8x64 input(reinterpret_cast(in + pos)); - uint64_t utf8_continuation_mask = input.lt(-65 + 1); - // We count one word for anything that is not a continuation (so - // leading bytes). - count += 64 - count_ones(utf8_continuation_mask); - int64_t utf8_4byte = input.gteq_unsigned(240); - count += count_ones(utf8_4byte); - } - return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); -} - - -simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) { - return count_code_points(in, size); -} -} // utf8 namespace -} // unnamed namespace -} // namespace westmere -} // namespace simdutf -/* end file src/generic/utf8.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf16.h -/* begin file src/generic/utf16.h */ -namespace simdutf { -namespace westmere { -namespace { -namespace utf16 { - -template -simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) { - size_t pos = 0; - size_t count = 0; - for(;pos + 32 <= size; pos += 32) { - simd16x32 input(reinterpret_cast(in + pos)); - if (!match_system(big_endian)) input.swap_bytes(); - uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); - count += count_ones(not_pair) / 2; - } - return count + scalar::utf16::count_code_points(in + pos, size - pos); -} - -template -simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) { - size_t pos = 0; - size_t count = 0; - // This algorithm could no doubt be improved! - for(;pos + 32 <= size; pos += 32) { - simd16x32 input(reinterpret_cast(in + pos)); - if (!match_system(big_endian)) input.swap_bytes(); - uint64_t ascii_mask = input.lteq(0x7F); - uint64_t twobyte_mask = input.lteq(0x7FF); - uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); - - size_t ascii_count = count_ones(ascii_mask) / 2; - size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2; - size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2; - size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; - count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count; - } - return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos); -} -template -simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) { - return count_code_points(in, size); -} - -simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) { - size_t pos = 0; - - while (pos + 32 <= size) { - simd16x32 input(reinterpret_cast(in + pos)); - input.swap_bytes(); - input.store(reinterpret_cast(output)); - pos += 32; - output += 32; - } + }; +} // utf8_to_latin1 namespace +} // unnamed namespace +} // namespace westmere + // namespace simdutf +/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */ - scalar::utf16::change_endianness_utf16(in + pos, size - pos, output); -} -} // utf16 -} // unnamed namespace -} // namespace westmere -} // namespace simdutf -/* end file src/generic/utf16.h */ // // Implementation-specific overrides // @@ -28226,6 +32685,74 @@ simdutf_warn_unused result implementation::validate_utf32_with_errors(const char } } +simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept { + + std::pair ret = sse_convert_latin1_to_utf8(buf, len, utf8_output); + size_t converted_chars = ret.second - utf8_output; + + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } + + return converted_chars; +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + std::pair ret = sse_convert_latin1_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { return 0; } + size_t converted_chars = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_converted_chars == 0) { return 0; } + converted_chars += scalar_converted_chars; + } + return converted_chars; +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + std::pair ret = sse_convert_latin1_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { return 0; } + size_t converted_chars = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_converted_chars == 0) { return 0; } + converted_chars += scalar_converted_chars; + } + return converted_chars; +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept { + std::pair ret = sse_convert_latin1_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { return 0; } + size_t converted_chars = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_converted_chars == 0) { return 0; } + converted_chars += scalar_converted_chars; + } + return converted_chars; +} + + +simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept { + utf8_to_latin1::validating_transcoder converter; + return converter.convert(buf, len, latin1_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept { + utf8_to_latin1::validating_transcoder converter; + return converter.convert_with_errors(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept { + return westmere::utf8_to_latin1::convert_valid(buf,len,latin1_output); +} + simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { utf8_to_utf16::validating_transcoder converter; return converter.convert(buf, len, utf16_output); @@ -28272,6 +32799,79 @@ simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const cha return utf8_to_utf32::convert_valid(input, size, utf32_output); } +simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + std::pair ret = sse_convert_utf16_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - latin1_output; + + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + std::pair ret = sse_convert_utf16_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - latin1_output; + + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + std::pair ret = sse_convert_utf16_to_latin1_with_errors(buf, len, latin1_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + std::pair ret = sse_convert_utf16_to_latin1_with_errors(buf, len, latin1_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} + + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + // optimization opportunity: we could provide an optimized function. + return convert_utf16be_to_latin1(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept { + // optimization opportunity: we could provide an optimized function. + return convert_utf16le_to_latin1(buf, len, latin1_output); +} + simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { std::pair ret = sse_convert_utf16_to_utf8(buf, len, utf8_output); if (ret.first == nullptr) { return 0; } @@ -28299,7 +32899,7 @@ simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_ } simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished + // ret.first.count is always the position in the buffer, not the number of code units written even if finished std::pair ret = westmere::sse_convert_utf16_to_utf8_with_errors(buf, len, utf8_output); if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count if (ret.first.count != len) { // All good so far, but not finished @@ -28312,12 +32912,12 @@ simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(c ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written + ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit code units written return ret.first; } simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished + // ret.first.count is always the position in the buffer, not the number of code units written even if finished std::pair ret = westmere::sse_convert_utf16_to_utf8_with_errors(buf, len, utf8_output); if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count if (ret.first.count != len) { // All good so far, but not finished @@ -28330,7 +32930,7 @@ simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(c ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written + ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit code units written return ret.first; } @@ -28342,6 +32942,43 @@ simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const c return convert_utf16be_to_utf8(buf, len, utf8_output); } +simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept { + std::pair ret = sse_convert_utf32_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - latin1_output; + // if (ret.first != buf + len) { + if (ret.first < buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + + +simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of code units written even if finished + std::pair ret = westmere::sse_convert_utf32_to_latin1_with_errors(buf, len, latin1_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept { + // optimization opportunity: we could provide an optimized function. + return convert_utf32_to_latin1(buf,len,latin1_output); +} + simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { std::pair ret = sse_convert_utf32_to_utf8(buf, len, utf8_output); if (ret.first == nullptr) { return 0; } @@ -28356,7 +32993,7 @@ simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* } simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished + // ret.first.count is always the position in the buffer, not the number of code units written even if finished std::pair ret = westmere::sse_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); if (ret.first.count != len) { result scalar_res = scalar::utf32_to_utf8::convert_with_errors( @@ -28368,7 +33005,7 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(con ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written + ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit code units written return ret.first; } @@ -28399,7 +33036,7 @@ simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16 } simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished + // ret.first.count is always the position in the buffer, not the number of code units written even if finished std::pair ret = westmere::sse_convert_utf16_to_utf32_with_errors(buf, len, utf32_output); if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count if (ret.first.count != len) { // All good so far, but not finished @@ -28412,12 +33049,12 @@ simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors( ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written + ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit code units written return ret.first; } simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished + // ret.first.count is always the position in the buffer, not the number of code units written even if finished std::pair ret = westmere::sse_convert_utf16_to_utf32_with_errors(buf, len, utf32_output); if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count if (ret.first.count != len) { // All good so far, but not finished @@ -28430,7 +33067,7 @@ simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors( ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written + ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit code units written return ret.first; } @@ -28465,7 +33102,7 @@ simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32 } simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished + // ret.first.count is always the position in the buffer, not the number of code units written even if finished std::pair ret = westmere::sse_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); if (ret.first.count != len) { result scalar_res = scalar::utf32_to_utf16::convert_with_errors( @@ -28477,12 +33114,12 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors( ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written + ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit code units written return ret.first; } simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { - // ret.first.count is always the position in the buffer, not the number of words written even if finished + // ret.first.count is always the position in the buffer, not the number of code units written even if finished std::pair ret = westmere::sse_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); if (ret.first.count != len) { result scalar_res = scalar::utf32_to_utf16::convert_with_errors( @@ -28494,7 +33131,7 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors( ret.second += scalar_res.count; } } - ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written + ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit code units written return ret.first; } @@ -28530,6 +33167,18 @@ simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t return utf8::count_code_points(input, length); } +simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept { + return count_utf8(buf,len); +} + +simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept { + return scalar::utf16::latin1_length_from_utf16(length); +} + +simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept { + return scalar::utf32::latin1_length_from_utf32(length); +} + simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept { return utf16::utf8_length_from_utf16(input, length); } @@ -28538,6 +33187,61 @@ simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16 return utf16::utf8_length_from_utf16(input, length); } +simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept { + return scalar::latin1::utf16_length_from_latin1(length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept { + return scalar::latin1::utf32_length_from_latin1(length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t len) const noexcept { + const uint8_t *str = reinterpret_cast(input); + size_t answer = len / sizeof(__m128i) * sizeof(__m128i); + size_t i = 0; + __m128i two_64bits = _mm_setzero_si128(); + while (i + sizeof(__m128i) <= len) { + __m128i runner = _mm_setzero_si128(); + size_t iterations = (len - i) / sizeof(__m128i); + if (iterations > 255) { + iterations = 255; + } + size_t max_i = i + iterations * sizeof(__m128i) - sizeof(__m128i); + for (; i + 4*sizeof(__m128i) <= max_i; i += 4*sizeof(__m128i)) { + __m128i input1 = _mm_loadu_si128((const __m128i *)(str + i)); + __m128i input2 = _mm_loadu_si128((const __m128i *)(str + i + sizeof(__m128i))); + __m128i input3 = _mm_loadu_si128((const __m128i *)(str + i + 2*sizeof(__m128i))); + __m128i input4 = _mm_loadu_si128((const __m128i *)(str + i + 3*sizeof(__m128i))); + __m128i input12 = _mm_add_epi8( + _mm_cmpgt_epi8( + _mm_setzero_si128(), + input1), + _mm_cmpgt_epi8( + _mm_setzero_si128(), + input2)); + __m128i input34 = _mm_add_epi8( + _mm_cmpgt_epi8( + _mm_setzero_si128(), + input3), + _mm_cmpgt_epi8( + _mm_setzero_si128(), + input4)); + __m128i input1234 = _mm_add_epi8(input12, input34); + runner = _mm_sub_epi8(runner, input1234); + } + for (; i <= max_i; i += sizeof(__m128i)) { + __m128i more_input = _mm_loadu_si128((const __m128i *)(str + i)); + runner = _mm_sub_epi8( + runner, _mm_cmpgt_epi8(_mm_setzero_si128(), more_input)); + } + two_64bits = _mm_add_epi64( + two_64bits, _mm_sad_epu8(runner, _mm_setzero_si128())); + } + answer += _mm_extract_epi64(two_64bits, 0) + + _mm_extract_epi64(two_64bits, 1); + return answer + scalar::latin1::utf8_length_from_latin1(reinterpret_cast(str + i), len - i); +} + simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept { return utf16::utf32_length_from_utf16(input, length); } @@ -28592,13 +33296,12 @@ simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_ } simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept { - return scalar::utf8::count_code_points(input, length); + return utf8::count_code_points(input, length); } } // namespace westmere } // namespace simdutf -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/end.h /* begin file src/simdutf/westmere/end.h */ #if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE // nothing needed. diff --git a/deps/simdutf/simdutf.h b/deps/simdutf/simdutf.h index 6ab1c34d7b30a7..62be108b57013a 100644 --- a/deps/simdutf/simdutf.h +++ b/deps/simdutf/simdutf.h @@ -1,11 +1,9 @@ -/* auto-generated on 2023-10-08 13:48:09 -0400. Do not edit! */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf.h +/* auto-generated on 2023-10-20 19:53:58 -0400. Do not edit! */ /* begin file include/simdutf.h */ #ifndef SIMDUTF_H #define SIMDUTF_H #include -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/compiler_check.h /* begin file include/simdutf/compiler_check.h */ #ifndef SIMDUTF_COMPILER_CHECK_H #define SIMDUTF_COMPILER_CHECK_H @@ -43,13 +41,11 @@ #endif // SIMDUTF_COMPILER_CHECK_H /* end file include/simdutf/compiler_check.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/common_defs.h /* begin file include/simdutf/common_defs.h */ #ifndef SIMDUTF_COMMON_DEFS_H #define SIMDUTF_COMMON_DEFS_H #include -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/portability.h /* begin file include/simdutf/portability.h */ #ifndef SIMDUTF_PORTABILITY_H #define SIMDUTF_PORTABILITY_H @@ -167,11 +163,8 @@ #ifdef SIMDUTF_IS_32BITS #ifndef SIMDUTF_NO_PORTABILITY_WARNING -#pragma message("The simdutf library is designed \ -for 64-bit processors and it seems that you are not \ -compiling for a known 64-bit platform. All fast kernels \ -will be disabled and performance may be poor. Please \ -use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.") +// In the future, we may want to warn users of 32-bit systems that +// the simdutf does not support accelerated kernels for such systems. #endif // SIMDUTF_NO_PORTABILITY_WARNING #endif // SIMDUTF_IS_32BITS @@ -280,7 +273,6 @@ use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.") #endif // SIMDUTF_PORTABILITY_H /* end file include/simdutf/portability.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/avx512.h /* begin file include/simdutf/avx512.h */ #ifndef SIMDUTF_AVX512_H_ #define SIMDUTF_AVX512_H_ @@ -483,7 +475,6 @@ use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.") #endif // SIMDUTF_COMMON_DEFS_H /* end file include/simdutf/common_defs.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/encoding_types.h /* begin file include/simdutf/encoding_types.h */ #include @@ -495,13 +486,14 @@ enum encoding_type { UTF16_BE = 4, // BOM 0xfe 0xff UTF32_LE = 8, // BOM 0xff 0xfe 0x00 0x00 UTF32_BE = 16, // BOM 0x00 0x00 0xfe 0xff + Latin1 = 32, unspecified = 0 }; enum endianness { - LITTLE, - BIG + LITTLE = 0, + BIG = 1 }; bool match_system(endianness e); @@ -514,7 +506,7 @@ namespace BOM { /** * Checks for a BOM. If not, returns unspecified * @param input the string to process - * @param length the length of the string in words + * @param length the length of the string in code units * @return the corresponding encoding */ @@ -531,7 +523,6 @@ size_t bom_byte_size(encoding_type bom); } // BOM namespace } // simdutf namespace /* end file include/simdutf/encoding_types.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/error.h /* begin file include/simdutf/error.h */ #ifndef ERROR_H #define ERROR_H @@ -545,15 +536,16 @@ enum error_code { TOO_LONG, // We either have too many consecutive continuation bytes or the string starts with a continuation byte. OVERLONG, // The decoded character must be above U+7F for two-byte characters, U+7FF for three-byte characters, // and U+FFFF for four-byte characters. - TOO_LARGE, // The decoded character must be less than or equal to U+10FFFF OR less than or equal than U+7F for ASCII. + TOO_LARGE, // The decoded character must be less than or equal to U+10FFFF,less than or equal than U+7F for ASCII OR less than equal than U+FF for Latin1 SURROGATE, // The decoded character must be not be in U+D800...DFFF (UTF-8 or UTF-32) OR - // a high surrogate must be followed by a low surrogate and a low surrogate must be preceded by a high surrogate (UTF-16) + // a high surrogate must be followed by a low surrogate and a low surrogate must be preceded by a high surrogate (UTF-16) OR + // there must be no surrogate at all (Latin1) OTHER // Not related to validation/transcoding. }; struct result { error_code error; - size_t count; // In case of error, indicates the position of the error. In case of success, indicates the number of words validated/written. + size_t count; // In case of error, indicates the position of the error. In case of success, indicates the number of code units validated/written. simdutf_really_inline result(); @@ -568,7 +560,6 @@ SIMDUTF_PUSH_DISABLE_WARNINGS SIMDUTF_DISABLE_UNDESIRED_WARNINGS // Public API -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/simdutf_version.h /* begin file include/simdutf/simdutf_version.h */ // /include/simdutf/simdutf_version.h automatically generated by release.py, // do not change by hand @@ -576,28 +567,27 @@ SIMDUTF_DISABLE_UNDESIRED_WARNINGS #define SIMDUTF_SIMDUTF_VERSION_H /** The version of simdutf being used (major.minor.revision) */ -#define SIMDUTF_VERSION "3.2.18" +#define SIMDUTF_VERSION "4.0.0" namespace simdutf { enum { /** * The major version (MAJOR.minor.revision) of simdutf being used. */ - SIMDUTF_VERSION_MAJOR = 3, + SIMDUTF_VERSION_MAJOR = 4, /** * The minor version (major.MINOR.revision) of simdutf being used. */ - SIMDUTF_VERSION_MINOR = 2, + SIMDUTF_VERSION_MINOR = 0, /** * The revision (major.minor.REVISION) of simdutf being used. */ - SIMDUTF_VERSION_REVISION = 18 + SIMDUTF_VERSION_REVISION = 0 }; } // namespace simdutf #endif // SIMDUTF_SIMDUTF_VERSION_H /* end file include/simdutf/simdutf_version.h */ -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/implementation.h /* begin file include/simdutf/implementation.h */ #ifndef SIMDUTF_IMPLEMENTATION_H #define SIMDUTF_IMPLEMENTATION_H @@ -607,7 +597,6 @@ enum { #endif #include #include -// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/internal/isadetection.h /* begin file include/simdutf/internal/isadetection.h */ /* From https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h @@ -685,7 +674,8 @@ enum instruction_set { AVX512CD = 0x2000, AVX512BW = 0x4000, AVX512VL = 0x8000, - AVX512VBMI2 = 0x10000 + AVX512VBMI2 = 0x10000, + AVX512VPOPCNTDQ = 0x2000 }; #if defined(__PPC64__) @@ -840,6 +830,9 @@ static inline uint32_t detect_supported_architectures() { if (ecx & cpuid_bit::ecx::avx512vbmi2) { host_isa |= instruction_set::AVX512VBMI2; } + if (ecx & cpuid_bit::ecx::avx512vpopcnt) { + host_isa |= instruction_set::AVX512VPOPCNTDQ; + } return host_isa; } #else // fallback @@ -892,7 +885,6 @@ simdutf_really_inline simdutf_warn_unused int detect_encodings(const uint8_t * i return detect_encodings(reinterpret_cast(input), length); } - /** * Validate the UTF-8 string. This function may be best when you expect * the input to be almost always valid. Otherwise, consider using @@ -913,7 +905,7 @@ simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept; * * @param buf the UTF-8 string to validate. * @param len the length of the string in bytes. - * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. */ simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) noexcept; @@ -936,7 +928,7 @@ simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept; * * @param buf the ASCII string to validate. * @param len the length of the string in bytes. - * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. */ simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept; @@ -950,7 +942,7 @@ simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t le * This function is not BOM-aware. * * @param buf the UTF-16 string to validate. - * @param len the length of the string in number of 2-byte words (char16_t). + * @param len the length of the string in number of 2-byte code units (char16_t). * @return true if and only if the string is valid UTF-16. */ simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) noexcept; @@ -965,7 +957,7 @@ simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) noexcep * This function is not BOM-aware. * * @param buf the UTF-16LE string to validate. - * @param len the length of the string in number of 2-byte words (char16_t). + * @param len the length of the string in number of 2-byte code units (char16_t). * @return true if and only if the string is valid UTF-16LE. */ simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) noexcept; @@ -980,7 +972,7 @@ simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) noexc * This function is not BOM-aware. * * @param buf the UTF-16BE string to validate. - * @param len the length of the string in number of 2-byte words (char16_t). + * @param len the length of the string in number of 2-byte code units (char16_t). * @return true if and only if the string is valid UTF-16BE. */ simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) noexcept; @@ -994,8 +986,8 @@ simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) noexc * This function is not BOM-aware. * * @param buf the UTF-16 string to validate. - * @param len the length of the string in number of 2-byte words (char16_t). - * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + * @param len the length of the string in number of 2-byte code units (char16_t). + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. */ simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf, size_t len) noexcept; @@ -1008,8 +1000,8 @@ simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf, size_ * This function is not BOM-aware. * * @param buf the UTF-16LE string to validate. - * @param len the length of the string in number of 2-byte words (char16_t). - * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + * @param len the length of the string in number of 2-byte code units (char16_t). + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. */ simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) noexcept; @@ -1022,8 +1014,8 @@ simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, siz * This function is not BOM-aware. * * @param buf the UTF-16BE string to validate. - * @param len the length of the string in number of 2-byte words (char16_t). - * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + * @param len the length of the string in number of 2-byte code units (char16_t). + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. */ simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) noexcept; @@ -1037,7 +1029,7 @@ simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, siz * This function is not BOM-aware. * * @param buf the UTF-32 string to validate. - * @param len the length of the string in number of 4-byte words (char32_t). + * @param len the length of the string in number of 4-byte code units (char32_t). * @return true if and only if the string is valid UTF-32. */ simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) noexcept; @@ -1051,13 +1043,75 @@ simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) noexcep * This function is not BOM-aware. * * @param buf the UTF-32 string to validate. - * @param len the length of the string in number of 4-byte words (char32_t). - * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + * @param len the length of the string in number of 4-byte code units (char32_t). + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. */ simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) noexcept; + /** + * Convert Latin1 string into UTF8 string. + * + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the Latin1 string to convert + * @param length the length of the string in bytes + * @param latin1_output the pointer to buffer that can hold conversion result + * @return the number of written char; 0 if conversion is not possible + */ + simdutf_warn_unused size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) noexcept; + + + /** + * Convert possibly Latin1 string into UTF-16LE string. + * + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the Latin1 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t; 0 if conversion is not possible + */ + simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept; + + /** + * Convert Latin1 string into UTF-16BE string. + * + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the Latin1 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t; 0 if conversion is not possible + */ + simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept; + + /** + * Convert Latin1 string into UTF-32 string. + * + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the Latin1 string to convert + * @param length the length of the string in bytes + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return the number of written char32_t; 0 if conversion is not possible + */ + simdutf_warn_unused size_t convert_latin1_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept; + + /** + * Convert possibly broken UTF-8 string into latin1 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param latin1_output the pointer to buffer that can hold conversion result + * @return the number of written char; 0 if the input was not valid UTF-8 string + */ + simdutf_warn_unused size_t convert_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept; + /** - * Using native endianness; Convert possibly broken UTF-8 string into UTF-16 string. + * Using native endianness, convert possibly broken UTF-8 string into a UTF-16 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1069,6 +1123,17 @@ simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_ */ simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept; + +/** + * Using native endianness, convert a Latin1 string into a UTF-16 string. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t. + */ +simdutf_warn_unused size_t convert_latin1_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept; + /** * Convert possibly broken UTF-8 string into UTF-16LE string. * @@ -1095,8 +1160,22 @@ simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t le */ simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept; + + /** + * Convert possibly broken UTF-8 string into latin1 string. with errors + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param latin1_output the pointer to buffer that can hold conversion result + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. + */ + simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * input, size_t length, char* latin1_output) noexcept; + /** - * Using native endianness; Convert possibly broken UTF-8 string into UTF-16 + * Using native endianness, convert possibly broken UTF-8 string into UTF-16 * string and stop on error. * * During the conversion also validation of the input string is done. @@ -1105,7 +1184,7 @@ simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t le * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. */ simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept; @@ -1118,7 +1197,7 @@ simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. */ simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept; @@ -1131,7 +1210,7 @@ simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * inpu * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. */ simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept; @@ -1157,12 +1236,27 @@ simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t leng * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. */ simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept; + /** + * Convert valid UTF-8 string into latin1 string. + * + * This function assumes that the input string is valid UTF-8. + * + * This function is not BOM-aware. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param latin1_output the pointer to buffer that can hold conversion result + * @return the number of written char; 0 if the input was not valid UTF-8 string + */ + simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept; + + /** - * Using native endianness; Convert valid UTF-8 string into UTF-16 string. + * Using native endianness, convert valid UTF-8 string into a UTF-16 string. * * This function assumes that the input string is valid UTF-8. * @@ -1209,8 +1303,31 @@ simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, siz */ simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept; + +/** + * Return the number of bytes that this Latin1 string would require in UTF-8 format. + * + * @param input the Latin1 string to convert + * @param length the length of the string bytes + * @return the number of bytes required to encode the Latin1 string as UTF-8 + */ +simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) noexcept; + /** - * Compute the number of 2-byte words that this UTF-8 string would require in UTF-16LE format. + * Compute the number of bytes that this UTF-8 string would require in Latin1 format. + * + * This function does not validate the input. + * + * This function is not BOM-aware. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in byte + * @return the number of bytes required to encode the UTF-8 string as Latin1 + */ +simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) noexcept; + +/** + * Compute the number of 2-byte code units that this UTF-8 string would require in UTF-16LE format. * * This function does not validate the input. * @@ -1218,12 +1335,12 @@ simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_ * * @param input the UTF-8 string to process * @param length the length of the string in bytes - * @return the number of char16_t words required to encode the UTF-8 string as UTF-16LE + * @return the number of char16_t code units required to encode the UTF-8 string as UTF-16LE */ simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept; /** - * Compute the number of 4-byte words that this UTF-8 string would require in UTF-32 format. + * Compute the number of 4-byte code units that this UTF-8 string would require in UTF-32 format. * * This function is equivalent to count_utf8 * @@ -1233,12 +1350,12 @@ simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t len * * @param input the UTF-8 string to process * @param length the length of the string in bytes - * @return the number of char32_t words required to encode the UTF-8 string as UTF-32 + * @return the number of char32_t code units required to encode the UTF-8 string as UTF-32 */ simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept; /** - * Using native endianness; Convert possibly broken UTF-16 string into UTF-8 string. + * Using native endianness, convert possibly broken UTF-16 string into UTF-8 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1246,12 +1363,60 @@ simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t len * This function is not BOM-aware. * * @param input the UTF-16 string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-16LE string + * @return number of written code units; 0 if input is not a valid UTF-16LE string */ simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; + + +/** + * Using native endianness, convert possibly broken UTF-16 string into Latin1 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 2-byte code units (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion result + * @return number of written code units; 0 if input is not a valid UTF-16LE string + */ +simdutf_warn_unused size_t convert_utf16_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept; + +/** + * Convert possibly broken UTF-16LE string into Latin1 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte code units (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion result + * @return number of written code units; 0 if input is not a valid UTF-16LE string + */ +simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept; + +/** + * Convert possibly broken UTF-16BE string into Latin1 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte code units (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + */ +simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept; + + /** * Convert possibly broken UTF-16LE string into UTF-8 string. * @@ -1261,9 +1426,9 @@ simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * input, size_t * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-16LE string + * @return number of written code units; 0 if input is not a valid UTF-16LE string */ simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; @@ -1276,14 +1441,57 @@ simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * input, size_ * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-16LE string + * @return number of written code units; 0 if input is not a valid UTF-16LE string */ simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; /** - * Using native endianness; Convert possibly broken UTF-16 string into UTF-8 string and stop on error. + * Using native endianness, convert possibly broken UTF-16 string into Latin1 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * This function is not BOM-aware. + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 2-byte code units (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + */ +simdutf_warn_unused result convert_utf16_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept; + +/** + * Convert possibly broken UTF-16LE string into Latin1 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte code units (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + */ +simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept; + +/** + * Convert possibly broken UTF-16BE string into Latin1 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte code units (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + */ +simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept; + + +/** + * Using native endianness, convert possibly broken UTF-16 string into UTF-8 string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1291,9 +1499,9 @@ simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * input, size_ * This function is not BOM-aware. * * @param input the UTF-16 string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. */ simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept; @@ -1306,9 +1514,9 @@ simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * in * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. */ simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept; @@ -1321,26 +1529,70 @@ simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. */ simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept; /** - * Using native endianness; Convert valid UTF-16 string into UTF-8 string. + * Using native endianness, convert valid UTF-16 string into UTF-8 string. * * This function assumes that the input string is valid UTF-16LE. * * This function is not BOM-aware. * * @param input the UTF-16 string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold the conversion result - * @return number of written words; 0 if conversion is not possible + * @return number of written code units; 0 if conversion is not possible */ simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; + +/** + * Using native endianness, convert UTF-16 string into Latin1 string. + * + * This function assumes that the input string is valid UTF-8. + * + * This function is not BOM-aware. + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 2-byte code units (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion result + * @return number of written code units; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_valid_utf16_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept; + +/** + * Convert valid UTF-16LE string into Latin1 string. + * + * This function assumes that the input string is valid UTF-16LE. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte code units (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion result + * @return number of written code units; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept; + +/** + * Convert valid UTF-16BE string into Latin1 string. + * + * This function assumes that the input string is valid UTF-16BE. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte code units (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion result + * @return number of written code units; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept; + + /** * Convert valid UTF-16LE string into UTF-8 string. * @@ -1349,9 +1601,9 @@ simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * input, s * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold the conversion result - * @return number of written words; 0 if conversion is not possible + * @return number of written code units; 0 if conversion is not possible */ simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; @@ -1363,14 +1615,14 @@ simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * input, * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold the conversion result - * @return number of written words; 0 if conversion is not possible + * @return number of written code units; 0 if conversion is not possible */ simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; /** - * Using native endianness; Convert possibly broken UTF-16 string into UTF-32 string. + * Using native endianness, convert possibly broken UTF-16 string into UTF-32 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1378,9 +1630,9 @@ simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * input, * This function is not BOM-aware. * * @param input the UTF-16 string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-16LE string + * @return number of written code units; 0 if input is not a valid UTF-16LE string */ simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; @@ -1393,9 +1645,9 @@ simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * input, size_t * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-16LE string + * @return number of written code units; 0 if input is not a valid UTF-16LE string */ simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; @@ -1408,14 +1660,14 @@ simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * input, size * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-16LE string + * @return number of written code units; 0 if input is not a valid UTF-16LE string */ simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; /** - * Using native endianness; Convert possibly broken UTF-16 string into + * Using native endianness, convert possibly broken UTF-16 string into * UTF-32 string and stop on error. * * During the conversion also validation of the input string is done. @@ -1424,9 +1676,9 @@ simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * input, size * This function is not BOM-aware. * * @param input the UTF-16 string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. */ simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; @@ -1439,9 +1691,9 @@ simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * i * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. */ simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; @@ -1454,23 +1706,23 @@ simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. */ simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; /** - * Using native endianness; Convert valid UTF-16 string into UTF-32 string. + * Using native endianness, convert valid UTF-16 string into UTF-32 string. * * This function assumes that the input string is valid UTF-16 (native endianness). * * This function is not BOM-aware. * * @param input the UTF-16 string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold the conversion result - * @return number of written words; 0 if conversion is not possible + * @return number of written code units; 0 if conversion is not possible */ simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; @@ -1482,9 +1734,9 @@ simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * input, * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold the conversion result - * @return number of written words; 0 if conversion is not possible + * @return number of written code units; 0 if conversion is not possible */ simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; @@ -1496,12 +1748,26 @@ simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * input * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold the conversion result - * @return number of written words; 0 if conversion is not possible + * @return number of written code units; 0 if conversion is not possible */ simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; + +/* + * Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format. + * + * This function does not validate the input. + * + * This function is not BOM-aware. + * + * @param length the length of the string in 2-byte code units (char16_t) + * @return the number of bytes required to encode the UTF-16LE string as Latin1 + */ +simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept; + + /** * Using native endianness; Compute the number of bytes that this UTF-16 * string would require in UTF-8 format. @@ -1509,7 +1775,7 @@ simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * input * This function does not validate the input. * * @param input the UTF-16 string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @return the number of bytes required to encode the UTF-16LE string as UTF-8 */ simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept; @@ -1520,7 +1786,7 @@ simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t * This function does not validate the input. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @return the number of bytes required to encode the UTF-16LE string as UTF-8 */ simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) noexcept; @@ -1531,7 +1797,7 @@ simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size * This function does not validate the input. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @return the number of bytes required to encode the UTF-16BE string as UTF-8 */ simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept; @@ -1545,9 +1811,9 @@ simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-32 string + * @return number of written code units; 0 if input is not a valid UTF-32 string */ simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept; @@ -1560,9 +1826,9 @@ simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * input, size_t * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. */ simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) noexcept; @@ -1574,14 +1840,14 @@ simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * in * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf8_buffer the pointer to buffer that can hold the conversion result - * @return number of written words; 0 if conversion is not possible + * @return number of written code units; 0 if conversion is not possible */ simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept; /** - * Using native endianness; Convert possibly broken UTF-32 string into UTF-16 string. + * Using native endianness, convert possibly broken UTF-32 string into a UTF-16 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1589,9 +1855,9 @@ simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * input, s * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-32 string + * @return number of written code units; 0 if input is not a valid UTF-32 string */ simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; @@ -1604,12 +1870,57 @@ simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * input, size_t * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-32 string + * @return number of written code units; 0 if input is not a valid UTF-32 string */ simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; +/** + * Convert possibly broken UTF-32 string into Latin1 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-32 string to convert + * @param length the length of the string in 4-byte code units (char32_t) + * @param latin1_buffer the pointer to buffer that can hold conversion result + * @return number of written code units; 0 if input is not a valid UTF-32 string + */ +simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) noexcept; + + +/** + * Convert possibly broken UTF-32 string into Latin1 string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-32 string to convert + * @param length the length of the string in 4-byte code units (char32_t) + * @param latin1_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + */ +simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * input, size_t length, char* latin1_buffer) noexcept; + +/** + * Convert valid UTF-32 string into Latin1 string. + * + * This function assumes that the input string is valid UTF-32. + * + * This function is not BOM-aware. + * + * @param input the UTF-32 string to convert + * @param length the length of the string in 4-byte code units (char32_t) + * @param latin1_buffer the pointer to buffer that can hold the conversion result + * @return number of written code units; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) noexcept; + /** * Convert possibly broken UTF-32 string into UTF-16BE string. * @@ -1619,14 +1930,14 @@ simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * input, size * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-32 string + * @return number of written code units; 0 if input is not a valid UTF-32 string */ simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; /** - * Using native endianness; Convert possibly broken UTF-32 string into UTF-16 + * Using native endianness, convert possibly broken UTF-32 string into UTF-16 * string and stop on error. * * During the conversion also validation of the input string is done. @@ -1635,9 +1946,9 @@ simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * input, size * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. */ simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; @@ -1650,9 +1961,9 @@ simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * i * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. */ simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; @@ -1665,23 +1976,23 @@ simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. */ simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; /** - * Using native endianness; Convert valid UTF-32 string into UTF-16 string. + * Using native endianness, convert valid UTF-32 string into a UTF-16 string. * * This function assumes that the input string is valid UTF-32. * * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf16_buffer the pointer to buffer that can hold the conversion result - * @return number of written words; 0 if conversion is not possible + * @return number of written code units; 0 if conversion is not possible */ simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; @@ -1693,9 +2004,9 @@ simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * input, * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf16_buffer the pointer to buffer that can hold the conversion result - * @return number of written words; 0 if conversion is not possible + * @return number of written code units; 0 if conversion is not possible */ simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; @@ -1707,9 +2018,9 @@ simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * input * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf16_buffer the pointer to buffer that can hold the conversion result - * @return number of written words; 0 if conversion is not possible + * @return number of written code units; 0 if conversion is not possible */ simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; @@ -1722,7 +2033,7 @@ simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * input * This function is not BOM-aware. * * @param input the UTF-16 string to process - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param output the pointer to buffer that can hold the conversion result */ void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept; @@ -1733,18 +2044,18 @@ void change_endianness_utf16(const char16_t * input, size_t length, char16_t * o * This function does not validate the input. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @return the number of bytes required to encode the UTF-32 string as UTF-8 */ simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept; /** - * Compute the number of two-byte words that this UTF-32 string would require in UTF-16 format. + * Compute the number of two-byte code units that this UTF-32 string would require in UTF-16 format. * * This function does not validate the input. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @return the number of bytes required to encode the UTF-32 string as UTF-16 */ simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept; @@ -1760,7 +2071,7 @@ simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_ * This function is not BOM-aware. * * @param input the UTF-16 string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @return the number of bytes required to encode the UTF-16LE string as UTF-32 */ simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_t length) noexcept; @@ -1775,7 +2086,7 @@ simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_ * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @return the number of bytes required to encode the UTF-16LE string as UTF-32 */ simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept; @@ -1790,7 +2101,7 @@ simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, siz * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @return the number of bytes required to encode the UTF-16BE string as UTF-32 */ simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept; @@ -1804,7 +2115,7 @@ simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, siz * This function is not BOM-aware. * * @param input the UTF-16 string to process - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @return number of code points */ simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept; @@ -1818,7 +2129,7 @@ simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) no * This function is not BOM-aware. * * @param input the UTF-16LE string to process - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @return number of code points */ simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) noexcept; @@ -1832,7 +2143,7 @@ simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) * This function is not BOM-aware. * * @param input the UTF-16BE string to process - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @return number of code points */ simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) noexcept; @@ -1932,7 +2243,7 @@ class implementation { * * @param buf the UTF-8 string to validate. * @param len the length of the string in bytes. - * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. */ simdutf_warn_unused virtual result validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0; @@ -1954,7 +2265,7 @@ class implementation { * * @param buf the ASCII string to validate. * @param len the length of the string in bytes. - * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. */ simdutf_warn_unused virtual result validate_ascii_with_errors(const char *buf, size_t len) const noexcept = 0; @@ -1968,7 +2279,7 @@ class implementation { * This function is not BOM-aware. * * @param buf the UTF-16LE string to validate. - * @param len the length of the string in number of 2-byte words (char16_t). + * @param len the length of the string in number of 2-byte code units (char16_t). * @return true if and only if the string is valid UTF-16LE. */ simdutf_warn_unused virtual bool validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0; @@ -1983,7 +2294,7 @@ class implementation { * This function is not BOM-aware. * * @param buf the UTF-16BE string to validate. - * @param len the length of the string in number of 2-byte words (char16_t). + * @param len the length of the string in number of 2-byte code units (char16_t). * @return true if and only if the string is valid UTF-16BE. */ simdutf_warn_unused virtual bool validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0; @@ -1997,8 +2308,8 @@ class implementation { * This function is not BOM-aware. * * @param buf the UTF-16LE string to validate. - * @param len the length of the string in number of 2-byte words (char16_t). - * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + * @param len the length of the string in number of 2-byte code units (char16_t). + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. */ simdutf_warn_unused virtual result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept = 0; @@ -2011,8 +2322,8 @@ class implementation { * This function is not BOM-aware. * * @param buf the UTF-16BE string to validate. - * @param len the length of the string in number of 2-byte words (char16_t). - * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + * @param len the length of the string in number of 2-byte code units (char16_t). + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. */ simdutf_warn_unused virtual result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept = 0; @@ -2024,7 +2335,7 @@ class implementation { * This function is not BOM-aware. * * @param buf the UTF-32 string to validate. - * @param len the length of the string in number of 4-byte words (char32_t). + * @param len the length of the string in number of 4-byte code units (char32_t). * @return true if and only if the string is valid UTF-32. */ simdutf_warn_unused virtual bool validate_utf32(const char32_t *buf, size_t len) const noexcept = 0; @@ -2037,11 +2348,101 @@ class implementation { * This function is not BOM-aware. * * @param buf the UTF-32 string to validate. - * @param len the length of the string in number of 4-byte words (char32_t). - * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + * @param len the length of the string in number of 4-byte code units (char32_t). + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. */ simdutf_warn_unused virtual result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept = 0; + /** + * Convert Latin1 string into UTF8 string. + * + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the Latin1 string to convert + * @param length the length of the string in bytes + * @param latin1_output the pointer to buffer that can hold conversion result + * @return the number of written char; 0 if conversion is not possible + */ + simdutf_warn_unused virtual size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) const noexcept = 0; + + + /** + * Convert possibly Latin1 string into UTF-16LE string. + * + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the Latin1 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t; 0 if conversion is not possible + */ + simdutf_warn_unused virtual size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; + + /** + * Convert Latin1 string into UTF-16BE string. + * + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the Latin1 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t; 0 if conversion is not possible + */ + simdutf_warn_unused virtual size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; + + /** + * Convert Latin1 string into UTF-32 string. + * + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the Latin1 string to convert + * @param length the length of the string in bytes + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return the number of written char32_t; 0 if conversion is not possible + */ + simdutf_warn_unused virtual size_t convert_latin1_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; + + /** + * Convert possibly broken UTF-8 string into latin1 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param latin1_output the pointer to buffer that can hold conversion result + * @return the number of written char; 0 if the input was not valid UTF-8 string + */ + simdutf_warn_unused virtual size_t convert_utf8_to_latin1(const char * input, size_t length, char* latin1_output) const noexcept = 0; + + /** + * Convert possibly broken UTF-8 string into latin1 string. with errors + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param latin1_output the pointer to buffer that can hold conversion result + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. + */ + simdutf_warn_unused virtual result convert_utf8_to_latin1_with_errors(const char * input, size_t length, char* latin1_output) const noexcept = 0; + + /** + * Convert valid UTF-8 string into latin1 string. + * + * This function assumes that the input string is valid UTF-8. + * + * This function is not BOM-aware. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param latin1_output the pointer to buffer that can hold conversion result + * @return the number of written char; 0 if the input was not valid UTF-8 string + */ + simdutf_warn_unused virtual size_t convert_valid_utf8_to_latin1(const char * input, size_t length, char* latin1_output) const noexcept = 0; + + /** * Convert possibly broken UTF-8 string into UTF-16LE string. * @@ -2077,7 +2478,7 @@ class implementation { * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. */ simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; @@ -2090,7 +2491,7 @@ class implementation { * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. */ simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; @@ -2116,7 +2517,7 @@ class implementation { * @param input the UTF-8 string to convert * @param length the length of the string in bytes * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. */ simdutf_warn_unused virtual result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) const noexcept = 0; @@ -2157,18 +2558,18 @@ class implementation { simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; /** - * Compute the number of 2-byte words that this UTF-8 string would require in UTF-16LE format. + * Compute the number of 2-byte code units that this UTF-8 string would require in UTF-16LE format. * * This function does not validate the input. * * @param input the UTF-8 string to process * @param length the length of the string in bytes - * @return the number of char16_t words required to encode the UTF-8 string as UTF-16LE + * @return the number of char16_t code units required to encode the UTF-8 string as UTF-16LE */ simdutf_warn_unused virtual size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept = 0; /** - * Compute the number of 4-byte words that this UTF-8 string would require in UTF-32 format. + * Compute the number of 4-byte code units that this UTF-8 string would require in UTF-32 format. * * This function is equivalent to count_utf8. * @@ -2176,10 +2577,96 @@ class implementation { * * @param input the UTF-8 string to process * @param length the length of the string in bytes - * @return the number of char32_t words required to encode the UTF-8 string as UTF-32 + * @return the number of char32_t code units required to encode the UTF-8 string as UTF-32 */ simdutf_warn_unused virtual size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept = 0; + /** + * Convert possibly broken UTF-16LE string into Latin1 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte code units (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion result + * @return number of written code units; 0 if input is not a valid UTF-16LE string + */ + simdutf_warn_unused virtual size_t convert_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0; + + /** + * Convert possibly broken UTF-16BE string into Latin1 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte code units (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + */ + simdutf_warn_unused virtual size_t convert_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0; + + /** + * Convert possibly broken UTF-16LE string into Latin1 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte code units (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + */ + simdutf_warn_unused virtual result convert_utf16le_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0; + + /** + * Convert possibly broken UTF-16BE string into Latin1 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte code units (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + */ + simdutf_warn_unused virtual result convert_utf16be_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0; + + /** + * Convert valid UTF-16LE string into Latin1 string. + * + * This function assumes that the input string is valid UTF-8. + + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte code units (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion result + * @return number of written code units; 0 if conversion is not possible + */ + simdutf_warn_unused virtual size_t convert_valid_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0; + + /** + * Convert valid UTF-16BE string into Latin1 string. + * + * This function assumes that the input string is valid UTF-8. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte code units (char16_t) + * @param latin1_buffer the pointer to buffer that can hold conversion result + * @return number of written code units; 0 if conversion is not possible + */ + simdutf_warn_unused virtual size_t convert_valid_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0; + /** * Convert possibly broken UTF-16LE string into UTF-8 string. * @@ -2189,9 +2676,9 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-16LE string + * @return number of written code units; 0 if input is not a valid UTF-16LE string */ simdutf_warn_unused virtual size_t convert_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; @@ -2204,9 +2691,9 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-16BE string + * @return number of written code units; 0 if input is not a valid UTF-16BE string */ simdutf_warn_unused virtual size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; @@ -2219,9 +2706,9 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. */ simdutf_warn_unused virtual result convert_utf16le_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; @@ -2234,9 +2721,9 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. */ simdutf_warn_unused virtual result convert_utf16be_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; @@ -2248,9 +2735,9 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold the conversion result - * @return number of written words; 0 if conversion is not possible + * @return number of written code units; 0 if conversion is not possible */ simdutf_warn_unused virtual size_t convert_valid_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; @@ -2262,9 +2749,9 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf8_buffer the pointer to buffer that can hold the conversion result - * @return number of written words; 0 if conversion is not possible + * @return number of written code units; 0 if conversion is not possible */ simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; @@ -2277,9 +2764,9 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-16LE string + * @return number of written code units; 0 if input is not a valid UTF-16LE string */ simdutf_warn_unused virtual size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; @@ -2292,9 +2779,9 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-16BE string + * @return number of written code units; 0 if input is not a valid UTF-16BE string */ simdutf_warn_unused virtual size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; @@ -2307,9 +2794,9 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. */ simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; @@ -2322,9 +2809,9 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. */ simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; @@ -2336,9 +2823,9 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold the conversion result - * @return number of written words; 0 if conversion is not possible + * @return number of written code units; 0 if conversion is not possible */ simdutf_warn_unused virtual size_t convert_valid_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; @@ -2350,9 +2837,9 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param utf32_buffer the pointer to buffer that can hold the conversion result - * @return number of written words; 0 if conversion is not possible + * @return number of written code units; 0 if conversion is not possible */ simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; @@ -2364,7 +2851,7 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @return the number of bytes required to encode the UTF-16LE string as UTF-8 */ simdutf_warn_unused virtual size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept = 0; @@ -2377,11 +2864,57 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @return the number of bytes required to encode the UTF-16BE string as UTF-8 */ simdutf_warn_unused virtual size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0; + /** + * Convert possibly broken UTF-32 string into Latin1 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-32 string to convert + * @param length the length of the string in 4-byte code units (char32_t) + * @param latin1_buffer the pointer to buffer that can hold conversion result + * @return number of written code units; 0 if input is not a valid UTF-32 string + */ + + simdutf_warn_unused virtual size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0; + + /** + * Convert possibly broken UTF-32 string into Latin1 string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-32 string to convert + * @param length the length of the string in 4-byte code units (char32_t) + * @param latin1_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. + */ + + simdutf_warn_unused virtual result convert_utf32_to_latin1_with_errors(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0; + + /** + * Convert valid UTF-32 string into Latin1 string. + * + * This function assumes that the input string is valid UTF-32. + * + * This function is not BOM-aware. + * + * @param input the UTF-32 string to convert + * @param length the length of the string in 4-byte code units (char32_t) + * @param latin1_buffer the pointer to buffer that can hold the conversion result + * @return number of written code units; 0 if conversion is not possible + */ + simdutf_warn_unused virtual size_t convert_valid_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0; + /** * Convert possibly broken UTF-32 string into UTF-8 string. * @@ -2391,9 +2924,9 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-32 string + * @return number of written code units; 0 if input is not a valid UTF-32 string */ simdutf_warn_unused virtual size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0; @@ -2406,9 +2939,9 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. */ simdutf_warn_unused virtual result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0; @@ -2420,12 +2953,23 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf8_buffer the pointer to buffer that can hold the conversion result - * @return number of written words; 0 if conversion is not possible + * @return number of written code units; 0 if conversion is not possible */ simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + + /** + * Return the number of bytes that this UTF-16 string would require in Latin1 format. + * + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 2-byte code units (char16_t) + * @return the number of bytes required to encode the UTF-16 string as Latin1 + */ + simdutf_warn_unused virtual size_t utf16_length_from_latin1(size_t length) const noexcept = 0; + /** * Convert possibly broken UTF-32 string into UTF-16LE string. * @@ -2435,9 +2979,9 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-32 string + * @return number of written code units; 0 if input is not a valid UTF-32 string */ simdutf_warn_unused virtual size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; @@ -2450,9 +2994,9 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-32 string + * @return number of written code units; 0 if input is not a valid UTF-32 string */ simdutf_warn_unused virtual size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; @@ -2465,9 +3009,9 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. */ simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; @@ -2480,9 +3024,9 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. + * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. */ simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; @@ -2494,9 +3038,9 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf16_buffer the pointer to buffer that can hold the conversion result - * @return number of written words; 0 if conversion is not possible + * @return number of written code units; 0 if conversion is not possible */ simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; @@ -2508,9 +3052,9 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @param utf16_buffer the pointer to buffer that can hold the conversion result - * @return number of written words; 0 if conversion is not possible + * @return number of written code units; 0 if conversion is not possible */ simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; @@ -2523,33 +3067,88 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16 string to process - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @param output the pointer to buffer that can hold the conversion result */ virtual void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept = 0; + /** + * Return the number of bytes that this Latin1 string would require in UTF-8 format. + * + * @param input the Latin1 string to convert + * @param length the length of the string bytes + * @return the number of bytes required to encode the Latin1 string as UTF-8 + */ + simdutf_warn_unused virtual size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept = 0; + /** * Compute the number of bytes that this UTF-32 string would require in UTF-8 format. * * This function does not validate the input. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @return the number of bytes required to encode the UTF-32 string as UTF-8 */ simdutf_warn_unused virtual size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0; /** - * Compute the number of two-byte words that this UTF-32 string would require in UTF-16 format. + * Compute the number of bytes that this UTF-32 string would require in Latin1 format. + * + * This function does not validate the input. + * + * @param length the length of the string in 4-byte code units (char32_t) + * @return the number of bytes required to encode the UTF-32 string as Latin1 + */ + simdutf_warn_unused virtual size_t latin1_length_from_utf32(size_t length) const noexcept = 0; + + /** + * Compute the number of bytes that this UTF-8 string would require in Latin1 format. + * + * This function does not validate the input. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in byte + * @return the number of bytes required to encode the UTF-8 string as Latin1 + */ + simdutf_warn_unused virtual size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept = 0; + + /* + * Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format. + * + * This function does not validate the input. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte code units (char16_t) + * @return the number of bytes required to encode the UTF-16LE string as Latin1 + */ + simdutf_warn_unused virtual size_t latin1_length_from_utf16(size_t length) const noexcept = 0; + + /** + * Compute the number of two-byte code units that this UTF-32 string would require in UTF-16 format. * * This function does not validate the input. * * @param input the UTF-32 string to convert - * @param length the length of the string in 4-byte words (char32_t) + * @param length the length of the string in 4-byte code units (char32_t) * @return the number of bytes required to encode the UTF-32 string as UTF-16 */ simdutf_warn_unused virtual size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0; + + /** + * Return the number of bytes that this UTF-32 string would require in Latin1 format. + * + * This function does not validate the input. + * + * @param input the UTF-32 string to convert + * @param length the length of the string in 4-byte code units (char32_t) + * @return the number of bytes required to encode the UTF-32 string as Latin1 + */ + simdutf_warn_unused virtual size_t utf32_length_from_latin1(size_t length) const noexcept = 0; + /* * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format. * @@ -2560,7 +3159,7 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16LE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @return the number of bytes required to encode the UTF-16LE string as UTF-32 */ simdutf_warn_unused virtual size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept = 0; @@ -2575,7 +3174,7 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16BE string to convert - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @return the number of bytes required to encode the UTF-16BE string as UTF-32 */ simdutf_warn_unused virtual size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0; @@ -2589,7 +3188,7 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16LE string to process - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @return number of code points */ simdutf_warn_unused virtual size_t count_utf16le(const char16_t * input, size_t length) const noexcept = 0; @@ -2603,7 +3202,7 @@ class implementation { * This function is not BOM-aware. * * @param input the UTF-16BE string to process - * @param length the length of the string in 2-byte words (char16_t) + * @param length the length of the string in 2-byte code units (char16_t) * @return number of code points */ simdutf_warn_unused virtual size_t count_utf16be(const char16_t * input, size_t length) const noexcept = 0; diff --git a/doc/contributing/maintaining/maintaining-dependencies.md b/doc/contributing/maintaining/maintaining-dependencies.md index 4fc7af4692a30e..2b52192c4acc2a 100644 --- a/doc/contributing/maintaining/maintaining-dependencies.md +++ b/doc/contributing/maintaining/maintaining-dependencies.md @@ -27,7 +27,7 @@ This a list of all the dependencies: * [npm 9.6.7][] * [openssl 3.0.8][] * [postject 1.0.0-alpha.6][] -* [simdutf 3.2.18][] +* [simdutf 4.0.0][] * [undici 5.26.3][] * [uvwasi 0.0.19][] * [V8 11.8.172.12][] @@ -286,7 +286,7 @@ See [maintaining-openssl][] for more informations. The [postject](https://github.com/nodejs/postject) dependency is used for the [Single Executable strategic initiative](https://github.com/nodejs/single-executable). -### simdutf 3.2.18 +### simdutf 4.0.0 The [simdutf](https://github.com/simdutf/simdutf) dependency is a C++ library for fast UTF-8 decoding and encoding. @@ -344,7 +344,7 @@ performance improvements not currently available in standard zlib. [npm 9.6.7]: #npm-967 [openssl 3.0.8]: #openssl-308 [postject 1.0.0-alpha.6]: #postject-100-alpha6 -[simdutf 3.2.18]: #simdutf-3218 +[simdutf 4.0.0]: #simdutf-400 [undici 5.26.3]: #undici-5263 [update-openssl-action]: ../../../.github/workflows/update-openssl.yml [uvwasi 0.0.19]: #uvwasi-0019