diff --git a/BUILD b/BUILD index 3c164641c2..4c8d8421d6 100644 --- a/BUILD +++ b/BUILD @@ -108,6 +108,7 @@ cc_library( "upb/decode.c", "upb/encode.c", "upb/internal/table.h", + "upb/internal/unicode.h", "upb/msg.c", "upb/msg_internal.h", "upb/status.c", @@ -138,6 +139,7 @@ cc_library( ":fastdecode", ":port", ":table_internal", + ":unicode_internal", ], ) @@ -448,6 +450,7 @@ cc_library( ":encode_internal", ":port", ":reflection", + ":unicode_internal", ":upb", ], ) @@ -825,6 +828,19 @@ cc_library( deps = [":port"], ) +cc_library( + name = "unicode_internal", + srcs = [ + "upb/internal/unicode.c", + ], + hdrs = [ + "upb/internal/unicode.h", + ], + copts = UPB_DEFAULT_COPTS, + visibility = ["//:__subpackages__"], + deps = [":port"], +) + # Amalgamation ################################################################# # begin:github_only diff --git a/upb/internal/unicode.c b/upb/internal/unicode.c new file mode 100644 index 0000000000..3e1d7eb365 --- /dev/null +++ b/upb/internal/unicode.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2009-2021, Google LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Google LLC nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "upb/internal/unicode.h" + +// Must be last. +#include "upb/port_def.inc" + +int upb_Unicode_ToUTF8(uint32_t cp, char* out) { + if (cp <= 0x7f) { + out[0] = cp; + return 1; + } + if (cp <= 0x07ff) { + out[0] = (cp >> 6) | 0xc0; + out[1] = (cp & 0x3f) | 0x80; + return 2; + } + if (cp <= 0xffff) { + out[0] = (cp >> 12) | 0xe0; + out[1] = ((cp >> 6) & 0x3f) | 0x80; + out[2] = (cp & 0x3f) | 0x80; + return 3; + } + if (cp <= 0x10ffff) { + out[0] = (cp >> 18) | 0xf0; + out[1] = ((cp >> 12) & 0x3f) | 0x80; + out[2] = ((cp >> 6) & 0x3f) | 0x80; + out[3] = (cp & 0x3f) | 0x80; + return 4; + } + return 0; +} diff --git a/upb/internal/unicode.h b/upb/internal/unicode.h new file mode 100644 index 0000000000..27640ae8a5 --- /dev/null +++ b/upb/internal/unicode.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2009-2021, Google LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Google LLC nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef UPB_INTERNAL_UNICODE_H_ +#define UPB_INTERNAL_UNICODE_H_ + +// Must be last. +#include "upb/port_def.inc" + +#ifdef __cplusplus +extern "C" { +#endif + +// Returns true iff a codepoint is the value for a high surrogate. +UPB_INLINE bool upb_Unicode_IsHigh(uint32_t cp) { + return (cp >= 0xd800 && cp <= 0xdbff); +} + +// Returns true iff a codepoint is the value for a low surrogate. +UPB_INLINE bool upb_Unicode_IsLow(uint32_t cp) { + return (cp >= 0xdc00 && cp <= 0xdfff); +} + +// Returns the high 16-bit surrogate value for a supplementary codepoint. +// Does not sanity-check the input. +UPB_INLINE uint16_t upb_Unicode_ToHigh(uint32_t cp) { + return (cp >> 10) + 0xd7c0; +} + +// Returns the low 16-bit surrogate value for a supplementary codepoint. +// Does not sanity-check the input. +UPB_INLINE uint16_t upb_Unicode_ToLow(uint32_t cp) { + return (cp & 0x3ff) | 0xdc00; +} + +// Returns the 32-bit value corresponding to a pair of 16-bit surrogates. +// Does not sanity-check the input. +UPB_INLINE uint32_t upb_Unicode_FromPair(uint32_t high, uint32_t low) { + return ((high & 0x3ff) << 10) + (low & 0x3ff) + 0x10000; +} + +// Outputs a codepoint as UTF8. +// Returns the number of bytes written (1-4 on success, 0 on error). +// Does not sanity-check the input. Specifically does not check for surrogates. +int upb_Unicode_ToUTF8(uint32_t cp, char* out); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#include "upb/port_undef.inc" + +#endif /* UPB_INTERNAL_UNICODE_H_ */ diff --git a/upb/json_decode.c b/upb/json_decode.c index 4f79294ef8..4071fc3d95 100644 --- a/upb/json_decode.c +++ b/upb/json_decode.c @@ -32,14 +32,14 @@ #include #include #include -#include #include #include #include "upb/encode.h" +#include "upb/internal/unicode.h" #include "upb/reflection.h" -/* Special header, must be included last. */ +// Must be last. #include "upb/port_def.inc" typedef struct { @@ -377,44 +377,20 @@ static uint32_t jsondec_codepoint(jsondec* d) { /* Parses a \uXXXX unicode escape (possibly a surrogate pair). */ static size_t jsondec_unicode(jsondec* d, char* out) { uint32_t cp = jsondec_codepoint(d); - if (cp >= 0xd800 && cp <= 0xdbff) { + if (upb_Unicode_IsHigh(cp)) { /* Surrogate pair: two 16-bit codepoints become a 32-bit codepoint. */ - uint32_t high = cp; - uint32_t low; jsondec_parselit(d, "\\u"); - low = jsondec_codepoint(d); - if (low < 0xdc00 || low > 0xdfff) { - jsondec_err(d, "Invalid low surrogate"); - } - cp = (high & 0x3ff) << 10; - cp |= (low & 0x3ff); - cp += 0x10000; - } else if (cp >= 0xdc00 && cp <= 0xdfff) { + uint32_t low = jsondec_codepoint(d); + if (!upb_Unicode_IsLow(low)) jsondec_err(d, "Invalid low surrogate"); + cp = upb_Unicode_FromPair(cp, low); + } else if (upb_Unicode_IsLow(cp)) { jsondec_err(d, "Unpaired low surrogate"); } /* Write to UTF-8 */ - if (cp <= 0x7f) { - out[0] = cp; - return 1; - } else if (cp <= 0x07FF) { - out[0] = ((cp >> 6) & 0x1F) | 0xC0; - out[1] = ((cp >> 0) & 0x3F) | 0x80; - return 2; - } else if (cp <= 0xFFFF) { - out[0] = ((cp >> 12) & 0x0F) | 0xE0; - out[1] = ((cp >> 6) & 0x3F) | 0x80; - out[2] = ((cp >> 0) & 0x3F) | 0x80; - return 3; - } else if (cp < 0x10FFFF) { - out[0] = ((cp >> 18) & 0x07) | 0xF0; - out[1] = ((cp >> 12) & 0x3f) | 0x80; - out[2] = ((cp >> 6) & 0x3f) | 0x80; - out[3] = ((cp >> 0) & 0x3f) | 0x80; - return 4; - } else { - jsondec_err(d, "Invalid codepoint"); - } + int bytes = upb_Unicode_ToUTF8(cp, out); + if (bytes == 0) jsondec_err(d, "Invalid codepoint"); + return bytes; } static void jsondec_resize(jsondec* d, char** buf, char** end, char** buf_end) { @@ -460,7 +436,7 @@ static upb_StringView jsondec_string(jsondec* d) { if (*d->ptr == 'u') { d->ptr++; if (buf_end - end < 4) { - /* Allow space for maximum-sized code point (4 bytes). */ + /* Allow space for maximum-sized codepoint (4 bytes). */ jsondec_resize(d, &buf, &end, &buf_end); } end += jsondec_unicode(d, end);