From 569d275f65f37434e209b1e3928d6d5176369fd0 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Mon, 11 Dec 2017 22:38:05 +0100 Subject: [PATCH 1/2] :boom: throwing an exception in case dump encounters a non-UTF-8 string #838 We had a lot of issues with failing roundtrips (i.e., parse errors from serializations) in case string were stored in the library that were not UTF-8 encoded. This PR adds an exception in this case. --- README.md | 3 ++ src/json.hpp | 92 +++++++++++++++++++++++++++++++++-- test/src/unit-cbor.cpp | 6 +-- test/src/unit-convenience.cpp | 11 +++-- test/src/unit-msgpack.cpp | 6 +-- test/src/unit-regression.cpp | 13 ++++- 6 files changed, 116 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index c4d9f733b8..89ccb5a744 100644 --- a/README.md +++ b/README.md @@ -803,6 +803,9 @@ The above copyright notice and this permission notice shall be included in all c THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* * * + +The class contains the UTF-8 Decoder from Bjoern Hoehrmann which is licensed under the [MIT License](http://opensource.org/licenses/MIT) (see above). Copyright © 2008-2009 [Bjoern Hoehrmann](http://bjoern.hoehrmann.de/) ## Contact diff --git a/src/json.hpp b/src/json.hpp index f353fddd9a..a80c1551a3 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -378,6 +378,7 @@ json.exception.type_error.312 | cannot use update() with string | The @ref updat json.exception.type_error.313 | invalid value to unflatten | The @ref unflatten function converts an object whose keys are JSON Pointers back into an arbitrary nested JSON value. The JSON Pointers must not overlap, because then the resulting value would not be well defined. json.exception.type_error.314 | only objects can be unflattened | The @ref unflatten function only works for an object whose keys are JSON Pointers. json.exception.type_error.315 | values in object must be primitive | The @ref unflatten function only works for an object whose keys are JSON Pointers and whose values are primitive. +json.exception.type_error.316 | invalid UTF-8 byte at index 10: 0x7E | The @ref dump function only works with UTF-8 encoded strings; that is, if you assign a `std::string` to a JSON value, make sure it is UTF-8 encoded. | @liveexample{The following code shows how a `type_error` exception can be caught.,type_error} @@ -4890,7 +4891,7 @@ class binary_reader default: // anything else (0xFF is handled inside the other types) { std::stringstream ss; - ss << std::setw(2) << std::setfill('0') << std::hex << current; + ss << std::setw(2) << std::uppercase << std::setfill('0') << std::hex << current; JSON_THROW(parse_error::create(112, chars_read, "error reading CBOR; last byte: 0x" + ss.str())); } } @@ -5214,7 +5215,7 @@ class binary_reader default: // anything else { std::stringstream ss; - ss << std::setw(2) << std::setfill('0') << std::hex << current; + ss << std::setw(2) << std::uppercase << std::setfill('0') << std::hex << current; JSON_THROW(parse_error::create(112, chars_read, "error reading MessagePack; last byte: 0x" + ss.str())); } @@ -5382,7 +5383,7 @@ class binary_reader default: { std::stringstream ss; - ss << std::setw(2) << std::setfill('0') << std::hex << current; + ss << std::setw(2) << std::uppercase << std::setfill('0') << std::hex << current; JSON_THROW(parse_error::create(113, chars_read, "expected a CBOR string; last byte: 0x" + ss.str())); } } @@ -5487,7 +5488,7 @@ class binary_reader default: { std::stringstream ss; - ss << std::setw(2) << std::setfill('0') << std::hex << current; + ss << std::setw(2) << std::uppercase << std::setfill('0') << std::hex << current; JSON_THROW(parse_error::create(113, chars_read, "expected a MessagePack string; last byte: 0x" + ss.str())); } @@ -6513,6 +6514,8 @@ class serializer */ void dump_escaped(const string_t& s, const bool ensure_ascii) const { + throw_if_invalid_utf8(s); + const auto space = extra_space(s, ensure_ascii); if (space == 0) { @@ -6764,6 +6767,87 @@ class serializer } } + /*! + @brief check whether a string is UTF-8 encoded + + The function checks each byte of a string whether it is UTF-8 encoded. The + result of the check is stored in the @a state parameter. The function must + be called initially with state 0 (accept). State 1 means the string must + be rejected, because the current byte is not allowed. If the string is + completely processed, but the state is non-zero, the string ended + prematurely; that is, the last byte indicated more bytes should have + followed. + + @param[in,out] state the state of the decoding + @param[in] byte next byte to decode + + @note The function has been edited: a std::array is used and the code + point is not calculated. + + @copyright Copyright (c) 2008-2009 Bjoern Hoehrmann + @sa http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + */ + static void decode(uint8_t& state, const uint8_t byte) + { + static const std::array utf8d = + { + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7f + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9f + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // a0..bf + 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..df + 0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // e0..ef + 0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // f0..ff + 0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2 + 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4 + 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6 + 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // s7..s8 + } + }; + + const uint8_t type = utf8d[byte]; + state = utf8d[256 + state * 16 + type]; + } + + /*! + @brief throw an exception if a string is not UTF-8 encoded + + @param[in] str UTF-8 string to check + @throw type_error.316 if passed string is not UTF-8 encoded + + @since version 3.0.0 + */ + static void throw_if_invalid_utf8(const std::string& str) + { + // start with state 0 (= accept) + uint8_t state = 0; + + for (size_t i = 0; i < str.size(); ++i) + { + const auto byte = static_cast(str[i]); + decode(state, byte); + if (state == 1) + { + // state 1 means reject + std::stringstream ss; + ss << std::setw(2) << std::uppercase << std::setfill('0') << std::hex << static_cast(byte); + JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + ss.str())); + } + } + + if (state != 0) + { + // we finish reading, but do not accept: string was incomplete + std::stringstream ss; + ss << std::setw(2) << std::uppercase << std::setfill('0') << std::hex << static_cast(static_cast(str.back())); + JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + ss.str())); + } + } + private: /// the output of the serializer output_adapter_t o = nullptr; diff --git a/test/src/unit-cbor.cpp b/test/src/unit-cbor.cpp index 02f3d4823d..8d28f68675 100644 --- a/test/src/unit-cbor.cpp +++ b/test/src/unit-cbor.cpp @@ -1287,10 +1287,10 @@ TEST_CASE("CBOR") { CHECK_THROWS_AS(json::from_cbor(std::vector({0x1c})), json::parse_error&); CHECK_THROWS_WITH(json::from_cbor(std::vector({0x1c})), - "[json.exception.parse_error.112] parse error at 1: error reading CBOR; last byte: 0x1c"); + "[json.exception.parse_error.112] parse error at 1: error reading CBOR; last byte: 0x1C"); CHECK_THROWS_AS(json::from_cbor(std::vector({0xf8})), json::parse_error&); CHECK_THROWS_WITH(json::from_cbor(std::vector({0xf8})), - "[json.exception.parse_error.112] parse error at 1: error reading CBOR; last byte: 0xf8"); + "[json.exception.parse_error.112] parse error at 1: error reading CBOR; last byte: 0xF8"); } SECTION("all unsupported bytes") @@ -1348,7 +1348,7 @@ TEST_CASE("CBOR") { CHECK_THROWS_AS(json::from_cbor(std::vector({0xa1, 0xff, 0x01})), json::parse_error&); CHECK_THROWS_WITH(json::from_cbor(std::vector({0xa1, 0xff, 0x01})), - "[json.exception.parse_error.113] parse error at 2: expected a CBOR string; last byte: 0xff"); + "[json.exception.parse_error.113] parse error at 2: expected a CBOR string; last byte: 0xFF"); } SECTION("strict mode") diff --git a/test/src/unit-convenience.cpp b/test/src/unit-convenience.cpp index 9a416a84e3..b9a1f2597d 100644 --- a/test/src/unit-convenience.cpp +++ b/test/src/unit-convenience.cpp @@ -32,7 +32,7 @@ SOFTWARE. #include "json.hpp" using nlohmann::json; -void check_escaped(const char* original, const char* escaped, const bool ensure_ascii = false); +void check_escaped(const char* original, const char* escaped = "", const bool ensure_ascii = false); void check_escaped(const char* original, const char* escaped, const bool ensure_ascii) { std::stringstream ss; @@ -99,7 +99,12 @@ TEST_CASE("convenience functions") check_escaped("\x1f", "\\u001f"); // invalid UTF-8 characters - check_escaped("ä\xA9ü", "ä\xA9ü"); - check_escaped("ä\xA9ü", "\\u00e4\xA9\\u00fc", true); + CHECK_THROWS_AS(check_escaped("ä\xA9ü"), json::type_error); + CHECK_THROWS_WITH(check_escaped("ä\xA9ü"), + "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9"); + + CHECK_THROWS_AS(check_escaped("\xC2"), json::type_error); + CHECK_THROWS_WITH(check_escaped("\xC2"), + "[json.exception.type_error.316] incomplete UTF-8 string; last byte: 0xC2"); } } diff --git a/test/src/unit-msgpack.cpp b/test/src/unit-msgpack.cpp index ff90f353cc..148bb180ae 100644 --- a/test/src/unit-msgpack.cpp +++ b/test/src/unit-msgpack.cpp @@ -1077,10 +1077,10 @@ TEST_CASE("MessagePack") { CHECK_THROWS_AS(json::from_msgpack(std::vector({0xc1})), json::parse_error&); CHECK_THROWS_WITH(json::from_msgpack(std::vector({0xc1})), - "[json.exception.parse_error.112] parse error at 1: error reading MessagePack; last byte: 0xc1"); + "[json.exception.parse_error.112] parse error at 1: error reading MessagePack; last byte: 0xC1"); CHECK_THROWS_AS(json::from_msgpack(std::vector({0xc6})), json::parse_error&); CHECK_THROWS_WITH(json::from_msgpack(std::vector({0xc6})), - "[json.exception.parse_error.112] parse error at 1: error reading MessagePack; last byte: 0xc6"); + "[json.exception.parse_error.112] parse error at 1: error reading MessagePack; last byte: 0xC6"); } SECTION("all unsupported bytes") @@ -1106,7 +1106,7 @@ TEST_CASE("MessagePack") { CHECK_THROWS_AS(json::from_msgpack(std::vector({0x81, 0xff, 0x01})), json::parse_error&); CHECK_THROWS_WITH(json::from_msgpack(std::vector({0x81, 0xff, 0x01})), - "[json.exception.parse_error.113] parse error at 2: expected a MessagePack string; last byte: 0xff"); + "[json.exception.parse_error.113] parse error at 2: expected a MessagePack string; last byte: 0xFF"); } SECTION("strict mode") diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 9afe6f3d0c..42edb8350c 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -975,7 +975,7 @@ TEST_CASE("regression tests") }; CHECK_THROWS_AS(json::from_cbor(vec1), json::parse_error&); CHECK_THROWS_WITH(json::from_cbor(vec1), - "[json.exception.parse_error.113] parse error at 13: expected a CBOR string; last byte: 0xb4"); + "[json.exception.parse_error.113] parse error at 13: expected a CBOR string; last byte: 0xB4"); // related test case: double-precision std::vector vec2 @@ -989,7 +989,7 @@ TEST_CASE("regression tests") }; CHECK_THROWS_AS(json::from_cbor(vec2), json::parse_error&); CHECK_THROWS_WITH(json::from_cbor(vec2), - "[json.exception.parse_error.113] parse error at 13: expected a CBOR string; last byte: 0xb4"); + "[json.exception.parse_error.113] parse error at 13: expected a CBOR string; last byte: 0xB4"); } SECTION("issue #452 - Heap-buffer-overflow (OSS-Fuzz issue 585)") @@ -1306,6 +1306,15 @@ TEST_CASE("regression tests") CHECK(j["nocopy"]["val"] == 0); } + SECTION("issue #838 - incorrect parse error with binary data in keys") + { + uint8_t key1[] = { 103, 92, 117, 48, 48, 48, 55, 92, 114, 215, 126, 214, 95, 92, 34, 174, 40, 71, 38, 174, 40, 71, 38, 223, 134, 247, 127 }; + std::string key1_str(key1, key1 + sizeof(key1)/sizeof(key1[0])); + json j = key1_str; + CHECK_THROWS_AS(j.dump(), json::type_error); + CHECK_THROWS_WITH(j.dump(), "[json.exception.type_error.316] invalid UTF-8 byte at index 10: 0x7E"); + } + SECTION("issue #843 - converting to array not working") { json j; From 8419bfbbd22f27335fb6e3c6d0e59e75e3b3522a Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Tue, 12 Dec 2017 20:44:57 +0100 Subject: [PATCH 2/2] :white_check_mark: improved test coverage As we guarantee proper UTF-8 before, we do not need to cope with it later. --- src/json.hpp | 16 ++++------------ test/src/unit-convenience.cpp | 4 ++-- test/src/unit-regression.cpp | 2 +- 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index a80c1551a3..b0c85dc1e8 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -6415,12 +6415,8 @@ class serializer if (ensure_ascii and (s[i] & 0x80 or s[i] == 0x7F)) { const auto bytes = bytes_following(static_cast(s[i])); - if (bytes == std::string::npos) - { - // invalid characters are treated as is, so no - // additional space will be used - break; - } + // invalid characters will be detected by throw_if_invalid_utf8 + assert (bytes != std::string::npos); if (bytes == 3) { @@ -6588,12 +6584,8 @@ class serializer (ensure_ascii and (s[i] & 0x80 or s[i] == 0x7F))) { const auto bytes = bytes_following(static_cast(s[i])); - if (bytes == std::string::npos) - { - // copy invalid character as is - result[pos++] = s[i]; - break; - } + // invalid characters will be detected by throw_if_invalid_utf8 + assert (bytes != std::string::npos); // check that the additional bytes are present assert(i + bytes < s.size()); diff --git a/test/src/unit-convenience.cpp b/test/src/unit-convenience.cpp index b9a1f2597d..3301a4e42a 100644 --- a/test/src/unit-convenience.cpp +++ b/test/src/unit-convenience.cpp @@ -101,10 +101,10 @@ TEST_CASE("convenience functions") // invalid UTF-8 characters CHECK_THROWS_AS(check_escaped("ä\xA9ü"), json::type_error); CHECK_THROWS_WITH(check_escaped("ä\xA9ü"), - "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9"); + "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9"); CHECK_THROWS_AS(check_escaped("\xC2"), json::type_error); CHECK_THROWS_WITH(check_escaped("\xC2"), - "[json.exception.type_error.316] incomplete UTF-8 string; last byte: 0xC2"); + "[json.exception.type_error.316] incomplete UTF-8 string; last byte: 0xC2"); } } diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 42edb8350c..177281608b 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -1309,7 +1309,7 @@ TEST_CASE("regression tests") SECTION("issue #838 - incorrect parse error with binary data in keys") { uint8_t key1[] = { 103, 92, 117, 48, 48, 48, 55, 92, 114, 215, 126, 214, 95, 92, 34, 174, 40, 71, 38, 174, 40, 71, 38, 223, 134, 247, 127 }; - std::string key1_str(key1, key1 + sizeof(key1)/sizeof(key1[0])); + std::string key1_str(key1, key1 + sizeof(key1) / sizeof(key1[0])); json j = key1_str; CHECK_THROWS_AS(j.dump(), json::type_error); CHECK_THROWS_WITH(j.dump(), "[json.exception.type_error.316] invalid UTF-8 byte at index 10: 0x7E");