-
Notifications
You must be signed in to change notification settings - Fork 0
/
encodings_test.cpp
74 lines (56 loc) · 2.55 KB
/
encodings_test.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#include <gtest/gtest.h>
#include <litehtml.h>
using namespace litehtml;
using namespace std;
#define S(s) string{ s, sizeof(s) - 1 }
struct test
{
string input;
string output;
};
test utf8_tests[] =
{
// VALID INPUTS
// input output
{ S("\x00"), S("\x00") }, // NUL
{ S("A"), S("A") }, // printable ASCII
// A - ASCII, L2 - lead byte of a 2-byte sequence, C - continuation byte
{ S("\xC2\xA3"), S("\xC2\xA3") }, // L2 C
{ S("\xE2\x82\xAC"), S("\xE2\x82\xAC") }, // L3 C C
{ S("\xF0\x90\x8D\x88"), S("\xF0\x90\x8D\x88") }, // L4 C C C
{ S("\xEF\xBB\xBF\xC2\xA3"), S("\xC2\xA3") }, // utf-8 bom is removed
// KINDA INVALID INPUTS
{ S("\xFF\xFE\xC2\xA3"), S("\xEA\x8F\x82") }, // utf-16le bom is removed, the rest is interpreted as utf-16le
// Because UTF-32 is not a valid HTML encoding, UTF-32LE BOM is not recognized, it is interpreted as
// UTF-16LE BOM. UTF-16LE BOM is removed, the rest is interpreted as UTF-16LE.
{ S("\xFF\xFE\x00\x00\xC2\xA3"), S("\x00\xEA\x8F\x82") },
// INVALID INPUTS
{ S("\xC0\x80"), S("\xEF\xBF\xBD\xEF\xBF\xBD") }, // overlong sequence for NUL -> 2 U+FFFD
{ S("\xF0\x82\x82\xAC"), S("\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD") }, // overlong sequence for € -> 4 U+FFFD
{ S("\xED\xB0\x80"), S("\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD") }, // low surrogate U+DC00 -> 3 U+FFFD
// low surrogate U+DC00 + high surrogate U+D800 -> 6 U+FFFD
{ S("\xED\xB0\x80\xED\xA0\x80"), S("\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD") },
{ S("\xF4\x90\x80\x80"), S("\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD") }, // U+110000 invalid codepoint (must be <= 0x10FFFF)
{ S("\x80"), S("\xEF\xBF\xBD") }, // C
{ S("\xC2\x41"), S("\xEF\xBF\xBD\x41") }, // L2 A
{ S("\xC2"), S("\xEF\xBF\xBD") }, // L2
{ S("\xE0\x80\x41"), S("\xEF\xBF\xBD\xEF\xBF\xBD\x41") }, // L3 C A
{ S("\xE0\x80"), S("\xEF\xBF\xBD\xEF\xBF\xBD") }, // L3 C
{ S("\xF0\x80\x80"), S("\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD") }, // L4 C C
{ S("\xC2\xE0"), S("\xEF\xBF\xBD\xEF\xBF\xBD") }, // L2 L3
{ S("\xE0\xC2\x80"), S("\xEF\xBF\xBD\xC2\x80") }, // L3 L2 C
{ S("\xF5"), S("\xEF\xBF\xBD") }, // L4
{ S("\xF8"), S("\xEF\xBF\xBD") }, // L5
{ S("\xFC"), S("\xEF\xBF\xBD") }, // L6
{ S("\xFE"), S("\xEF\xBF\xBD") },
{ S("\xFF"), S("\xEF\xBF\xBD") },
};
TEST(encodings, utf8)
{
for (auto test : utf8_tests)
{
string output;
decode(test.input, encoding::utf_8, output);
EXPECT_EQ(output, test.output);
}
}