google · JimLarson · Jun 21, 2020 · Jun 19, 2020 · Jun 21, 2020
diff --git a/doc/langdef.md b/doc/langdef.md
@@ -277,6 +277,17 @@ CEL Literal   | Meaning
 `"\xFF"`      | String of "&yuml;" (code point 255)
 `b"\xFF"`     | Sequence of byte 255 (_not_ UTF-8 of &yuml;)
 
+While strings must be sequences of valid Unicode code points, no Unicode
+normalization is attempted on strings, as there are several normal forms, they
+can be expensive to convert, and we don't know which is desired. If Unicode
+normalization is desired, it should be performed outside of CEL, or done as a
+custom extension function.
+
+Likewise, no advanced collation is attempted on strings, as this depnds on the
+normalization and can be locale-dependent. Strings are simply treated as
+sequences of code points and are ordered with lexicographic ordering based on
+the numeric value of the code points.
+
 ### Aggregate Values
 
 Lists are ordered sequences of values.

diff --git a/tests/simple/testdata/comparisons.textproto b/tests/simple/testdata/comparisons.textproto
@@ -63,6 +63,18 @@ section {
     expr: "'a' == 'à'"
     value: { bool_value: false }
   }
+  test {
+    name: "no_string_normalization"
+    description: "Should not normalize Unicode."
+    expr: "'Am\\u00E9lie' == 'Ame\\u0301lie'"
+    value: { bool_value: false }
+  }
+  test {
+    name: "no_string_normalization_surrogate"
+    description: "Should not replace surrogate pairs."
+    expr: "'\\U0001F436' == '\\uD83D\\uDC36'"
+    value: { bool_value: false }
+  }
   test {
     name: "eq_null"
     expr: "null == null"
@@ -379,6 +391,12 @@ section {
     expr: "'a' < 'AB'"
     value: { bool_value: false }
   }
+  test {
+    name: "unicode_order_lexical"
+    description: "Compare the actual code points of the string, instead of decomposing ế into 'e' plus accent modifiers."
+    expr: "'f' < '\\u1EBF'"
+    value: { bool_value: true }
+  }
   test {
     name: "lt_bytes"
     expr: "b'a' < b'b'"