fix: handle utf8 characters in the dune files(#9728)

Signed-off-by: Alpha DIALLO <moyodiallo@gmail.com>
ocaml · Feb 22, 2024 · 920470b · 920470b
1 parent fb11897
commit 920470b
Show file tree

Hide file tree

Showing 10 changed files with 518 additions and 33 deletions.
diff --git a/otherlibs/stdune/src/bytes.ml b/otherlibs/stdune/src/bytes.ml
@@ -1 +1,186 @@
 include StdLabels.Bytes
+
+external unsafe_get_uint8 : bytes -> int -> int = "%bytes_unsafe_get"
+
+(* UTF codecs and validations *)
+
+let dec_invalid = Uchar.utf_decode_invalid
+let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar.unsafe_of_int u)
+
+(* In case of decoding error, if we error on the first byte, we
+   consume the byte, otherwise we consume the [n] bytes preceding
+   the erroring byte.
+
+   This means that if a client uses decodes without caring about
+   validity it naturally replace bogus data with Uchar.rep according
+   to the WHATWG Encoding standard. Other schemes are possible by
+   consulting the number of used bytes on invalid decodes. For more
+   details see https://hsivonen.fi/broken-utf-8/
+
+   For this reason in [get_utf_8_uchar] we gradually check the next
+   byte is available rather than doing it immediately after the
+   first byte. Contrast with [is_valid_utf_8]. *)
+
+(* UTF-8 *)
+
+let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10
+let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101
+let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100
+let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b
+let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8
+let[@inline] utf_8_uchar_2 b0 b1 = ((b0 land 0x1F) lsl 6) lor (b1 land 0x3F)
+
+let[@inline] utf_8_uchar_3 b0 b1 b2 =
+  ((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F)
+;;
+
+let[@inline] utf_8_uchar_4 b0 b1 b2 b3 =
+  ((b0 land 0x07) lsl 18)
+  lor ((b1 land 0x3F) lsl 12)
+  lor ((b2 land 0x3F) lsl 6)
+  lor (b3 land 0x3F)
+;;
+
+let get_utf_8_uchar b i =
+  let b0 = get_uint8 b i in
+  (* raises if [i] is not a valid index. *)
+  let get = unsafe_get_uint8 in
+  let max = length b - 1 in
+  match Char.unsafe_chr b0 with
+  (* See The Unicode Standard, Table 3.7 *)
+  | '\x00' .. '\x7F' -> dec_ret 1 b0
+  | '\xC2' .. '\xDF' ->
+    let i = i + 1 in
+    if i > max
+    then dec_invalid 1
+    else (
+      let b1 = get b i in
+      if not_in_x80_to_xBF b1 then dec_invalid 1 else dec_ret 2 (utf_8_uchar_2 b0 b1))
+  | '\xE0' ->
+    let i = i + 1 in
+    if i > max
+    then dec_invalid 1
+    else (
+      let b1 = get b i in
+      if not_in_xA0_to_xBF b1
+      then dec_invalid 1
+      else (
+        let i = i + 1 in
+        if i > max
+        then dec_invalid 2
+        else (
+          let b2 = get b i in
+          if not_in_x80_to_xBF b2
+          then dec_invalid 2
+          else dec_ret 3 (utf_8_uchar_3 b0 b1 b2))))
+  | '\xE1' .. '\xEC' | '\xEE' .. '\xEF' ->
+    let i = i + 1 in
+    if i > max
+    then dec_invalid 1
+    else (
+      let b1 = get b i in
+      if not_in_x80_to_xBF b1
+      then dec_invalid 1
+      else (
+        let i = i + 1 in
+        if i > max
+        then dec_invalid 2
+        else (
+          let b2 = get b i in
+          if not_in_x80_to_xBF b2
+          then dec_invalid 2
+          else dec_ret 3 (utf_8_uchar_3 b0 b1 b2))))
+  | '\xED' ->
+    let i = i + 1 in
+    if i > max
+    then dec_invalid 1
+    else (
+      let b1 = get b i in
+      if not_in_x80_to_x9F b1
+      then dec_invalid 1
+      else (
+        let i = i + 1 in
+        if i > max
+        then dec_invalid 2
+        else (
+          let b2 = get b i in
+          if not_in_x80_to_xBF b2
+          then dec_invalid 2
+          else dec_ret 3 (utf_8_uchar_3 b0 b1 b2))))
+  | '\xF0' ->
+    let i = i + 1 in
+    if i > max
+    then dec_invalid 1
+    else (
+      let b1 = get b i in
+      if not_in_x90_to_xBF b1
+      then dec_invalid 1
+      else (
+        let i = i + 1 in
+        if i > max
+        then dec_invalid 2
+        else (
+          let b2 = get b i in
+          if not_in_x80_to_xBF b2
+          then dec_invalid 2
+          else (
+            let i = i + 1 in
+            if i > max
+            then dec_invalid 3
+            else (
+              let b3 = get b i in
+              if not_in_x80_to_xBF b3
+              then dec_invalid 3
+              else dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3))))))
+  | '\xF1' .. '\xF3' ->
+    let i = i + 1 in
+    if i > max
+    then dec_invalid 1
+    else (
+      let b1 = get b i in
+      if not_in_x80_to_xBF b1
+      then dec_invalid 1
+      else (
+        let i = i + 1 in
+        if i > max
+        then dec_invalid 2
+        else (
+          let b2 = get b i in
+          if not_in_x80_to_xBF b2
+          then dec_invalid 2
+          else (
+            let i = i + 1 in
+            if i > max
+            then dec_invalid 3
+            else (
+              let b3 = get b i in
+              if not_in_x80_to_xBF b3
+              then dec_invalid 3
+              else dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3))))))
+  | '\xF4' ->
+    let i = i + 1 in
+    if i > max
+    then dec_invalid 1
+    else (
+      let b1 = get b i in
+      if not_in_x80_to_x8F b1
+      then dec_invalid 1
+      else (
+        let i = i + 1 in
+        if i > max
+        then dec_invalid 2
+        else (
+          let b2 = get b i in
+          if not_in_x80_to_xBF b2
+          then dec_invalid 2
+          else (
+            let i = i + 1 in
+            if i > max
+            then dec_invalid 3
+            else (
+              let b3 = get b i in
+              if not_in_x80_to_xBF b3
+              then dec_invalid 3
+              else dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3))))))
+  | _ -> dec_invalid 1
+;;
diff --git a/otherlibs/stdune/src/bytes.mli b/otherlibs/stdune/src/bytes.mli
@@ -1,3 +1,7 @@
 include module type of struct
   include StdLabels.Bytes
 end
+
+(** [get_utf_8_uchar b i] decodes an UTF-8 character at index [i] in
+    [b]. *)
+val get_utf_8_uchar : t -> int -> Uchar.utf_decode
diff --git a/otherlibs/stdune/src/stdune.ml b/otherlibs/stdune/src/stdune.ml
@@ -24,6 +24,10 @@ module Option = Option
 module Or_exn = Or_exn
 module Ordering = Ordering
 
+module Unicode = struct
+  module Uchar = Uchar
+end
+
 module Pp = struct
   include Pp
 

diff --git a/otherlibs/stdune/src/string.ml b/otherlibs/stdune/src/string.ml
@@ -26,6 +26,7 @@ let rindex = rindex_opt
 let rindex_from = rindex_from_opt
 let break s ~pos = sub s ~pos:0 ~len:pos, sub s ~pos ~len:(length s - pos)
 let is_empty s = length s = 0
+let get_utf_8_uchar s i = Bytes.get_utf_8_uchar (Bytes.unsafe_of_string s) i
 
 module Cased_functions (X : sig
     val normalize : char -> char

diff --git a/otherlibs/stdune/src/string.mli b/otherlibs/stdune/src/string.mli
@@ -21,6 +21,7 @@ val drop_prefix : t -> prefix:t -> t option
 val drop_prefix_if_exists : t -> prefix:t -> t
 val drop_suffix : t -> suffix:t -> t option
 val drop_suffix_if_exists : t -> suffix:t -> t
+val get_utf_8_uchar : t -> int -> Uchar.utf_decode
 
 (** [drop_prefix_and_suffix t ~prefix ~suffix] Will attempt to remove [prefix]
     from the prefix and [suffix] from the suffix of [t]. Return [Some _] only

diff --git a/otherlibs/stdune/src/uchar.ml b/otherlibs/stdune/src/uchar.ml
@@ -0,0 +1,88 @@
+(**************************************************************************)
+(*                                                                        *)
+(*                                 OCaml                                  *)
+(*                                                                        *)
+(*                           Daniel C. Buenzli                            *)
+(*                                                                        *)
+(*   Copyright 2014 Institut National de Recherche en Informatique et     *)
+(*     en Automatique.                                                    *)
+(*                                                                        *)
+(*   All rights reserved.  This file is distributed under the terms of    *)
+(*   the GNU Lesser General Public License version 2.1, with the          *)
+(*   special exception on linking described in the file LICENSE.          *)
+(*                                                                        *)
+(**************************************************************************)
+
+external format_int : string -> int -> string = "caml_format_int"
+
+let err_no_pred = "U+0000 has no predecessor"
+let err_no_succ = "U+10FFFF has no successor"
+let err_not_sv i = format_int "%X" i ^ " is not an Unicode scalar value"
+let err_not_latin1 u = "U+" ^ format_int "%04X" u ^ " is not a latin1 character"
+
+type t = int
+
+let min = 0x0000
+let max = 0x10FFFF
+let lo_bound = 0xD7FF
+let hi_bound = 0xE000
+let bom = 0xFEFF
+let rep = 0xFFFD
+
+let succ u =
+  if u = lo_bound then hi_bound else if u = max then invalid_arg err_no_succ else u + 1
+;;
+
+let pred u =
+  if u = hi_bound then lo_bound else if u = min then invalid_arg err_no_pred else u - 1
+;;
+
+let is_valid i = (min <= i && i <= lo_bound) || (hi_bound <= i && i <= max)
+let of_int i = if is_valid i then i else invalid_arg (err_not_sv i)
+
+external unsafe_of_int : int -> t = "%identity"
+external to_int : t -> int = "%identity"
+
+let is_char u = u < 256
+let of_char c = Char.code c
+let to_char u = if u > 255 then invalid_arg (err_not_latin1 u) else Char.unsafe_chr u
+let unsafe_to_char = Char.unsafe_chr
+let equal : int -> int -> bool = ( = )
+let compare : int -> int -> int = Stdlib.compare
+let hash = to_int
+
+(* UTF codecs tools *)
+
+type utf_decode = int
+(* This is an int [0xDUUUUUU] decomposed as follows:
+   - [D] is four bits for decode information, the highest bit is set if the
+     decode is valid. The three lower bits indicate the number of elements
+     from the source that were consumed by the decode.
+   - [UUUUUU] is the decoded Unicode character or the Unicode replacement
+     character U+FFFD if for invalid decodes. *)
+
+let valid_bit = 27
+let decode_bits = 24
+let[@inline] utf_decode_is_valid d = d lsr valid_bit = 1
+let[@inline] utf_decode_length d = (d lsr decode_bits) land 0b111
+let[@inline] utf_decode_uchar d = unsafe_of_int (d land 0xFFFFFF)
+let[@inline] utf_decode n u = ((8 lor n) lsl decode_bits) lor to_int u
+let[@inline] utf_decode_invalid n = (n lsl decode_bits) lor rep
+
+let utf_8_byte_length u =
+  match to_int u with
+  | u when u < 0 -> assert false
+  | u when u <= 0x007F -> 1
+  | u when u <= 0x07FF -> 2
+  | u when u <= 0xFFFF -> 3
+  | u when u <= 0x10FFFF -> 4
+  | _ -> assert false
+;;
+
+let utf_16_byte_length u =
+  match to_int u with
+  | u when u < 0 -> assert false
+  | u when u <= 0xFFFF -> 2
+  | u when u <= 0x10FFFF -> 4
+  | _ -> assert false
+;;