Skip to content

Commit

Permalink
fix: handle utf8 characters in the dune files(#9728)
Browse files Browse the repository at this point in the history
Signed-off-by: Alpha DIALLO <moyodiallo@gmail.com>
  • Loading branch information
moyodiallo committed Feb 22, 2024
1 parent fb11897 commit 920470b
Show file tree
Hide file tree
Showing 10 changed files with 518 additions and 33 deletions.
185 changes: 185 additions & 0 deletions otherlibs/stdune/src/bytes.ml
Original file line number Diff line number Diff line change
@@ -1 +1,186 @@
include StdLabels.Bytes

external unsafe_get_uint8 : bytes -> int -> int = "%bytes_unsafe_get"

(* UTF codecs and validations *)

let dec_invalid = Uchar.utf_decode_invalid
let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar.unsafe_of_int u)

(* In case of decoding error, if we error on the first byte, we
consume the byte, otherwise we consume the [n] bytes preceding
the erroring byte.
This means that if a client uses decodes without caring about
validity it naturally replace bogus data with Uchar.rep according
to the WHATWG Encoding standard. Other schemes are possible by
consulting the number of used bytes on invalid decodes. For more
details see https://hsivonen.fi/broken-utf-8/
For this reason in [get_utf_8_uchar] we gradually check the next
byte is available rather than doing it immediately after the
first byte. Contrast with [is_valid_utf_8]. *)

(* UTF-8 *)

let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10
let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101
let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100
let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b
let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8
let[@inline] utf_8_uchar_2 b0 b1 = ((b0 land 0x1F) lsl 6) lor (b1 land 0x3F)

let[@inline] utf_8_uchar_3 b0 b1 b2 =
((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F)
;;

let[@inline] utf_8_uchar_4 b0 b1 b2 b3 =
((b0 land 0x07) lsl 18)
lor ((b1 land 0x3F) lsl 12)
lor ((b2 land 0x3F) lsl 6)
lor (b3 land 0x3F)
;;

let get_utf_8_uchar b i =
let b0 = get_uint8 b i in
(* raises if [i] is not a valid index. *)
let get = unsafe_get_uint8 in
let max = length b - 1 in
match Char.unsafe_chr b0 with
(* See The Unicode Standard, Table 3.7 *)
| '\x00' .. '\x7F' -> dec_ret 1 b0
| '\xC2' .. '\xDF' ->
let i = i + 1 in
if i > max
then dec_invalid 1
else (
let b1 = get b i in
if not_in_x80_to_xBF b1 then dec_invalid 1 else dec_ret 2 (utf_8_uchar_2 b0 b1))
| '\xE0' ->
let i = i + 1 in
if i > max
then dec_invalid 1
else (
let b1 = get b i in
if not_in_xA0_to_xBF b1
then dec_invalid 1
else (
let i = i + 1 in
if i > max
then dec_invalid 2
else (
let b2 = get b i in
if not_in_x80_to_xBF b2
then dec_invalid 2
else dec_ret 3 (utf_8_uchar_3 b0 b1 b2))))
| '\xE1' .. '\xEC' | '\xEE' .. '\xEF' ->
let i = i + 1 in
if i > max
then dec_invalid 1
else (
let b1 = get b i in
if not_in_x80_to_xBF b1
then dec_invalid 1
else (
let i = i + 1 in
if i > max
then dec_invalid 2
else (
let b2 = get b i in
if not_in_x80_to_xBF b2
then dec_invalid 2
else dec_ret 3 (utf_8_uchar_3 b0 b1 b2))))
| '\xED' ->
let i = i + 1 in
if i > max
then dec_invalid 1
else (
let b1 = get b i in
if not_in_x80_to_x9F b1
then dec_invalid 1
else (
let i = i + 1 in
if i > max
then dec_invalid 2
else (
let b2 = get b i in
if not_in_x80_to_xBF b2
then dec_invalid 2
else dec_ret 3 (utf_8_uchar_3 b0 b1 b2))))
| '\xF0' ->
let i = i + 1 in
if i > max
then dec_invalid 1
else (
let b1 = get b i in
if not_in_x90_to_xBF b1
then dec_invalid 1
else (
let i = i + 1 in
if i > max
then dec_invalid 2
else (
let b2 = get b i in
if not_in_x80_to_xBF b2
then dec_invalid 2
else (
let i = i + 1 in
if i > max
then dec_invalid 3
else (
let b3 = get b i in
if not_in_x80_to_xBF b3
then dec_invalid 3
else dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3))))))
| '\xF1' .. '\xF3' ->
let i = i + 1 in
if i > max
then dec_invalid 1
else (
let b1 = get b i in
if not_in_x80_to_xBF b1
then dec_invalid 1
else (
let i = i + 1 in
if i > max
then dec_invalid 2
else (
let b2 = get b i in
if not_in_x80_to_xBF b2
then dec_invalid 2
else (
let i = i + 1 in
if i > max
then dec_invalid 3
else (
let b3 = get b i in
if not_in_x80_to_xBF b3
then dec_invalid 3
else dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3))))))
| '\xF4' ->
let i = i + 1 in
if i > max
then dec_invalid 1
else (
let b1 = get b i in
if not_in_x80_to_x8F b1
then dec_invalid 1
else (
let i = i + 1 in
if i > max
then dec_invalid 2
else (
let b2 = get b i in
if not_in_x80_to_xBF b2
then dec_invalid 2
else (
let i = i + 1 in
if i > max
then dec_invalid 3
else (
let b3 = get b i in
if not_in_x80_to_xBF b3
then dec_invalid 3
else dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3))))))
| _ -> dec_invalid 1
;;
4 changes: 4 additions & 0 deletions otherlibs/stdune/src/bytes.mli
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
include module type of struct
include StdLabels.Bytes
end

(** [get_utf_8_uchar b i] decodes an UTF-8 character at index [i] in
[b]. *)
val get_utf_8_uchar : t -> int -> Uchar.utf_decode
4 changes: 4 additions & 0 deletions otherlibs/stdune/src/stdune.ml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ module Option = Option
module Or_exn = Or_exn
module Ordering = Ordering

module Unicode = struct
module Uchar = Uchar
end

module Pp = struct
include Pp

Expand Down
1 change: 1 addition & 0 deletions otherlibs/stdune/src/string.ml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ let rindex = rindex_opt
let rindex_from = rindex_from_opt
let break s ~pos = sub s ~pos:0 ~len:pos, sub s ~pos ~len:(length s - pos)
let is_empty s = length s = 0
let get_utf_8_uchar s i = Bytes.get_utf_8_uchar (Bytes.unsafe_of_string s) i

module Cased_functions (X : sig
val normalize : char -> char
Expand Down
1 change: 1 addition & 0 deletions otherlibs/stdune/src/string.mli
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ val drop_prefix : t -> prefix:t -> t option
val drop_prefix_if_exists : t -> prefix:t -> t
val drop_suffix : t -> suffix:t -> t option
val drop_suffix_if_exists : t -> suffix:t -> t
val get_utf_8_uchar : t -> int -> Uchar.utf_decode

(** [drop_prefix_and_suffix t ~prefix ~suffix] Will attempt to remove [prefix]
from the prefix and [suffix] from the suffix of [t]. Return [Some _] only
Expand Down
88 changes: 88 additions & 0 deletions otherlibs/stdune/src/uchar.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
(**************************************************************************)
(* *)
(* OCaml *)
(* *)
(* Daniel C. Buenzli *)
(* *)
(* Copyright 2014 Institut National de Recherche en Informatique et *)
(* en Automatique. *)
(* *)
(* All rights reserved. This file is distributed under the terms of *)
(* the GNU Lesser General Public License version 2.1, with the *)
(* special exception on linking described in the file LICENSE. *)
(* *)
(**************************************************************************)

external format_int : string -> int -> string = "caml_format_int"

let err_no_pred = "U+0000 has no predecessor"
let err_no_succ = "U+10FFFF has no successor"
let err_not_sv i = format_int "%X" i ^ " is not an Unicode scalar value"
let err_not_latin1 u = "U+" ^ format_int "%04X" u ^ " is not a latin1 character"

type t = int

let min = 0x0000
let max = 0x10FFFF
let lo_bound = 0xD7FF
let hi_bound = 0xE000
let bom = 0xFEFF
let rep = 0xFFFD

let succ u =
if u = lo_bound then hi_bound else if u = max then invalid_arg err_no_succ else u + 1
;;

let pred u =
if u = hi_bound then lo_bound else if u = min then invalid_arg err_no_pred else u - 1
;;

let is_valid i = (min <= i && i <= lo_bound) || (hi_bound <= i && i <= max)
let of_int i = if is_valid i then i else invalid_arg (err_not_sv i)

external unsafe_of_int : int -> t = "%identity"
external to_int : t -> int = "%identity"

let is_char u = u < 256
let of_char c = Char.code c
let to_char u = if u > 255 then invalid_arg (err_not_latin1 u) else Char.unsafe_chr u
let unsafe_to_char = Char.unsafe_chr
let equal : int -> int -> bool = ( = )
let compare : int -> int -> int = Stdlib.compare
let hash = to_int

(* UTF codecs tools *)

type utf_decode = int
(* This is an int [0xDUUUUUU] decomposed as follows:
- [D] is four bits for decode information, the highest bit is set if the
decode is valid. The three lower bits indicate the number of elements
from the source that were consumed by the decode.
- [UUUUUU] is the decoded Unicode character or the Unicode replacement
character U+FFFD if for invalid decodes. *)

let valid_bit = 27
let decode_bits = 24
let[@inline] utf_decode_is_valid d = d lsr valid_bit = 1
let[@inline] utf_decode_length d = (d lsr decode_bits) land 0b111
let[@inline] utf_decode_uchar d = unsafe_of_int (d land 0xFFFFFF)
let[@inline] utf_decode n u = ((8 lor n) lsl decode_bits) lor to_int u
let[@inline] utf_decode_invalid n = (n lsl decode_bits) lor rep

let utf_8_byte_length u =
match to_int u with
| u when u < 0 -> assert false
| u when u <= 0x007F -> 1
| u when u <= 0x07FF -> 2
| u when u <= 0xFFFF -> 3
| u when u <= 0x10FFFF -> 4
| _ -> assert false
;;

let utf_16_byte_length u =
match to_int u with
| u when u < 0 -> assert false
| u when u <= 0xFFFF -> 2
| u when u <= 0x10FFFF -> 4
| _ -> assert false
;;
Loading

0 comments on commit 920470b

Please sign in to comment.