-
Notifications
You must be signed in to change notification settings - Fork 412
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: handle utf8 characters in the dune files(#9728)
Signed-off-by: Alpha DIALLO <moyodiallo@gmail.com>
- Loading branch information
1 parent
fb11897
commit 920470b
Showing
10 changed files
with
518 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,186 @@ | ||
include StdLabels.Bytes | ||
|
||
external unsafe_get_uint8 : bytes -> int -> int = "%bytes_unsafe_get" | ||
|
||
(* UTF codecs and validations *) | ||
|
||
let dec_invalid = Uchar.utf_decode_invalid | ||
let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar.unsafe_of_int u) | ||
|
||
(* In case of decoding error, if we error on the first byte, we | ||
consume the byte, otherwise we consume the [n] bytes preceding | ||
the erroring byte. | ||
This means that if a client uses decodes without caring about | ||
validity it naturally replace bogus data with Uchar.rep according | ||
to the WHATWG Encoding standard. Other schemes are possible by | ||
consulting the number of used bytes on invalid decodes. For more | ||
details see https://hsivonen.fi/broken-utf-8/ | ||
For this reason in [get_utf_8_uchar] we gradually check the next | ||
byte is available rather than doing it immediately after the | ||
first byte. Contrast with [is_valid_utf_8]. *) | ||
|
||
(* UTF-8 *) | ||
|
||
let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10 | ||
let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101 | ||
let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100 | ||
let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b | ||
let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8 | ||
let[@inline] utf_8_uchar_2 b0 b1 = ((b0 land 0x1F) lsl 6) lor (b1 land 0x3F) | ||
|
||
let[@inline] utf_8_uchar_3 b0 b1 b2 = | ||
((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F) | ||
;; | ||
|
||
let[@inline] utf_8_uchar_4 b0 b1 b2 b3 = | ||
((b0 land 0x07) lsl 18) | ||
lor ((b1 land 0x3F) lsl 12) | ||
lor ((b2 land 0x3F) lsl 6) | ||
lor (b3 land 0x3F) | ||
;; | ||
|
||
let get_utf_8_uchar b i = | ||
let b0 = get_uint8 b i in | ||
(* raises if [i] is not a valid index. *) | ||
let get = unsafe_get_uint8 in | ||
let max = length b - 1 in | ||
match Char.unsafe_chr b0 with | ||
(* See The Unicode Standard, Table 3.7 *) | ||
| '\x00' .. '\x7F' -> dec_ret 1 b0 | ||
| '\xC2' .. '\xDF' -> | ||
let i = i + 1 in | ||
if i > max | ||
then dec_invalid 1 | ||
else ( | ||
let b1 = get b i in | ||
if not_in_x80_to_xBF b1 then dec_invalid 1 else dec_ret 2 (utf_8_uchar_2 b0 b1)) | ||
| '\xE0' -> | ||
let i = i + 1 in | ||
if i > max | ||
then dec_invalid 1 | ||
else ( | ||
let b1 = get b i in | ||
if not_in_xA0_to_xBF b1 | ||
then dec_invalid 1 | ||
else ( | ||
let i = i + 1 in | ||
if i > max | ||
then dec_invalid 2 | ||
else ( | ||
let b2 = get b i in | ||
if not_in_x80_to_xBF b2 | ||
then dec_invalid 2 | ||
else dec_ret 3 (utf_8_uchar_3 b0 b1 b2)))) | ||
| '\xE1' .. '\xEC' | '\xEE' .. '\xEF' -> | ||
let i = i + 1 in | ||
if i > max | ||
then dec_invalid 1 | ||
else ( | ||
let b1 = get b i in | ||
if not_in_x80_to_xBF b1 | ||
then dec_invalid 1 | ||
else ( | ||
let i = i + 1 in | ||
if i > max | ||
then dec_invalid 2 | ||
else ( | ||
let b2 = get b i in | ||
if not_in_x80_to_xBF b2 | ||
then dec_invalid 2 | ||
else dec_ret 3 (utf_8_uchar_3 b0 b1 b2)))) | ||
| '\xED' -> | ||
let i = i + 1 in | ||
if i > max | ||
then dec_invalid 1 | ||
else ( | ||
let b1 = get b i in | ||
if not_in_x80_to_x9F b1 | ||
then dec_invalid 1 | ||
else ( | ||
let i = i + 1 in | ||
if i > max | ||
then dec_invalid 2 | ||
else ( | ||
let b2 = get b i in | ||
if not_in_x80_to_xBF b2 | ||
then dec_invalid 2 | ||
else dec_ret 3 (utf_8_uchar_3 b0 b1 b2)))) | ||
| '\xF0' -> | ||
let i = i + 1 in | ||
if i > max | ||
then dec_invalid 1 | ||
else ( | ||
let b1 = get b i in | ||
if not_in_x90_to_xBF b1 | ||
then dec_invalid 1 | ||
else ( | ||
let i = i + 1 in | ||
if i > max | ||
then dec_invalid 2 | ||
else ( | ||
let b2 = get b i in | ||
if not_in_x80_to_xBF b2 | ||
then dec_invalid 2 | ||
else ( | ||
let i = i + 1 in | ||
if i > max | ||
then dec_invalid 3 | ||
else ( | ||
let b3 = get b i in | ||
if not_in_x80_to_xBF b3 | ||
then dec_invalid 3 | ||
else dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)))))) | ||
| '\xF1' .. '\xF3' -> | ||
let i = i + 1 in | ||
if i > max | ||
then dec_invalid 1 | ||
else ( | ||
let b1 = get b i in | ||
if not_in_x80_to_xBF b1 | ||
then dec_invalid 1 | ||
else ( | ||
let i = i + 1 in | ||
if i > max | ||
then dec_invalid 2 | ||
else ( | ||
let b2 = get b i in | ||
if not_in_x80_to_xBF b2 | ||
then dec_invalid 2 | ||
else ( | ||
let i = i + 1 in | ||
if i > max | ||
then dec_invalid 3 | ||
else ( | ||
let b3 = get b i in | ||
if not_in_x80_to_xBF b3 | ||
then dec_invalid 3 | ||
else dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)))))) | ||
| '\xF4' -> | ||
let i = i + 1 in | ||
if i > max | ||
then dec_invalid 1 | ||
else ( | ||
let b1 = get b i in | ||
if not_in_x80_to_x8F b1 | ||
then dec_invalid 1 | ||
else ( | ||
let i = i + 1 in | ||
if i > max | ||
then dec_invalid 2 | ||
else ( | ||
let b2 = get b i in | ||
if not_in_x80_to_xBF b2 | ||
then dec_invalid 2 | ||
else ( | ||
let i = i + 1 in | ||
if i > max | ||
then dec_invalid 3 | ||
else ( | ||
let b3 = get b i in | ||
if not_in_x80_to_xBF b3 | ||
then dec_invalid 3 | ||
else dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)))))) | ||
| _ -> dec_invalid 1 | ||
;; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,7 @@ | ||
include module type of struct | ||
include StdLabels.Bytes | ||
end | ||
|
||
(** [get_utf_8_uchar b i] decodes an UTF-8 character at index [i] in | ||
[b]. *) | ||
val get_utf_8_uchar : t -> int -> Uchar.utf_decode |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
(**************************************************************************) | ||
(* *) | ||
(* OCaml *) | ||
(* *) | ||
(* Daniel C. Buenzli *) | ||
(* *) | ||
(* Copyright 2014 Institut National de Recherche en Informatique et *) | ||
(* en Automatique. *) | ||
(* *) | ||
(* All rights reserved. This file is distributed under the terms of *) | ||
(* the GNU Lesser General Public License version 2.1, with the *) | ||
(* special exception on linking described in the file LICENSE. *) | ||
(* *) | ||
(**************************************************************************) | ||
|
||
external format_int : string -> int -> string = "caml_format_int" | ||
|
||
let err_no_pred = "U+0000 has no predecessor" | ||
let err_no_succ = "U+10FFFF has no successor" | ||
let err_not_sv i = format_int "%X" i ^ " is not an Unicode scalar value" | ||
let err_not_latin1 u = "U+" ^ format_int "%04X" u ^ " is not a latin1 character" | ||
|
||
type t = int | ||
|
||
let min = 0x0000 | ||
let max = 0x10FFFF | ||
let lo_bound = 0xD7FF | ||
let hi_bound = 0xE000 | ||
let bom = 0xFEFF | ||
let rep = 0xFFFD | ||
|
||
let succ u = | ||
if u = lo_bound then hi_bound else if u = max then invalid_arg err_no_succ else u + 1 | ||
;; | ||
|
||
let pred u = | ||
if u = hi_bound then lo_bound else if u = min then invalid_arg err_no_pred else u - 1 | ||
;; | ||
|
||
let is_valid i = (min <= i && i <= lo_bound) || (hi_bound <= i && i <= max) | ||
let of_int i = if is_valid i then i else invalid_arg (err_not_sv i) | ||
|
||
external unsafe_of_int : int -> t = "%identity" | ||
external to_int : t -> int = "%identity" | ||
|
||
let is_char u = u < 256 | ||
let of_char c = Char.code c | ||
let to_char u = if u > 255 then invalid_arg (err_not_latin1 u) else Char.unsafe_chr u | ||
let unsafe_to_char = Char.unsafe_chr | ||
let equal : int -> int -> bool = ( = ) | ||
let compare : int -> int -> int = Stdlib.compare | ||
let hash = to_int | ||
|
||
(* UTF codecs tools *) | ||
|
||
type utf_decode = int | ||
(* This is an int [0xDUUUUUU] decomposed as follows: | ||
- [D] is four bits for decode information, the highest bit is set if the | ||
decode is valid. The three lower bits indicate the number of elements | ||
from the source that were consumed by the decode. | ||
- [UUUUUU] is the decoded Unicode character or the Unicode replacement | ||
character U+FFFD if for invalid decodes. *) | ||
|
||
let valid_bit = 27 | ||
let decode_bits = 24 | ||
let[@inline] utf_decode_is_valid d = d lsr valid_bit = 1 | ||
let[@inline] utf_decode_length d = (d lsr decode_bits) land 0b111 | ||
let[@inline] utf_decode_uchar d = unsafe_of_int (d land 0xFFFFFF) | ||
let[@inline] utf_decode n u = ((8 lor n) lsl decode_bits) lor to_int u | ||
let[@inline] utf_decode_invalid n = (n lsl decode_bits) lor rep | ||
|
||
let utf_8_byte_length u = | ||
match to_int u with | ||
| u when u < 0 -> assert false | ||
| u when u <= 0x007F -> 1 | ||
| u when u <= 0x07FF -> 2 | ||
| u when u <= 0xFFFF -> 3 | ||
| u when u <= 0x10FFFF -> 4 | ||
| _ -> assert false | ||
;; | ||
|
||
let utf_16_byte_length u = | ||
match to_int u with | ||
| u when u < 0 -> assert false | ||
| u when u <= 0xFFFF -> 2 | ||
| u when u <= 0x10FFFF -> 4 | ||
| _ -> assert false | ||
;; |
Oops, something went wrong.