diff --git a/src/dune_sexp/dune b/src/dune_sexp/dune index 4b661198c6e3..d8508fcd1ab7 100644 --- a/src/dune_sexp/dune +++ b/src/dune_sexp/dune @@ -1,7 +1,7 @@ (library (name dune_sexp) (synopsis "[Internal] S-expression library") - (libraries stdune) + (libraries stdune dune_uutf) (instrumentation (backend bisect_ppx))) diff --git a/src/dune_sexp/escape.ml b/src/dune_sexp/escape.ml index 8eb26e4c71cf..0c3b5d54cc08 100644 --- a/src/dune_sexp/escape.ml +++ b/src/dune_sexp/escape.ml @@ -1,17 +1,37 @@ open! Stdune +let utf_8_byte_length u = + match Uchar.to_int u with + | u when u < 0 -> assert false + | u when u <= 0x007F -> 1 + | u when u <= 0x07FF -> 2 + | u when u <= 0xFFFF -> 3 + | u when u <= 0x10FFFF -> 4 + | _ -> assert false +;; + let quote_length s = let n = ref 0 in let len = String.length s in - for i = 0 to len - 1 do - n - := !n - + - match String.unsafe_get s i with - | '\"' | '\\' | '\n' | '\t' | '\r' | '\b' -> 2 - | '%' -> if i + 1 < len && s.[i + 1] = '{' then 2 else 1 - | ' ' .. '~' -> 1 - | _ -> 4 + let i = ref 0 in + while !i < len do + (n + := !n + + + match String.unsafe_get s !i with + | '\"' | '\\' | '\n' | '\t' | '\r' | '\b' -> 2 + | '%' -> if !i + 1 < len && s.[!i + 1] = '{' then 2 else 1 + | ' ' .. '~' -> 1 + | _ -> + let uchar = String.sub s ~pos:!i ~len:(min (len - !i) 4) in + (match Uutf.decoder ~encoding:`UTF_8 (`String uchar) |> Uutf.decode with + | `Uchar u -> + let uchar_len = utf_8_byte_length u in + assert (uchar_len > 1 && uchar_len < 5); + i := !i + uchar_len - 1; + uchar_len + | _ -> 4)); + incr i done; !n ;; @@ -19,8 +39,9 @@ let quote_length s = let escape_to s ~dst:s' ~ofs = let n = ref ofs in let len = String.length s in - for i = 0 to len - 1 do - (match String.unsafe_get s i with + let i = ref 0 in + while !i < len do + (match String.unsafe_get s !i with | ('\"' | '\\') as c -> Bytes.unsafe_set s' !n '\\'; incr n; @@ -41,21 +62,36 @@ let escape_to s ~dst:s' ~ofs = Bytes.unsafe_set s' !n '\\'; incr n; Bytes.unsafe_set s' !n 'b' - | '%' when i + 1 < len && s.[i + 1] = '{' -> + | '%' when !i + 1 < len && s.[!i + 1] = '{' -> Bytes.unsafe_set s' !n '\\'; incr n; Bytes.unsafe_set s' !n '%' | ' ' .. '~' as c -> Bytes.unsafe_set s' !n c | c -> - let a = Char.code c in - Bytes.unsafe_set s' !n '\\'; - incr n; - Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 100))); - incr n; - Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 10 mod 10))); - incr n; - Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a mod 10)))); - incr n + let uchar = String.sub s ~pos:!i ~len:(min (len - !i) 4) in + (match Uutf.decoder ~encoding:`UTF_8 (`String uchar) |> Uutf.decode with + | `Uchar u -> + let uchar_len = utf_8_byte_length u in + assert (uchar_len > 1 && uchar_len < 5); + Bytes.unsafe_set s' !n (String.unsafe_get s !i); + Bytes.unsafe_set s' (!n + 1) (String.unsafe_get s (!i + 1)); + if uchar_len > 2 + then Bytes.unsafe_set s' (!n + 2) (String.unsafe_get s (!i + 2)); + if uchar_len > 3 + then Bytes.unsafe_set s' (!n + 3) (String.unsafe_get s (!i + 3)); + n := !n + uchar_len - 1; + i := !i + uchar_len - 1 + | _ -> + let a = Char.code c in + Bytes.unsafe_set s' !n '\\'; + incr n; + Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 100))); + incr n; + Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 10 mod 10))); + incr n; + Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a mod 10))))); + incr n; + incr i done ;; diff --git a/src/dune_sexp/lexer.mll b/src/dune_sexp/lexer.mll index 46aec74b1d5c..83b86d7fee5f 100644 --- a/src/dune_sexp/lexer.mll +++ b/src/dune_sexp/lexer.mll @@ -16,11 +16,12 @@ end type t = with_comments:bool -> Lexing.lexbuf -> Token.t -let error ?(delta = 0) lexbuf message = +let error ?(delta = 0) ?(delta_stop = 0) lexbuf message = let start = Lexing.lexeme_start_p lexbuf in + let stop = Lexing.lexeme_end_p lexbuf in let loc = Loc.create ~start:{ start with pos_cnum = start.pos_cnum + delta } - ~stop:(Lexing.lexeme_end_p lexbuf) + ~stop:{ stop with pos_cnum = stop.pos_cnum + delta_stop } in User_error.raise ~loc [ Pp.text message ] @@ -144,6 +145,8 @@ let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F'] let atom_char = [^ ';' '(' ')' '"' '\000'-'\032' '\127'-'\255'] let varname_char = atom_char # [ ':' '%' '{' '}' ] +let non_ascii = ['\128'-'\255'] + rule token with_comments = parse | newline { Lexing.new_line lexbuf; token with_comments lexbuf } @@ -352,13 +355,38 @@ and template_variable = parse } | '}' | eof { error lexbuf "%{...} forms cannot be empty" } - | (varname_char* as skip) (_ as other) - | (varname_char+ ':' ((':' | varname_char)*) as skip) (_ as other) + | (varname_char* as skip) (non_ascii* as maybe_utf) (_ as other) + | (varname_char+ ':' ((':' | varname_char)*) as skip) (non_ascii* as maybe_utf) (_ as other) { - error - ~delta:(String.length skip) - lexbuf - (Printf.sprintf "The character %C is not allowed inside %%{...} forms" other) + let utf_8_byte_length u = + match Uchar.to_int u with + | u when u < 0 -> assert false + | u when u <= 0x007F -> 1 + | u when u <= 0x07FF -> 2 + | u when u <= 0xFFFF -> 3 + | u when u <= 0x10FFFF -> 4 + | _ -> assert false + in + let utf_len = String.length maybe_utf in + let uchar = + if utf_len > 1 then + Some (Uutf.decoder ~encoding:`UTF_8 (`String maybe_utf) |> Uutf.decode) + else None + in + match uchar with + | Some (`Uchar u) -> + let uchar_len = utf_8_byte_length u in + let utf_char = String.sub maybe_utf ~pos:0 ~len:uchar_len in + error + ~delta:(String.length skip) + ~delta_stop:(-uchar_len) + lexbuf + (Printf.sprintf "The character %s is not allowed inside %%{...} forms" utf_char) + | _ -> + error + ~delta:(String.length skip) + lexbuf + (Printf.sprintf "The character %C is not allowed inside %%{...} forms" other) } { diff --git a/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t b/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t index c11fd94e4145..dcf44519b4e2 100644 --- a/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t +++ b/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t @@ -1,13 +1,30 @@ -How the non-ASCII characters are handled, this is also related to the issue #9728 +Utf8 characters are handled for now, this is also related to the issue #9728 $ dune format-dune-file < ("É") + > ("Éff ĎúÑȨ") > EOF - ("\195\137") + ("Éff ĎúÑȨ") $ dune format-dune-file < (run foo %{bin:é}) > EOF File "", line 1, characters 15-16: - Error: The character '\195' is not allowed inside %{...} forms + Error: The character é is not allowed inside %{...} forms + [1] + + $ dune format-dune-file < (echo "hÉllo") + > EOF + (echo "hÉllo") + + $ dune format-dune-file < (echo "É") + > EOF + (echo "É") + + $ dune format-dune-file < (Écho "hello") + > EOF + File "", line 1, characters 1-1: + Error: Invalid . file [1]