Skip to content

Commit

Permalink
fix: handle utf8 characters in the dune files(#9728)
Browse files Browse the repository at this point in the history
Signed-off-by: Alpha DIALLO <moyodiallo@gmail.com>
  • Loading branch information
moyodiallo committed Feb 24, 2024
1 parent 5100a06 commit 0907f5f
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 34 deletions.
2 changes: 1 addition & 1 deletion src/dune_sexp/dune
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
(library
(name dune_sexp)
(synopsis "[Internal] S-expression library")
(libraries stdune)
(libraries stdune dune_uutf)
(instrumentation
(backend bisect_ppx)))

Expand Down
78 changes: 57 additions & 21 deletions src/dune_sexp/escape.ml
Original file line number Diff line number Diff line change
@@ -1,26 +1,47 @@
open! Stdune

let utf_8_byte_length u =
match Uchar.to_int u with
| u when u < 0 -> assert false
| u when u <= 0x007F -> 1
| u when u <= 0x07FF -> 2
| u when u <= 0xFFFF -> 3
| u when u <= 0x10FFFF -> 4
| _ -> assert false
;;

let quote_length s =
let n = ref 0 in
let len = String.length s in
for i = 0 to len - 1 do
n
:= !n
+
match String.unsafe_get s i with
| '\"' | '\\' | '\n' | '\t' | '\r' | '\b' -> 2
| '%' -> if i + 1 < len && s.[i + 1] = '{' then 2 else 1
| ' ' .. '~' -> 1
| _ -> 4
let i = ref 0 in
while !i < len do
(n
:= !n
+
match String.unsafe_get s !i with
| '\"' | '\\' | '\n' | '\t' | '\r' | '\b' -> 2
| '%' -> if !i + 1 < len && s.[!i + 1] = '{' then 2 else 1
| ' ' .. '~' -> 1
| _ ->
let uchar = String.sub s ~pos:!i ~len:(min (len - !i) 4) in
(match Uutf.decoder ~encoding:`UTF_8 (`String uchar) |> Uutf.decode with
| `Uchar u ->
let uchar_len = utf_8_byte_length u in
assert (uchar_len > 1 && uchar_len < 5);
i := !i + uchar_len - 1;
uchar_len
| _ -> 4));
incr i
done;
!n
;;

let escape_to s ~dst:s' ~ofs =
let n = ref ofs in
let len = String.length s in
for i = 0 to len - 1 do
(match String.unsafe_get s i with
let i = ref 0 in
while !i < len do
(match String.unsafe_get s !i with
| ('\"' | '\\') as c ->
Bytes.unsafe_set s' !n '\\';
incr n;
Expand All @@ -41,21 +62,36 @@ let escape_to s ~dst:s' ~ofs =
Bytes.unsafe_set s' !n '\\';
incr n;
Bytes.unsafe_set s' !n 'b'
| '%' when i + 1 < len && s.[i + 1] = '{' ->
| '%' when !i + 1 < len && s.[!i + 1] = '{' ->
Bytes.unsafe_set s' !n '\\';
incr n;
Bytes.unsafe_set s' !n '%'
| ' ' .. '~' as c -> Bytes.unsafe_set s' !n c
| c ->
let a = Char.code c in
Bytes.unsafe_set s' !n '\\';
incr n;
Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 100)));
incr n;
Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 10 mod 10)));
incr n;
Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a mod 10))));
incr n
let uchar = String.sub s ~pos:!i ~len:(min (len - !i) 4) in
(match Uutf.decoder ~encoding:`UTF_8 (`String uchar) |> Uutf.decode with
| `Uchar u ->
let uchar_len = utf_8_byte_length u in
assert (uchar_len > 1 && uchar_len < 5);
Bytes.unsafe_set s' !n (String.unsafe_get s !i);
Bytes.unsafe_set s' (!n + 1) (String.unsafe_get s (!i + 1));
if uchar_len > 2
then Bytes.unsafe_set s' (!n + 2) (String.unsafe_get s (!i + 2));
if uchar_len > 3
then Bytes.unsafe_set s' (!n + 3) (String.unsafe_get s (!i + 3));
n := !n + uchar_len - 1;
i := !i + uchar_len - 1
| _ ->
let a = Char.code c in
Bytes.unsafe_set s' !n '\\';
incr n;
Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 100)));
incr n;
Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 10 mod 10)));
incr n;
Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a mod 10)))));
incr n;
incr i
done
;;

Expand Down
44 changes: 36 additions & 8 deletions src/dune_sexp/lexer.mll
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@ end

type t = with_comments:bool -> Lexing.lexbuf -> Token.t

let error ?(delta = 0) lexbuf message =
let error ?(delta = 0) ?(delta_stop = 0) lexbuf message =
let start = Lexing.lexeme_start_p lexbuf in
let stop = Lexing.lexeme_end_p lexbuf in
let loc =
Loc.create ~start:{ start with pos_cnum = start.pos_cnum + delta }
~stop:(Lexing.lexeme_end_p lexbuf)
~stop:{ stop with pos_cnum = stop.pos_cnum + delta_stop }
in
User_error.raise ~loc [ Pp.text message ]

Expand Down Expand Up @@ -144,6 +145,8 @@ let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F']
let atom_char = [^ ';' '(' ')' '"' '\000'-'\032' '\127'-'\255']
let varname_char = atom_char # [ ':' '%' '{' '}' ]

let non_ascii = ['\128'-'\255']

rule token with_comments = parse
| newline
{ Lexing.new_line lexbuf; token with_comments lexbuf }
Expand Down Expand Up @@ -352,13 +355,38 @@ and template_variable = parse
}
| '}' | eof
{ error lexbuf "%{...} forms cannot be empty" }
| (varname_char* as skip) (_ as other)
| (varname_char+ ':' ((':' | varname_char)*) as skip) (_ as other)
| (varname_char* as skip) (non_ascii* as maybe_utf) (_ as other)
| (varname_char+ ':' ((':' | varname_char)*) as skip) (non_ascii* as maybe_utf) (_ as other)
{
error
~delta:(String.length skip)
lexbuf
(Printf.sprintf "The character %C is not allowed inside %%{...} forms" other)
let utf_8_byte_length u =
match Uchar.to_int u with
| u when u < 0 -> assert false
| u when u <= 0x007F -> 1
| u when u <= 0x07FF -> 2
| u when u <= 0xFFFF -> 3
| u when u <= 0x10FFFF -> 4
| _ -> assert false
in
let utf_len = String.length maybe_utf in
let uchar =
if utf_len > 1 then
Some (Uutf.decoder ~encoding:`UTF_8 (`String maybe_utf) |> Uutf.decode)
else None
in
match uchar with
| Some (`Uchar u) ->
let uchar_len = utf_8_byte_length u in
let utf_char = String.sub maybe_utf ~pos:0 ~len:uchar_len in
error
~delta:(String.length skip)
~delta_stop:(-uchar_len)
lexbuf
(Printf.sprintf "The character %s is not allowed inside %%{...} forms" utf_char)
| _ ->
error
~delta:(String.length skip)
lexbuf
(Printf.sprintf "The character %C is not allowed inside %%{...} forms" other)
}
{
Expand Down
25 changes: 21 additions & 4 deletions test/blackbox-tests/test-cases/formatting/non-ascii-characters.t
Original file line number Diff line number Diff line change
@@ -1,13 +1,30 @@
How the non-ASCII characters are handled, this is also related to the issue #9728
Utf8 characters are handled for now, this is also related to the issue #9728

$ dune format-dune-file <<EOF
> ("É")
> ("Éff ĎúÑȨ")
> EOF
("\195\137")
("Éff ĎúÑȨ")

$ dune format-dune-file <<EOF
> (run foo %{bin:é})
> EOF
File "", line 1, characters 15-16:
Error: The character '\195' is not allowed inside %{...} forms
Error: The character é is not allowed inside %{...} forms
[1]

$ dune format-dune-file <<EOF
> (echo "hÉllo")
> EOF
(echo "hÉllo")

$ dune format-dune-file <<EOF
> (echo "É")
> EOF
(echo "É")

$ dune format-dune-file <<EOF
> (Écho "hello")
> EOF
File "", line 1, characters 1-1:
Error: Invalid . file
[1]

0 comments on commit 0907f5f

Please sign in to comment.