Merge pull request #756 from ejgallego/input_help_unicode

[hover] Show input help for Unicode characters
ejgallego · Jun 5, 2024 · d46cfd6 · d46cfd6
2 parents 9bcc395 + 91c79ca
commit d46cfd6
Show file tree

Hide file tree

Showing 11 changed files with 288 additions and 138 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -19,6 +19,8 @@
    #754)
  - [vscode] Fix focus race when a Coq file is in column 2 (@ejgallego,
    #755, cc: #722, #725)
+ - [hover] Show input howto for unicode characters on hover
+   (@ejgallego, Léo Stefanesco, #756)
 
 # coq-lsp 0.1.10: Hasta el 40 de Mayo _en effect_...
 ----------------------------------------------------

diff --git a/controller/rq_common.ml b/controller/rq_common.ml
@@ -55,22 +55,44 @@ let validate_line ~(contents : Fleche.Contents.t) ~line =
     Some (Array.get contents.lines line)
   else None
 
-let validate_column char line =
+let validate_column ~get char line =
   let length = Lang.Utf.length_utf16 line in
   if char < length then
     let char = Lang.Utf.utf8_offset_of_utf16_offset ~line ~offset:char in
-    Some (String.get line char)
+    get line char
   else None
 
 (* This returns a byte-based char offset for the line *)
-let validate_position ~contents ~point =
+let validate_position ~get ~contents ~point =
   let line, char = point in
-  validate_line ~contents ~line |> fun l -> Option.bind l (validate_column char)
+  validate_line ~contents ~line |> fun l ->
+  Option.bind l (validate_column ~get char)
 
-let get_char_at_point ~contents ~point =
-  let line, char = point in
-  if char >= 1 then
-    let point = (line, char - 1) in
-    validate_position ~contents ~point
-  else (* Can't get previous char *)
-    None
+let get_char_at_point_gen ~prev ~get ~contents ~point =
+  if prev then
+    let line, char = point in
+    if char >= 1 then
+      let point = (line, char - 1) in
+      validate_position ~get ~contents ~point
+    else (* Can't get previous char *)
+      None
+  else validate_position ~get ~contents ~point
+
+let get_char_at_point ~prev ~contents ~point =
+  let get line utf8_offset = Some (String.get line utf8_offset) in
+  get_char_at_point_gen ~prev ~get ~contents ~point
+
+let get_uchar_at_point ~prev ~contents ~point =
+  let get line utf8_offset =
+    let decode =
+      Lang.Compat.OCaml4_14.String.get_utf_8_uchar line utf8_offset
+    in
+    if Lang.Compat.OCaml4_14.Uchar.utf_decode_is_valid decode then
+      let str =
+        String.sub line utf8_offset
+          (Lang.Compat.OCaml4_14.Uchar.utf_decode_length decode)
+      in
+      Some (Lang.Compat.OCaml4_14.Uchar.utf_decode_uchar decode, str)
+    else None
+  in
+  get_char_at_point_gen ~prev ~get ~contents ~point
diff --git a/controller/rq_common.mli b/controller/rq_common.mli
@@ -12,4 +12,11 @@ val get_id_at_point :
   contents:Fleche.Contents.t -> point:int * int -> string option
 
 val get_char_at_point :
-  contents:Fleche.Contents.t -> point:int * int -> char option
+  prev:bool -> contents:Fleche.Contents.t -> point:int * int -> char option
+
+(* Get both the uchar and its utf-8 string representation *)
+val get_uchar_at_point :
+     prev:bool
+  -> contents:Fleche.Contents.t
+  -> point:int * int
+  -> (Uchar.t * string) option
diff --git a/controller/rq_completion.ml b/controller/rq_completion.ml
@@ -60,21 +60,15 @@ let mk_unicode_completion_item point (label, newText) =
   mk_completion ~label ~labelDetails ~textEdit ~commitCharacters ()
 
 let unicode_list point : Yojson.Safe.t list =
-  let ulist =
-    match !Fleche.Config.v.unicode_completion with
-    | Off -> []
-    | Internal_small -> Unicode_bindings.small
-    | Normal -> Unicode_bindings.normal
-    | Extended -> Unicode_bindings.extended
-  in
+  let ulist = Unicode_bindings.from_config () in
   (* Coq's CList.map is tail-recursive *)
   CList.map (mk_unicode_completion_item point) ulist
 
 let completion ~token:_ ~(doc : Fleche.Doc.t) ~point =
   (* Instead of get_char_at_point we should have a CompletionContext.t, to be
      addressed in further completion PRs *)
   let contents = doc.contents in
-  (match Rq_common.get_char_at_point ~contents ~point with
+  (match Rq_common.get_char_at_point ~prev:true ~contents ~point with
   | None ->
     let incomplete = true in
     let items = [] in

diff --git a/controller/rq_hover.ml b/controller/rq_hover.ml
@@ -223,6 +223,29 @@ module Notation : HoverProvider = struct
   let h = Handler.WithNode info_notation
 end
 
+module InputHelp : HoverProvider = struct
+  let mk_map map =
+    List.fold_left
+      (fun m (tex, uni) -> CString.Map.add uni tex m)
+      CString.Map.empty map
+
+  (* A bit hackish, but OK *)
+  let unimap =
+    Lazy.from_fun (fun () -> mk_map (Unicode_bindings.from_config ()))
+
+  let input_help ~token:_ ~contents ~point ~node:_ =
+    (* check if contents at point match *)
+    match Rq_common.get_uchar_at_point ~prev:false ~contents ~point with
+    | Some (uchar, uchar_str)
+      when Lang.Compat.OCaml4_14.Uchar.utf_8_byte_length uchar > 1 ->
+      Option.map
+        (fun tex -> Format.asprintf "Input %s with %s" uchar_str tex)
+        (CString.Map.find_opt uchar_str (Lazy.force unimap))
+    | Some _ | None -> None
+
+  let h = Handler.MaybeNode input_help
+end
+
 module Register = struct
   let handlers : Handler.t list ref = ref []
   let add fn = handlers := fn :: !handlers
@@ -240,7 +263,9 @@ module Register = struct
 end
 
 (* Register in-file hover plugins *)
-let () = List.iter Register.add [ Loc_info.h; Stats.h; Type.h; Notation.h ]
+let () =
+  List.iter Register.add
+    [ Loc_info.h; Stats.h; Type.h; Notation.h; InputHelp.h ]
 
 let hover ~token ~(doc : Fleche.Doc.t) ~point =
   let node = Info.LC.node ~doc ~point Exact in

diff --git a/controller/unicode_bindings.ml b/controller/unicode_bindings.ml
@@ -1697,3 +1697,10 @@ let extended =
   ; ("\\_v", "ᵥ")
   ; ("\\_x", "ₓ")
   ]
+
+let from_config () =
+  match !Fleche.Config.v.unicode_completion with
+  | Off -> []
+  | Internal_small -> small
+  | Normal -> normal
+  | Extended -> extended
diff --git a/controller/unicode_bindings.mli b/controller/unicode_bindings.mli
@@ -9,3 +9,6 @@ val normal : (string * string) list
 
 (** All the supported bindings for unicode characters in a table. *)
 val extended : (string * string) list
+
+(** Return the list selected in config *)
+val from_config : unit -> (string * string) list
diff --git a/lang/compat.ml b/lang/compat.ml
@@ -0,0 +1,176 @@
+(* OCaml compat *)
+
+(* The following is copied from Ocaml's standard library Bytes and Uchar
+   modules. We use the public safe variant of various functions, so it should be
+   slower.
+
+   TODO: when our minimum supported Ocaml version is >= 4.14 we shoud switch to
+   the standard library. *)
+module Uchar_ = Uchar
+
+module OCaml4_14 = struct
+  module Uchar = struct
+    type utf_decode = int
+
+    (* From Uchar.ml *)
+    let rep = 0xFFFD
+    let valid_bit = 27
+    let decode_bits = 24
+    let[@inline] utf_decode_is_valid d = d lsr valid_bit = 1
+    let[@inline] utf_decode_length d = (d lsr decode_bits) land 0b111
+    let[@inline] utf_decode_uchar d = Uchar.unsafe_of_int (d land 0xFFFFFF)
+    let[@inline] utf_decode n u = ((8 lor n) lsl decode_bits) lor Uchar.to_int u
+    let[@inline] utf_decode_invalid n = (n lsl decode_bits) lor rep
+
+    let utf_8_byte_length u =
+      match Uchar.to_int u with
+      | u when u < 0 -> assert false
+      | u when u <= 0x007F -> 1
+      | u when u <= 0x07FF -> 2
+      | u when u <= 0xFFFF -> 3
+      | u when u <= 0x10FFFF -> 4
+      | _ -> assert false
+
+    let utf_16_byte_length u =
+      match Uchar.to_int u with
+      | u when u < 0 -> assert false
+      | u when u <= 0xFFFF -> 2
+      | u when u <= 0x10FFFF -> 4
+      | _ -> assert false
+  end
+
+  module String = struct
+    let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10
+    let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101
+    let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100
+    let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b
+    let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8
+    let[@inline] utf_8_uchar_2 b0 b1 = ((b0 land 0x1F) lsl 6) lor (b1 land 0x3F)
+
+    let[@inline] utf_8_uchar_3 b0 b1 b2 =
+      ((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F)
+
+    let[@inline] utf_8_uchar_4 b0 b1 b2 b3 =
+      ((b0 land 0x07) lsl 18)
+      lor ((b1 land 0x3F) lsl 12)
+      lor ((b2 land 0x3F) lsl 6)
+      lor (b3 land 0x3F)
+
+    let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar_.unsafe_of_int u)
+    let dec_invalid = Uchar.utf_decode_invalid
+
+    let get_utf_8_uchar s i =
+      let b = Bytes.unsafe_of_string s in
+      let b0 = Bytes.get_uint8 b i in
+      (* raises if [i] is not a valid index. *)
+      let get = Bytes.get_uint8 in
+      let max = Bytes.length b - 1 in
+      match Char.unsafe_chr b0 with
+      (* See The Unicode Standard, Table 3.7 *)
+      | '\x00' .. '\x7F' -> dec_ret 1 b0
+      | '\xC2' .. '\xDF' ->
+        let i = i + 1 in
+        if i > max then dec_invalid 1
+        else
+          let b1 = get b i in
+          if not_in_x80_to_xBF b1 then dec_invalid 1
+          else dec_ret 2 (utf_8_uchar_2 b0 b1)
+      | '\xE0' ->
+        let i = i + 1 in
+        if i > max then dec_invalid 1
+        else
+          let b1 = get b i in
+          if not_in_xA0_to_xBF b1 then dec_invalid 1
+          else
+            let i = i + 1 in
+            if i > max then dec_invalid 2
+            else
+              let b2 = get b i in
+              if not_in_x80_to_xBF b2 then dec_invalid 2
+              else dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
+      | '\xE1' .. '\xEC' | '\xEE' .. '\xEF' ->
+        let i = i + 1 in
+        if i > max then dec_invalid 1
+        else
+          let b1 = get b i in
+          if not_in_x80_to_xBF b1 then dec_invalid 1
+          else
+            let i = i + 1 in
+            if i > max then dec_invalid 2
+            else
+              let b2 = get b i in
+              if not_in_x80_to_xBF b2 then dec_invalid 2
+              else dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
+      | '\xED' ->
+        let i = i + 1 in
+        if i > max then dec_invalid 1
+        else
+          let b1 = get b i in
+          if not_in_x80_to_x9F b1 then dec_invalid 1
+          else
+            let i = i + 1 in
+            if i > max then dec_invalid 2
+            else
+              let b2 = get b i in
+              if not_in_x80_to_xBF b2 then dec_invalid 2
+              else dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
+      | '\xF0' ->
+        let i = i + 1 in
+        if i > max then dec_invalid 1
+        else
+          let b1 = get b i in
+          if not_in_x90_to_xBF b1 then dec_invalid 1
+          else
+            let i = i + 1 in
+            if i > max then dec_invalid 2
+            else
+              let b2 = get b i in
+              if not_in_x80_to_xBF b2 then dec_invalid 2
+              else
+                let i = i + 1 in
+                if i > max then dec_invalid 3
+                else
+                  let b3 = get b i in
+                  if not_in_x80_to_xBF b3 then dec_invalid 3
+                  else dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
+      | '\xF1' .. '\xF3' ->
+        let i = i + 1 in
+        if i > max then dec_invalid 1
+        else
+          let b1 = get b i in
+          if not_in_x80_to_xBF b1 then dec_invalid 1
+          else
+            let i = i + 1 in
+            if i > max then dec_invalid 2
+            else
+              let b2 = get b i in
+              if not_in_x80_to_xBF b2 then dec_invalid 2
+              else
+                let i = i + 1 in
+                if i > max then dec_invalid 3
+                else
+                  let b3 = get b i in
+                  if not_in_x80_to_xBF b3 then dec_invalid 3
+                  else dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
+      | '\xF4' ->
+        let i = i + 1 in
+        if i > max then dec_invalid 1
+        else
+          let b1 = get b i in
+          if not_in_x80_to_x8F b1 then dec_invalid 1
+          else
+            let i = i + 1 in
+            if i > max then dec_invalid 2
+            else
+              let b2 = get b i in
+              if not_in_x80_to_xBF b2 then dec_invalid 2
+              else
+                let i = i + 1 in
+                if i > max then dec_invalid 3
+                else
+                  let b3 = get b i in
+                  if not_in_x80_to_xBF b3 then dec_invalid 3
+                  else dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
+      | _ -> dec_invalid 1
+  end
+end
diff --git a/lang/compat.mli b/lang/compat.mli
@@ -0,0 +1,16 @@
+module OCaml4_14 : sig
+  module Uchar : sig
+    type utf_decode
+
+    val utf_decode_is_valid : utf_decode -> bool
+    val utf_decode_uchar : utf_decode -> Uchar.t
+    val utf_decode_length : utf_decode -> int
+    val utf_decode : int -> Uchar.t -> int
+    val utf_8_byte_length : Uchar.t -> int
+    val utf_16_byte_length : Uchar.t -> int
+  end
+
+  module String : sig
+    val get_utf_8_uchar : string -> int -> Uchar.utf_decode
+  end
+end