From e436c57a28096c1bb02fd88431be9434fbb6cd54 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Fri, 2 Sep 2022 22:34:54 -0400 Subject: [PATCH 01/31] Add some documentation to cryptic Sub functions --- src/parser.ml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/parser.ml b/src/parser.ml index 6e05e28b..86c876a9 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -1,6 +1,7 @@ open Ast open Compat +(* Implementation of string slices over a base string via an offset *) module Sub : sig type t @@ -12,8 +13,12 @@ module Sub : sig val print : Format.formatter -> t -> unit val head : ?rev:unit -> t -> char option val tail : ?rev:unit -> t -> t + (** [head n s] is a list of the first [n] characters of [s] *) val heads : int -> t -> char list + + (** [tails n s] is [s] with the first [n] characters dropped *) val tails : int -> t -> t + val for_all : (char -> bool) -> t -> bool val exists : (char -> bool) -> t -> bool val is_empty : t -> bool From 3adc1da78ba9dfd51596c909882ef48e7fb68b0e Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Fri, 2 Sep 2022 22:52:46 -0400 Subject: [PATCH 02/31] Factor out string slice access functions The `last` and `tail` functions where each two functions combined into one, differentiated by a `~rev` flag. This imposes unnecessary overhead when trying to read the code (IMO). This change replaces the flag-based usage with two declaritively named functions. We also rename the `ws` function to the more accurate `trim_ws`, since it trims white space, and make the flag a boolean, rather than an optional unit. --- src/parser.ml | 138 +++++++++++++++++++++++++++----------------------- 1 file changed, 76 insertions(+), 62 deletions(-) diff --git a/src/parser.ml b/src/parser.ml index 86c876a9..a8f138c9 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -11,13 +11,20 @@ module Sub : sig val lexbuf : t -> Lexing.lexbuf val contains : string -> t -> bool val print : Format.formatter -> t -> unit - val head : ?rev:unit -> t -> char option - val tail : ?rev:unit -> t -> t - (** [head n s] is a list of the first [n] characters of [s] *) + val head : t -> char option + val tail : t -> t + + val last : t -> char option + (** [last s] is the [Some c] if [c] is the last character of [s], or else [None] if [s] is empty *) + + val drop_last : t -> t + (** [drop_last s] is the [s] without its last character *) + val heads : int -> t -> char list + (** [head n s] is a list of the first [n] characters of [s] *) - (** [tails n s] is [s] with the first [n] characters dropped *) val tails : int -> t -> t + (** [tails n s] is [s] with the first [n] characters dropped *) val for_all : (char -> bool) -> t -> bool val exists : (char -> bool) -> t -> bool @@ -66,17 +73,21 @@ end = struct in loop off - let head ?rev s = - match (rev, s) with - | _, { len = 0; _ } -> None - | None, { base; off; _ } -> Some base.[off] - | Some (), { base; off; len } -> Some base.[off + len - 1] + let head = function + | { len = 0; _ } -> None + | { base; off; _ } -> Some base.[off] - let tail ?rev s = - match (rev, s) with - | _, { len = 0; _ } -> s - | None, { base; off; len } -> { base; off = succ off; len = pred len } - | Some (), { base; off; len } -> { base; off; len = pred len } + let last = function + | { len = 0; _ } -> None + | { base; off; len } -> Some base.[off + len - 1] + + let tail = function + | { len = 0; _ } as s -> s + | { base; off; len } -> { base; off = succ off; len = pred len } + + let drop_last = function + | { len = 0; _ } as s -> s + | { base; off; len } -> { base; off; len = pred len } let heads n s = if n < 0 then invalid_arg "heads"; @@ -87,6 +98,8 @@ end = struct in loop n s + (* TODO Length can become negative *) + (* TODO Should be named "drop" can become negative *) let tails n { base; off; len } = if n < 0 then invalid_arg "tails"; { base; off = off + n; len = len - n } @@ -264,12 +277,15 @@ let sp3 s = let ( ||| ) p1 p2 s = try p1 s with Fail -> p2 s -let rec ws ?rev s = - match Sub.head ?rev s with - | Some (' ' | '\t' | '\010' .. '\013') -> ws ?rev (Sub.tail ?rev s) +let rec trim_ws ?(rev = false) s = + let inspect, drop = + if rev then (Sub.last, Sub.drop_last) else (Sub.head, Sub.tail) + in + match inspect s with + | Some (' ' | '\t' | '\010' .. '\013') -> trim_ws ~rev (drop s) | None | Some _ -> s -let is_empty s = Sub.is_empty (ws s) +let is_empty s = Sub.is_empty (trim_ws s) let thematic_break s = match Sub.head s with @@ -293,7 +309,7 @@ let setext_heading s = match Sub.head s with | Some c1 when c = c1 -> loop (succ n) (Sub.tail s) | Some _ | None -> - if not (Sub.is_empty (ws s)) then raise Fail; + if not (Sub.is_empty (trim_ws s)) then raise Fail; if c = '-' && n = 1 then raise Fail; (* can be interpreted as an empty list item *) Lsetext_heading ((if c = '-' then 2 else 1), n) @@ -380,7 +396,7 @@ let attribute_string s = Buffer.add_char buf c; loop (Sub.tail s) in - let s', a = loop (ws s) in + let s', a = loop (trim_ws s) in (s', parse_attributes a) let atx_heading s = @@ -390,18 +406,16 @@ let atx_heading s = | Some '#' -> loop (succ n) (Sub.tail s) | Some (' ' | '\t' | '\010' .. '\013') -> let s, a = - match Sub.head ~rev:() s with - | Some '}' -> attribute_string s - | _ -> (s, []) + match Sub.last s with Some '}' -> attribute_string s | _ -> (s, []) in - let s = ws ~rev:() (ws s) in + let s = trim_ws ~rev:true (trim_ws s) in let rec loop t = - match Sub.head ~rev:() t with - | Some '#' -> loop (Sub.tail ~rev:() t) - | Some (' ' | '\t' | '\010' .. '\013') | None -> ws ~rev:() t + match Sub.last t with + | Some '#' -> loop (Sub.drop_last t) + | Some (' ' | '\t' | '\010' .. '\013') | None -> trim_ws ~rev:true t | Some _ -> s in - Latx_heading (n, Sub.to_string (ws (loop s)), a) + Latx_heading (n, Sub.to_string (trim_ws (loop s)), a) | Some _ -> raise Fail | None -> Latx_heading (n, Sub.to_string s, []) in @@ -469,15 +483,15 @@ let entity s = let info_string c s = let buf = Buffer.create 17 in let s, a = - match Sub.head ~rev:() s with Some '}' -> attribute_string s | _ -> (s, []) + match Sub.last s with Some '}' -> attribute_string s | _ -> (s, []) in - let s = ws ~rev:() (ws s) in + let s = trim_ws ~rev:true (trim_ws s) in let rec loop s = match Sub.head s with | Some (' ' | '\t' | '\010' .. '\013') | None -> if c = '`' && Sub.exists (function '`' -> true | _ -> false) s then raise Fail; - ((Buffer.contents buf, Sub.to_string (ws s)), a) + ((Buffer.contents buf, Sub.to_string (trim_ws s)), a) | Some '`' when c = '`' -> raise Fail | Some ('\\' as c) -> ( let s = Sub.tail s in @@ -501,7 +515,7 @@ let info_string c s = Buffer.add_char buf c; loop (Sub.tail s) in - loop (ws s) + loop (trim_ws s) let fenced_code ind s = match Sub.head s with @@ -645,7 +659,7 @@ let special_tag s = List.mem s special_tags let closing_tag s = - let s = ws s in + let s = trim_ws s in match Sub.head s with | Some '>' -> if not (is_empty (Sub.tail s)) then raise Fail; @@ -668,7 +682,7 @@ let known_tag tag s = let ws1 s = match Sub.head s with - | Some (' ' | '\t' | '\010' .. '\013') -> ws s + | Some (' ' | '\t' | '\010' .. '\013') -> trim_ws s | Some _ | None -> raise Fail let attribute_name s = @@ -709,10 +723,10 @@ let attribute_value s = let attribute s = let s = ws1 s in let s = attribute_name s in - let s = ws s in + let s = trim_ws s in match Sub.head s with | Some '=' -> - let s = ws (Sub.tail s) in + let s = trim_ws (Sub.tail s) in attribute_value s | Some _ | None -> s @@ -722,7 +736,7 @@ let attributes s = let open_tag s = let s = attributes s in - let s = ws s in + let s = trim_ws s in let n = match Sub.heads 2 s with | '/' :: '>' :: _ -> 2 @@ -754,9 +768,9 @@ let blank s = let tag_string s = let buf = Buffer.create 17 in let s, a = - match Sub.head ~rev:() s with Some '}' -> attribute_string s | _ -> (s, []) + match Sub.last s with Some '}' -> attribute_string s | _ -> (s, []) in - let s = ws ~rev:() (ws s) in + let s = trim_ws ~rev:true (trim_ws s) in let rec loop s = match Sub.head s with | Some (' ' | '\t' | '\010' .. '\013') | None -> (Buffer.contents buf, a) @@ -764,7 +778,7 @@ let tag_string s = Buffer.add_char buf c; loop (Sub.tail s) in - loop (ws s) + loop (trim_ws s) let def_list s = let s = Sub.tail s in @@ -928,34 +942,34 @@ module Pre = struct let is_emph_match n1 n2 = (* - *foo**bar**baz* - - *foo** -> the second delimiter ** is both an opening and closing delimiter. + + *foo** -> the second delimiter ** is both an opening and closing delimiter. The sum of the length of both delimiters is 3, so they can't be matched. - - **bar** -> they are both opening and closing delemiters. + + **bar** -> they are both opening and closing delemiters. Their sum is 4 which is not a multiple of 3 so they can be matched to produce bar - + The end result is: foobarbaz - + - *foo***bar**baz* - - *foo*** -> *** is both an opening and closing delimiter. + + *foo*** -> *** is both an opening and closing delimiter. Their sum is 4 so they can be matched to produce: foo** - - **bar** -> they are both opening and closing delemiters. + + **bar** -> they are both opening and closing delemiters. Their sum is 4 which is not a multiple of 3 so they can be matched to produce bar - + The end result is: foobarbaz* - + - ***foo***bar**baz* - - ***foo*** -> the second delimiter *** is both an opening and closing delimiter. + + ***foo*** -> the second delimiter *** is both an opening and closing delimiter. Their sum is 6 which is a multiple of 3. However, both lengths are multiples of 3 so they can be matched to produce: foo - + bar**baz* -> ** is both an opening and closing delimiter. Their sum is 3 so they can't be matched - + The end result is: foobar**baz* *) if (n1 + n2) mod 3 = 0 && n1 mod 3 != 0 && n2 mod 3 != 0 then false @@ -972,12 +986,12 @@ module Pre = struct (* The second delimiter (the closer) is also an opener, and both delimiters don't match together, according to the "mod 3" rule. In that case, we check if the next delimiter can match. - + *foo**bar**baz* The second delimiter that's both an opener/closer ( ** before bar) matches with the next delimiter ( ** after bar). They'll become bar. The end result will be: foobarbaz - - + + *foo**bar*baz* The second delimiter that's both an opener/closer ( ** before bar) doesn't match with the next delimiter ( * after bar). **bar will be considered as regular text. The end result will be: foo**barbaz* @@ -1011,12 +1025,12 @@ module Pre = struct (* This case happens when we encounter a second opener delimiter. We look ahead for the next closer, and if the next closer is of the same style, we can match them together. - + *foo _bar_ baz_ The second opener (_ before `bar`) is of the same style as the next closer (_ after `bar`). We can match them to produce bar The end result will be: *foo bar baz_ - - + + *foo _bar* baz_ The second opener (_ before `bar`) is not of the same style as the next closer ( * after `bar`). They can't be matched so we'll consider _bar as regular text. The end result will be: foo _bar baz_ From 5955a32ee179f42c4127b575dbdafcf4fa130c4f Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Fri, 2 Sep 2022 23:10:51 -0400 Subject: [PATCH 03/31] Rename `Sub.tails` to `Sub.drop` Afaik, `drop` is the usual name for this function. Also adds validation logic to ensure the slice cannot exceed the bounds of the underlying string. --- src/parser.ml | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/parser.ml b/src/parser.ml index a8f138c9..b74a506d 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -23,8 +23,8 @@ module Sub : sig val heads : int -> t -> char list (** [head n s] is a list of the first [n] characters of [s] *) - val tails : int -> t -> t - (** [tails n s] is [s] with the first [n] characters dropped *) + val drop : int -> t -> t + (** [drop n s] is [s] with the first [n] characters dropped *) val for_all : (char -> bool) -> t -> bool val exists : (char -> bool) -> t -> bool @@ -98,11 +98,13 @@ end = struct in loop n s - (* TODO Length can become negative *) - (* TODO Should be named "drop" can become negative *) - let tails n { base; off; len } = + let drop n s = if n < 0 then invalid_arg "tails"; - { base; off = off + n; len = len - n } + (* len should not be reduced below 0, as strings cannot have a negative length *) + let len = max (s.len - n) 0 in + (* off should not exceed the length of the base string *) + let off = min (s.off + n) (String.length s.base) in + { s with off; len } let is_empty s = length s = 0 @@ -448,7 +450,7 @@ let entity s = ([ u ], Sub.tail s) | Some _ | None -> raise Fail in - loop 0 0 (Sub.tails 2 s) + loop 0 0 (Sub.drop 2 s) | '#' :: _ -> let rec loop m n s = if m > 7 then raise Fail; @@ -743,7 +745,7 @@ let open_tag s = | '>' :: _ -> 1 | _ -> raise Fail in - if not (is_empty (Sub.tails n s)) then raise Fail; + if not (is_empty (Sub.drop n s)) then raise Fail; Lhtml (false, Hblank) let raw_html s = @@ -754,10 +756,10 @@ let raw_html s = Lhtml (true, Hcontains [ "]]>" ]) | '<' :: '!' :: _ -> Lhtml (true, Hcontains [ ">" ]) | '<' :: '/' :: _ -> - let tag, s = tag_name (Sub.tails 2 s) in + let tag, s = tag_name (Sub.drop 2 s) in (known_tag tag ||| closing_tag) s | '<' :: _ -> - let tag, s = tag_name (Sub.tails 1 s) in + let tag, s = tag_name (Sub.drop 1 s) in (special_tag tag ||| known_tag tag ||| open_tag) s | _ -> raise Fail From 147b7d4c13717d39cb9cf8f1fae050f66bb4ab35 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Sun, 4 Sep 2022 17:19:08 -0400 Subject: [PATCH 04/31] Rename "heads" to "take" --- src/parser.ml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/parser.ml b/src/parser.ml index b74a506d..71c593cd 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -20,8 +20,8 @@ module Sub : sig val drop_last : t -> t (** [drop_last s] is the [s] without its last character *) - val heads : int -> t -> char list - (** [head n s] is a list of the first [n] characters of [s] *) + val take : int -> t -> char list + (** [take n s] is a list of the first [n] characters of [s] *) val drop : int -> t -> t (** [drop n s] is [s] with the first [n] characters dropped *) @@ -89,8 +89,8 @@ end = struct | { len = 0; _ } as s -> s | { base; off; len } -> { base; off; len = pred len } - let heads n s = - if n < 0 then invalid_arg "heads"; + let take n s = + if n < 0 then invalid_arg "take"; let rec loop n s = if n = 0 || length s = 0 then [] else @@ -424,7 +424,7 @@ let atx_heading s = loop 0 s let entity s = - match Sub.heads 2 s with + match Sub.take 2 s with | '#' :: ('x' | 'X') :: _ -> let rec loop m n s = if m > 6 then raise Fail; @@ -677,7 +677,7 @@ let special_tag tag s = let known_tag tag s = if not (known_tag tag) then raise Fail; - match Sub.heads 2 s with + match Sub.take 2 s with | (' ' | '\t' | '\010' .. '\013') :: _ | [] | '>' :: _ | '/' :: '>' :: _ -> Lhtml (true, Hblank) | _ -> raise Fail @@ -740,7 +740,7 @@ let open_tag s = let s = attributes s in let s = trim_ws s in let n = - match Sub.heads 2 s with + match Sub.take 2 s with | '/' :: '>' :: _ -> 2 | '>' :: _ -> 1 | _ -> raise Fail @@ -749,7 +749,7 @@ let open_tag s = Lhtml (false, Hblank) let raw_html s = - match Sub.heads 10 s with + match Sub.take 10 s with | '<' :: '?' :: _ -> Lhtml (true, Hcontains [ "?>" ]) | '<' :: '!' :: '-' :: '-' :: _ -> Lhtml (true, Hcontains [ "-->" ]) | '<' :: '!' :: '[' :: 'C' :: 'D' :: 'A' :: 'T' :: 'A' :: '[' :: _ -> From 153bd1317b9558c47334fa5c0dbab80e54f6cfcf Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Sun, 4 Sep 2022 17:30:27 -0400 Subject: [PATCH 05/31] Move `Sub` module into strSlice module file Helps reduce cognitive overload in trying to graple with `parser.ml` and gives a more accurate name to the module. --- src/block.ml | 2 +- src/parser.ml | 122 +---------------------------------------------- src/strSlice.ml | 87 +++++++++++++++++++++++++++++++++ src/strSlice.mli | 31 ++++++++++++ 4 files changed, 120 insertions(+), 122 deletions(-) create mode 100644 src/strSlice.ml create mode 100644 src/strSlice.mli diff --git a/src/block.ml b/src/block.ml index 17ba3a2b..7b5489df 100644 --- a/src/block.ml +++ b/src/block.ml @@ -1,5 +1,5 @@ open Ast -module Sub = Parser.Sub +module Sub = StrSlice module Pre = struct type container = diff --git a/src/parser.ml b/src/parser.ml index 71c593cd..4fc44431 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -1,127 +1,7 @@ open Ast open Compat -(* Implementation of string slices over a base string via an offset *) -module Sub : sig - type t - - val of_string : ?off:int -> string -> t - val to_string : t -> string - val offset : int -> t -> t - val lexbuf : t -> Lexing.lexbuf - val contains : string -> t -> bool - val print : Format.formatter -> t -> unit - val head : t -> char option - val tail : t -> t - - val last : t -> char option - (** [last s] is the [Some c] if [c] is the last character of [s], or else [None] if [s] is empty *) - - val drop_last : t -> t - (** [drop_last s] is the [s] without its last character *) - - val take : int -> t -> char list - (** [take n s] is a list of the first [n] characters of [s] *) - - val drop : int -> t -> t - (** [drop n s] is [s] with the first [n] characters dropped *) - - val for_all : (char -> bool) -> t -> bool - val exists : (char -> bool) -> t -> bool - val is_empty : t -> bool - val get_offset : t -> int - val length : t -> int - val sub : len:int -> t -> t -end = struct - type t = - { base : string - ; off : int - ; len : int - } - - let of_string ?(off = 0) base = { base; off; len = String.length base - off } - let to_string { base; off; len } = String.sub base off len - let print ppf s = Format.fprintf ppf "%S" (to_string s) - let get_offset { off; _ } = off - let length { len; _ } = len - - let offset n { base; off; len } = - if n < 0 then invalid_arg "offset"; - let rec loop n base off len = - if n = 0 || len = 0 then { base; off; len } - else - match base.[off] with - | '\t' -> - let ts = ((off + 4) / 4 * 4) - off in - let b = Buffer.create len in - Buffer.add_substring b base 0 off; - for _ = 1 to ts do - Buffer.add_char b ' ' - done; - Buffer.add_substring b base (off + 1) (len - 1); - loop n (Buffer.contents b) off (len + ts - 1) - | _ -> loop (n - 1) base (off + 1) (len - 1) - in - loop n base off len - - let lexbuf s = Lexing.from_string (to_string s) - - let contains s1 { base; off; len } = - let rec loop off = - if off + String.length s1 > len then false - else s1 = String.sub base off (String.length s1) || loop (off + 1) - in - loop off - - let head = function - | { len = 0; _ } -> None - | { base; off; _ } -> Some base.[off] - - let last = function - | { len = 0; _ } -> None - | { base; off; len } -> Some base.[off + len - 1] - - let tail = function - | { len = 0; _ } as s -> s - | { base; off; len } -> { base; off = succ off; len = pred len } - - let drop_last = function - | { len = 0; _ } as s -> s - | { base; off; len } -> { base; off; len = pred len } - - let take n s = - if n < 0 then invalid_arg "take"; - let rec loop n s = - if n = 0 || length s = 0 then [] - else - match head s with Some c -> c :: loop (pred n) (tail s) | None -> [] - in - loop n s - - let drop n s = - if n < 0 then invalid_arg "tails"; - (* len should not be reduced below 0, as strings cannot have a negative length *) - let len = max (s.len - n) 0 in - (* off should not exceed the length of the base string *) - let off = min (s.off + n) (String.length s.base) in - { s with off; len } - - let is_empty s = length s = 0 - - let exists f s = - let rec loop s i = - if i >= s.len then false - else if f s.base.[s.off + i] then true - else loop s (succ i) - in - loop s 0 - - let for_all f s = not (exists (fun c -> not (f c)) s) - - let sub ~len s = - if len > s.len then invalid_arg "sub"; - { s with len } -end +module Sub = StrSlice exception Fail diff --git a/src/strSlice.ml b/src/strSlice.ml new file mode 100644 index 00000000..f2aa9cad --- /dev/null +++ b/src/strSlice.ml @@ -0,0 +1,87 @@ +type t = + { base : string + ; off : int + ; len : int + } + +let of_string ?(off = 0) base = { base; off; len = String.length base - off } +let to_string { base; off; len } = String.sub base off len +let print ppf s = Format.fprintf ppf "%S" (to_string s) +let get_offset { off; _ } = off +let length { len; _ } = len + +let offset n { base; off; len } = + if n < 0 then invalid_arg "offset"; + let rec loop n base off len = + if n = 0 || len = 0 then { base; off; len } + else + match base.[off] with + | '\t' -> + let ts = ((off + 4) / 4 * 4) - off in + let b = Buffer.create len in + Buffer.add_substring b base 0 off; + for _ = 1 to ts do + Buffer.add_char b ' ' + done; + Buffer.add_substring b base (off + 1) (len - 1); + loop n (Buffer.contents b) off (len + ts - 1) + | _ -> loop (n - 1) base (off + 1) (len - 1) + in + loop n base off len + +let lexbuf s = Lexing.from_string (to_string s) + +let contains s1 { base; off; len } = + let rec loop off = + if off + String.length s1 > len then false + else s1 = String.sub base off (String.length s1) || loop (off + 1) + in + loop off + +let head = function + | { len = 0; _ } -> None + | { base; off; _ } -> Some base.[off] + +let last = function + | { len = 0; _ } -> None + | { base; off; len } -> Some base.[off + len - 1] + +let tail = function + | { len = 0; _ } as s -> s + | { base; off; len } -> { base; off = succ off; len = pred len } + +let drop_last = function + | { len = 0; _ } as s -> s + | { base; off; len } -> { base; off; len = pred len } + +let take n s = + if n < 0 then invalid_arg "take"; + let rec loop n s = + if n = 0 || length s = 0 then [] + else match head s with Some c -> c :: loop (pred n) (tail s) | None -> [] + in + loop n s + +let drop n s = + if n < 0 then invalid_arg "drop"; + (* len should not be reduced below 0, as strings cannot have a negative length *) + let len = max (s.len - n) 0 in + (* off should not exceed the length of the base string *) + let off = min (s.off + n) (String.length s.base) in + { s with off; len } + +let is_empty s = length s = 0 + +let exists f s = + let rec loop s i = + if i >= s.len then false + else if f s.base.[s.off + i] then true + else loop s (succ i) + in + loop s 0 + +let for_all f s = not (exists (fun c -> not (f c)) s) + +let sub ~len s = + if len > s.len then invalid_arg "sub"; + { s with len } diff --git a/src/strSlice.mli b/src/strSlice.mli new file mode 100644 index 00000000..5874966c --- /dev/null +++ b/src/strSlice.mli @@ -0,0 +1,31 @@ +(* Implementation of string slices over a base string via an offset *) + +type t + +val of_string : ?off:int -> string -> t +val to_string : t -> string +val offset : int -> t -> t +val lexbuf : t -> Lexing.lexbuf +val contains : string -> t -> bool +val print : Format.formatter -> t -> unit +val head : t -> char option +val tail : t -> t + +val last : t -> char option +(** [last s] is the [Some c] if [c] is the last character of [s], or else [None] if [s] is empty *) + +val drop_last : t -> t +(** [drop_last s] is the [s] without its last character *) + +val take : int -> t -> char list +(** [take n s] is a list of the first [n] characters of [s] *) + +val drop : int -> t -> t +(** [drop n s] is [s] with the first [n] characters dropped *) + +val for_all : (char -> bool) -> t -> bool +val exists : (char -> bool) -> t -> bool +val is_empty : t -> bool +val get_offset : t -> int +val length : t -> int +val sub : len:int -> t -> t From 2aaddb32b5608556d022bee4929ff76b53a54825 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Sun, 4 Sep 2022 18:35:09 -0400 Subject: [PATCH 06/31] Add some comments and improve organization --- src/parser.ml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/parser.ml b/src/parser.ml index 4fc44431..b1c6634a 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -1,10 +1,10 @@ open Ast open Compat - module Sub = StrSlice exception Fail +(** Stateful parser combinators *) module P : sig type state type 'a t = state -> 'a @@ -12,11 +12,16 @@ module P : sig val of_string : string -> state val peek : char option t val peek_exn : char t + val peek_before : char -> state -> char + val peek_after : char -> state -> char val pos : state -> int val range : state -> int -> int -> string val set_pos : state -> int -> unit val junk : unit t + val char : char -> unit t + (** [char c] accepts a [c] *) + val next : char t val ( ||| ) : 'a t -> 'a t -> 'a t val ws : unit t @@ -25,8 +30,6 @@ module P : sig val ( >>> ) : unit t -> 'a t -> 'a t val ( <<< ) : 'a t -> unit t -> 'a t val protect : 'a t -> 'a t - val peek_before : char -> state -> char - val peek_after : char -> state -> char val pair : 'a t -> 'b t -> ('a * 'b) t val on_sub : (Sub.t -> 'a * Sub.t) -> 'a t end = struct From f7bc5b06e4e2ffe8785e9e122d38e2c09ffb59ea Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Sun, 4 Sep 2022 18:35:30 -0400 Subject: [PATCH 07/31] Factor out repeated length checks --- src/parser.ml | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/parser.ml b/src/parser.ml index b1c6634a..d0815889 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -42,20 +42,22 @@ end = struct type 'a t = state -> 'a + let ensure_chars_remain st = if st.pos >= String.length st.str then raise Fail + let char c st = - if st.pos >= String.length st.str then raise Fail; + ensure_chars_remain st; if st.str.[st.pos] <> c then raise Fail; st.pos <- st.pos + 1 let next st = - if st.pos >= String.length st.str then raise Fail - else - let c = st.str.[st.pos] in - st.pos <- st.pos + 1; - c + ensure_chars_remain st; + let c = st.str.[st.pos] in + st.pos <- st.pos + 1; + c let peek_exn st = - if st.pos >= String.length st.str then raise Fail else st.str.[st.pos] + ensure_chars_remain st; + st.str.[st.pos] let peek st = if st.pos >= String.length st.str then None else Some st.str.[st.pos] From 9b98a62fec32139dbd2a4d5253cb22732a0a2e02 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Sun, 4 Sep 2022 18:37:57 -0400 Subject: [PATCH 08/31] Define peek_exn via peek --- src/parser.ml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/parser.ml b/src/parser.ml index d0815889..1a2d1e95 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -55,13 +55,10 @@ end = struct st.pos <- st.pos + 1; c - let peek_exn st = - ensure_chars_remain st; - st.str.[st.pos] - let peek st = if st.pos >= String.length st.str then None else Some st.str.[st.pos] + let peek_exn st = match peek st with Some c -> c | None -> raise Fail let peek_before c st = if st.pos = 0 then c else st.str.[st.pos - 1] let peek_after c st = From 63a3709bc0d7c33c0227d74ab587975da3dec907 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Sun, 4 Sep 2022 21:45:40 -0400 Subject: [PATCH 09/31] Document the parser helpers --- src/parser.ml | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/src/parser.ml b/src/parser.ml index 1a2d1e95..4a009bfb 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -3,6 +3,8 @@ open Compat module Sub = StrSlice exception Fail +(** Raised when a parser fails, used for control flow rather than error + handling *) (** Stateful parser combinators *) module P : sig @@ -10,28 +12,73 @@ module P : sig type 'a t = state -> 'a val of_string : string -> state + val peek : char option t + (** [Some c] if [c] is the next character in the input, or [None] + if the input is exhausted. + + NOTE: Does not advance the state. *) + val peek_exn : char t + (** the next character in the input, or raises [Fail] if the + input is exhausted. + + NOTE: Does not advance the state. *) + val peek_before : char -> state -> char + (** the previous character in the input, or the next + character, if we are at the start of the input. + + NOTE: Does not advance the state. *) + val peek_after : char -> state -> char + (** the character after the next in the input, or the next + character, if we are at the end of the input. + + NOTE: Does not advance the state. *) + val pos : state -> int val range : state -> int -> int -> string val set_pos : state -> int -> unit + val junk : unit t + (** ignores the next character in the input *) val char : char -> unit t - (** [char c] accepts a [c] *) + (** accepts a [c] *) val next : char t - val ( ||| ) : 'a t -> 'a t -> 'a t + val ws : unit t + (** accepts 0 or more white space characters *) + val sp : unit t + (** accepts 0 or more spaces or tabs *) + val ws1 : unit t + (** accepts 1 or more spaces or tabs, fails if none is found *) + + val ( ||| ) : 'a t -> 'a t -> 'a t + (** [p ||| q] tries to accept [p], but in case [p] fails, it accepts [q] + (which can fail). *) + val ( >>> ) : unit t -> 'a t -> 'a t + (** [p >>> q] accepts [p] followed by [q], returning whatever [q] does *) + val ( <<< ) : 'a t -> unit t -> 'a t + (** [p >>> q] accepts [p] followed by [q], returning whatever [q] does *) + val protect : 'a t -> 'a t + (** run the given parser, resetting the state back to it's initial condition + if the parser fails *) + val pair : 'a t -> 'b t -> ('a * 'b) t + val on_sub : (Sub.t -> 'a * Sub.t) -> 'a t + (** Given a function [f] that takes a prefix of a string slice to a value [x] + of type ['a] and some remainder of the slice, [on_sub f] produces [x] from + the state, and advances the input the length of the slice that was + consumed by [f]. *) end = struct type state = { str : string From ed6f2aea5c3b6381902f491ebef36017828a9868 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Sun, 4 Sep 2022 22:33:25 -0400 Subject: [PATCH 10/31] Clean up sp3 --- src/parser.ml | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/parser.ml b/src/parser.ml index 4a009bfb..368b20ae 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -193,19 +193,15 @@ type t = | Lparagraph | Ldef_list of string +(* drop up to 3 spaces, returning the number of spaces dropped and the remainder of the string *) let sp3 s = - match Sub.head s with - | Some ' ' -> ( - let s = Sub.tail s in - match Sub.head s with - | Some ' ' -> ( - let s = Sub.tail s in - match Sub.head s with - | Some ' ' -> (3, Sub.tail s) - | Some _ | None -> (2, s)) - | Some _ | None -> (1, s)) - | Some _ | None -> (0, s) + match Sub.take 3 s with + | [ ' '; ' '; ' ' ] -> (3, Sub.drop 3 s) + | ' ' :: ' ' :: _ -> (2, Sub.drop 2 s) + | ' ' :: _ -> (1, Sub.drop 1 s) + | _ -> (0, s) +(** TODO Why is this here? Doesn't it exactly repeat the one in [P]? *) let ( ||| ) p1 p2 s = try p1 s with Fail -> p2 s let rec trim_ws ?(rev = false) s = From dabd83c8285b6ea261af10bb20921415558806ce Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Sun, 4 Sep 2022 22:52:55 -0400 Subject: [PATCH 11/31] Remove (most) duplicates of ws testing logic The aim is to enable the reader to see what we are testing for at a glance, to reduce the cognitive load of reading each line. --- src/parser.ml | 60 ++++++++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/src/parser.ml b/src/parser.ml index 368b20ae..c39a4b08 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -2,6 +2,10 @@ open Ast open Compat module Sub = StrSlice +let is_whitespace = function + | ' ' | '\t' | '\010' .. '\013' -> true + | _ -> false + exception Fail (** Raised when a parser fails, used for control flow rather than error handling *) @@ -127,11 +131,9 @@ end = struct let ws st = let rec loop () = - match peek_exn st with - | ' ' | '\t' | '\010' .. '\013' -> - junk st; - loop () - | _ -> () + if is_whitespace (peek_exn st) then ( + junk st; + loop ()) in try loop () with Fail -> () @@ -146,11 +148,10 @@ end = struct try loop () with Fail -> () let ws1 st = - match peek_exn st with - | ' ' | '\t' | '\010' .. '\013' -> - junk st; - ws st - | _ -> raise Fail + if is_whitespace (peek_exn st) then ( + junk st; + ws st) + else raise Fail let ( >>> ) p q st = p st; @@ -209,7 +210,7 @@ let rec trim_ws ?(rev = false) s = if rev then (Sub.last, Sub.drop_last) else (Sub.head, Sub.tail) in match inspect s with - | Some (' ' | '\t' | '\010' .. '\013') -> trim_ws ~rev (drop s) + | Some w when is_whitespace w -> trim_ws ~rev (drop s) | None | Some _ -> s let is_empty s = Sub.is_empty (trim_ws s) @@ -220,7 +221,7 @@ let thematic_break s = let rec loop n s = match Sub.head s with | Some c1 when c = c1 -> loop (succ n) (Sub.tail s) - | Some (' ' | '\t' | '\010' .. '\013') -> loop n (Sub.tail s) + | Some w when is_whitespace w -> loop n (Sub.tail s) | Some _ -> raise Fail | None -> if n < 3 then raise Fail; @@ -331,7 +332,7 @@ let atx_heading s = if n > 6 then raise Fail; match Sub.head s with | Some '#' -> loop (succ n) (Sub.tail s) - | Some (' ' | '\t' | '\010' .. '\013') -> + | Some w when is_whitespace w -> let s, a = match Sub.last s with Some '}' -> attribute_string s | _ -> (s, []) in @@ -339,7 +340,8 @@ let atx_heading s = let rec loop t = match Sub.last t with | Some '#' -> loop (Sub.drop_last t) - | Some (' ' | '\t' | '\010' .. '\013') | None -> trim_ws ~rev:true t + | Some w when is_whitespace w -> trim_ws ~rev:true t + | None -> trim_ws ~rev:true t | Some _ -> s in Latx_heading (n, Sub.to_string (trim_ws (loop s)), a) @@ -415,6 +417,7 @@ let info_string c s = let s = trim_ws ~rev:true (trim_ws s) in let rec loop s = match Sub.head s with + (* TODO use is_whitespace *) | Some (' ' | '\t' | '\010' .. '\013') | None -> if c = '`' && Sub.exists (function '`' -> true | _ -> false) s then raise Fail; @@ -607,9 +610,10 @@ let known_tag tag s = Lhtml (true, Hblank) | _ -> raise Fail +(** TODO Why these repeated functions that look just like thos in [P]? *) let ws1 s = match Sub.head s with - | Some (' ' | '\t' | '\010' .. '\013') -> trim_ws s + | Some w when is_whitespace w -> trim_ws s | Some _ | None -> raise Fail let attribute_name s = @@ -700,6 +704,7 @@ let tag_string s = let s = trim_ws ~rev:true (trim_ws s) in let rec loop s = match Sub.head s with + (* TODO use is_whitespace *) | Some (' ' | '\t' | '\010' .. '\013') | None -> (Buffer.contents buf, a) | Some c -> Buffer.add_char buf c; @@ -710,7 +715,7 @@ let tag_string s = let def_list s = let s = Sub.tail s in match Sub.head s with - | Some (' ' | '\t' | '\010' .. '\013') -> + | Some w when is_whitespace w-> Ldef_list (String.trim (Sub.to_string s)) | _ -> raise Fail @@ -748,7 +753,7 @@ let is_empty st = try let rec loop () = match next st with - | ' ' | '\t' | '\010' .. '\013' -> loop () + | c when is_whitespace c -> loop () | _ -> set_pos st off; false @@ -1018,11 +1023,11 @@ let link_label allow_balanced_brackets st = junk st; Buffer.add_char buf c; loop (succ n) true - | (' ' | '\t' | '\010' .. '\013') as c -> + | w when is_whitespace w -> junk st; - Buffer.add_char buf c; + Buffer.add_char buf w; loop n nonempty - | _ as c -> + | c -> junk st; Buffer.add_char buf c; loop n true @@ -1100,8 +1105,8 @@ let tag_name st = let ws_buf buf st = let rec loop () = match peek st with - | Some ((' ' | '\t' | '\010' .. '\013') as c) -> - Buffer.add_char buf c; + | Some w when is_whitespace w -> + Buffer.add_char buf w; junk st; loop () | Some _ | None -> () @@ -1188,7 +1193,7 @@ let attribute_value_specification = ws >>> char '=' >>> ws >>> attribute_value let ws1_buf buf st = match peek st with - | Some (' ' | '\t' | '\010' .. '\013') -> ws_buf buf st + | Some w when is_whitespace w -> ws_buf buf st | Some _ | None -> raise Fail let attribute st = @@ -1323,7 +1328,7 @@ let declaration st = junk st; Buffer.add_char buf c; loop () - | ' ' | '\t' | '\010' .. '\013' -> + | w when is_whitespace w -> ws1_buf buf st; let rec loop () = match peek_exn st with @@ -1586,7 +1591,7 @@ let inline_pre buf acc st = junk st; gobble_body start (succ m) | _ when m = n -> finish () - | Some ((' ' | '\t' | '\010' .. '\013') as c) -> + | Some c when is_whitespace c -> if m > 0 then Buffer.add_string bufcode (String.make m '`'); Buffer.add_char bufcode (if c = '\010' then ' ' else c); junk st; @@ -1844,10 +1849,11 @@ let sp3 st = | exception Fail -> 0 let link_reference_definition st : attributes Ast.link_def = + (* TODO remove duplicated ws/ws1 functions? *) let ws st = let rec loop seen_nl = match peek st with - | Some (' ' | '\t' | '\011' .. '\013') -> + | Some w when is_whitespace w -> junk st; loop seen_nl | Some '\n' when not seen_nl -> @@ -1859,7 +1865,7 @@ let link_reference_definition st : attributes Ast.link_def = in let ws1 st = match next st with - | ' ' | '\t' | '\010' .. '\013' -> ws st + | w when is_whitespace w -> ws st | _ -> raise Fail in ignore (sp3 st); From 16747cd05a8b16a15ad384e4b9527adb428d6b57 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Sun, 4 Sep 2022 22:54:40 -0400 Subject: [PATCH 12/31] Move char preds together --- src/parser.ml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/parser.ml b/src/parser.ml index c39a4b08..80adb4b0 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -6,6 +6,13 @@ let is_whitespace = function | ' ' | '\t' | '\010' .. '\013' -> true | _ -> false +let is_punct = function + | '!' | '"' | '#' | '$' | '%' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | '-' + | '.' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '[' | '\\' | ']' | '^' + | '_' | '`' | '{' | '|' | '}' | '~' -> + true + | _ -> false + exception Fail (** Raised when a parser fails, used for control flow rather than error handling *) @@ -245,13 +252,6 @@ let setext_heading s = loop 1 (Sub.tail s) | Some _ | None -> raise Fail -let is_punct = function - | '!' | '"' | '#' | '$' | '%' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | '-' - | '.' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '[' | '\\' | ']' | '^' - | '_' | '`' | '{' | '|' | '}' | '~' -> - true - | _ -> false - let parse_attributes = function | None -> [] | Some s -> ( From e842b90a71a41eacc8d5ff8dbf2c7f48c1caf5b9 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Sun, 4 Sep 2022 23:21:53 -0400 Subject: [PATCH 13/31] Add drop_while and drop_last_while --- src/strSlice.ml | 17 +++++++++++++---- src/strSlice.mli | 12 ++++++++++++ 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/src/strSlice.ml b/src/strSlice.ml index f2aa9cad..d2b42198 100644 --- a/src/strSlice.ml +++ b/src/strSlice.ml @@ -9,6 +9,7 @@ let to_string { base; off; len } = String.sub base off len let print ppf s = Format.fprintf ppf "%S" (to_string s) let get_offset { off; _ } = off let length { len; _ } = len +let is_empty s = length s = 0 let offset n { base; off; len } = if n < 0 then invalid_arg "offset"; @@ -50,9 +51,7 @@ let tail = function | { len = 0; _ } as s -> s | { base; off; len } -> { base; off = succ off; len = pred len } -let drop_last = function - | { len = 0; _ } as s -> s - | { base; off; len } -> { base; off; len = pred len } +let uncons s = head s |> Option.map (fun hd -> (hd, tail s)) let take n s = if n < 0 then invalid_arg "take"; @@ -70,7 +69,17 @@ let drop n s = let off = min (s.off + n) (String.length s.base) in { s with off; len } -let is_empty s = length s = 0 +let drop_last = function + | { len = 0; _ } as s -> s + | { base; off; len } -> { base; off; len = pred len } + +let rec drop_while f s = + match uncons s with Some (x, s') when f x -> drop_while f s' | _ -> s + +let rec drop_last_while f s = + match last s with + | Some l when f l -> drop_last_while f (drop_last s) + | _ -> s let exists f s = let rec loop s i = diff --git a/src/strSlice.mli b/src/strSlice.mli index 5874966c..0e115f14 100644 --- a/src/strSlice.mli +++ b/src/strSlice.mli @@ -11,6 +11,10 @@ val print : Format.formatter -> t -> unit val head : t -> char option val tail : t -> t +val uncons : t -> (char * t) option +(** [uncons s] is [Some (h, t)] where [h] is [head s] and [t] is [tail s], + or [None] if [is_empty s] *) + val last : t -> char option (** [last s] is the [Some c] if [c] is the last character of [s], or else [None] if [s] is empty *) @@ -23,6 +27,14 @@ val take : int -> t -> char list val drop : int -> t -> t (** [drop n s] is [s] with the first [n] characters dropped *) +val drop_while : (char -> bool) -> t -> t +(** [drop_while f s] is [s] with the longest prefix for which [f] is true for + every character dropped *) + +val drop_last_while : (char -> bool) -> t -> t +(** [drop_last_while f s] is [s] with the longest suffix for which [f] is true for + every character dropped *) + val for_all : (char -> bool) -> t -> bool val exists : (char -> bool) -> t -> bool val is_empty : t -> bool From ec0e52db2893b93d60b2d09a9d39eaad6fd2a35f Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Sun, 4 Sep 2022 23:22:03 -0400 Subject: [PATCH 14/31] Factor out white space trimming Replace logic with drop_while and separate into three distinct functions. --- src/parser.ml | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/src/parser.ml b/src/parser.ml index 80adb4b0..7099185c 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -212,15 +212,11 @@ let sp3 s = (** TODO Why is this here? Doesn't it exactly repeat the one in [P]? *) let ( ||| ) p1 p2 s = try p1 s with Fail -> p2 s -let rec trim_ws ?(rev = false) s = - let inspect, drop = - if rev then (Sub.last, Sub.drop_last) else (Sub.head, Sub.tail) - in - match inspect s with - | Some w when is_whitespace w -> trim_ws ~rev (drop s) - | None | Some _ -> s +let trim_leading_ws s = Sub.drop_while is_whitespace s +let trim_trailing_ws s = Sub.drop_last_while is_whitespace s +let trim_ws s = trim_leading_ws s |> trim_trailing_ws -let is_empty s = Sub.is_empty (trim_ws s) +let is_empty s = Sub.is_empty (trim_leading_ws s) let thematic_break s = match Sub.head s with @@ -244,7 +240,7 @@ let setext_heading s = match Sub.head s with | Some c1 when c = c1 -> loop (succ n) (Sub.tail s) | Some _ | None -> - if not (Sub.is_empty (trim_ws s)) then raise Fail; + if not (Sub.is_empty (trim_leading_ws s)) then raise Fail; if c = '-' && n = 1 then raise Fail; (* can be interpreted as an empty list item *) Lsetext_heading ((if c = '-' then 2 else 1), n) @@ -324,7 +320,7 @@ let attribute_string s = Buffer.add_char buf c; loop (Sub.tail s) in - let s', a = loop (trim_ws s) in + let s', a = loop (trim_leading_ws s) in (s', parse_attributes a) let atx_heading s = @@ -336,15 +332,15 @@ let atx_heading s = let s, a = match Sub.last s with Some '}' -> attribute_string s | _ -> (s, []) in - let s = trim_ws ~rev:true (trim_ws s) in + let s = (trim_ws s) in let rec loop t = match Sub.last t with | Some '#' -> loop (Sub.drop_last t) - | Some w when is_whitespace w -> trim_ws ~rev:true t - | None -> trim_ws ~rev:true t + | Some w when is_whitespace w -> trim_trailing_ws t + | None -> trim_trailing_ws t | Some _ -> s in - Latx_heading (n, Sub.to_string (trim_ws (loop s)), a) + Latx_heading (n, Sub.to_string (trim_leading_ws (loop s)), a) | Some _ -> raise Fail | None -> Latx_heading (n, Sub.to_string s, []) in @@ -414,14 +410,14 @@ let info_string c s = let s, a = match Sub.last s with Some '}' -> attribute_string s | _ -> (s, []) in - let s = trim_ws ~rev:true (trim_ws s) in + let s = (trim_ws s) in let rec loop s = match Sub.head s with (* TODO use is_whitespace *) | Some (' ' | '\t' | '\010' .. '\013') | None -> if c = '`' && Sub.exists (function '`' -> true | _ -> false) s then raise Fail; - ((Buffer.contents buf, Sub.to_string (trim_ws s)), a) + ((Buffer.contents buf, Sub.to_string (trim_leading_ws s)), a) | Some '`' when c = '`' -> raise Fail | Some ('\\' as c) -> ( let s = Sub.tail s in @@ -445,7 +441,7 @@ let info_string c s = Buffer.add_char buf c; loop (Sub.tail s) in - loop (trim_ws s) + loop (trim_leading_ws s) let fenced_code ind s = match Sub.head s with @@ -589,7 +585,7 @@ let special_tag s = List.mem s special_tags let closing_tag s = - let s = trim_ws s in + let s = trim_leading_ws s in match Sub.head s with | Some '>' -> if not (is_empty (Sub.tail s)) then raise Fail; @@ -613,7 +609,7 @@ let known_tag tag s = (** TODO Why these repeated functions that look just like thos in [P]? *) let ws1 s = match Sub.head s with - | Some w when is_whitespace w -> trim_ws s + | Some w when is_whitespace w -> trim_leading_ws s | Some _ | None -> raise Fail let attribute_name s = @@ -654,10 +650,10 @@ let attribute_value s = let attribute s = let s = ws1 s in let s = attribute_name s in - let s = trim_ws s in + let s = trim_leading_ws s in match Sub.head s with | Some '=' -> - let s = trim_ws (Sub.tail s) in + let s = trim_leading_ws (Sub.tail s) in attribute_value s | Some _ | None -> s @@ -667,7 +663,7 @@ let attributes s = let open_tag s = let s = attributes s in - let s = trim_ws s in + let s = trim_leading_ws s in let n = match Sub.take 2 s with | '/' :: '>' :: _ -> 2 @@ -701,7 +697,7 @@ let tag_string s = let s, a = match Sub.last s with Some '}' -> attribute_string s | _ -> (s, []) in - let s = trim_ws ~rev:true (trim_ws s) in + let s = (trim_ws s) in let rec loop s = match Sub.head s with (* TODO use is_whitespace *) @@ -710,7 +706,7 @@ let tag_string s = Buffer.add_char buf c; loop (Sub.tail s) in - loop (trim_ws s) + loop (trim_leading_ws s) let def_list s = let s = Sub.tail s in From fa6ae6c675324322788288bbb2089321331f27a3 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Sun, 4 Sep 2022 23:23:39 -0400 Subject: [PATCH 15/31] Clarify comment --- src/parser.ml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/parser.ml b/src/parser.ml index 7099185c..1a2237d6 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -209,7 +209,8 @@ let sp3 s = | ' ' :: _ -> (1, Sub.drop 1 s) | _ -> (0, s) -(** TODO Why is this here? Doesn't it exactly repeat the one in [P]? *) +(** TODO Why is this here? Doesn't it almost exactly repeat the one in [P], only with slices? + Why is this kind of repetition needed? *) let ( ||| ) p1 p2 s = try p1 s with Fail -> p2 s let trim_leading_ws s = Sub.drop_while is_whitespace s From 75805d83c86e55dfcc326b456dae33179cc69dbd Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Sun, 4 Sep 2022 23:40:36 -0400 Subject: [PATCH 16/31] Refactor thematic_break parsing - Use uncons to simplify logic - Separate helper function def from case analysis - Simplify failure logic --- src/parser.ml | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/src/parser.ml b/src/parser.ml index 1a2237d6..a6bc7d9b 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -216,23 +216,26 @@ let ( ||| ) p1 p2 s = try p1 s with Fail -> p2 s let trim_leading_ws s = Sub.drop_while is_whitespace s let trim_trailing_ws s = Sub.drop_last_while is_whitespace s let trim_ws s = trim_leading_ws s |> trim_trailing_ws - let is_empty s = Sub.is_empty (trim_leading_ws s) -let thematic_break s = - match Sub.head s with - | Some (('*' | '_' | '-') as c) -> - let rec loop n s = - match Sub.head s with - | Some c1 when c = c1 -> loop (succ n) (Sub.tail s) - | Some w when is_whitespace w -> loop n (Sub.tail s) - | Some _ -> raise Fail - | None -> - if n < 3 then raise Fail; - Lthematic_break - in - loop 1 (Sub.tail s) - | Some _ | None -> raise Fail +(* See https://spec.commonmark.org/0.30/#thematic-breaks *) +let thematic_break = + let accept symb chars = + let rec loop n s = + match Sub.uncons s with + | Some (c, tl) when symb = c -> loop (succ n) tl + (* Themtic break chars can be separated by spaces *) + | Some (w, tl) when is_whitespace w -> loop n tl + (* Three or more of the same thematic break chars found *) + | None when n >= 3 -> Lthematic_break + | _ -> raise Fail + in + loop 1 chars + in + fun s -> + match Sub.uncons s with + | Some ((('*' | '_' | '-') as symb), rest) -> accept symb rest + | Some _ | None -> raise Fail let setext_heading s = match Sub.head s with @@ -333,7 +336,7 @@ let atx_heading s = let s, a = match Sub.last s with Some '}' -> attribute_string s | _ -> (s, []) in - let s = (trim_ws s) in + let s = trim_ws s in let rec loop t = match Sub.last t with | Some '#' -> loop (Sub.drop_last t) @@ -411,7 +414,7 @@ let info_string c s = let s, a = match Sub.last s with Some '}' -> attribute_string s | _ -> (s, []) in - let s = (trim_ws s) in + let s = trim_ws s in let rec loop s = match Sub.head s with (* TODO use is_whitespace *) @@ -698,7 +701,7 @@ let tag_string s = let s, a = match Sub.last s with Some '}' -> attribute_string s | _ -> (s, []) in - let s = (trim_ws s) in + let s = trim_ws s in let rec loop s = match Sub.head s with (* TODO use is_whitespace *) @@ -712,8 +715,7 @@ let tag_string s = let def_list s = let s = Sub.tail s in match Sub.head s with - | Some w when is_whitespace w-> - Ldef_list (String.trim (Sub.to_string s)) + | Some w when is_whitespace w -> Ldef_list (String.trim (Sub.to_string s)) | _ -> raise Fail let indented_code ind s = @@ -1861,9 +1863,7 @@ let link_reference_definition st : attributes Ast.link_def = loop false in let ws1 st = - match next st with - | w when is_whitespace w -> ws st - | _ -> raise Fail + match next st with w when is_whitespace w -> ws st | _ -> raise Fail in ignore (sp3 st); let label = link_label false st in From d3535dc4f64efab95c516880c749e80a7aefe0bf Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Mon, 5 Sep 2022 11:23:23 -0400 Subject: [PATCH 17/31] Clean up setext_heading --- src/parser.ml | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/src/parser.ml b/src/parser.ml index a6bc7d9b..08bed7cc 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -194,6 +194,7 @@ type t = | Lthematic_break | Latx_heading of int * string * attributes | Lsetext_heading of int * int + (** the level of the heading and how long the underline marker is *) | Lfenced_code of int * int * code_block_kind * (string * string) * attributes | Lindented_code of Sub.t | Lhtml of bool * html_kind @@ -237,20 +238,28 @@ let thematic_break = | Some ((('*' | '_' | '-') as symb), rest) -> accept symb rest | Some _ | None -> raise Fail +(* See https://spec.commonmark.org/0.30/#setext-heading *) let setext_heading s = - match Sub.head s with - | Some (('-' | '=') as c) -> - let rec loop n s = - match Sub.head s with - | Some c1 when c = c1 -> loop (succ n) (Sub.tail s) - | Some _ | None -> - if not (Sub.is_empty (trim_leading_ws s)) then raise Fail; - if c = '-' && n = 1 then raise Fail; - (* can be interpreted as an empty list item *) - Lsetext_heading ((if c = '-' then 2 else 1), n) - in - loop 1 (Sub.tail s) - | Some _ | None -> raise Fail + (* The first char determines if possible setext heading and what level heading *) + let level, symb = + match Sub.head s with + | Some '=' -> (1, '=') + | Some '-' -> (2, '-') + | _ -> raise Fail + in + (* Trailing whitespace is ignored *) + let trimmed = trim_trailing_ws s in + let heading_length = Sub.length trimmed in + if heading_length = 1 && symb = '-' then + (* can be interpreted as an empty list item *) + raise Fail + else + (* setext headings consist of an unbroken sequence of consecutive header chars *) + let remaining = Sub.drop_while (Char.equal symb) trimmed in + if not (Sub.is_empty remaining) then + (* If other characters remain, it cannot be a setext heading *) + raise Fail + else Lsetext_heading (level, heading_length) let parse_attributes = function | None -> [] From 60f750b41dc431a1d5a0ddcf282f31fa5309a78d Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Mon, 5 Sep 2022 12:59:09 -0400 Subject: [PATCH 18/31] Add stdcmpat dependency Allows us to use more recent stdlib methods while maintaining compat with older versions of the compiler. --- dune-project | 1 + omd.opam | 1 + src/dune | 2 +- src/strSlice.ml | 2 ++ 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/dune-project b/dune-project index a7f1f836..05cbb0c1 100644 --- a/dune-project +++ b/dune-project @@ -25,6 +25,7 @@ extension mechanism, and some other features. Note that the opam package installs both the OMD library and the command line tool `omd`.") (tags (org:ocamllabs org:mirage)) (depends (ocaml (>= 4.05)) + stdcompat uutf uucp uunf diff --git a/omd.opam b/omd.opam index f5509000..0fd4565a 100644 --- a/omd.opam +++ b/omd.opam @@ -23,6 +23,7 @@ bug-reports: "https://github.com/ocaml/omd/issues" depends: [ "dune" {>= "2.7"} "ocaml" {>= "4.05"} + "stdcompat" "uutf" "uucp" "uunf" diff --git a/src/dune b/src/dune index 7de9d124..42a7f6a2 100644 --- a/src/dune +++ b/src/dune @@ -1,7 +1,7 @@ (library (name omd) (public_name omd) - (libraries uutf uucp uunf) + (libraries uutf uucp uunf stdcompat) (flags :standard -w -30)) (rule diff --git a/src/strSlice.ml b/src/strSlice.ml index d2b42198..def61716 100644 --- a/src/strSlice.ml +++ b/src/strSlice.ml @@ -1,3 +1,5 @@ +module Option = Stdcompat.Option + type t = { base : string ; off : int From 28b38e531f22f4066e72ad50ebbf94190d891239 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Mon, 5 Sep 2022 13:40:00 -0400 Subject: [PATCH 19/31] Remove unneeded disabled warning --- src/dune | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/dune b/src/dune index 42a7f6a2..47c78257 100644 --- a/src/dune +++ b/src/dune @@ -1,8 +1,7 @@ (library (name omd) (public_name omd) - (libraries uutf uucp uunf stdcompat) - (flags :standard -w -30)) + (libraries uutf uucp uunf stdcompat)) (rule (with-stdout-to From 46ee44c2378fa5c719eea2b36516efe208290aea Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Mon, 5 Sep 2022 21:48:53 -0400 Subject: [PATCH 20/31] Add index, split_at, and fold_left These will enable cleaner logic in the parsing --- src/strSlice.ml | 51 ++++++++++++++++++++++++++++++++++++++++++++++++ src/strSlice.mli | 30 +++++++++++++++++++++++++++- 2 files changed, 80 insertions(+), 1 deletion(-) diff --git a/src/strSlice.ml b/src/strSlice.ml index def61716..538983b8 100644 --- a/src/strSlice.ml +++ b/src/strSlice.ml @@ -83,6 +83,44 @@ let rec drop_last_while f s = | Some l when f l -> drop_last_while f (drop_last s) | _ -> s +let index f s = + let len = length s in + let rest = drop_while (fun c -> not (f c)) s in + let idx = len - length rest in + if idx = len then + None + else + Some idx + +(* Uncomment to test *) +(* TODO: rig up method to unit test our utilities *) +(* let () = *) +(* let index c = index (Char.equal c) in *) +(* let s = of_string "abcd" in *) +(* assert (index 'a' s = Some 0); *) +(* assert (index 'b' s = Some 1); *) +(* assert (index 'c' s = Some 2); *) +(* assert (index 'z' s = None) *) + +let split_at f s = + match index f s with + | None -> (s, offset (length s) s) + | Some idx -> ({s with len = idx} , offset idx s) + +(* Uncomment to test *) +(* TODO: rig up method to unit test our utilities *) +(* let () = *) +(* let f x = x = 'c' in *) +(* let before, rest = split_at f (of_string "abcdef") in *) +(* assert ("ab" = to_string before); *) +(* assert ("cdef" = to_string rest); *) +(* let before, rest = split_at f (of_string "cab") in *) +(* assert ("" = to_string before); *) +(* assert ("cab" = to_string rest); *) +(* let before, rest = split_at f (of_string "aaa") in *) +(* assert ("aaa" = to_string before); *) +(* assert ("" = to_string rest) *) + let exists f s = let rec loop s i = if i >= s.len then false @@ -96,3 +134,16 @@ let for_all f s = not (exists (fun c -> not (f c)) s) let sub ~len s = if len > s.len then invalid_arg "sub"; { s with len } + +let fold_left f init s = + let rec aux acc rest = + match uncons rest with + | None -> acc + | Some (x, xs) -> aux (f x acc) xs + in + aux init s + +(* let () = *) +(* let s = of_string "abcde" in *) +(* assert (fold_left (fun _ n -> n + 1) 0 s = 5); *) +(* assert (fold_left (fun c s -> String.make 2 c ^ s) "" s = "eeddccbbaa") *) diff --git a/src/strSlice.mli b/src/strSlice.mli index 0e115f14..47d072e8 100644 --- a/src/strSlice.mli +++ b/src/strSlice.mli @@ -7,6 +7,12 @@ val to_string : t -> string val offset : int -> t -> t val lexbuf : t -> Lexing.lexbuf val contains : string -> t -> bool +val length : t -> int + +val index : (char -> bool) -> t -> int option +(** [index c s] is [Some i] where [i] is the index of the character in [s] for + which [f] is first true, or [None] if [f] holds for no characters in [s]. *) + val print : Format.formatter -> t -> unit val head : t -> char option val tail : t -> t @@ -35,9 +41,31 @@ val drop_last_while : (char -> bool) -> t -> t (** [drop_last_while f s] is [s] with the longest suffix for which [f] is true for every character dropped *) +val split_at : (char -> bool) -> t -> t * t +(** [split_at f s] is [(taken, rest)] where [taken] is the prefix of [s] for + which [f] is [false] and [rest] is remainder, including the first character + for which [f] is [true]. + + E.g., + + {[ + let () = + let f x = x = 'c' in + let before, rest = split_at f (of_string "abcdef") in + assert ("ab" = to_string before); + assert ("cdef" = to_string rest); + let before, rest = split_at f (of_string "cab") in + assert ("" = to_string before); + assert ("cab" = to_string rest); + let before, rest = split_at f (of_string "aaa") in + assert ("aaa" = to_string before); + assert ("" = to_string rest) + ]} +*) + +val fold_left : (char -> 'a -> 'a) -> 'a -> t -> 'a val for_all : (char -> bool) -> t -> bool val exists : (char -> bool) -> t -> bool val is_empty : t -> bool val get_offset : t -> int -val length : t -> int val sub : len:int -> t -> t From 7f88563e32748b4682fba96924cfc2c7191f9003 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Mon, 5 Sep 2022 21:49:22 -0400 Subject: [PATCH 21/31] Further refactor thematic_break Using a fold we can further simplify the logic, and ditch the explicit loop. --- src/parser.ml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/parser.ml b/src/parser.ml index 08bed7cc..d012cd10 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -221,21 +221,21 @@ let is_empty s = Sub.is_empty (trim_leading_ws s) (* See https://spec.commonmark.org/0.30/#thematic-breaks *) let thematic_break = - let accept symb chars = - let rec loop n s = - match Sub.uncons s with - | Some (c, tl) when symb = c -> loop (succ n) tl - (* Themtic break chars can be separated by spaces *) - | Some (w, tl) when is_whitespace w -> loop n tl - (* Three or more of the same thematic break chars found *) - | None when n >= 3 -> Lthematic_break - | _ -> raise Fail - in - loop 1 chars + (* Accepts thematic break chars or fail, counting how many chars we find *) + let f symb c count = + if Char.equal symb c then succ count + else if is_whitespace c then + (* Thematic break chars can be separated by spaces *) + count + else raise Fail in fun s -> - match Sub.uncons s with - | Some ((('*' | '_' | '-') as symb), rest) -> accept symb rest + match Sub.head s with + | Some (('*' | '_' | '-') as symb) -> + if Sub.fold_left (f symb) 0 s >= 3 then + (* Three or more of the same thematic break chars found *) + Lthematic_break + else raise Fail | Some _ | None -> raise Fail (* See https://spec.commonmark.org/0.30/#setext-heading *) From 4a17175c9a1a2e9ad63ab607c6f0f945c37e2d54 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Mon, 5 Sep 2022 21:59:11 -0400 Subject: [PATCH 22/31] Simplify setext_heading again --- src/parser.ml | 90 +++++++++++++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 42 deletions(-) diff --git a/src/parser.ml b/src/parser.ml index d012cd10..b94b1f88 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -240,52 +240,54 @@ let thematic_break = (* See https://spec.commonmark.org/0.30/#setext-heading *) let setext_heading s = - (* The first char determines if possible setext heading and what level heading *) + (* The first char determines if possible setext and the level of the heading *) let level, symb = match Sub.head s with | Some '=' -> (1, '=') | Some '-' -> (2, '-') | _ -> raise Fail in - (* Trailing whitespace is ignored *) - let trimmed = trim_trailing_ws s in - let heading_length = Sub.length trimmed in - if heading_length = 1 && symb = '-' then + let (heading_chars, rest) = Sub.split_at (fun c -> not (Char.equal c symb)) s in + if Char.equal symb '-' && Sub.length heading_chars = 1 then (* can be interpreted as an empty list item *) raise Fail + else if not (Sub.for_all is_whitespace rest) then + (* if anything except whitespace is left, it can't be a setext heading underline *) + raise Fail else - (* setext headings consist of an unbroken sequence of consecutive header chars *) - let remaining = Sub.drop_while (Char.equal symb) trimmed in - if not (Sub.is_empty remaining) then - (* If other characters remain, it cannot be a setext heading *) - raise Fail - else Lsetext_heading (level, heading_length) - -let parse_attributes = function - | None -> [] - | Some s -> ( - let attributes = String.split_on_char ' ' s in - let f (id, classes, acc) s = - if s = "" then (id, classes, acc) - else - match s.[0] with - | '#' -> (Some (String.sub s 1 (String.length s - 1)), classes, acc) - | '.' -> (id, String.sub s 1 (String.length s - 1) :: classes, acc) - | _ -> ( - let attr = String.split_on_char '=' s in - match attr with - | [] -> (id, classes, acc) - | h :: t -> (id, classes, (h, String.concat "=" t) :: acc)) - in - let id, classes, acc = List.fold_left f (None, [], []) attributes in - let acc = List.rev acc in - let acc = - if classes <> [] then - ("class", String.concat " " (List.rev classes)) :: acc - else acc - in - match id with Some id -> ("id", id) :: acc | None -> acc) + Lsetext_heading (level, Sub.length heading_chars) + +(* Parses a string slice in pandoc-style into an association list + + See https://pandoc.org/MANUAL.html#extension-header_attributes *) +let parse_attributes s = + let attributes = String.split_on_char ' ' s in + let f (id, classes, acc) s = + if s = "" then (id, classes, acc) + else + match s.[0] with + | '#' -> (Some (String.sub s 1 (String.length s - 1)), classes, acc) + | '.' -> (id, String.sub s 1 (String.length s - 1) :: classes, acc) + | _ -> ( + let attr = String.split_on_char '=' s in + match attr with + | [] -> (id, classes, acc) + | h :: t -> (id, classes, (h, String.concat "=" t) :: acc)) + in + let id, classes, acc = List.fold_left f (None, [], []) attributes in + let acc = List.rev acc in + let acc = + match classes with + | [] -> acc + | _ :: _ -> + let classes = String.concat " " (List.rev classes) in + ("class", classes) :: acc + in + match id with Some id -> ("id", id) :: acc | None -> acc +(* Parses a string slice into an attribute list (possibly empty) and the non-attribute part of the string + + These are pandoc style attributes https://pandoc.org/MANUAL.html#extension-attributes *) let attribute_string s = let buf = Buffer.create 64 in let rec loop s = @@ -305,16 +307,19 @@ let attribute_string s = let rec loop' s = match Sub.head s with | Some '}' -> ( + (* Found a closing bracket not at the end of the line *) let s = Sub.tail s in match Sub.head s with + | None -> + (* At end of line, so we've finished parsing the attributes *) + ( Sub.of_string (Buffer.contents buf) + , Some (Buffer.contents buf') ) | Some _ -> + (* Not at end of line, so this can't be a set of attributes *) Buffer.add_char buf '{'; Buffer.add_buffer buf buf'; Buffer.add_char buf '}'; - loop s - | None -> - ( Sub.of_string (Buffer.contents buf) - , Some (Buffer.contents buf') )) + loop s) | None -> Buffer.add_char buf '{'; Buffer.add_buffer buf buf'; @@ -334,7 +339,8 @@ let attribute_string s = loop (Sub.tail s) in let s', a = loop (trim_leading_ws s) in - (s', parse_attributes a) + let attrs = Option.map parse_attributes a |> Option.value ~default:[] in + (s', attrs) let atx_heading s = let rec loop n s = @@ -795,7 +801,7 @@ let inline_attribute_string s = loop s (pos s) | _ -> None in - let attr = parse_attributes a in + let attr = Option.map parse_attributes a |> Option.value ~default:[] in if attr = [] then set_pos s ppos; attr From 4527c6aac66b7c40b0c6942bd25ea6d6cbc0a7b8 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Mon, 5 Sep 2022 22:20:06 -0400 Subject: [PATCH 23/31] Replace custom Compat module with Stdcompat --- src/compat.ml | 52 ------------------------------------------------- src/parser.ml | 2 +- src/strSlice.ml | 2 +- src/toc.ml | 2 +- 4 files changed, 3 insertions(+), 55 deletions(-) delete mode 100644 src/compat.ml diff --git a/src/compat.ml b/src/compat.ml deleted file mode 100644 index 7789f422..00000000 --- a/src/compat.ml +++ /dev/null @@ -1,52 +0,0 @@ -module Uchar = struct - include Uchar - - let rep : Uchar.t = of_int 0xFFFD -end - -module List = struct - include List - - let rec find_map f = function - | [] -> None - | x :: xs -> ( match f x with None -> find_map f xs | y -> y) - - let rec find_opt p = function - | [] -> None - | x :: l -> if p x then Some x else find_opt p l -end - -module Buffer = struct - include Buffer - - let add_utf_8_uchar b u = - match Uchar.to_int u with - | u when u < 0 -> assert false - | u when u <= 0x007F -> Buffer.add_char b (Char.unsafe_chr u) - | u when u <= 0x07FF -> - Buffer.add_char b (Char.unsafe_chr (0xC0 lor (u lsr 6))); - Buffer.add_char b (Char.unsafe_chr (0x80 lor (u land 0x3F))) - | u when u <= 0xFFFF -> - Buffer.add_char b (Char.unsafe_chr (0xE0 lor (u lsr 12))); - Buffer.add_char b (Char.unsafe_chr (0x80 lor ((u lsr 6) land 0x3F))); - Buffer.add_char b (Char.unsafe_chr (0x80 lor (u land 0x3F))) - | u when u <= 0x10FFFF -> - Buffer.add_char b (Char.unsafe_chr (0xF0 lor (u lsr 18))); - Buffer.add_char b (Char.unsafe_chr (0x80 lor ((u lsr 12) land 0x3F))); - Buffer.add_char b (Char.unsafe_chr (0x80 lor ((u lsr 6) land 0x3F))); - Buffer.add_char b (Char.unsafe_chr (0x80 lor (u land 0x3F))) - | _ -> assert false -end - -module String = struct - include String - - let for_all p s = - let n = length s in - let rec loop i = - if i = n then true - else if p (unsafe_get s i) then loop (succ i) - else false - in - loop 0 -end diff --git a/src/parser.ml b/src/parser.ml index b94b1f88..e1a9bd61 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -1,5 +1,5 @@ open Ast -open Compat +open Stdcompat module Sub = StrSlice let is_whitespace = function diff --git a/src/strSlice.ml b/src/strSlice.ml index 538983b8..c6ca58ac 100644 --- a/src/strSlice.ml +++ b/src/strSlice.ml @@ -1,4 +1,4 @@ -module Option = Stdcompat.Option +open Stdcompat type t = { base : string diff --git a/src/toc.ml b/src/toc.ml index 38e5a4b6..6f98099d 100644 --- a/src/toc.ml +++ b/src/toc.ml @@ -1,5 +1,5 @@ open Ast -open Compat +open Stdcompat let rec remove_links inline = match inline with From 3cafd16951c4da7d55956e2c96de4e7ab5b18df7 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Mon, 5 Sep 2022 22:40:06 -0400 Subject: [PATCH 24/31] Fix formatting --- src/parser.ml | 5 ++--- src/strSlice.ml | 11 +++-------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/src/parser.ml b/src/parser.ml index e1a9bd61..e1010d04 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -247,15 +247,14 @@ let setext_heading s = | Some '-' -> (2, '-') | _ -> raise Fail in - let (heading_chars, rest) = Sub.split_at (fun c -> not (Char.equal c symb)) s in + let heading_chars, rest = Sub.split_at (fun c -> not (Char.equal c symb)) s in if Char.equal symb '-' && Sub.length heading_chars = 1 then (* can be interpreted as an empty list item *) raise Fail else if not (Sub.for_all is_whitespace rest) then (* if anything except whitespace is left, it can't be a setext heading underline *) raise Fail - else - Lsetext_heading (level, Sub.length heading_chars) + else Lsetext_heading (level, Sub.length heading_chars) (* Parses a string slice in pandoc-style into an association list diff --git a/src/strSlice.ml b/src/strSlice.ml index c6ca58ac..4fb3429e 100644 --- a/src/strSlice.ml +++ b/src/strSlice.ml @@ -87,10 +87,7 @@ let index f s = let len = length s in let rest = drop_while (fun c -> not (f c)) s in let idx = len - length rest in - if idx = len then - None - else - Some idx + if idx = len then None else Some idx (* Uncomment to test *) (* TODO: rig up method to unit test our utilities *) @@ -105,7 +102,7 @@ let index f s = let split_at f s = match index f s with | None -> (s, offset (length s) s) - | Some idx -> ({s with len = idx} , offset idx s) + | Some idx -> ({ s with len = idx }, offset idx s) (* Uncomment to test *) (* TODO: rig up method to unit test our utilities *) @@ -137,9 +134,7 @@ let sub ~len s = let fold_left f init s = let rec aux acc rest = - match uncons rest with - | None -> acc - | Some (x, xs) -> aux (f x acc) xs + match uncons rest with None -> acc | Some (x, xs) -> aux (f x acc) xs in aux init s From d2666cef875dd94334b7a5041a799c8327257394 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Wed, 7 Sep 2022 21:58:28 -0400 Subject: [PATCH 25/31] Use inline records for Lsetext_heading --- src/block.ml | 6 +++--- src/parser.ml | 11 +++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/block.ml b/src/block.ml index 7b5489df..0157f7b3 100644 --- a/src/block.ml +++ b/src/block.ml @@ -90,7 +90,7 @@ module Pre = struct | Rempty, Lblockquote s -> { blocks; next = Rblockquote (process empty s) } | Rempty, Lthematic_break -> { blocks = Thematic_break [] :: blocks; next = Rempty } - | Rempty, Lsetext_heading (2, n) when n >= 3 -> + | Rempty, Lsetext_heading { level = 2; len } when len >= 3 -> { blocks = Thematic_break [] :: blocks; next = Rempty } | Rempty, Latx_heading (level, text, attr) -> { blocks = Heading (attr, level, text) :: blocks; next = Rempty } @@ -117,7 +117,7 @@ module Pre = struct | Lfenced_code _ | Lhtml (true, _) ) ) -> process { blocks = close { blocks; next }; next = Rempty } s - | Rparagraph (_ :: _ as lines), Lsetext_heading (level, _) -> + | Rparagraph (_ :: _ as lines), Lsetext_heading { level; _ } -> let text = concat (List.map trim_left lines) in let defs, text = link_reference_definitions text in link_defs := defs @ !link_defs; @@ -221,7 +221,7 @@ module Pre = struct | Rparagraph (_ :: _ as lines) -> ( match classify_line s with | Parser.Lparagraph | Lindented_code _ - | Lsetext_heading (1, _) + | Lsetext_heading { level = 1; _ } | Lhtml (false, _) -> Some (Rparagraph (Sub.to_string s :: lines)) | _ -> None) diff --git a/src/parser.ml b/src/parser.ml index e1010d04..16359564 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -193,8 +193,10 @@ type t = | Lblockquote of Sub.t | Lthematic_break | Latx_heading of int * string * attributes - | Lsetext_heading of int * int - (** the level of the heading and how long the underline marker is *) + | Lsetext_heading of + { level : int + ; len : int + } (** the level of the heading and how long the underline marker is *) | Lfenced_code of int * int * code_block_kind * (string * string) * attributes | Lindented_code of Sub.t | Lhtml of bool * html_kind @@ -248,13 +250,14 @@ let setext_heading s = | _ -> raise Fail in let heading_chars, rest = Sub.split_at (fun c -> not (Char.equal c symb)) s in - if Char.equal symb '-' && Sub.length heading_chars = 1 then + let len = Sub.length heading_chars in + if Char.equal symb '-' && len = 1 then (* can be interpreted as an empty list item *) raise Fail else if not (Sub.for_all is_whitespace rest) then (* if anything except whitespace is left, it can't be a setext heading underline *) raise Fail - else Lsetext_heading (level, Sub.length heading_chars) + else Lsetext_heading { level; len } (* Parses a string slice in pandoc-style into an association list From 6611e256f7a5090a80f62ed5ea691a1afa9bb44d Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Wed, 7 Sep 2022 22:07:18 -0400 Subject: [PATCH 26/31] Remove enusre_chars_remain --- src/parser.ml | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/parser.ml b/src/parser.ml index 16359564..d2ddc595 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -100,18 +100,17 @@ end = struct type 'a t = state -> 'a - let ensure_chars_remain st = if st.pos >= String.length st.str then raise Fail - let char c st = - ensure_chars_remain st; - if st.str.[st.pos] <> c then raise Fail; - st.pos <- st.pos + 1 + if st.pos >= String.length st.str then raise Fail + else if st.str.[st.pos] <> c then raise Fail + else st.pos <- st.pos + 1 let next st = - ensure_chars_remain st; - let c = st.str.[st.pos] in - st.pos <- st.pos + 1; - c + if st.pos >= String.length st.str then raise Fail + else + let c = st.str.[st.pos] in + st.pos <- st.pos + 1; + c let peek st = if st.pos >= String.length st.str then None else Some st.str.[st.pos] From ff1799d8db272b58269c7575cc5483e700cfc6d3 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Wed, 7 Sep 2022 22:14:49 -0400 Subject: [PATCH 27/31] Drop use of "Sub" alias for StrSlice --- src/block.ml | 37 ++++---- src/parser.ml | 246 ++++++++++++++++++++++++++------------------------ 2 files changed, 147 insertions(+), 136 deletions(-) diff --git a/src/block.ml b/src/block.ml index 0157f7b3..248c9e91 100644 --- a/src/block.ml +++ b/src/block.ml @@ -1,5 +1,4 @@ open Ast -module Sub = StrSlice module Pre = struct type container = @@ -98,19 +97,19 @@ module Pre = struct { blocks; next = Rfenced_code (ind, num, q, info, [], a) } | Rempty, Lhtml (_, kind) -> process { blocks; next = Rhtml (kind, []) } s | Rempty, Lindented_code s -> - { blocks; next = Rindented_code [ Sub.to_string s ] } + { blocks; next = Rindented_code [ StrSlice.to_string s ] } | Rempty, Llist_item (kind, indent, s) -> { blocks ; next = Rlist (kind, Tight, false, indent, [], process empty s) } | Rempty, (Lsetext_heading _ | Lparagraph | Ldef_list _) -> - { blocks; next = Rparagraph [ Sub.to_string s ] } + { blocks; next = Rparagraph [ StrSlice.to_string s ] } | Rparagraph [ h ], Ldef_list def -> { blocks; next = Rdef_list (h, [ def ]) } | Rdef_list (term, defs), Ldef_list def -> { blocks; next = Rdef_list (term, def :: defs) } | Rparagraph _, Llist_item ((Ordered (1, _) | Bullet _), _, s1) - when not (Parser.is_empty (Parser.P.of_string (Sub.to_string s1))) -> + when not (Parser.is_empty (Parser.P.of_string (StrSlice.to_string s1))) -> process { blocks = close { blocks; next }; next = Rempty } s | ( Rparagraph _ , ( Lempty | Lblockquote _ | Lthematic_break | Latx_heading _ @@ -130,44 +129,46 @@ module Pre = struct In that case, there's nothing to make as Heading. We can simply add `===` as Rparagraph *) - { blocks; next = Rparagraph [ Sub.to_string s ] } + { blocks; next = Rparagraph [ StrSlice.to_string s ] } else { blocks = Heading ([], level, text) :: blocks; next = Rempty } | Rparagraph lines, _ -> - { blocks; next = Rparagraph (Sub.to_string s :: lines) } + { blocks; next = Rparagraph (StrSlice.to_string s :: lines) } | Rfenced_code (_, num, q, _, _, _), Lfenced_code (_, num', q1, ("", _), _) when num' >= num && q = q1 -> { blocks = close { blocks; next }; next = Rempty } | Rfenced_code (ind, num, q, info, lines, a), _ -> let s = let ind = min (Parser.indent s) ind in - if ind > 0 then Sub.offset ind s else s + if ind > 0 then StrSlice.offset ind s else s in { blocks - ; next = Rfenced_code (ind, num, q, info, Sub.to_string s :: lines, a) + ; next = + Rfenced_code (ind, num, q, info, StrSlice.to_string s :: lines, a) } | Rdef_list (term, d :: defs), Lparagraph -> { blocks - ; next = Rdef_list (term, (d ^ "\n" ^ Sub.to_string s) :: defs) + ; next = Rdef_list (term, (d ^ "\n" ^ StrSlice.to_string s) :: defs) } | Rdef_list _, _ -> process { blocks = close { blocks; next }; next = Rempty } s | Rindented_code lines, Lindented_code s -> - { blocks; next = Rindented_code (Sub.to_string s :: lines) } + { blocks; next = Rindented_code (StrSlice.to_string s :: lines) } | Rindented_code lines, Lempty -> let n = min (Parser.indent s) 4 in - let s = Sub.offset n s in - { blocks; next = Rindented_code (Sub.to_string s :: lines) } + let s = StrSlice.offset n s in + { blocks; next = Rindented_code (StrSlice.to_string s :: lines) } | Rindented_code _, _ -> process { blocks = close { blocks; next }; next = Rempty } s | Rhtml ((Hcontains l as k), lines), _ - when List.exists (fun t -> Sub.contains t s) l -> - { blocks = close { blocks; next = Rhtml (k, Sub.to_string s :: lines) } + when List.exists (fun t -> StrSlice.contains t s) l -> + { blocks = + close { blocks; next = Rhtml (k, StrSlice.to_string s :: lines) } ; next = Rempty } | Rhtml (Hblank, _), Lempty -> { blocks = close { blocks; next }; next = Rempty } | Rhtml (k, lines), _ -> - { blocks; next = Rhtml (k, Sub.to_string s :: lines) } + { blocks; next = Rhtml (k, StrSlice.to_string s :: lines) } | Rblockquote state, Lblockquote s -> { blocks; next = Rblockquote (process state s) } | Rlist (kind, style, _, ind, items, state), Lempty -> @@ -179,7 +180,7 @@ module Pre = struct process { blocks = close { blocks; next }; next = Rempty } s | Rlist (kind, style, prev_empty, ind, items, state), _ when Parser.indent s >= ind -> - let s = Sub.offset ind s in + let s = StrSlice.offset ind s in let state = process state s in let style = let rec new_block = function @@ -223,7 +224,7 @@ module Pre = struct | Parser.Lparagraph | Lindented_code _ | Lsetext_heading { level = 1; _ } | Lhtml (false, _) -> - Some (Rparagraph (Sub.to_string s :: lines)) + Some (Rparagraph (StrSlice.to_string s :: lines)) | _ -> None) | _ -> None in @@ -231,7 +232,7 @@ module Pre = struct | Some next -> { blocks; next } | None -> process { blocks = close { blocks; next }; next = Rempty } s) - let process link_defs state s = process link_defs state (Sub.of_string s) + let process link_defs state s = process link_defs state (StrSlice.of_string s) let of_channel ic = let link_defs = ref [] in diff --git a/src/parser.ml b/src/parser.ml index d2ddc595..3081135a 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -1,6 +1,5 @@ open Ast open Stdcompat -module Sub = StrSlice let is_whitespace = function | ' ' | '\t' | '\010' .. '\013' -> true @@ -85,7 +84,7 @@ module P : sig val pair : 'a t -> 'b t -> ('a * 'b) t - val on_sub : (Sub.t -> 'a * Sub.t) -> 'a t + val on_sub : (StrSlice.t -> 'a * StrSlice.t) -> 'a t (** Given a function [f] that takes a prefix of a string slice to a value [x] of type ['a] and some remainder of the slice, [on_sub f] produces [x] from the state, and advances the input the length of the slice that was @@ -174,8 +173,8 @@ end = struct (x, y) let on_sub fn st = - let result, s = fn (Sub.of_string ~off:st.pos st.str) in - st.pos <- Sub.get_offset s; + let result, s = fn (StrSlice.of_string ~off:st.pos st.str) in + st.pos <- StrSlice.get_offset s; result end @@ -189,7 +188,7 @@ type code_block_kind = type t = | Lempty - | Lblockquote of Sub.t + | Lblockquote of StrSlice.t | Lthematic_break | Latx_heading of int * string * attributes | Lsetext_heading of @@ -197,28 +196,28 @@ type t = ; len : int } (** the level of the heading and how long the underline marker is *) | Lfenced_code of int * int * code_block_kind * (string * string) * attributes - | Lindented_code of Sub.t + | Lindented_code of StrSlice.t | Lhtml of bool * html_kind - | Llist_item of list_type * int * Sub.t + | Llist_item of list_type * int * StrSlice.t | Lparagraph | Ldef_list of string (* drop up to 3 spaces, returning the number of spaces dropped and the remainder of the string *) let sp3 s = - match Sub.take 3 s with - | [ ' '; ' '; ' ' ] -> (3, Sub.drop 3 s) - | ' ' :: ' ' :: _ -> (2, Sub.drop 2 s) - | ' ' :: _ -> (1, Sub.drop 1 s) + match StrSlice.take 3 s with + | [ ' '; ' '; ' ' ] -> (3, StrSlice.drop 3 s) + | ' ' :: ' ' :: _ -> (2, StrSlice.drop 2 s) + | ' ' :: _ -> (1, StrSlice.drop 1 s) | _ -> (0, s) (** TODO Why is this here? Doesn't it almost exactly repeat the one in [P], only with slices? Why is this kind of repetition needed? *) let ( ||| ) p1 p2 s = try p1 s with Fail -> p2 s -let trim_leading_ws s = Sub.drop_while is_whitespace s -let trim_trailing_ws s = Sub.drop_last_while is_whitespace s +let trim_leading_ws s = StrSlice.drop_while is_whitespace s +let trim_trailing_ws s = StrSlice.drop_last_while is_whitespace s let trim_ws s = trim_leading_ws s |> trim_trailing_ws -let is_empty s = Sub.is_empty (trim_leading_ws s) +let is_empty s = StrSlice.is_empty (trim_leading_ws s) (* See https://spec.commonmark.org/0.30/#thematic-breaks *) let thematic_break = @@ -231,9 +230,9 @@ let thematic_break = else raise Fail in fun s -> - match Sub.head s with + match StrSlice.head s with | Some (('*' | '_' | '-') as symb) -> - if Sub.fold_left (f symb) 0 s >= 3 then + if StrSlice.fold_left (f symb) 0 s >= 3 then (* Three or more of the same thematic break chars found *) Lthematic_break else raise Fail @@ -243,17 +242,19 @@ let thematic_break = let setext_heading s = (* The first char determines if possible setext and the level of the heading *) let level, symb = - match Sub.head s with + match StrSlice.head s with | Some '=' -> (1, '=') | Some '-' -> (2, '-') | _ -> raise Fail in - let heading_chars, rest = Sub.split_at (fun c -> not (Char.equal c symb)) s in - let len = Sub.length heading_chars in + let heading_chars, rest = + StrSlice.split_at (fun c -> not (Char.equal c symb)) s + in + let len = StrSlice.length heading_chars in if Char.equal symb '-' && len = 1 then (* can be interpreted as an empty list item *) raise Fail - else if not (Sub.for_all is_whitespace rest) then + else if not (StrSlice.for_all is_whitespace rest) then (* if anything except whitespace is left, it can't be a setext heading underline *) raise Fail else Lsetext_heading { level; len } @@ -292,28 +293,28 @@ let parse_attributes s = let attribute_string s = let buf = Buffer.create 64 in let rec loop s = - match Sub.head s with - | None -> (Sub.of_string (Buffer.contents buf), None) + match StrSlice.head s with + | None -> (StrSlice.of_string (Buffer.contents buf), None) | Some ('\\' as c) -> ( - let s = Sub.tail s in - match Sub.head s with + let s = StrSlice.tail s in + match StrSlice.head s with | Some c when is_punct c -> Buffer.add_char buf c; - loop (Sub.tail s) + loop (StrSlice.tail s) | Some _ | None -> Buffer.add_char buf c; loop s) | Some '{' -> let buf' = Buffer.create 64 in let rec loop' s = - match Sub.head s with + match StrSlice.head s with | Some '}' -> ( (* Found a closing bracket not at the end of the line *) - let s = Sub.tail s in - match Sub.head s with + let s = StrSlice.tail s in + match StrSlice.head s with | None -> (* At end of line, so we've finished parsing the attributes *) - ( Sub.of_string (Buffer.contents buf) + ( StrSlice.of_string (Buffer.contents buf) , Some (Buffer.contents buf') ) | Some _ -> (* Not at end of line, so this can't be a set of attributes *) @@ -324,20 +325,20 @@ let attribute_string s = | None -> Buffer.add_char buf '{'; Buffer.add_buffer buf buf'; - (Sub.of_string (Buffer.contents buf), None) + (StrSlice.of_string (Buffer.contents buf), None) | Some '{' -> Buffer.add_char buf '{'; Buffer.add_buffer buf buf'; Buffer.reset buf'; - loop' (Sub.tail s) + loop' (StrSlice.tail s) | Some c -> Buffer.add_char buf' c; - loop' (Sub.tail s) + loop' (StrSlice.tail s) in - loop' (Sub.tail s) + loop' (StrSlice.tail s) | Some c -> Buffer.add_char buf c; - loop (Sub.tail s) + loop (StrSlice.tail s) in let s', a = loop (trim_leading_ws s) in let attrs = Option.map parse_attributes a |> Option.value ~default:[] in @@ -346,110 +347,118 @@ let attribute_string s = let atx_heading s = let rec loop n s = if n > 6 then raise Fail; - match Sub.head s with - | Some '#' -> loop (succ n) (Sub.tail s) + match StrSlice.head s with + | Some '#' -> loop (succ n) (StrSlice.tail s) | Some w when is_whitespace w -> let s, a = - match Sub.last s with Some '}' -> attribute_string s | _ -> (s, []) + match StrSlice.last s with + | Some '}' -> attribute_string s + | _ -> (s, []) in let s = trim_ws s in let rec loop t = - match Sub.last t with - | Some '#' -> loop (Sub.drop_last t) + match StrSlice.last t with + | Some '#' -> loop (StrSlice.drop_last t) | Some w when is_whitespace w -> trim_trailing_ws t | None -> trim_trailing_ws t | Some _ -> s in - Latx_heading (n, Sub.to_string (trim_leading_ws (loop s)), a) + Latx_heading (n, StrSlice.to_string (trim_leading_ws (loop s)), a) | Some _ -> raise Fail - | None -> Latx_heading (n, Sub.to_string s, []) + | None -> Latx_heading (n, StrSlice.to_string s, []) in loop 0 s let entity s = - match Sub.take 2 s with + match StrSlice.take 2 s with | '#' :: ('x' | 'X') :: _ -> let rec loop m n s = if m > 6 then raise Fail; - match Sub.head s with + match StrSlice.head s with | Some ('a' .. 'f' as c) -> loop (succ m) ((n * 16) + Char.code c - Char.code 'a' + 10) - (Sub.tail s) + (StrSlice.tail s) | Some ('A' .. 'F' as c) -> loop (succ m) ((n * 16) + Char.code c - Char.code 'A' + 10) - (Sub.tail s) + (StrSlice.tail s) | Some ('0' .. '9' as c) -> - loop (succ m) ((n * 16) + Char.code c - Char.code '0') (Sub.tail s) + loop + (succ m) + ((n * 16) + Char.code c - Char.code '0') + (StrSlice.tail s) | Some ';' -> if m = 0 then raise Fail; let u = if n = 0 || not (Uchar.is_valid n) then Uchar.rep else Uchar.of_int n in - ([ u ], Sub.tail s) + ([ u ], StrSlice.tail s) | Some _ | None -> raise Fail in - loop 0 0 (Sub.drop 2 s) + loop 0 0 (StrSlice.drop 2 s) | '#' :: _ -> let rec loop m n s = if m > 7 then raise Fail; - match Sub.head s with + match StrSlice.head s with | Some ('0' .. '9' as c) -> - loop (succ m) ((n * 10) + Char.code c - Char.code '0') (Sub.tail s) + loop + (succ m) + ((n * 10) + Char.code c - Char.code '0') + (StrSlice.tail s) | Some ';' -> if m = 0 then raise Fail; let u = if n = 0 || not (Uchar.is_valid n) then Uchar.rep else Uchar.of_int n in - ([ u ], Sub.tail s) + ([ u ], StrSlice.tail s) | Some _ | None -> raise Fail in - loop 0 0 (Sub.tail s) + loop 0 0 (StrSlice.tail s) | ('a' .. 'z' | 'A' .. 'Z') :: _ -> let rec loop len t = - match Sub.head t with + match StrSlice.head t with | Some ('a' .. 'z' | 'A' .. 'Z' | '0' .. '9') -> - loop (succ len) (Sub.tail t) + loop (succ len) (StrSlice.tail t) | Some ';' -> ( - let name = Sub.to_string (Sub.sub ~len s) in + let name = StrSlice.to_string (StrSlice.sub ~len s) in match Entities.f name with | [] -> raise Fail - | cps -> (cps, Sub.tail t)) + | cps -> (cps, StrSlice.tail t)) | Some _ | None -> raise Fail in - loop 1 (Sub.tail s) + loop 1 (StrSlice.tail s) | _ -> raise Fail let info_string c s = let buf = Buffer.create 17 in let s, a = - match Sub.last s with Some '}' -> attribute_string s | _ -> (s, []) + match StrSlice.last s with Some '}' -> attribute_string s | _ -> (s, []) in let s = trim_ws s in let rec loop s = - match Sub.head s with + match StrSlice.head s with (* TODO use is_whitespace *) | Some (' ' | '\t' | '\010' .. '\013') | None -> - if c = '`' && Sub.exists (function '`' -> true | _ -> false) s then - raise Fail; - ((Buffer.contents buf, Sub.to_string (trim_leading_ws s)), a) + if c = '`' && StrSlice.exists (function '`' -> true | _ -> false) s + then raise Fail; + ((Buffer.contents buf, StrSlice.to_string (trim_leading_ws s)), a) | Some '`' when c = '`' -> raise Fail | Some ('\\' as c) -> ( - let s = Sub.tail s in - match Sub.head s with + let s = StrSlice.tail s in + match StrSlice.head s with | Some c when is_punct c -> Buffer.add_char buf c; - loop (Sub.tail s) + loop (StrSlice.tail s) | Some _ | None -> Buffer.add_char buf c; loop s) | Some ('&' as c) -> ( - let s = Sub.tail s in + let s = StrSlice.tail s in match entity s with | ul, s -> List.iter (Buffer.add_utf_8_uchar buf) ul; @@ -459,74 +468,74 @@ let info_string c s = loop s) | Some c -> Buffer.add_char buf c; - loop (Sub.tail s) + loop (StrSlice.tail s) in loop (trim_leading_ws s) let fenced_code ind s = - match Sub.head s with + match StrSlice.head s with | Some (('`' | '~') as c) -> let rec loop n s = - match Sub.head s with - | Some c1 when c = c1 -> loop (succ n) (Sub.tail s) + match StrSlice.head s with + | Some c1 when c = c1 -> loop (succ n) (StrSlice.tail s) | Some _ | None -> if n < 3 then raise Fail; let s, a = info_string c s in let c = if c = '`' then Backtick else Tilde in Lfenced_code (ind, n, c, s, a) in - loop 1 (Sub.tail s) + loop 1 (StrSlice.tail s) | Some _ | None -> raise Fail let indent s = let rec loop n s = - match Sub.head s with - | Some ' ' -> loop (n + 1) (Sub.tail s) - | Some '\t' -> loop (n + 4) (Sub.tail s) + match StrSlice.head s with + | Some ' ' -> loop (n + 1) (StrSlice.tail s) + | Some '\t' -> loop (n + 4) (StrSlice.tail s) | Some _ | None -> n in loop 0 s let unordered_list_item ind s = - match Sub.head s with + match StrSlice.head s with | Some (('+' | '-' | '*') as c) -> - let s = Sub.tail s in + let s = StrSlice.tail s in if is_empty s then Llist_item (Bullet c, 2 + ind, s) else let n = indent s in if n = 0 then raise Fail; let n = if n <= 4 then n else 1 in - Llist_item (Bullet c, n + 1 + ind, Sub.offset n s) + Llist_item (Bullet c, n + 1 + ind, StrSlice.offset n s) | Some _ | None -> raise Fail let ordered_list_item ind s = let rec loop n m s = - match Sub.head s with + match StrSlice.head s with | Some ('0' .. '9' as c) -> if n >= 9 then raise Fail; - loop (succ n) ((m * 10) + Char.code c - Char.code '0') (Sub.tail s) + loop (succ n) ((m * 10) + Char.code c - Char.code '0') (StrSlice.tail s) | Some (('.' | ')') as c) -> - let s = Sub.tail s in + let s = StrSlice.tail s in if is_empty s then Llist_item (Ordered (m, c), n + 1 + ind, s) else let ind' = indent s in if ind' = 0 then raise Fail; let ind' = if ind' <= 4 then ind' else 1 in - Llist_item (Ordered (m, c), n + ind + ind' + 1, Sub.offset ind' s) + Llist_item (Ordered (m, c), n + ind + ind' + 1, StrSlice.offset ind' s) | Some _ | None -> raise Fail in loop 0 0 s let tag_name s0 = - match Sub.head s0 with + match StrSlice.head s0 with | Some ('a' .. 'z' | 'A' .. 'Z') -> let rec loop len s = - match Sub.head s with + match StrSlice.head s with | Some ('a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-') -> - loop (succ len) (Sub.tail s) - | Some _ | None -> (Sub.to_string (Sub.sub s0 ~len), s) + loop (succ len) (StrSlice.tail s) + | Some _ | None -> (StrSlice.to_string (StrSlice.sub s0 ~len), s) in - loop 1 (Sub.tail s0) + loop 1 (StrSlice.tail s0) | Some _ | None -> raise Fail let known_tags = @@ -606,63 +615,63 @@ let special_tag s = let closing_tag s = let s = trim_leading_ws s in - match Sub.head s with + match StrSlice.head s with | Some '>' -> - if not (is_empty (Sub.tail s)) then raise Fail; + if not (is_empty (StrSlice.tail s)) then raise Fail; Lhtml (false, Hblank) | Some _ | None -> raise Fail let special_tag tag s = if not (special_tag tag) then raise Fail; - match Sub.head s with + match StrSlice.head s with | Some (' ' | '\t' | '\010' .. '\013' | '>') | None -> Lhtml (true, Hcontains [ ""; ""; "" ]) | Some _ -> raise Fail let known_tag tag s = if not (known_tag tag) then raise Fail; - match Sub.take 2 s with + match StrSlice.take 2 s with | (' ' | '\t' | '\010' .. '\013') :: _ | [] | '>' :: _ | '/' :: '>' :: _ -> Lhtml (true, Hblank) | _ -> raise Fail (** TODO Why these repeated functions that look just like thos in [P]? *) let ws1 s = - match Sub.head s with + match StrSlice.head s with | Some w when is_whitespace w -> trim_leading_ws s | Some _ | None -> raise Fail let attribute_name s = - match Sub.head s with + match StrSlice.head s with | Some ('a' .. 'z' | 'A' .. 'Z' | '_' | ':') -> let rec loop s = - match Sub.head s with + match StrSlice.head s with | Some ('a' .. 'z' | 'A' .. 'Z' | '_' | '.' | ':' | '0' .. '9') -> - loop (Sub.tail s) + loop (StrSlice.tail s) | Some _ | None -> s in loop s | Some _ | None -> raise Fail let attribute_value s = - match Sub.head s with + match StrSlice.head s with | Some (('\'' | '"') as c) -> let rec loop s = - match Sub.head s with - | Some c1 when c = c1 -> Sub.tail s - | Some _ -> loop (Sub.tail s) + match StrSlice.head s with + | Some c1 when c = c1 -> StrSlice.tail s + | Some _ -> loop (StrSlice.tail s) | None -> raise Fail in - loop (Sub.tail s) + loop (StrSlice.tail s) | Some _ -> let rec loop first s = - match Sub.head s with + match StrSlice.head s with | Some (' ' | '\t' | '\010' .. '\013' | '"' | '\'' | '=' | '<' | '>' | '`') | None -> if first then raise Fail; s - | Some _ -> loop false (Sub.tail s) + | Some _ -> loop false (StrSlice.tail s) in loop true s | None -> raise Fail @@ -671,9 +680,9 @@ let attribute s = let s = ws1 s in let s = attribute_name s in let s = trim_leading_ws s in - match Sub.head s with + match StrSlice.head s with | Some '=' -> - let s = trim_leading_ws (Sub.tail s) in + let s = trim_leading_ws (StrSlice.tail s) in attribute_value s | Some _ | None -> s @@ -685,26 +694,26 @@ let open_tag s = let s = attributes s in let s = trim_leading_ws s in let n = - match Sub.take 2 s with + match StrSlice.take 2 s with | '/' :: '>' :: _ -> 2 | '>' :: _ -> 1 | _ -> raise Fail in - if not (is_empty (Sub.drop n s)) then raise Fail; + if not (is_empty (StrSlice.drop n s)) then raise Fail; Lhtml (false, Hblank) let raw_html s = - match Sub.take 10 s with + match StrSlice.take 10 s with | '<' :: '?' :: _ -> Lhtml (true, Hcontains [ "?>" ]) | '<' :: '!' :: '-' :: '-' :: _ -> Lhtml (true, Hcontains [ "-->" ]) | '<' :: '!' :: '[' :: 'C' :: 'D' :: 'A' :: 'T' :: 'A' :: '[' :: _ -> Lhtml (true, Hcontains [ "]]>" ]) | '<' :: '!' :: _ -> Lhtml (true, Hcontains [ ">" ]) | '<' :: '/' :: _ -> - let tag, s = tag_name (Sub.drop 2 s) in + let tag, s = tag_name (StrSlice.drop 2 s) in (known_tag tag ||| closing_tag) s | '<' :: _ -> - let tag, s = tag_name (Sub.drop 1 s) in + let tag, s = tag_name (StrSlice.drop 1 s) in (special_tag tag ||| known_tag tag ||| open_tag) s | _ -> raise Fail @@ -715,35 +724,36 @@ let blank s = let tag_string s = let buf = Buffer.create 17 in let s, a = - match Sub.last s with Some '}' -> attribute_string s | _ -> (s, []) + match StrSlice.last s with Some '}' -> attribute_string s | _ -> (s, []) in let s = trim_ws s in let rec loop s = - match Sub.head s with + match StrSlice.head s with (* TODO use is_whitespace *) | Some (' ' | '\t' | '\010' .. '\013') | None -> (Buffer.contents buf, a) | Some c -> Buffer.add_char buf c; - loop (Sub.tail s) + loop (StrSlice.tail s) in loop (trim_leading_ws s) let def_list s = - let s = Sub.tail s in - match Sub.head s with - | Some w when is_whitespace w -> Ldef_list (String.trim (Sub.to_string s)) + let s = StrSlice.tail s in + match StrSlice.head s with + | Some w when is_whitespace w -> + Ldef_list (String.trim (StrSlice.to_string s)) | _ -> raise Fail let indented_code ind s = if indent s + ind < 4 then raise Fail; - Lindented_code (Sub.offset (4 - ind) s) + Lindented_code (StrSlice.offset (4 - ind) s) let parse s0 = let ind, s = sp3 s0 in - match Sub.head s with + match StrSlice.head s with | Some '>' -> - let s = Sub.offset 1 s in - let s = if indent s > 0 then Sub.offset 1 s else s in + let s = StrSlice.offset 1 s in + let s = if indent s > 0 then StrSlice.offset 1 s else s in Lblockquote s | Some '=' -> setext_heading s | Some '-' -> From 7b1ea607d07d267038821e4a12517d42fb353849 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Wed, 7 Sep 2022 22:20:37 -0400 Subject: [PATCH 28/31] Add Compat module back One of our compat functions is necessary for backwards compatible functionality. --- src/compat.ml | 25 +++++++++++++++++++++++++ src/parser.ml | 2 +- src/strSlice.ml | 2 +- src/toc.ml | 2 +- 4 files changed, 28 insertions(+), 3 deletions(-) create mode 100644 src/compat.ml diff --git a/src/compat.ml b/src/compat.ml new file mode 100644 index 00000000..185ad9e3 --- /dev/null +++ b/src/compat.ml @@ -0,0 +1,25 @@ +(** Overlay on Stdcompat to refine/adjust some compatibility modules *) + +include Stdcompat + +module Buffer = struct + include Buffer + + let add_utf_8_uchar b u = + match Uchar.to_int u with + | u when u < 0 -> assert false + | u when u <= 0x007F -> Buffer.add_char b (Char.unsafe_chr u) + | u when u <= 0x07FF -> + Buffer.add_char b (Char.unsafe_chr (0xC0 lor (u lsr 6))); + Buffer.add_char b (Char.unsafe_chr (0x80 lor (u land 0x3F))) + | u when u <= 0xFFFF -> + Buffer.add_char b (Char.unsafe_chr (0xE0 lor (u lsr 12))); + Buffer.add_char b (Char.unsafe_chr (0x80 lor ((u lsr 6) land 0x3F))); + Buffer.add_char b (Char.unsafe_chr (0x80 lor (u land 0x3F))) + | u when u <= 0x10FFFF -> + Buffer.add_char b (Char.unsafe_chr (0xF0 lor (u lsr 18))); + Buffer.add_char b (Char.unsafe_chr (0x80 lor ((u lsr 12) land 0x3F))); + Buffer.add_char b (Char.unsafe_chr (0x80 lor ((u lsr 6) land 0x3F))); + Buffer.add_char b (Char.unsafe_chr (0x80 lor (u land 0x3F))) + | _ -> assert false +end diff --git a/src/parser.ml b/src/parser.ml index 3081135a..90ac161a 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -1,5 +1,5 @@ open Ast -open Stdcompat +open Compat let is_whitespace = function | ' ' | '\t' | '\010' .. '\013' -> true diff --git a/src/strSlice.ml b/src/strSlice.ml index 4fb3429e..0fbbb76b 100644 --- a/src/strSlice.ml +++ b/src/strSlice.ml @@ -1,4 +1,4 @@ -open Stdcompat +open Compat type t = { base : string diff --git a/src/toc.ml b/src/toc.ml index 6f98099d..38e5a4b6 100644 --- a/src/toc.ml +++ b/src/toc.ml @@ -1,5 +1,5 @@ open Ast -open Stdcompat +open Compat let rec remove_links inline = match inline with From 606e2c04d27464d106b1f42d7f632c2c8ded1b69 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Wed, 7 Sep 2022 22:44:10 -0400 Subject: [PATCH 29/31] Try to fix name clsah --- src/compat.ml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/compat.ml b/src/compat.ml index 185ad9e3..66ed9321 100644 --- a/src/compat.ml +++ b/src/compat.ml @@ -1,9 +1,9 @@ (** Overlay on Stdcompat to refine/adjust some compatibility modules *) -include Stdcompat +include (Stdcompat : module type of Stdcompat with module Buffer := Buffer) module Buffer = struct - include Buffer + include Stdcompat.Buffer let add_utf_8_uchar b u = match Uchar.to_int u with From e1f1f94834f4cec1fcb424113fc50d7c9266da40 Mon Sep 17 00:00:00 2001 From: Corentin Leruth Date: Fri, 9 Sep 2022 16:28:26 +0200 Subject: [PATCH 30/31] remove compat module --- src/compat.ml | 25 ------------------------- src/parser.ml | 6 +++--- src/strSlice.ml | 2 +- src/toc.ml | 2 +- 4 files changed, 5 insertions(+), 30 deletions(-) delete mode 100644 src/compat.ml diff --git a/src/compat.ml b/src/compat.ml deleted file mode 100644 index 66ed9321..00000000 --- a/src/compat.ml +++ /dev/null @@ -1,25 +0,0 @@ -(** Overlay on Stdcompat to refine/adjust some compatibility modules *) - -include (Stdcompat : module type of Stdcompat with module Buffer := Buffer) - -module Buffer = struct - include Stdcompat.Buffer - - let add_utf_8_uchar b u = - match Uchar.to_int u with - | u when u < 0 -> assert false - | u when u <= 0x007F -> Buffer.add_char b (Char.unsafe_chr u) - | u when u <= 0x07FF -> - Buffer.add_char b (Char.unsafe_chr (0xC0 lor (u lsr 6))); - Buffer.add_char b (Char.unsafe_chr (0x80 lor (u land 0x3F))) - | u when u <= 0xFFFF -> - Buffer.add_char b (Char.unsafe_chr (0xE0 lor (u lsr 12))); - Buffer.add_char b (Char.unsafe_chr (0x80 lor ((u lsr 6) land 0x3F))); - Buffer.add_char b (Char.unsafe_chr (0x80 lor (u land 0x3F))) - | u when u <= 0x10FFFF -> - Buffer.add_char b (Char.unsafe_chr (0xF0 lor (u lsr 18))); - Buffer.add_char b (Char.unsafe_chr (0x80 lor ((u lsr 12) land 0x3F))); - Buffer.add_char b (Char.unsafe_chr (0x80 lor ((u lsr 6) land 0x3F))); - Buffer.add_char b (Char.unsafe_chr (0x80 lor (u land 0x3F))) - | _ -> assert false -end diff --git a/src/parser.ml b/src/parser.ml index 90ac161a..e3f20e88 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -1,5 +1,5 @@ open Ast -open Compat +open Stdcompat let is_whitespace = function | ' ' | '\t' | '\010' .. '\013' -> true @@ -461,7 +461,7 @@ let info_string c s = let s = StrSlice.tail s in match entity s with | ul, s -> - List.iter (Buffer.add_utf_8_uchar buf) ul; + List.iter (Uutf.Buffer.add_utf_8 buf) ul; loop s | exception Fail -> Buffer.add_char buf c; @@ -819,7 +819,7 @@ let inline_attribute_string s = let entity buf st = junk st; match on_sub entity st with - | cs -> List.iter (Buffer.add_utf_8_uchar buf) cs + | cs -> List.iter (Uutf.Buffer.add_utf_8 buf) cs | exception Fail -> Buffer.add_char buf '&' module Pre = struct diff --git a/src/strSlice.ml b/src/strSlice.ml index 0fbbb76b..4fb3429e 100644 --- a/src/strSlice.ml +++ b/src/strSlice.ml @@ -1,4 +1,4 @@ -open Compat +open Stdcompat type t = { base : string diff --git a/src/toc.ml b/src/toc.ml index 38e5a4b6..6f98099d 100644 --- a/src/toc.ml +++ b/src/toc.ml @@ -1,5 +1,5 @@ open Ast -open Compat +open Stdcompat let rec remove_links inline = match inline with From 612f35ad2bbfde567d500759bd7d3d81eda7215f Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Sun, 11 Sep 2022 20:25:33 -0400 Subject: [PATCH 31/31] Raise minimum ocaml version to 4.08 --- .github/workflows/workflow.yml | 4 +++- dune-project | 2 +- omd.opam | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml index 4ea86554..327df7a5 100644 --- a/.github/workflows/workflow.yml +++ b/.github/workflows/workflow.yml @@ -17,7 +17,9 @@ jobs: - windows-latest ocaml-compiler: # Decision on version matrix informed by https://discuss.ocaml.org/t/which-ocaml-compiler-versions-should-we-run-against-in-ci/7933/2 - - 4.05.0 + # But has gradually inched up due to signs of bitrot on earlier versions + # such as https://github.com/thierry-martinez/stdcompat/issues/26 + - 4.08.0 - 4.14.x runs-on: ${{ matrix.os }} steps: diff --git a/dune-project b/dune-project index 05cbb0c1..f1ff65f0 100644 --- a/dune-project +++ b/dune-project @@ -24,7 +24,7 @@ Additionally, OMD implements a few Github markdown features, an extension mechanism, and some other features. Note that the opam package installs both the OMD library and the command line tool `omd`.") (tags (org:ocamllabs org:mirage)) - (depends (ocaml (>= 4.05)) + (depends (ocaml (>= 4.08)) stdcompat uutf uucp diff --git a/omd.opam b/omd.opam index 0fd4565a..3e6a58ca 100644 --- a/omd.opam +++ b/omd.opam @@ -22,7 +22,7 @@ homepage: "https://github.com/ocaml/omd" bug-reports: "https://github.com/ocaml/omd/issues" depends: [ "dune" {>= "2.7"} - "ocaml" {>= "4.05"} + "ocaml" {>= "4.08"} "stdcompat" "uutf" "uucp"