From 0d6a4007f15d9845f5bf97e15fe683e557aed2a2 Mon Sep 17 00:00:00 2001
From: pkoppstein <pkoppstein@gmail.com>
Date: Thu, 29 Jun 2023 02:04:44 -0400
Subject: [PATCH 1/5] revamp sub/3 to resolve most issues with gsub (and sub
 with "g"); add uniq(stream)

The primary purpose of this commit (which supercedes PR
https://github.com/jqlang/jq/pull/2624) is to rectify most problems
with `gsub` (and also `sub` with the "g" option), in particular #1425
('\b'), #2354 (lookahead), and #2532 (regex == "^(?!cd ).*$|^cd ";"")).

This commit also partly resolves #2148 and #1206 in that `gsub` no
longer loops infinitely; however, because the new `gsub` depends
critically on match(_;"g"), the behavior when regex == "" is sometimes
non-standard. [*1]

Since the new sub/3 relies on uniq/1, that has been added as well [*2].

The documentation has been updated to reflect the fact that `sub` and
`gsub` are intended to be regular in the second argument. [*3]

Also, _nwise/1 has been tweaked to take advantage of TCO.

Footnotes:

[*1] Using the new gsub, '"a" | gsub( ""; "a")' emits "aa" rather than
"aaa" as would be standard.  This is nevertheless better than the
infinite loop behavior of jq 1.6 in this case.

With one exception (as explained in [*2]), the new gsub is implemented
as though match/2 behavior is correct.  That is, bugs in `gsub`
behavior will most likely have their origin in `match/2`.

[*2] `uniq/1` adopts the Unix/Linux name and semantics; it is needed for the following test case:

gsub("(?=u)"; "u")
"qux"
"quux"

Without this functionality:

Test #23: 'gsub("(?=u)"; "u")' at line number 100
*** Expected "quux", but got "quuux" for test at line number 102: gsub("(?=u)"; "u")

The root of the problem here is `match`: if `match` is fixed, then gsub would not need `untie`.

The addition of `uniq` as a top-level function should be a non-issue
relative to general concern about builtins.jq bloat: the line count of
the new builtin.jq is significantly reduced overall, and the number of
defs is actually reduced by 1 (from 111 (ignoring a redundant def) to 110).

[*3] See e.g. https://github.com/jqlang/jq/issues/513#issuecomment-50834811
---
 docs/content/manual/manual.yml | 48 ++++++++++++++++++++-----
 src/builtin.jq                 | 66 ++++++++++++++--------------------
 tests/jq.test                  |  8 +++++
 tests/onig.test                | 65 +++++++++++++++++++++++++++++++++
 4 files changed, 139 insertions(+), 48 deletions(-)

diff --git a/docs/content/manual/manual.yml b/docs/content/manual/manual.yml
index 68cc091fef..ca08f89eaf 100644
--- a/docs/content/manual/manual.yml
+++ b/docs/content/manual/manual.yml
@@ -1428,6 +1428,30 @@ sections:
             input: '[{"foo":1, "bar":14}, {"foo":2, "bar":3}]'
             output: ['{"foo":2, "bar":3}']
 
+      - title: "`uniq(stream)`"
+        body: |
+
+          The `uniq` function produces a substream of the given stream
+          by emitting in turn the first item from each run within it.
+          No sorting takes place.
+          
+        examples:
+          - program: '[uniq(1,1,2,null,null,1)]'
+            input: 'null'
+            output: ['[1,2,null,1]']
+
+          - program: '[uniq(.[])]'
+            input: '[1,1,2,null,null,1]'
+            output: ['[1,2,null,1]']
+
+          - program: '[uniq(empty)]'
+            input: 'null'
+            output: ['[]']
+
+          - program: '[true, false | [uniq(1,1,2)]]'
+            input: null
+            output: ['[[1,2],[1,2]]']
+            
       - title: "`unique`, `unique_by(path_exp)`"
         body: |
 
@@ -2471,27 +2495,33 @@ sections:
             input: '("ab,cd", "ef, gh")'
             output: ['"ab"', '"cd"', '"ef"', '"gh"']
 
-      - title: "`sub(regex; tostring)`, `sub(regex; string; flags)`"
+      - title: "`sub(regex; tostring)`, `sub(regex; tostring; flags)`"
         body: |
 
-          Emit the string obtained by replacing the first match of regex in the
-          input string with `tostring`, after interpolation.  `tostring` should
-          be a jq string, and may contain references to named captures. The
-          named captures are, in effect, presented as a JSON object (as
-          constructed by `capture`) to `tostring`, so a reference to a captured
-          variable named "x" would take the form: `"\(.x)"`.
+          Emit the string obtained by replacing the first match of
+          regex in the input string with `tostring`, after
+          interpolation.  `tostring` should be a jq string or a stream
+          of such strings, each of which may contain references to
+          named captures. The named captures are, in effect, presented
+          as a JSON object (as constructed by `capture`) to
+          `tostring`, so a reference to a captured variable named "x"
+          would take the form: `"\(.x)"`.
 
         example:
           - program: 'sub("^[^a-z]*(?<x>[a-z]*).*")'
             input: '"123abc456"'
             output: '"ZabcZabc"'
 
+          - program: '[sub("(?<a>.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)")]'
+            input: '"aB"'
+            output: ['["AB","aB"]']
 
-      - title: "`gsub(regex; string)`, `gsub(regex; string; flags)`"
+      - title: "`gsub(regex; tostring)`, `gsub(regex; tostring; flags)`"
         body: |
 
           `gsub` is like `sub` but all the non-overlapping occurrences of the regex are
-          replaced by the string, after interpolation.
+          replaced by `tostring`, after interpolation. If the second argument is a stream
+          of jq strings, then `gsub` will produce a corresponding stream of JSON strings.
 
         example:
           - program: 'gsub("(?<x>.)[^a]*"; "+\(.x)-")'
diff --git a/src/builtin.jq b/src/builtin.jq
index a102fd51a0..7af5d696e5 100644
--- a/src/builtin.jq
+++ b/src/builtin.jq
@@ -99,8 +99,10 @@ def scan(re):
 #
 # If input is an array, then emit a stream of successive subarrays of length n (or less),
 # and similarly for strings.
-def _nwise(a; $n): if a|length <= $n then a else a[0:$n] , _nwise(a[$n:]; $n) end;
-def _nwise($n): _nwise(.; $n);
+def _nwise($n):
+  def n: if length <= $n then . else .[0:$n] , (.[$n:] | n) end;
+  n;
+def _nwise(a; $n): a | _nwise($n);
 #
 # splits/1 produces a stream; split/1 is retained for backward compatibility.
 def splits($re; flags): . as $s
@@ -114,47 +116,34 @@ def splits($re): splits($re; null);
 # split emits an array for backward compatibility
 def split($re; flags): [ splits($re; flags) ];
 #
-# If s contains capture variables, then create a capture object and pipe it to s
-def sub($re; s):
-  . as $in
-  | [match($re)]
-  | if length == 0 then $in
-    else .[0]
-    | . as $r
-#  # create the "capture" object:
-    | reduce ( $r | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair
-        ({}; . + $pair)
-    | $in[0:$r.offset] + s + $in[$r.offset+$r.length:]
-    end ;
+# stream-oriented
+def uniq(s):
+  foreach s as $x (null;
+    if . and $x == .[0] then .[1] = false
+    else [$x, true]
+    end;
+    if .[1] then .[0] else empty end);
 #
-# If s contains capture variables, then create a capture object and pipe it to s
-def sub($re; s; flags):
-  def subg: [explode[] | select(. != 103)] | implode;
-  # "fla" should be flags with all occurrences of g removed; gs should be non-nil if flags has a g
-  def sub1(fla; gs):
-    def mysub:
-      . as $in
-      | [match($re; fla)]
-      | if length == 0 then $in
-        else .[0] as $edit
-        | ($edit | .offset + .length) as $len
-        # create the "capture" object:
-        | reduce ( $edit | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair
-            ({}; . + $pair)
-        | $in[0:$edit.offset]
-          + s
-          + ($in[$len:] | if length > 0 and gs then mysub else . end)
-        end ;
-    mysub ;
-    (flags | index("g")) as $gs
-    | (flags | if $gs then subg else . end) as $fla
-    | sub1($fla; $gs);
+# If s contains capture variables, then create a capture object and pipe it to s, bearing
+# in mind that s could be a stream
+def sub($re; s; $flags):
+   . as $in
+   | (reduce uniq(match($re; $flags)) as $edit
+        ({result: [], previous: 0};
+            $in[ .previous: ($edit | .offset) ] as $gap
+            # create the "capture" objects (one per item in s)
+            | [reduce ( $edit | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair
+                 ({}; . + $pair) | s ] as $inserts
+            | reduce range(0; $inserts|length) as $ix (.; .result[$ix] += $gap + $inserts[$ix])
+	    | .previous = ($edit | .offset + .length ) )
+          | .result[] + $in[.previous:] )
+      // $in;
 #
 def sub($re; s): sub($re; s; "");
-# repeated substitution of re (which may contain named captures)
+#
 def gsub($re; s; flags): sub($re; s; flags + "g");
 def gsub($re; s): sub($re; s; "g");
-
+#
 ########################################################################
 # generic iterator/generator
 def while(cond; update):
@@ -237,7 +226,6 @@ def tostream:
   getpath($p) |
   reduce path(.[]?) as $q ([$p, .]; [$p+$q]);
 
-
 # Assuming the input array is sorted, bsearch/1 returns
 # the index of the target if the target is in the input array; and otherwise
 #  (-1 - ix), where ix is the insertion point that would leave the array sorted.
diff --git a/tests/jq.test b/tests/jq.test
index ca8e27059f..83b19fb4e9 100644
--- a/tests/jq.test
+++ b/tests/jq.test
@@ -1731,3 +1731,11 @@ false
 . |= try . catch .
 1
 1
+
+[uniq(1,1,2,3,3,4)]
+null
+[1,2,3,4]
+
+[uniq(empty)]
+null
+[]
diff --git a/tests/onig.test b/tests/onig.test
index daacae9cd7..805efabaa6 100644
--- a/tests/onig.test
+++ b/tests/onig.test
@@ -75,6 +75,45 @@ gsub( "(.*)"; "";  "x")
 ""
 ""
 
+gsub( ""; "a";  "g")
+""
+"a"
+
+gsub( "^"; "";  "g")
+"a"
+"a"
+
+
+# The following is a regression test and should not be construed as a requirement other than that execution should terminate:
+gsub( ""; "a";  "g")
+"a"
+"aa"
+
+gsub( "$"; "a";  "g")
+"a"
+"aa"
+
+gsub( "^"; "a")
+""
+"a"
+
+gsub("(?=u)"; "u")
+"qux"
+"quux"
+
+gsub("^.*a"; "b")
+"aaa"
+"b"
+
+gsub("^.*?a"; "b")
+"aaa"
+"baa"
+
+# The following is for regression testing and should not be construed as a requirement:
+[gsub("a"; "b", "c")]
+"a"
+["b","c"]
+
 [.[] | scan(", ")]
 ["a,b, c, d, e,f",", a,b, c, d, e,f, "]
 [", ",", ",", ",", ",", ",", ",", ",", "]
@@ -92,7 +131,33 @@ gsub("(?<x>.)[^a]*"; "+\(.x)-")
 "Abcabc"
 "+A-+a-"
 
+gsub("(?<x>.)(?<y>[0-9])"; "\(.x|ascii_downcase)\(.y)")
+"A1 B2 CD"
+"a1 b2 CD"
+
+gsub("\\b(?<x>.)"; "\(.x|ascii_downcase)")
+"ABC DEF"
+"aBC dEF"
+
 # utf-8
 sub("(?<x>.)"; "\(.x)!")
 "’"
 "’!"
+
+[sub("a"; "b", "c")]
+"a"
+["b","c"]
+
+[sub("(?<a>.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)", "c")]
+"aB"
+["AB","aB","cB"]
+
+[gsub("(?<a>.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)", "c")]
+"aB"
+["AB","ab","cc"]
+
+# splits and _nwise
+[splits("")]
+"ab"
+["","a","b"]
+

From d8cd39eaadf33b2fd04d2c7f9e878a634b2ba0b4 Mon Sep 17 00:00:00 2001
From: pkoppstein <pkoppstein@gmail.com>
Date: Sun, 2 Jul 2023 00:29:46 -0400
Subject: [PATCH 2/5] builtin.c: fix zero-width bug, so uniq/1 is no longer
 needed

builtin.c: f_match: Zero-width match : ensure '"qux" | match("(?=u)"; "g")' matches just once

rm uniq/1 as it is no longer needed

In manual.yml, replace nonsensical sub/1 example
---
 docs/content/manual/manual.yml | 31 +++----------------
 src/builtin.c                  |  3 +-
 src/builtin.jq                 | 10 +------
 tests/jq.test                  |  7 -----
 tests/man.test                 | 54 ++++++++++++++++++++++++++--------
 5 files changed, 49 insertions(+), 56 deletions(-)

diff --git a/docs/content/manual/manual.yml b/docs/content/manual/manual.yml
index ca08f89eaf..aa3a61f8be 100644
--- a/docs/content/manual/manual.yml
+++ b/docs/content/manual/manual.yml
@@ -1428,30 +1428,6 @@ sections:
             input: '[{"foo":1, "bar":14}, {"foo":2, "bar":3}]'
             output: ['{"foo":2, "bar":3}']
 
-      - title: "`uniq(stream)`"
-        body: |
-
-          The `uniq` function produces a substream of the given stream
-          by emitting in turn the first item from each run within it.
-          No sorting takes place.
-          
-        examples:
-          - program: '[uniq(1,1,2,null,null,1)]'
-            input: 'null'
-            output: ['[1,2,null,1]']
-
-          - program: '[uniq(.[])]'
-            input: '[1,1,2,null,null,1]'
-            output: ['[1,2,null,1]']
-
-          - program: '[uniq(empty)]'
-            input: 'null'
-            output: ['[]']
-
-          - program: '[true, false | [uniq(1,1,2)]]'
-            input: null
-            output: ['[[1,2],[1,2]]']
-            
       - title: "`unique`, `unique_by(path_exp)`"
         body: |
 
@@ -2508,9 +2484,10 @@ sections:
           would take the form: `"\(.x)"`.
 
         example:
-          - program: 'sub("^[^a-z]*(?<x>[a-z]*).*")'
-            input: '"123abc456"'
-            output: '"ZabcZabc"'
+
+          - program: 'sub("[^a-z]*(?<x>[a-z]*)"; "Z\(.x)"; "g")'
+            input: '"123abc456def"'
+            output: ['"ZabcZdef"']
 
           - program: '[sub("(?<a>.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)")]'
             input: '"aB"'
diff --git a/src/builtin.c b/src/builtin.c
index 9b2d9a23fc..9a26ea3ae3 100644
--- a/src/builtin.c
+++ b/src/builtin.c
@@ -930,7 +930,8 @@ static jv f_match(jq_state *jq, jv input, jv regex, jv modifiers, jv testmode) {
         match = jv_object_set(match, jv_string("string"), jv_string(""));
         match = jv_object_set(match, jv_string("captures"), jv_array());
         result = jv_array_append(result, match);
-        start += 1;
+	// ensure '"qux" | match("(?=u)"; "g")' matches just once
+	start = (const UChar*)(input_string+region->end[0]+1);
         continue;
       }
 
diff --git a/src/builtin.jq b/src/builtin.jq
index 7af5d696e5..47a0171bbe 100644
--- a/src/builtin.jq
+++ b/src/builtin.jq
@@ -116,19 +116,11 @@ def splits($re): splits($re; null);
 # split emits an array for backward compatibility
 def split($re; flags): [ splits($re; flags) ];
 #
-# stream-oriented
-def uniq(s):
-  foreach s as $x (null;
-    if . and $x == .[0] then .[1] = false
-    else [$x, true]
-    end;
-    if .[1] then .[0] else empty end);
-#
 # If s contains capture variables, then create a capture object and pipe it to s, bearing
 # in mind that s could be a stream
 def sub($re; s; $flags):
    . as $in
-   | (reduce uniq(match($re; $flags)) as $edit
+   | (reduce match($re; $flags) as $edit
         ({result: [], previous: 0};
             $in[ .previous: ($edit | .offset) ] as $gap
             # create the "capture" objects (one per item in s)
diff --git a/tests/jq.test b/tests/jq.test
index 83b19fb4e9..78c4017b14 100644
--- a/tests/jq.test
+++ b/tests/jq.test
@@ -1732,10 +1732,3 @@ false
 1
 1
 
-[uniq(1,1,2,3,3,4)]
-null
-[1,2,3,4]
-
-[uniq(empty)]
-null
-[]
diff --git a/tests/man.test b/tests/man.test
index 1c6ff67ed7..947ae61886 100644
--- a/tests/man.test
+++ b/tests/man.test
@@ -2,6 +2,18 @@
 "Hello, world!"
 "Hello, world!"
 
+. | tojson
+12345678909876543212345
+"12345678909876543212345"
+
+map([., . == 1]) | tojson
+[1, 1.000, 1.0, 100e-2]
+"[[1,true],[1.000,true],[1.0,true],[1.00,true]]"
+
+. as $big | [$big, $big + 1] | map(. > 10000000000000000000000000000000)
+10000000000000000000000000000001
+[true, false]
+
 .foo
 {"foo": 42, "bar": "less interesting data"}
 42
@@ -163,11 +175,12 @@ null
 -1
 
 .[] | length
-[[1,2], "string", {"a":2}, null]
+[[1,2], "string", {"a":2}, null, -5]
 2
 6
 1
 0
+5
 
 utf8bytelength
 "\u03bc"
@@ -343,12 +356,12 @@ flatten
 [{"foo": "bar"}, [{"foo": "baz"}]]
 [{"foo": "bar"}, {"foo": "baz"}]
 
-range(2;4)
+range(2; 4)
 null
 2
 3
 
-[range(2;4)]
+[range(2; 4)]
 null
 [2,3]
 
@@ -356,15 +369,15 @@ null
 null
 [0,1,2,3]
 
-[range(0;10;3)]
+[range(0; 10; 3)]
 null
 [0,3,6,9]
 
-[range(0;10;-1)]
+[range(0; 10; -1)]
 null
 []
 
-[range(0;-5;-1)]
+[range(0; -5; -1)]
 null
 [0,-1,-2,-3,-4]
 
@@ -551,6 +564,10 @@ join(" ")
 1
 [1,2,4,8,16,32,64]
 
+[repeat(.*2, error)?]
+1
+[2]
+
 [.,1]|until(.[0] < 1; [.[0] - 1, .[1] * .[0]])|.[1]
 4
 24
@@ -658,7 +675,7 @@ true
 false
 false
 
-if . == 0 then "zero" elif . == 1 then "one" else "many" end
+if . == 0 then   "zero" elif . == 1 then   "one" else   "many" end
 2
 "many"
 
@@ -841,8 +858,21 @@ true
 {"foo": 42}
 {"foo": 43}
 
-.[]|tonumber?
-["1", "hello", "3", 4]
-1
-3
-4
+.a = .b
+{"a": {"b": 10}, "b": 20}
+{"a":20,"b":20}
+
+.a |= .b
+{"a": {"b": 10}, "b": 20}
+{"a":10,"b":20}
+
+(.a, .b) = range(3)
+null
+{"a":0,"b":0}
+{"a":1,"b":1}
+{"a":2,"b":2}
+
+(.a, .b) |= range(3)
+null
+{"a":0,"b":0}
+

From 5373191e64d9763c14da46b1657828779b74f462 Mon Sep 17 00:00:00 2001
From: pkoppstein <pkoppstein@gmail.com>
Date: Sun, 2 Jul 2023 01:53:59 -0400
Subject: [PATCH 3/5] manual.yml: gsub examples

Correct one gsub example, and add another
---
 docs/content/manual/manual.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/content/manual/manual.yml b/docs/content/manual/manual.yml
index aa3a61f8be..d56c53a5e8 100644
--- a/docs/content/manual/manual.yml
+++ b/docs/content/manual/manual.yml
@@ -2503,7 +2503,11 @@ sections:
         example:
           - program: 'gsub("(?<x>.)[^a]*"; "+\(.x)-")'
             input: '"Abcabc"'
-            output: '"+A-+a-"'
+            output: ['"+A-+a-"']
+
+          - program: '[gsub("p"; "a", "b")]'
+            input: '"p"'
+            output: ['["a","b"]']
 
 
   - title: Advanced features

From 3015ec8d8ddc6ae9780d0881f87896437edee440 Mon Sep 17 00:00:00 2001
From: pkoppstein <pkoppstein@gmail.com>
Date: Sun, 2 Jul 2023 15:03:36 -0400
Subject: [PATCH 4/5] untabify builtin.jq

Remove two tab characters
---
 src/builtin.jq | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/builtin.jq b/src/builtin.jq
index 47a0171bbe..096635119f 100644
--- a/src/builtin.jq
+++ b/src/builtin.jq
@@ -127,7 +127,7 @@ def sub($re; s; $flags):
             | [reduce ( $edit | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair
                  ({}; . + $pair) | s ] as $inserts
             | reduce range(0; $inserts|length) as $ix (.; .result[$ix] += $gap + $inserts[$ix])
-	    | .previous = ($edit | .offset + .length ) )
+            | .previous = ($edit | .offset + .length ) )
           | .result[] + $in[.previous:] )
       // $in;
 #
@@ -187,7 +187,7 @@ def transpose:
   | length as $length
   | reduce range(0; $max) as $j
       ([]; . + [reduce range(0;$length) as $i ([]; . + [ $in[$i][$j] ] )] )
-	        end;
+  end;
 def in(xs): . as $x | xs | has($x);
 def inside(xs): . as $x | xs | contains($x);
 def repeat(exp):

From 832734b5cdc49935a4a8530c8c492fd908da40b2 Mon Sep 17 00:00:00 2001
From: pkoppstein <pkoppstein@gmail.com>
Date: Sun, 2 Jul 2023 21:51:44 -0400
Subject: [PATCH 5/5] builtin.c: untabify

untabify
---
 src/builtin.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/builtin.c b/src/builtin.c
index 9a26ea3ae3..2d1156dc56 100644
--- a/src/builtin.c
+++ b/src/builtin.c
@@ -930,8 +930,8 @@ static jv f_match(jq_state *jq, jv input, jv regex, jv modifiers, jv testmode) {
         match = jv_object_set(match, jv_string("string"), jv_string(""));
         match = jv_object_set(match, jv_string("captures"), jv_array());
         result = jv_array_append(result, match);
-	// ensure '"qux" | match("(?=u)"; "g")' matches just once
-	start = (const UChar*)(input_string+region->end[0]+1);
+        // ensure '"qux" | match("(?=u)"; "g")' matches just once
+        start = (const UChar*)(input_string+region->end[0]+1);
         continue;
       }