From 0d6a4007f15d9845f5bf97e15fe683e557aed2a2 Mon Sep 17 00:00:00 2001 From: pkoppstein Date: Thu, 29 Jun 2023 02:04:44 -0400 Subject: [PATCH 1/5] revamp sub/3 to resolve most issues with gsub (and sub with "g"); add uniq(stream) The primary purpose of this commit (which supercedes PR https://github.com/jqlang/jq/pull/2624) is to rectify most problems with `gsub` (and also `sub` with the "g" option), in particular #1425 ('\b'), #2354 (lookahead), and #2532 (regex == "^(?!cd ).*$|^cd ";"")). This commit also partly resolves #2148 and #1206 in that `gsub` no longer loops infinitely; however, because the new `gsub` depends critically on match(_;"g"), the behavior when regex == "" is sometimes non-standard. [*1] Since the new sub/3 relies on uniq/1, that has been added as well [*2]. The documentation has been updated to reflect the fact that `sub` and `gsub` are intended to be regular in the second argument. [*3] Also, _nwise/1 has been tweaked to take advantage of TCO. Footnotes: [*1] Using the new gsub, '"a" | gsub( ""; "a")' emits "aa" rather than "aaa" as would be standard. This is nevertheless better than the infinite loop behavior of jq 1.6 in this case. With one exception (as explained in [*2]), the new gsub is implemented as though match/2 behavior is correct. That is, bugs in `gsub` behavior will most likely have their origin in `match/2`. [*2] `uniq/1` adopts the Unix/Linux name and semantics; it is needed for the following test case: gsub("(?=u)"; "u") "qux" "quux" Without this functionality: Test #23: 'gsub("(?=u)"; "u")' at line number 100 *** Expected "quux", but got "quuux" for test at line number 102: gsub("(?=u)"; "u") The root of the problem here is `match`: if `match` is fixed, then gsub would not need `untie`. The addition of `uniq` as a top-level function should be a non-issue relative to general concern about builtins.jq bloat: the line count of the new builtin.jq is significantly reduced overall, and the number of defs is actually reduced by 1 (from 111 (ignoring a redundant def) to 110). [*3] See e.g. https://github.com/jqlang/jq/issues/513#issuecomment-50834811 --- docs/content/manual/manual.yml | 48 ++++++++++++++++++++----- src/builtin.jq | 66 ++++++++++++++-------------------- tests/jq.test | 8 +++++ tests/onig.test | 65 +++++++++++++++++++++++++++++++++ 4 files changed, 139 insertions(+), 48 deletions(-) diff --git a/docs/content/manual/manual.yml b/docs/content/manual/manual.yml index 68cc091fef..ca08f89eaf 100644 --- a/docs/content/manual/manual.yml +++ b/docs/content/manual/manual.yml @@ -1428,6 +1428,30 @@ sections: input: '[{"foo":1, "bar":14}, {"foo":2, "bar":3}]' output: ['{"foo":2, "bar":3}'] + - title: "`uniq(stream)`" + body: | + + The `uniq` function produces a substream of the given stream + by emitting in turn the first item from each run within it. + No sorting takes place. + + examples: + - program: '[uniq(1,1,2,null,null,1)]' + input: 'null' + output: ['[1,2,null,1]'] + + - program: '[uniq(.[])]' + input: '[1,1,2,null,null,1]' + output: ['[1,2,null,1]'] + + - program: '[uniq(empty)]' + input: 'null' + output: ['[]'] + + - program: '[true, false | [uniq(1,1,2)]]' + input: null + output: ['[[1,2],[1,2]]'] + - title: "`unique`, `unique_by(path_exp)`" body: | @@ -2471,27 +2495,33 @@ sections: input: '("ab,cd", "ef, gh")' output: ['"ab"', '"cd"', '"ef"', '"gh"'] - - title: "`sub(regex; tostring)`, `sub(regex; string; flags)`" + - title: "`sub(regex; tostring)`, `sub(regex; tostring; flags)`" body: | - Emit the string obtained by replacing the first match of regex in the - input string with `tostring`, after interpolation. `tostring` should - be a jq string, and may contain references to named captures. The - named captures are, in effect, presented as a JSON object (as - constructed by `capture`) to `tostring`, so a reference to a captured - variable named "x" would take the form: `"\(.x)"`. + Emit the string obtained by replacing the first match of + regex in the input string with `tostring`, after + interpolation. `tostring` should be a jq string or a stream + of such strings, each of which may contain references to + named captures. The named captures are, in effect, presented + as a JSON object (as constructed by `capture`) to + `tostring`, so a reference to a captured variable named "x" + would take the form: `"\(.x)"`. example: - program: 'sub("^[^a-z]*(?[a-z]*).*")' input: '"123abc456"' output: '"ZabcZabc"' + - program: '[sub("(?.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)")]' + input: '"aB"' + output: ['["AB","aB"]'] - - title: "`gsub(regex; string)`, `gsub(regex; string; flags)`" + - title: "`gsub(regex; tostring)`, `gsub(regex; tostring; flags)`" body: | `gsub` is like `sub` but all the non-overlapping occurrences of the regex are - replaced by the string, after interpolation. + replaced by `tostring`, after interpolation. If the second argument is a stream + of jq strings, then `gsub` will produce a corresponding stream of JSON strings. example: - program: 'gsub("(?.)[^a]*"; "+\(.x)-")' diff --git a/src/builtin.jq b/src/builtin.jq index a102fd51a0..7af5d696e5 100644 --- a/src/builtin.jq +++ b/src/builtin.jq @@ -99,8 +99,10 @@ def scan(re): # # If input is an array, then emit a stream of successive subarrays of length n (or less), # and similarly for strings. -def _nwise(a; $n): if a|length <= $n then a else a[0:$n] , _nwise(a[$n:]; $n) end; -def _nwise($n): _nwise(.; $n); +def _nwise($n): + def n: if length <= $n then . else .[0:$n] , (.[$n:] | n) end; + n; +def _nwise(a; $n): a | _nwise($n); # # splits/1 produces a stream; split/1 is retained for backward compatibility. def splits($re; flags): . as $s @@ -114,47 +116,34 @@ def splits($re): splits($re; null); # split emits an array for backward compatibility def split($re; flags): [ splits($re; flags) ]; # -# If s contains capture variables, then create a capture object and pipe it to s -def sub($re; s): - . as $in - | [match($re)] - | if length == 0 then $in - else .[0] - | . as $r -# # create the "capture" object: - | reduce ( $r | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair - ({}; . + $pair) - | $in[0:$r.offset] + s + $in[$r.offset+$r.length:] - end ; +# stream-oriented +def uniq(s): + foreach s as $x (null; + if . and $x == .[0] then .[1] = false + else [$x, true] + end; + if .[1] then .[0] else empty end); # -# If s contains capture variables, then create a capture object and pipe it to s -def sub($re; s; flags): - def subg: [explode[] | select(. != 103)] | implode; - # "fla" should be flags with all occurrences of g removed; gs should be non-nil if flags has a g - def sub1(fla; gs): - def mysub: - . as $in - | [match($re; fla)] - | if length == 0 then $in - else .[0] as $edit - | ($edit | .offset + .length) as $len - # create the "capture" object: - | reduce ( $edit | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair - ({}; . + $pair) - | $in[0:$edit.offset] - + s - + ($in[$len:] | if length > 0 and gs then mysub else . end) - end ; - mysub ; - (flags | index("g")) as $gs - | (flags | if $gs then subg else . end) as $fla - | sub1($fla; $gs); +# If s contains capture variables, then create a capture object and pipe it to s, bearing +# in mind that s could be a stream +def sub($re; s; $flags): + . as $in + | (reduce uniq(match($re; $flags)) as $edit + ({result: [], previous: 0}; + $in[ .previous: ($edit | .offset) ] as $gap + # create the "capture" objects (one per item in s) + | [reduce ( $edit | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair + ({}; . + $pair) | s ] as $inserts + | reduce range(0; $inserts|length) as $ix (.; .result[$ix] += $gap + $inserts[$ix]) + | .previous = ($edit | .offset + .length ) ) + | .result[] + $in[.previous:] ) + // $in; # def sub($re; s): sub($re; s; ""); -# repeated substitution of re (which may contain named captures) +# def gsub($re; s; flags): sub($re; s; flags + "g"); def gsub($re; s): sub($re; s; "g"); - +# ######################################################################## # generic iterator/generator def while(cond; update): @@ -237,7 +226,6 @@ def tostream: getpath($p) | reduce path(.[]?) as $q ([$p, .]; [$p+$q]); - # Assuming the input array is sorted, bsearch/1 returns # the index of the target if the target is in the input array; and otherwise # (-1 - ix), where ix is the insertion point that would leave the array sorted. diff --git a/tests/jq.test b/tests/jq.test index ca8e27059f..83b19fb4e9 100644 --- a/tests/jq.test +++ b/tests/jq.test @@ -1731,3 +1731,11 @@ false . |= try . catch . 1 1 + +[uniq(1,1,2,3,3,4)] +null +[1,2,3,4] + +[uniq(empty)] +null +[] diff --git a/tests/onig.test b/tests/onig.test index daacae9cd7..805efabaa6 100644 --- a/tests/onig.test +++ b/tests/onig.test @@ -75,6 +75,45 @@ gsub( "(.*)"; ""; "x") "" "" +gsub( ""; "a"; "g") +"" +"a" + +gsub( "^"; ""; "g") +"a" +"a" + + +# The following is a regression test and should not be construed as a requirement other than that execution should terminate: +gsub( ""; "a"; "g") +"a" +"aa" + +gsub( "$"; "a"; "g") +"a" +"aa" + +gsub( "^"; "a") +"" +"a" + +gsub("(?=u)"; "u") +"qux" +"quux" + +gsub("^.*a"; "b") +"aaa" +"b" + +gsub("^.*?a"; "b") +"aaa" +"baa" + +# The following is for regression testing and should not be construed as a requirement: +[gsub("a"; "b", "c")] +"a" +["b","c"] + [.[] | scan(", ")] ["a,b, c, d, e,f",", a,b, c, d, e,f, "] [", ",", ",", ",", ",", ",", ",", ",", "] @@ -92,7 +131,33 @@ gsub("(?.)[^a]*"; "+\(.x)-") "Abcabc" "+A-+a-" +gsub("(?.)(?[0-9])"; "\(.x|ascii_downcase)\(.y)") +"A1 B2 CD" +"a1 b2 CD" + +gsub("\\b(?.)"; "\(.x|ascii_downcase)") +"ABC DEF" +"aBC dEF" + # utf-8 sub("(?.)"; "\(.x)!") "’" "’!" + +[sub("a"; "b", "c")] +"a" +["b","c"] + +[sub("(?.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)", "c")] +"aB" +["AB","aB","cB"] + +[gsub("(?.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)", "c")] +"aB" +["AB","ab","cc"] + +# splits and _nwise +[splits("")] +"ab" +["","a","b"] + From d8cd39eaadf33b2fd04d2c7f9e878a634b2ba0b4 Mon Sep 17 00:00:00 2001 From: pkoppstein Date: Sun, 2 Jul 2023 00:29:46 -0400 Subject: [PATCH 2/5] builtin.c: fix zero-width bug, so uniq/1 is no longer needed builtin.c: f_match: Zero-width match : ensure '"qux" | match("(?=u)"; "g")' matches just once rm uniq/1 as it is no longer needed In manual.yml, replace nonsensical sub/1 example --- docs/content/manual/manual.yml | 31 +++---------------- src/builtin.c | 3 +- src/builtin.jq | 10 +------ tests/jq.test | 7 ----- tests/man.test | 54 ++++++++++++++++++++++++++-------- 5 files changed, 49 insertions(+), 56 deletions(-) diff --git a/docs/content/manual/manual.yml b/docs/content/manual/manual.yml index ca08f89eaf..aa3a61f8be 100644 --- a/docs/content/manual/manual.yml +++ b/docs/content/manual/manual.yml @@ -1428,30 +1428,6 @@ sections: input: '[{"foo":1, "bar":14}, {"foo":2, "bar":3}]' output: ['{"foo":2, "bar":3}'] - - title: "`uniq(stream)`" - body: | - - The `uniq` function produces a substream of the given stream - by emitting in turn the first item from each run within it. - No sorting takes place. - - examples: - - program: '[uniq(1,1,2,null,null,1)]' - input: 'null' - output: ['[1,2,null,1]'] - - - program: '[uniq(.[])]' - input: '[1,1,2,null,null,1]' - output: ['[1,2,null,1]'] - - - program: '[uniq(empty)]' - input: 'null' - output: ['[]'] - - - program: '[true, false | [uniq(1,1,2)]]' - input: null - output: ['[[1,2],[1,2]]'] - - title: "`unique`, `unique_by(path_exp)`" body: | @@ -2508,9 +2484,10 @@ sections: would take the form: `"\(.x)"`. example: - - program: 'sub("^[^a-z]*(?[a-z]*).*")' - input: '"123abc456"' - output: '"ZabcZabc"' + + - program: 'sub("[^a-z]*(?[a-z]*)"; "Z\(.x)"; "g")' + input: '"123abc456def"' + output: ['"ZabcZdef"'] - program: '[sub("(?.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)")]' input: '"aB"' diff --git a/src/builtin.c b/src/builtin.c index 9b2d9a23fc..9a26ea3ae3 100644 --- a/src/builtin.c +++ b/src/builtin.c @@ -930,7 +930,8 @@ static jv f_match(jq_state *jq, jv input, jv regex, jv modifiers, jv testmode) { match = jv_object_set(match, jv_string("string"), jv_string("")); match = jv_object_set(match, jv_string("captures"), jv_array()); result = jv_array_append(result, match); - start += 1; + // ensure '"qux" | match("(?=u)"; "g")' matches just once + start = (const UChar*)(input_string+region->end[0]+1); continue; } diff --git a/src/builtin.jq b/src/builtin.jq index 7af5d696e5..47a0171bbe 100644 --- a/src/builtin.jq +++ b/src/builtin.jq @@ -116,19 +116,11 @@ def splits($re): splits($re; null); # split emits an array for backward compatibility def split($re; flags): [ splits($re; flags) ]; # -# stream-oriented -def uniq(s): - foreach s as $x (null; - if . and $x == .[0] then .[1] = false - else [$x, true] - end; - if .[1] then .[0] else empty end); -# # If s contains capture variables, then create a capture object and pipe it to s, bearing # in mind that s could be a stream def sub($re; s; $flags): . as $in - | (reduce uniq(match($re; $flags)) as $edit + | (reduce match($re; $flags) as $edit ({result: [], previous: 0}; $in[ .previous: ($edit | .offset) ] as $gap # create the "capture" objects (one per item in s) diff --git a/tests/jq.test b/tests/jq.test index 83b19fb4e9..78c4017b14 100644 --- a/tests/jq.test +++ b/tests/jq.test @@ -1732,10 +1732,3 @@ false 1 1 -[uniq(1,1,2,3,3,4)] -null -[1,2,3,4] - -[uniq(empty)] -null -[] diff --git a/tests/man.test b/tests/man.test index 1c6ff67ed7..947ae61886 100644 --- a/tests/man.test +++ b/tests/man.test @@ -2,6 +2,18 @@ "Hello, world!" "Hello, world!" +. | tojson +12345678909876543212345 +"12345678909876543212345" + +map([., . == 1]) | tojson +[1, 1.000, 1.0, 100e-2] +"[[1,true],[1.000,true],[1.0,true],[1.00,true]]" + +. as $big | [$big, $big + 1] | map(. > 10000000000000000000000000000000) +10000000000000000000000000000001 +[true, false] + .foo {"foo": 42, "bar": "less interesting data"} 42 @@ -163,11 +175,12 @@ null -1 .[] | length -[[1,2], "string", {"a":2}, null] +[[1,2], "string", {"a":2}, null, -5] 2 6 1 0 +5 utf8bytelength "\u03bc" @@ -343,12 +356,12 @@ flatten [{"foo": "bar"}, [{"foo": "baz"}]] [{"foo": "bar"}, {"foo": "baz"}] -range(2;4) +range(2; 4) null 2 3 -[range(2;4)] +[range(2; 4)] null [2,3] @@ -356,15 +369,15 @@ null null [0,1,2,3] -[range(0;10;3)] +[range(0; 10; 3)] null [0,3,6,9] -[range(0;10;-1)] +[range(0; 10; -1)] null [] -[range(0;-5;-1)] +[range(0; -5; -1)] null [0,-1,-2,-3,-4] @@ -551,6 +564,10 @@ join(" ") 1 [1,2,4,8,16,32,64] +[repeat(.*2, error)?] +1 +[2] + [.,1]|until(.[0] < 1; [.[0] - 1, .[1] * .[0]])|.[1] 4 24 @@ -658,7 +675,7 @@ true false false -if . == 0 then "zero" elif . == 1 then "one" else "many" end +if . == 0 then "zero" elif . == 1 then "one" else "many" end 2 "many" @@ -841,8 +858,21 @@ true {"foo": 42} {"foo": 43} -.[]|tonumber? -["1", "hello", "3", 4] -1 -3 -4 +.a = .b +{"a": {"b": 10}, "b": 20} +{"a":20,"b":20} + +.a |= .b +{"a": {"b": 10}, "b": 20} +{"a":10,"b":20} + +(.a, .b) = range(3) +null +{"a":0,"b":0} +{"a":1,"b":1} +{"a":2,"b":2} + +(.a, .b) |= range(3) +null +{"a":0,"b":0} + From 5373191e64d9763c14da46b1657828779b74f462 Mon Sep 17 00:00:00 2001 From: pkoppstein Date: Sun, 2 Jul 2023 01:53:59 -0400 Subject: [PATCH 3/5] manual.yml: gsub examples Correct one gsub example, and add another --- docs/content/manual/manual.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/content/manual/manual.yml b/docs/content/manual/manual.yml index aa3a61f8be..d56c53a5e8 100644 --- a/docs/content/manual/manual.yml +++ b/docs/content/manual/manual.yml @@ -2503,7 +2503,11 @@ sections: example: - program: 'gsub("(?.)[^a]*"; "+\(.x)-")' input: '"Abcabc"' - output: '"+A-+a-"' + output: ['"+A-+a-"'] + + - program: '[gsub("p"; "a", "b")]' + input: '"p"' + output: ['["a","b"]'] - title: Advanced features From 3015ec8d8ddc6ae9780d0881f87896437edee440 Mon Sep 17 00:00:00 2001 From: pkoppstein Date: Sun, 2 Jul 2023 15:03:36 -0400 Subject: [PATCH 4/5] untabify builtin.jq Remove two tab characters --- src/builtin.jq | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/builtin.jq b/src/builtin.jq index 47a0171bbe..096635119f 100644 --- a/src/builtin.jq +++ b/src/builtin.jq @@ -127,7 +127,7 @@ def sub($re; s; $flags): | [reduce ( $edit | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair ({}; . + $pair) | s ] as $inserts | reduce range(0; $inserts|length) as $ix (.; .result[$ix] += $gap + $inserts[$ix]) - | .previous = ($edit | .offset + .length ) ) + | .previous = ($edit | .offset + .length ) ) | .result[] + $in[.previous:] ) // $in; # @@ -187,7 +187,7 @@ def transpose: | length as $length | reduce range(0; $max) as $j ([]; . + [reduce range(0;$length) as $i ([]; . + [ $in[$i][$j] ] )] ) - end; + end; def in(xs): . as $x | xs | has($x); def inside(xs): . as $x | xs | contains($x); def repeat(exp): From 832734b5cdc49935a4a8530c8c492fd908da40b2 Mon Sep 17 00:00:00 2001 From: pkoppstein Date: Sun, 2 Jul 2023 21:51:44 -0400 Subject: [PATCH 5/5] builtin.c: untabify untabify --- src/builtin.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/builtin.c b/src/builtin.c index 9a26ea3ae3..2d1156dc56 100644 --- a/src/builtin.c +++ b/src/builtin.c @@ -930,8 +930,8 @@ static jv f_match(jq_state *jq, jv input, jv regex, jv modifiers, jv testmode) { match = jv_object_set(match, jv_string("string"), jv_string("")); match = jv_object_set(match, jv_string("captures"), jv_array()); result = jv_array_append(result, match); - // ensure '"qux" | match("(?=u)"; "g")' matches just once - start = (const UChar*)(input_string+region->end[0]+1); + // ensure '"qux" | match("(?=u)"; "g")' matches just once + start = (const UChar*)(input_string+region->end[0]+1); continue; }