diff --git a/docs/content/manual/manual.yml b/docs/content/manual/manual.yml index 2f4bd3c014..5f334f49c4 100644 --- a/docs/content/manual/manual.yml +++ b/docs/content/manual/manual.yml @@ -772,6 +772,9 @@ sections: `null` can be added to any value, and returns the other value unchanged. + A numeric byte value between 0 and 255, inclusive, can be + added to a binary string value. + examples: - program: '.a + 1' input: '{"a": 7}' @@ -1414,6 +1417,41 @@ sections: input: '[1, "1", [1]]' output: ['"1"', '"1"', '"[1]"'] + - title: "`tobinary`" + body: | + + The `tobinary` function is like `tostring`, but its output + will be a string which when output to jq's output stream + will be base64-encoded, and which if added with other + strings will produce a binary string value. + + Internally the binary string may be represented efficiently, + and may not be encoded until it is output or until it is + passed to `tostring`. Adding a byte value (integer value + between 0 and 255, inclusive) to a binary string is allowed, + and will append that byte to it. + + - title: "`tobinary_bytearray`" + body: | + + The `tobinary_bytearray` function is like `tobinary`, but + when output by jq it will be represented as an array of + small non-negative byte value integers. + + - title: "`tobinary_utf8`" + body: | + + The `tobinary_utf8` function is like `tobinary`, but when + output by jq it will be converted to UTF-8 with bad + character replacements. + + - title: "`tobinary(bytes)`" + body: | + + This function constructs a binary string value like + `tobinary` but consisting of the byte values output by + `bytes`. + - title: "`type`" body: | @@ -1421,6 +1459,13 @@ sections: string, which is one of null, boolean, number, string, array or object. + - title: "`stringtype`" + body: | + + Strings can be UTF-8 strings or binary strings. The + `stringtype` builtin outputs `"UTF-8"` or `"binary"` when + given a string as input. + examples: - program: 'map(type)' input: '[0, false, [], {}, null, "hello"]' @@ -2038,7 +2083,9 @@ sections: * `@base64d`: The inverse of `@base64`, input is decoded as specified by RFC 4648. - Note\: If the decoded string is not UTF-8, the results are undefined. + The result will be a binary string as if `tobinary_utf8` + was used, meaning that on output bad characters will be + replaced. This syntax can be combined with string interpolation in a useful way. You can follow a `@foo` token with a string diff --git a/jq.1.prebuilt b/jq.1.prebuilt index c4f99dd603..91e902cdcb 100644 --- a/jq.1.prebuilt +++ b/jq.1.prebuilt @@ -742,6 +742,9 @@ The operator \fB+\fR takes two filters, applies them both to the same input, and .P \fBnull\fR can be added to any value, and returns the other value unchanged\. . +.P +A numeric byte value between 0 and 255, inclusive, can be added to a binary string value\. +. .IP "" 4 . .nf @@ -1483,9 +1486,27 @@ jq \'\.[] | tostring\' . .IP "" 0 . +.SS "tobinary" +The \fBtobinary\fR function is like \fBtostring\fR, but its output will be a string which when output to jq\'s output stream will be base64\-encoded, and which if added with other strings will produce a binary string value\. +. +.P +Internally the binary string may be represented efficiently, and may not be encoded until it is output or until it is passed to \fBtostring\fR\. Adding a byte value (integer value between 0 and 255, inclusive) to a binary string is allowed, and will append that byte to it\. +. +.SS "tobinary_bytearray" +The \fBtobinary_bytearray\fR function is like \fBtobinary\fR, but when output by jq it will be represented as an array of small non\-negative byte value integers\. +. +.SS "tobinary_utf8" +The \fBtobinary_utf8\fR function is like \fBtobinary\fR, but when output by jq it will be converted to UTF\-8 with bad character replacements\. +. +.SS "tobinary(bytes)" +This function constructs a binary string value like \fBtobinary\fR but consisting of the byte values output by \fBbytes\fR\. +. .SS "type" The \fBtype\fR function returns the type of its argument as a string, which is one of null, boolean, number, string, array or object\. . +.SS "stringtype" +Strings can be UTF\-8 strings or binary strings\. The \fBstringtype\fR builtin outputs \fB"UTF\-8"\fR or \fB"binary"\fR when given a string as input\. +. .IP "" 4 . .nf @@ -2216,7 +2237,7 @@ The input is converted to base64 as specified by RFC 4648\. \fB@base64d\fR: . .IP -The inverse of \fB@base64\fR, input is decoded as specified by RFC 4648\. Note\e: If the decoded string is not UTF\-8, the results are undefined\. +The inverse of \fB@base64\fR, input is decoded as specified by RFC 4648\. The result will be a binary string as if \fBtobinary_utf8\fR was used, meaning that on output bad characters will be replaced\. . .P This syntax can be combined with string interpolation in a useful way\. You can follow a \fB@foo\fR token with a string literal\. The contents of the string literal will \fInot\fR be escaped\. However, all interpolations made inside that string literal will be escaped\. For instance, diff --git a/src/builtin.c b/src/builtin.c index b38d4c2f4f..37ed57df56 100644 --- a/src/builtin.c +++ b/src/builtin.c @@ -102,6 +102,14 @@ static jv f_plus(jq_state *jq, jv input, jv a, jv b) { return jv_array_concat(a, b); } else if (jv_get_kind(a) == JV_KIND_OBJECT && jv_get_kind(b) == JV_KIND_OBJECT) { return jv_object_merge(a, b); + } else if (jv_get_kind(a) == JV_KIND_STRING && + jv_get_string_kind(a) != JV_STRING_KIND_UTF8 && + jv_get_kind(b) == JV_KIND_NUMBER) { + int c = jv_number_value(b); + if (c < 0 || c > 255) + return type_error2(a, b, "cannot be added because the latter is not a valid byte value"); + unsigned char uc = c; + return jv_binary_append_buf(a, &uc, 1); } else { return type_error2(a, b, "cannot be added"); } @@ -511,7 +519,15 @@ static jv f_length(jq_state *jq, jv input) { static jv f_tostring(jq_state *jq, jv input) { if (jv_get_kind(input) == JV_KIND_STRING) { - return input; + switch (jv_get_string_kind(input)) { + case JV_STRING_KIND_UTF8: + return input; + default: + jv o = jv_string_sized(jv_string_value(input), + jv_string_length_bytes(jv_copy(input))); + jv_free(input); + return o; + } } else { return jv_dump_string(input, 0); } @@ -525,24 +541,6 @@ static jv f_utf8bytelength(jq_state *jq, jv input) { #define CHARS_ALPHANUM "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" -static const unsigned char BASE64_ENCODE_TABLE[64 + 1] = CHARS_ALPHANUM "+/"; -static const unsigned char BASE64_INVALID_ENTRY = 0xFF; -static const unsigned char BASE64_DECODE_TABLE[255] = { - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 62, // + - 0xFF, 0xFF, 0xFF, - 63, // / - 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, // 0-9 - 0xFF, 0xFF, 0xFF, - 99, // = - 0xFF, 0xFF, 0xFF, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // A-Z - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // a-z - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF -}; - - static jv escape_string(jv input, const char* escapings) { assert(jv_get_kind(input) == JV_KIND_STRING); @@ -560,7 +558,7 @@ static jv escape_string(jv input, const char* escapings) { const char* i = jv_string_value(input); const char* end = i + jv_string_length_bytes(jv_copy(input)); const char* cstart; - int c = 0; + uint32_t c = 0; while ((i = jvp_utf8_next((cstart = i), end, &c))) { if (c < 128 && lookup[c]) { ret = jv_string_append_str(ret, lookup[c]); @@ -693,70 +691,10 @@ static jv f_format(jq_state *jq, jv input, jv fmt) { return line; } else if (!strcmp(fmt_s, "base64")) { jv_free(fmt); - input = f_tostring(jq, input); - jv line = jv_string(""); - const unsigned char* data = (const unsigned char*)jv_string_value(input); - int len = jv_string_length_bytes(jv_copy(input)); - for (int i=0; i= 3 ? 3 : len-i; - for (int j=0; j<3; j++) { - code <<= 8; - code |= j < n ? (unsigned)data[i+j] : 0; - } - char buf[4]; - for (int j=0; j<4; j++) { - buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f]; - } - if (n < 3) buf[3] = '='; - if (n < 2) buf[2] = '='; - line = jv_string_append_buf(line, buf, sizeof(buf)); - } - jv_free(input); - return line; + return jv_binary_to_base64(f_tostring(jq, input)); } else if (!strcmp(fmt_s, "base64d")) { jv_free(fmt); - input = f_tostring(jq, input); - const unsigned char* data = (const unsigned char*)jv_string_value(input); - int len = jv_string_length_bytes(jv_copy(input)); - size_t decoded_len = (3 * len) / 4; // 3 usable bytes for every 4 bytes of input - char *result = jv_mem_calloc(decoded_len, sizeof(char)); - memset(result, 0, decoded_len * sizeof(char)); - uint32_t ri = 0; - int input_bytes_read=0; - uint32_t code = 0; - for (int i=0; i> 16) & 0xFF; - result[ri++] = (code >> 8) & 0xFF; - result[ri++] = code & 0xFF; - input_bytes_read = 0; - code = 0; - } - } - if (input_bytes_read == 3) { - result[ri++] = (code >> 10) & 0xFF; - result[ri++] = (code >> 2) & 0xFF; - } else if (input_bytes_read == 2) { - result[ri++] = (code >> 4) & 0xFF; - } else if (input_bytes_read == 1) { - free(result); - return type_error(input, "trailing base64 byte found"); - } - - jv line = jv_string_sized(result, ri); - jv_free(input); - free(result); - return line; + return jv_binary_from_base64(f_tostring(jq, input)); } else { jv_free(input); return jv_invalid_with_msg(jv_string_concat(fmt, jv_string(" is not a valid format"))); @@ -1603,13 +1541,24 @@ static jv f_strftime(jq_state *jq, jv a, jv b) { const char *fmt = jv_string_value(b); size_t alloced = strlen(fmt) + 100; - char *buf = alloca(alloced); + char *buf; + if (alloced > 2048) + buf = jv_mem_alloc(alloced); + else + buf = alloca(alloced); size_t n = strftime(buf, alloced, fmt, &tm); jv_free(b); /* POSIX doesn't provide errno values for strftime() failures; weird */ - if (n == 0 || n > alloced) + if (n == 0 || n > alloced) { + if (alloced > 2048) + jv_mem_free(buf); return jv_invalid_with_msg(jv_string("strftime/1: unknown system failure")); - return jv_string(buf); + } + if (alloced < 2048) + return jv_string(buf); + b = jv_string(buf); + jv_mem_free(buf); + return b; } #else static jv f_strftime(jq_state *jq, jv a, jv b) { @@ -1678,6 +1627,54 @@ static jv f_current_line(jq_state *jq, jv a) { return jq_util_input_get_current_line(jq); } +static jv f_tobinary(jq_state *jq, jv a) { + switch (jv_get_kind(a)) { + case JV_KIND_STRING: + a.subkind = JV_STRING_KIND_BINARY; + return a; + case JV_KIND_ARRAY: + int len = jv_array_length(jv_copy(a)); + unsigned char *b = jv_mem_alloc(len); + jv_array_foreach(a, i, x) { + if (jv_get_kind(x) != JV_KIND_NUMBER) { + char errbuf[15]; + + jv_mem_free(b); + return jv_invalid_with_msg(jv_string_fmt("Not a byte value at array index %d: %s", i, + jv_dump_string_trunc(jv_copy(x), errbuf, sizeof(errbuf)))); + } + /* XXX No validation that `x' is an integer... */ + b[i] = jv_number_value(x); + } + jv_free(a); + a = jv_binary_sized(b, len); + jv_mem_free(b); + return a; + default: + return ret_error(a, jv_string("Only strings and arrays of byte values can be converted to binary")); + } +} + +static jv f_tobinary_bytearray(jq_state *jq, jv a) { + a = f_tobinary(jq, a); + a.subkind = JV_STRING_KIND_BINARY_BYTEARRAY; + return a; +} + +static jv f_tobinary_utf8(jq_state *jq, jv a) { + a = f_tobinary(jq, a); + a.subkind = JV_STRING_KIND_BINARY_UTF8; + return a; +} + +static jv f_stringtype(jq_state *jq, jv a) { + if (jv_get_kind(a) != JV_KIND_STRING) + return type_error(a, "is not a string"); + jv b = jv_string(jv_string_kind_name(jv_get_string_kind(a))); + jv_free(a); + return b; +} + #define LIBM_DD(name) \ {(cfunction_ptr)f_ ## name, #name, 1}, #define LIBM_DD_NO(name) @@ -1769,6 +1766,10 @@ static const struct cfunction function_list[] = { {(cfunction_ptr)f_now, "now", 1}, {(cfunction_ptr)f_current_filename, "input_filename", 1}, {(cfunction_ptr)f_current_line, "input_line_number", 1}, + {(cfunction_ptr)f_tobinary, "tobinary", 1}, + {(cfunction_ptr)f_tobinary_bytearray, "tobinary_bytearray", 1}, + {(cfunction_ptr)f_tobinary_utf8, "tobinary_utf8", 1}, + {(cfunction_ptr)f_stringtype, "stringtype", 1}, }; #undef LIBM_DDDD_NO #undef LIBM_DDD_NO diff --git a/src/builtin.jq b/src/builtin.jq index a13d7845bf..b6341efb8a 100644 --- a/src/builtin.jq +++ b/src/builtin.jq @@ -281,3 +281,5 @@ def JOIN($idx; stream; idx_expr; join_expr): stream | [., $idx[idx_expr]] | join_expr; def IN(s): any(s == .; .); def IN(src; s): any(src == s; .); + +def tobinary(bytes): reduce bytes as $byte (""|tobinary; . + $byte); diff --git a/src/execute.c b/src/execute.c index adf3773799..557841123a 100644 --- a/src/execute.c +++ b/src/execute.c @@ -11,6 +11,7 @@ #include "jv_alloc.h" #include "jq_parser.h" +#include "jv_unicode.h" #include "locfile.h" #include "jv.h" #include "jq.h" @@ -673,20 +674,47 @@ jv jq_next(jq_state *jq) { case INDEX_OPT: { jv t = stack_pop(jq); jv k = stack_pop(jq); - // detect invalid path expression like path(reverse | .a) - if (!path_intact(jq, jv_copy(t))) { - char keybuf[15]; - char objbuf[30]; - jv msg = jv_string_fmt( - "Invalid path expression near attempt to access element %s of %s", - jv_dump_string_trunc(k, keybuf, sizeof(keybuf)), - jv_dump_string_trunc(t, objbuf, sizeof(objbuf))); - set_error(jq, jv_invalid_with_msg(msg)); - goto do_backtrack; + jv v; + if (jv_get_kind(t) == JV_KIND_STRING && jv_get_kind(k) == JV_KIND_NUMBER) { + switch (jv_get_string_kind(t)) { + case JV_STRING_KIND_UTF8: + v = jv_string_append_codepoint(jv_string(""), jv_string_index(t, jv_number_value(k))); + break; + case JV_STRING_KIND_BINARY: + case JV_STRING_KIND_BINARY_BYTEARRAY: + case JV_STRING_KIND_BINARY_UTF8: + const char *s = jv_string_value(t); + int len = jv_string_length_bytes(jv_copy(t)); + int idx = jv_number_value(k); + + if (idx < 0) + idx += idx; + if (idx < 0 || idx >= len) + goto do_backtrack; + v = jv_number(((unsigned char *)s)[idx]); + jv_free(t); + break; + default: + set_error(jq, jv_invalid_with_msg(jv_string("Internal error: unknown string sub-type"))); + goto do_backtrack; + } + } else { + // detect invalid path expression like path(reverse | .a) + if (!path_intact(jq, jv_copy(t))) { + char keybuf[15]; + char objbuf[30]; + jv msg = jv_string_fmt( + "Invalid path expression near attempt to access element %s of %s", + jv_dump_string_trunc(k, keybuf, sizeof(keybuf)), + jv_dump_string_trunc(t, objbuf, sizeof(objbuf))); + set_error(jq, jv_invalid_with_msg(msg)); + goto do_backtrack; + } + v = jv_get(t, jv_copy(k)); + if (jv_is_valid(v)) + path_append(jq, k, jv_copy(v)); } - jv v = jv_get(t, jv_copy(k)); if (jv_is_valid(v)) { - path_append(jq, k, jv_copy(v)); stack_push(jq, v); } else { jv_free(k); @@ -721,7 +749,8 @@ jv jq_next(jq_state *jq) { case EACH_OPT: { jv container = stack_pop(jq); // detect invalid path expression like path(reverse | .[]) - if (!path_intact(jq, jv_copy(container))) { + if (jv_get_kind(container) != JV_KIND_STRING && + !path_intact(jq, jv_copy(container))) { char errbuf[30]; jv msg = jv_string_fmt( "Invalid path expression near attempt to iterate through %s", @@ -758,6 +787,44 @@ jv jq_next(jq_state *jq) { key = jv_object_iter_key(container, idx); value = jv_object_iter_value(container, idx); } + } else if (jv_get_kind(container) == JV_KIND_STRING) { + switch (jv_get_string_kind(container)) { + case JV_STRING_KIND_UTF8: { + const char *s = jv_string_value(container); + const char *next = s; + int len = jv_string_length_bytes(jv_copy(container)); + const char *end = s + len; + int c; + if (opcode == EACH || opcode == EACH_OPT) { + idx = 0; + } else { + next = s + idx; + } + keep_going = idx < len; + next = jvp_utf8_next(next, end, &c); + idx = next - s; + value = jv_string_append_codepoint(jv_string(""), c); + is_last = jvp_utf8_next(next, end, &c) == 0; + break; + } + case JV_STRING_KIND_BINARY: + case JV_STRING_KIND_BINARY_BYTEARRAY: + case JV_STRING_KIND_BINARY_UTF8: + const unsigned char *s = (const unsigned char *)jv_string_value(container); + int len = jv_string_length_bytes(jv_copy(container)); + if (opcode == EACH || opcode == EACH_OPT) { + idx = 0; + } else { + idx++; + } + keep_going = idx < len; + value = jv_string_append_codepoint(jv_string(""), s[idx]); + is_last = idx == len -1; + break; + default: + set_error(jq, jv_invalid_with_msg(jv_string("Internal error: unknown string sub-type"))); + goto do_backtrack; + } } else { assert(opcode == EACH || opcode == EACH_OPT); if (opcode == EACH) { @@ -777,15 +844,17 @@ jv jq_next(jq_state *jq) { goto do_backtrack; } else if (is_last) { // we don't need to make a backtrack point - jv_free(container); - path_append(jq, key, jv_copy(value)); + if (jv_get_kind(container) != JV_KIND_STRING) + path_append(jq, key, jv_copy(value)); stack_push(jq, value); + jv_free(container); } else { struct stack_pos spos = stack_get_pos(jq); stack_push(jq, container); stack_push(jq, jv_number(idx)); stack_save(jq, pc - 1, spos); - path_append(jq, key, jv_copy(value)); + if (jv_get_kind(container) != JV_KIND_STRING) + path_append(jq, key, jv_copy(value)); stack_push(jq, value); } break; diff --git a/src/jq.h b/src/jq.h index 5269de3ff8..9c6b1592f9 100644 --- a/src/jq.h +++ b/src/jq.h @@ -54,9 +54,15 @@ jv jq_get_attr(jq_state *, jv); */ typedef struct jq_util_input_state jq_util_input_state; typedef void (*jq_util_msg_cb)(void *, const char *); +typedef enum { + JQ_UTIL_PARSE_SLURP = 1, + JQ_UTIL_PARSE_BINARY = 2, +} jq_util_parser_enum; + + jq_util_input_state *jq_util_input_init(jq_util_msg_cb, void *); -void jq_util_input_set_parser(jq_util_input_state *, jv_parser *, int); +void jq_util_input_set_parser(jq_util_input_state *, jv_parser *, jq_util_parser_enum); void jq_util_input_free(jq_util_input_state **); void jq_util_input_add_input(jq_util_input_state *, const char *); int jq_util_input_errors(jq_util_input_state *); diff --git a/src/jv.c b/src/jv.c index 498a14149d..5ffd5c4988 100644 --- a/src/jv.c +++ b/src/jv.c @@ -144,6 +144,24 @@ jv jv_bool(int x) { return x ? JV_TRUE : JV_FALSE; } +jv_string_kind jv_get_string_kind(jv v) { + assert(jv_get_kind(v) == JV_KIND_STRING); + return v.subkind; +} + +const char* jv_string_kind_name(jv_string_kind k) { + switch (k) { + case JV_STRING_KIND_UTF8: + return "UTF-8"; + case JV_STRING_KIND_BINARY: + case JV_STRING_KIND_BINARY_BYTEARRAY: + case JV_STRING_KIND_BINARY_UTF8: + return "binary"; + default: + return ""; + } +} + /* * Invalid objects, with optional error messages */ @@ -1094,10 +1112,10 @@ static jv jvp_string_copy_replace_bad(const char* data, uint32_t length) { uint32_t maxlength = length * 3 + 1; // worst case: all bad bytes, each becomes a 3-byte U+FFFD jvp_string* s = jvp_string_alloc(maxlength); char* out = s->data; - int c = 0; + uint32_t c = 0; while ((i = jvp_utf8_next((cstart = i), end, &c))) { - if (c == -1) { + if (c == (uint32_t)-1) { c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER } out += jvp_utf8_encode(c, out); @@ -1167,8 +1185,8 @@ static jv jvp_string_append(jv string, const char* data, uint32_t len) { memcpy(news->data, s->data, currlen); memcpy(news->data + currlen, data, len); news->data[currlen + len] = 0; + jv r = {JVP_FLAGS_STRING, string.subkind, 0, 0, {&news->refcnt}}; jvp_string_free(string); - jv r = {JVP_FLAGS_STRING, 0, 0, 0, {&news->refcnt}}; return r; } } @@ -1245,6 +1263,148 @@ static int jvp_string_equal(jv a, jv b) { return memcmp(stra->data, strb->data, jvp_string_length(stra)) == 0; } +/* + * Binary strings (public API) + */ + +jv jv_binary_sized(const unsigned char *str, int len) { + jv b = jvp_string_new((const char *)str, len); + b.subkind = JV_STRING_KIND_BINARY; + return b; +} + +jv jv_binary(const unsigned char *str) { + /* The input is NUL-terminated, but otherwise binary */ + return jv_binary_sized(str, strlen((const char *)str)); +} + +int jv_binary_length(jv j) { + assert(JVP_HAS_KIND(j, JV_KIND_STRING)); + int r = jvp_string_length(jvp_string_ptr(j)); + jv_free(j); + return r; +} + +jv jv_binary_slice(jv j, int start, int end) { + assert(JVP_HAS_KIND(j, JV_KIND_STRING)); + const unsigned char *s = (const unsigned char *)jv_string_value(j); + int len = jv_string_length_bytes(jv_copy(j)); + jv res; + + jvp_clamp_slice_params(len, &start, &end); + assert(0 <= start && start <= end && end <= len); + + /* See note in jv_string_slice() */ + res = jv_binary_sized(s + start, end - start); + jv_free(j); + return res; +} + +jv jv_binary_append_buf(jv a, const unsigned char *buf, int len) { + return jvp_string_append(a, (const char *)buf, len); +} + +jv jv_binary_from_string(jv j) { + j.subkind = JV_STRING_KIND_BINARY; + return j; +} + +#define CHARS_ALPHANUM "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" + +static const unsigned char BASE64_ENCODE_TABLE[64 + 1] = CHARS_ALPHANUM "+/"; +static const unsigned char BASE64_INVALID_ENTRY = 0xFF; +static const unsigned char BASE64_DECODE_TABLE[255] = { + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 62, // + + 0xFF, 0xFF, 0xFF, + 63, // / + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, // 0-9 + 0xFF, 0xFF, 0xFF, + 99, // = + 0xFF, 0xFF, 0xFF, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // A-Z + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // a-z + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF +}; + +jv jv_binary_to_base64(jv input) { + const unsigned char* data = (const unsigned char*)jv_string_value(input); + int len = jv_string_length_bytes(jv_copy(input)); + jv out = jv_string(""); + for (int i=0; i= 3 ? 3 : len-i; + for (int j=0; j<3; j++) { + code <<= 8; + code |= j < n ? (unsigned)data[i+j] : 0; + } + char buf[4]; + for (int j=0; j<4; j++) { + buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f]; + } + if (n < 3) buf[3] = '='; + if (n < 2) buf[2] = '='; + out = jv_string_append_buf(out, buf, sizeof(buf)); + } + jv_free(input); + return out; +} + +jv jv_binary_from_base64(jv input) { + const unsigned char* data = (const unsigned char*)jv_string_value(input); + int len = jv_string_length_bytes(jv_copy(input)); + size_t decoded_len = (3 * len) / 4; // 3 usable bytes for every 4 bytes of input + unsigned char *result = jv_mem_calloc(decoded_len, sizeof(char)); + memset(result, 0, decoded_len * sizeof(char)); + uint32_t ri = 0; + int input_bytes_read=0; + uint32_t code = 0; + for (int i=0; i> 16) & 0xFF; + result[ri++] = (code >> 8) & 0xFF; + result[ri++] = code & 0xFF; + input_bytes_read = 0; + code = 0; + } + } + if (input_bytes_read == 3) { + result[ri++] = (code >> 10) & 0xFF; + result[ri++] = (code >> 2) & 0xFF; + } else if (input_bytes_read == 2) { + result[ri++] = (code >> 4) & 0xFF; + } else if (input_bytes_read == 1) { + char errbuf[15]; + + jv err = jv_invalid_with_msg(jv_string_fmt("Invalid base64 data (trailing base64 byte found) (%s)", + jv_dump_string_trunc(jv_copy(input), errbuf, sizeof(errbuf)))); + jv_free(input); + free(result); + return err; + } + + jv line = jv_binary_sized(result, ri); + line.subkind = JV_STRING_KIND_BINARY_UTF8; + jv_free(input); + free(result); + return line; +} + /* * Strings (public API) */ @@ -1264,6 +1424,12 @@ jv jv_string(const char* str) { return jv_string_sized(str, strlen(str)); } +jv jv_string_from_binary(jv j) { + jv r = jv_string_sized(jv_string_value(j), jv_string_length_bytes(jv_copy(j))); + jv_free(j); + return r; +} + int jv_string_length_bytes(jv j) { assert(JVP_HAS_KIND(j, JV_KIND_STRING)); int r = jvp_string_length(jvp_string_ptr(j)); @@ -1275,12 +1441,49 @@ int jv_string_length_codepoints(jv j) { assert(JVP_HAS_KIND(j, JV_KIND_STRING)); const char* i = jv_string_value(j); const char* end = i + jv_string_length_bytes(jv_copy(j)); - int c = 0, len = 0; + uint32_t c = 0; + int len = 0; while ((i = jvp_utf8_next(i, end, &c))) len++; jv_free(j); return len; } +uint32_t jv_string_index(jv j, int idx) { + assert(JVP_HAS_KIND(j, JV_KIND_STRING)); + const char* i = jv_string_value(j); + const char* end = i + jv_string_length_bytes(jv_copy(j)); + uint32_t c = 0; + switch (jv_get_string_kind(j)) { + case JV_STRING_KIND_UTF8: + if (idx < 0) { + idx += jv_string_length_codepoints(jv_copy(j)); + if (idx < 0) + return 0; + } + while (i < end && idx >= 0) { + i = jvp_utf8_next(i, end, &c); + idx--; + } + if (i == end && idx != -1) + c = 0; + break; + case JV_STRING_KIND_BINARY: + case JV_STRING_KIND_BINARY_BYTEARRAY: + case JV_STRING_KIND_BINARY_UTF8: + if (idx < 0) + idx += end - i; + if (idx < 0) + return 0; + if (i + idx >= end) + return 0; + c = ((const unsigned char *)i)[idx]; + break; + default: + break; + } + jv_free(j); + return c; +} jv jv_string_indexes(jv j, jv k) { assert(JVP_HAS_KIND(j, JV_KIND_STRING)); @@ -1317,18 +1520,27 @@ jv jv_string_split(jv j, jv sep) { assert(jv_get_refcnt(a) == 1); if (seplen == 0) { - int c; + uint32_t c; while ((jstr = jvp_utf8_next(jstr, jend, &c))) - a = jv_array_append(a, jv_string_append_codepoint(jv_string(""), c)); + a = jv_array_append(a, jv_string_append_codepoint(j.subkind == JV_STRING_KIND_UTF8 ? + jv_string("") : + jv_binary((const unsigned char *)""), + c)); } else { for (p = jstr; p < jend; p = s + seplen) { s = _jq_memmem(p, jend - p, sepstr, seplen); if (s == NULL) s = jend; - a = jv_array_append(a, jv_string_sized(p, s - p)); - // Add an empty string to denote that j ends on a sep + a = jv_array_append(a, + j.subkind == JV_STRING_KIND_UTF8 ? + jv_string_sized(p, s - p) : + jv_binary_sized((const unsigned char *)p, s - p)); + // Add an empty string o denote that j ends on a sep if (s + seplen == jend && seplen != 0) - a = jv_array_append(a, jv_string("")); + a = jv_array_append(a, + j.subkind == JV_STRING_KIND_UTF8 ? + jv_string("") : + jv_binary((const unsigned char *)"")); } } jv_free(j); @@ -1342,7 +1554,7 @@ jv jv_string_explode(jv j) { int len = jv_string_length_bytes(jv_copy(j)); const char* end = i + len; jv a = jv_array_sized(len); - int c; + uint32_t c; while ((i = jvp_utf8_next(i, end, &c))) a = jv_array_append(a, jv_number(c)); jv_free(j); @@ -1385,11 +1597,13 @@ const char* jv_string_value(jv j) { jv jv_string_slice(jv j, int start, int end) { assert(JVP_HAS_KIND(j, JV_KIND_STRING)); + if (j.subkind != JV_STRING_KIND_UTF8) + return jv_binary_slice(j, start, end); const char *s = jv_string_value(j); int len = jv_string_length_bytes(jv_copy(j)); int i; const char *p, *e; - int c; + uint32_t c; jv res; jvp_clamp_slice_params(len, &start, &end); @@ -1402,7 +1616,7 @@ jv jv_string_slice(jv j, int start, int end) { jv_free(j); return jv_string_empty(16); } - if (c == -1) { + if (c == (uint32_t)-1) { jv_free(j); return jv_invalid_with_msg(jv_string("Invalid UTF-8 string")); } @@ -1414,7 +1628,7 @@ jv jv_string_slice(jv j, int start, int end) { e = s + len; break; } - if (c == -1) { + if (c == (uint32_t)-1) { jv_free(j); return jv_invalid_with_msg(jv_string("Invalid UTF-8 string")); } @@ -1433,27 +1647,31 @@ jv jv_string_slice(jv j, int start, int end) { } jv jv_string_concat(jv a, jv b) { + jv_string_kind subkind = a.subkind | b.subkind; a = jvp_string_append(a, jv_string_value(b), jvp_string_length(jvp_string_ptr(b))); jv_free(b); + a.subkind = subkind; return a; } jv jv_string_append_buf(jv a, const char* buf, int len) { - if (jvp_utf8_is_valid(buf, buf+len)) { - a = jvp_string_append(a, buf, len); - } else { - jv b = jvp_string_copy_replace_bad(buf, len); - a = jv_string_concat(a, b); - } - return a; + if (a.subkind != JV_STRING_KIND_UTF8) + return jvp_string_append(a, buf, len); + if (jvp_utf8_is_valid(buf, buf+len)) + return jvp_string_append(a, buf, len); + jv b = jvp_string_copy_replace_bad(buf, len); + return jv_string_concat(a, b); } jv jv_string_append_codepoint(jv a, uint32_t c) { + if (a.subkind != JV_STRING_KIND_UTF8 && c < 256) { + unsigned char uc = c; + return jvp_string_append(a, (char *)&uc, 1); + } char buf[5]; int len = jvp_utf8_encode(c, buf); - a = jvp_string_append(a, buf, len); - return a; + return jvp_string_append(a, buf, len); } jv jv_string_append_str(jv a, const char* str) { diff --git a/src/jv.h b/src/jv.h index 8c96f822f0..ef764f2fcb 100644 --- a/src/jv.h +++ b/src/jv.h @@ -16,13 +16,26 @@ typedef enum { JV_KIND_OBJECT } jv_kind; +typedef enum { + /* String subtypes */ + JV_STRING_KIND_UTF8, + JV_STRING_KIND_BINARY, /* prints as base64 */ + JV_STRING_KIND_BINARY_BYTEARRAY, /* prints as array of byte values */ + JV_STRING_KIND_BINARY_UTF8, /* prints as UTF-8 with bad character substitutions */ + /* Maybe add empty and 1-element arrays as array subtypes to avoid allocations? */ + /* + * XXX TODO MAYBE: merge with JVP_FLAGS concept, move to using kind_flags field, + * and/or combine the old pad_ and kind/kind_flags fields. + */ +} jv_string_kind; + struct jv_refcnt; /* All of the fields of this struct are private. Really. Do not play with them. */ typedef struct { unsigned char kind_flags; - unsigned char pad_; + unsigned char subkind; unsigned short offset; /* array offsets */ int size; union { @@ -40,6 +53,9 @@ jv_kind jv_get_kind(jv); const char* jv_kind_name(jv_kind); static int jv_is_valid(jv x) { return jv_get_kind(x) != JV_KIND_INVALID; } +jv_string_kind jv_get_string_kind(jv); +const char* jv_string_kind_name(jv_string_kind); + jv jv_copy(jv); void jv_free(jv); @@ -105,13 +121,24 @@ jv jv_array_indexes(jv, jv); #endif +jv jv_binary_sized(const unsigned char *, int); +jv jv_binary(const unsigned char *); +int jv_binary_length(jv); +jv jv_binary_slice(jv, int, int); +jv jv_binary_append_buf(jv, const unsigned char *, int); +jv jv_binary_to_base64(jv); +jv jv_binary_from_base64(jv); +jv jv_binary_from_string(jv); + jv jv_string(const char*); jv jv_string_sized(const char*, int); +jv jv_string_from_binary(jv); jv jv_string_empty(int len); int jv_string_length_bytes(jv); int jv_string_length_codepoints(jv); unsigned long jv_string_hash(jv); const char* jv_string_value(jv); +uint32_t jv_string_index(jv, int); jv jv_string_indexes(jv j, jv k); jv jv_string_slice(jv j, int start, int end); jv jv_string_concat(jv, jv); diff --git a/src/jv_print.c b/src/jv_print.c index d1db88aa89..49e14cd52c 100644 --- a/src/jv_print.c +++ b/src/jv_print.c @@ -113,16 +113,40 @@ static void put_indent(int n, int flags, FILE* fout, jv* strout, int T) { } } -static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) { +static void jvp_dump_string(struct dtoa_context *C, jv str, int ascii_only, FILE* F, jv* S, int T) { assert(jv_get_kind(str) == JV_KIND_STRING); + if (jv_get_string_kind(str) == JV_STRING_KIND_BINARY_BYTEARRAY) { + const unsigned char *s = (const unsigned char *)jv_string_value(str); + char buf[JVP_DTOA_FMT_MAX_LEN]; + int i, len = jv_string_length_bytes(jv_copy(str)); + + put_char('[', F, S, T); + for (i = 0; i < len; i++) { + /* XXX This is way too slow */ + /* XXX Need to do indentation */ + put_str(jvp_dtoa_fmt(C, buf, s[i]), F, S, 0 /* XXX flags */); + if (i < len - 1) + put_char(',', F, S, T); + } + put_char(']', F, S, T); + return; + } + if (jv_get_string_kind(str) == JV_STRING_KIND_BINARY) { + // TODO: Add several different ways to represent binary. + // We should have: base64, hex, array of bytes, and + // maybe even a just-8bit representation + str = jv_binary_to_base64(str); + } else if (jv_get_string_kind(str) == JV_STRING_KIND_BINARY_UTF8) { + str = jv_string_from_binary(str); + } const char* i = jv_string_value(str); const char* end = i + jv_string_length_bytes(jv_copy(str)); const char* cstart; - int c = 0; + uint32_t c = 0; char buf[32]; put_char('"', F, S, T); while ((i = jvp_utf8_next((cstart = i), end, &c))) { - assert(c != -1); + assert(c != (uint32_t)-1); int unicode_escape = 0; if (0x20 <= c && c <= 0x7E) { // printable ASCII @@ -176,7 +200,7 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) { put_str(buf, F, S, T); } } - assert(c != -1); + assert(c != (uint32_t)-1); put_char('"', F, S, T); } @@ -205,7 +229,7 @@ static void jv_dump_term(struct dtoa_context* C, jv x, int flags, int indent, FI jv msg = jv_invalid_get_msg(jv_copy(x)); if (jv_get_kind(msg) == JV_KIND_STRING) { put_str("", F, S, flags & JV_PRINT_ISATTY); } else { put_str("", F, S, flags & JV_PRINT_ISATTY); @@ -250,7 +274,7 @@ static void jv_dump_term(struct dtoa_context* C, jv x, int flags, int indent, FI break; } case JV_KIND_STRING: - jvp_dump_string(x, flags & JV_PRINT_ASCII, F, S, flags & JV_PRINT_ISATTY); + jvp_dump_string(C, x, flags & JV_PRINT_ASCII, F, S, flags & JV_PRINT_ISATTY); if (flags & JV_PRINT_REFCOUNT) put_refcnt(C, refcnt, F, S, flags & JV_PRINT_ISATTY); break; @@ -337,7 +361,7 @@ static void jv_dump_term(struct dtoa_context* C, jv x, int flags, int indent, FI first = 0; if (color) put_str(FIELD_COLOR, F, S, flags & JV_PRINT_ISATTY); - jvp_dump_string(key, flags & JV_PRINT_ASCII, F, S, flags & JV_PRINT_ISATTY); + jvp_dump_string(C, key, flags & JV_PRINT_ASCII, F, S, flags & JV_PRINT_ISATTY); jv_free(key); if (color) put_str(COLRESET, F, S, flags & JV_PRINT_ISATTY); diff --git a/src/jv_unicode.c b/src/jv_unicode.c index d197349f48..275547a0d9 100644 --- a/src/jv_unicode.c +++ b/src/jv_unicode.c @@ -26,12 +26,12 @@ const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_ return start; } -const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { +const char* jvp_utf8_next(const char* in, const char* end, uint32_t *codepoint_ret) { assert(in <= end); if (in == end) { return 0; } - int codepoint = -1; + uint32_t codepoint = 0xffffffff; unsigned char first = (unsigned char)in[0]; int length = utf8_coding_length[first]; if ((first & 0x80) == 0) { @@ -50,7 +50,7 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { unsigned ch = (unsigned char)in[i]; if (utf8_coding_length[ch] != UTF8_CONTINUATION_BYTE){ /* Invalid UTF8 sequence - not followed by the right number of continuation bytes */ - codepoint = -1; + codepoint = 0xffffffff; length = i; break; } @@ -75,9 +75,9 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { } int jvp_utf8_is_valid(const char* in, const char* end) { - int codepoint; + uint32_t codepoint; while ((in = jvp_utf8_next(in, end, &codepoint))) { - if (codepoint == -1) return 0; + if (codepoint == 0xffffffff) return 0; } return 1; } @@ -90,15 +90,16 @@ int jvp_utf8_decode_length(char startchar) { else return 4; // 1111 ____ } -int jvp_utf8_encode_length(int codepoint) { +int jvp_utf8_encode_length(uint32_t codepoint) { + assert(codepoint <= 0x10FFFF); if (codepoint <= 0x7F) return 1; else if (codepoint <= 0x7FF) return 2; else if (codepoint <= 0xFFFF) return 3; else return 4; } -int jvp_utf8_encode(int codepoint, char* out) { - assert(codepoint >= 0 && codepoint <= 0x10FFFF); +int jvp_utf8_encode(uint32_t codepoint, char* out) { + assert(codepoint <= 0x10FFFF); char* start = out; if (codepoint <= 0x7F) { *out++ = codepoint; diff --git a/src/jv_unicode.h b/src/jv_unicode.h index 558721a8fd..ce511ee08a 100644 --- a/src/jv_unicode.h +++ b/src/jv_unicode.h @@ -1,12 +1,14 @@ #ifndef JV_UNICODE_H #define JV_UNICODE_H +#include + const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes); -const char* jvp_utf8_next(const char* in, const char* end, int* codepoint); +const char* jvp_utf8_next(const char* in, const char* end, uint32_t *codepoint); int jvp_utf8_is_valid(const char* in, const char* end); int jvp_utf8_decode_length(char startchar); -int jvp_utf8_encode_length(int codepoint); -int jvp_utf8_encode(int codepoint, char* out); +int jvp_utf8_encode_length(uint32_t codepoint); +int jvp_utf8_encode(uint32_t codepoint, char* out); #endif diff --git a/src/jv_utf8_tables.h b/src/jv_utf8_tables.h index f1a4252fce..544f6be78a 100644 --- a/src/jv_utf8_tables.h +++ b/src/jv_utf8_tables.h @@ -33,5 +33,5 @@ static const unsigned char utf8_coding_bits[] = 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x07, 0x07, 0x07, 0x07, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; -static const int utf8_first_codepoint[] = +static const uint32_t utf8_first_codepoint[] = {0x00, 0x00, 0x80, 0x800, 0x10000}; diff --git a/src/util.c b/src/util.c index 250bdf75f4..f87f144b76 100644 --- a/src/util.c +++ b/src/util.c @@ -41,6 +41,7 @@ void *alloca (size_t); #include "util.h" #include "jq.h" #include "jv_alloc.h" +#include "jv_unicode.h" #ifdef WIN32 FILE *fopen(const char *fname, const char *mode) { @@ -189,6 +190,7 @@ struct jq_util_input_state { size_t buf_valid_len; jv current_filename; size_t current_line; + jq_util_parser_enum flags; }; static void fprinter(void *data, const char *fname) { @@ -210,13 +212,16 @@ jq_util_input_state *jq_util_input_init(jq_util_msg_cb err_cb, void *err_cb_data return new_state; } -void jq_util_input_set_parser(jq_util_input_state *state, jv_parser *parser, int slurp) { +void jq_util_input_set_parser(jq_util_input_state *state, + jv_parser *parser, + jq_util_parser_enum flags) { assert(!jv_is_valid(state->slurped)); state->parser = parser; + state->flags = flags; - if (parser == NULL && slurp) + if (parser == NULL && (flags & JQ_UTIL_PARSE_SLURP)) state->slurped = jv_string(""); - else if (slurp) + else if ((flags & JQ_UTIL_PARSE_SLURP)) state->slurped = jv_array(); else state->slurped = jv_invalid(); @@ -279,7 +284,9 @@ static int jq_util_input_read_more(jq_util_input_state *state) { state->current_filename = jv_string(""); } else { state->current_input = fopen(f, "r"); - state->current_filename = jv_string(f); + state->current_filename = jvp_utf8_is_valid(f, f + strlen(f)) ? + jv_string(f) : + jv_binary((const unsigned char *)f); if (!state->current_input) { state->err_cb(state->err_cb_data, f); state->failures++; @@ -406,7 +413,10 @@ jv jq_util_input_next_input(jq_util_input_state *state) { continue; if (jv_is_valid(state->slurped)) { // Slurped raw input - state->slurped = jv_string_concat(state->slurped, jv_string_sized(state->buf, state->buf_valid_len)); + if (state->flags & JQ_UTIL_PARSE_BINARY) + state->slurped = jv_string_concat(state->slurped, jv_binary_sized((const unsigned char *)state->buf, state->buf_valid_len)); + else + state->slurped = jv_string_concat(state->slurped, jv_string_sized(state->buf, state->buf_valid_len)); } else { if (!jv_is_valid(value)) value = jv_string(""); diff --git a/tests/base64.test b/tests/base64.test index 0f82b0b71d..80e3927e9b 100644 --- a/tests/base64.test +++ b/tests/base64.test @@ -27,9 +27,9 @@ # invalid base64 characters (whitespace) . | try @base64d catch . "Not base64 data" -"string (\"Not base64...) is not valid base64 data" +"Invalid base64 data (\"Not base64...)" # invalid base64 (too many bytes, QUJD = "ABCD" . | try @base64d catch . "QUJDa" -"string (\"QUJDa\") trailing base64 byte found" +"Invalid base64 data (trailing base64 byte found) (\"QUJDa\")"