diff --git a/pkg/pattern/tokenization/replacer.go b/pkg/pattern/tokenization/replacer.go index 35d9259236be..cc725cb5b987 100644 --- a/pkg/pattern/tokenization/replacer.go +++ b/pkg/pattern/tokenization/replacer.go @@ -314,6 +314,10 @@ restore: // should be faster than a defer return false } +// 'b' and 'B' are not present here because of the way we check for byte size +// units below. If they were present, then suffixes like 'Bb', 'bb', etc. would +// be considered valid byte sizes. Also, only integer numbers are accepted as +// valid bytesizes in bytes, so we handle bytes with special cases instead. var byteSizes = [256]bool{'k': true, 'K': true, 'm': true, 'M': true, 'g': true, 'G': true, 't': true, 'T': true, 'p': true, 'P': true} // Only moves the head forward if it successfully matches a duration @@ -339,6 +343,22 @@ func (r *replacer) advanceBytesize(c1 byte) (matched bool) { return false } +func (r *replacer) advanceSpacedBytesize(canBeBytes bool) (matched bool) { + // Get the next character after the space + c1, hasNext := r.advance() + if !hasNext { + return false + } + if canBeBytes && (c1 == 'b' || c1 == 'B') && r.peekNextIsBoundary() { + return true + } + if r.advanceBytesize(c1) { + return true + } + r.backtrack() + return false +} + func (r *replacer) advance() (c byte, advanced bool) { if r.head >= len(r.source) { return 0, false @@ -394,6 +414,14 @@ func (r *replacer) handleHexOrUnit(hasMinusPrefix bool, n1, l1 uint, c1 byte) (e c1 = r.peekFirstNonInt() } + // Special case, this might be a byte size + if (c1 == 'b' || c1 == 'B') && r.peekNextIsBoundary() { + // We do not subsume a minus sign - byte sizes are unlikely to be + // negative, it's more likely this is a dash as a part of a range + r.emit(hasMinusPrefix, placeholderBytesize) + return true + } + // Maybe we are at the start of a hex string, either something like // "[0-9]+[a-f]", "[0-9]+[A-F]", or "0x". We support both lower and upper // case letters, but to avoid false positives, we want hex replacements to @@ -489,6 +517,14 @@ func (r *replacer) handleNumberWithDecimal(hasMinusPrefix bool, n1 uint, l1 uint return r.handlePotentialUnitWithDecimal(hasMinusPrefix, b2) } + // This can be a byte size with a space, e.g. "3.14 GiB" + if b2 == ' ' && r.advanceSpacedBytesize(false) { + // We do not subsume a minus sign - byte sizes are unlikely to be + // negative, it's more likely this is a dash as a part of a range + r.emit(hasMinusPrefix, placeholderBytesize) + return true + } + // We have a decimal number followed by a non-dot boundary, so this is not // an IP or a version number or anything like that. if b2 != '.' { @@ -633,6 +669,11 @@ func (r *replacer) handleNumberStart(hasMinusPrefix bool) (endsWithBoundary bool case n1 <= maxYear && l1 <= 4 && (b1 == '-' || b1 == '/'): return r.handleSaneTimestamp(hasMinusPrefix, n1, b1) + // This might be a byte size with a space, e.g. "2 b", "3 GiB" + case b1 == ' ' && r.advanceSpacedBytesize(true): + r.emit(hasMinusPrefix, placeholderBytesize) + return true + // Weird RFC822 dates like "02 Jan 06 15:04 MST" case n1 <= 31 && l1 <= 2 && b1 == ' ': if r.advanceMonthName() && r.advanceChar(' ') && r.advanceYear() && r.advanceChar(' ') && r.advanceTime(true) && r.advanceStringOrNumericTimeZone(false) { diff --git a/pkg/pattern/tokenization/tokenization_test.go b/pkg/pattern/tokenization/tokenization_test.go index 9e9e4c75fb02..dde2814a2fe9 100644 --- a/pkg/pattern/tokenization/tokenization_test.go +++ b/pkg/pattern/tokenization/tokenization_test.go @@ -145,8 +145,11 @@ var tokenizationCornerTestCases = []tokenizationTestCase{ []string{".", "3h121m3.", "1h0.", "100usa", "0.12msa"}, }, { - "2Mib 0.12KB-5GB 3.12kb 123Gbps 124mbit:512Tbit", - []string{"", "-", "", "", ":"}, + // We only consider integers to be valid bytesizes in bytes (0.2B doesn't make sense) + "2Mib 0.12KB-5GB 3.12kb 123Gbps 124mbit:512Tbit 5 B;124.1 KB/3b - 2b or 2 BeNot 13.37 b 3 b", + []string{ + "", "-", "", "", ":", + ";/", "-", "", "or", "", "BeNot", "", "b", ""}, }, { `status=123 status_code:500 status 200 status="-1" status_code:"404" httpStatus=200`, @@ -175,6 +178,13 @@ var tokenizationRealisticTestCases = []tokenizationTestCase{ "level=debug", "ts=", "caller=shard_resolver.go:", "bytes=", "chunks=", "streams=", "entries=", `msg="queried index"`, "type=single", `matchers="{stream=\"stdout\", pod=\"loki-canary-v75j4\"}"`, "duration=", "from=", "through=", "length=", }, }, + // tricky loki distributor message: + { + `level=debug ts=2024-07-12T12:25:06.175464934Z caller=push.go:146 org_id=29 traceID=7af4f918eab1c80f msg="push request parsed" path=/loki/api/v1/push contentType=application/x-protobuf contentEncoding= bodySize="8.8 kB" streams=11 entries=43 streamLabelsSize="3.4 kB" entriesSize="19 kB" structuredMetadataSize="71 B" totalSize="22 kB" mostRecentLagMs=167 adaptiveLogsDroppedLines=10 adaptiveLogsDroppedSize=4965 adaptiveLogsMatchedLines=37`, + []string{ + "level=debug", "ts=", "caller=push.go:", "org_id=", "traceID=", `msg="push request parsed"`, "path=/loki/api/v1/push", "contentType=application/x-protobuf", "contentEncoding=", `bodySize=""`, "streams=", "entries=", `streamLabelsSize=""`, `entriesSize=""`, `structuredMetadataSize=""`, `totalSize=""`, "mostRecentLagMs=", "adaptiveLogsDroppedLines=", "adaptiveLogsDroppedSize=", "adaptiveLogsMatchedLines=", + }, + }, // random JSON logs { `{"timestamp": "2022-12-23T12:34:56Z", "level": "debug", "message": "Server starting", "server_id": "abcdefghij", "start_time": "2022-12-23T12:30:00Z"}`,