From 133b0fa9909871accea7382197357cc463f3d6c7 Mon Sep 17 00:00:00 2001 From: "mergify[bot]" <37929162+mergify[bot]@users.noreply.github.com> Date: Mon, 18 Jul 2022 10:25:20 +0200 Subject: [PATCH] [winlogbeat] Clean up rune handling (#32198) (#32356) * Take into account RuneErrors when decoding XML codes * Add more unit tests * Parse multibyte utf8 encodings at the end of backing array (cherry picked from commit 157c3279a92b89b2bb8381cb865808f369cab6bd) Co-authored-by: Marc Guasch --- libbeat/common/encoding/xml/safe_reader.go | 14 +- .../common/encoding/xml/safe_reader_test.go | 139 ++++++++++++++++++ 2 files changed, 151 insertions(+), 2 deletions(-) create mode 100644 libbeat/common/encoding/xml/safe_reader_test.go diff --git a/libbeat/common/encoding/xml/safe_reader.go b/libbeat/common/encoding/xml/safe_reader.go index 9c7c95c73c6..e49d8dc2b6c 100644 --- a/libbeat/common/encoding/xml/safe_reader.go +++ b/libbeat/common/encoding/xml/safe_reader.go @@ -53,12 +53,22 @@ func (r *SafeReader) Read(d []byte) (n int, err error) { return output(n) } if len(r.buf) == 0 { - n, _ = r.inner.Read(r.backing[:]) + n, _ := r.inner.Read(r.backing[:]) r.buf = r.backing[:n] } for i := 0; i < len(r.buf); { + if !utf8.FullRune(r.buf[i:]) { + n = copy(d, r.buf[:i]) + rn := copy(r.backing[:], r.buf[i:]) + nn, err := r.inner.Read(r.backing[rn:]) + if err != nil && err != io.EOF { + return 0, err + } + r.buf = r.backing[:rn+nn] + return output(n) + } code, size := utf8.DecodeRune(r.buf[i:]) - if !unicode.IsSpace(code) && unicode.IsControl(code) { + if code == utf8.RuneError || (!unicode.IsSpace(code) && unicode.IsControl(code)) { n = copy(d, r.buf[:i]) r.buf = r.buf[n+1:] r.code = []byte(fmt.Sprintf("\\u%04x", code)) diff --git a/libbeat/common/encoding/xml/safe_reader_test.go b/libbeat/common/encoding/xml/safe_reader_test.go new file mode 100644 index 00000000000..18a93ec1a32 --- /dev/null +++ b/libbeat/common/encoding/xml/safe_reader_test.go @@ -0,0 +1,139 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//go:build !integration +// +build !integration + +package xml + +import ( + "encoding/xml" + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestInvalidXMLIsSanitized(t *testing.T) { + tests := []struct { + name string + input []byte + want map[string]interface{} + err error + }{ + { + name: "control space", + input: []byte(`John` + "\t" + ` +`), + want: map[string]interface{}{ + "person": map[string]interface{}{ + "Name": map[string]interface{}{ + "#text": "John", + "ID": "123", + }, + }, + }, + }, + { + name: "crlf", + input: []byte(`John` + "\r\n" + ``), + want: map[string]interface{}{ + "person": map[string]interface{}{ + "Name": map[string]interface{}{ + "#text": "John", + "ID": "123", + }, + }, + }, + }, + { + name: "single invalid", + input: []byte(`John` + "\x80" + ``), + want: map[string]interface{}{ + "person": map[string]interface{}{ + "Name": map[string]interface{}{ + "#text": "John\\ufffd", + "ID": "123", + }, + }, + }, + }, + { + name: "double invalid", + input: []byte(`` + "\x80" + `John` + "\x80" + ``), + want: map[string]interface{}{ + "person": map[string]interface{}{ + "Name": map[string]interface{}{ + "#text": "\\ufffdJohn\\ufffd", + "ID": "123", + }, + }, + }, + }, + { + name: "happy single invalid", + input: []byte(`😊John` + "\x80" + ``), + want: map[string]interface{}{ + "person": map[string]interface{}{ + "Name": map[string]interface{}{ + "#text": "😊John\\ufffd", + "ID": "123", + }, + }, + }, + }, + { + name: "invalid tag", + input: []byte(`John`), + want: nil, + err: &xml.SyntaxError{Msg: "attribute name without = in element", Line: 1}, + }, + { + name: "invalid tag value", + input: []byte(`John`), + want: map[string]interface{}{ + "person": map[string]interface{}{ + "Name": map[string]interface{}{ + "#text": "John", + "ID": "\\ufffd123", + }, + }, + }, + }, + { + name: "unhappy", + input: []byte(`John is` + strings.Repeat(" ", 223) + ` 😞`), + want: map[string]interface{}{ + "person": map[string]interface{}{ + "Name": map[string]interface{}{ + "#text": "John is" + strings.Repeat(" ", 223) + " 😞", + "ID": "123", + }, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + d := NewDecoder(NewSafeReader(test.input)) + out, err := d.Decode() + assert.Equal(t, test.err, err) + assert.Equal(t, test.want, out) + }) + } +}