From 47335207992eec9cf1148b6eac75bd2edd2c4a88 Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Thu, 25 Jul 2024 19:27:56 -0700
Subject: [PATCH] mountinfo: simplify/speedup unescape
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It has been pointed out that unescape implementation is unnecessarily
complex, and I tend to agree. Plus, we never expect any errors during
unescaping, and if there are any, we can always return the string as is.

So, drop the error return, and simplify the algorithm. Modify the test
case and the benchmark accordingly.

NOTE this also fixes the issue of trying to convert escape sequences
where the character code is >= utf8.RuneSelf (128 decimal, 0200 octal).
Adding those to the output may result in invalid utf-8 strings. The new
code doesn't try to unescape those (and the tests are amended to check
that).

Finally, add some more test cases.

Here's benchstat output:
name               old time/op    new time/op    delta
Unescape-20           851ns ± 1%     358ns ± 3%  -57.94%  (p=0.008 n=5+5)

name               old alloc/op   new alloc/op   delta
Unescape-20            640B ± 0%      255B ± 0%  -60.16%  (p=0.000 n=5+4)

name               old allocs/op  new allocs/op  delta
Unescape-20            31.0 ± 0%      21.0 ± 0%  -32.26%  (p=0.008 n=5+5)

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
 mountinfo/mountinfo_linux.go      | 88 ++++++++++---------------------
 mountinfo/mountinfo_linux_test.go | 58 ++++++++++----------
 2 files changed, 57 insertions(+), 89 deletions(-)

diff --git a/mountinfo/mountinfo_linux.go b/mountinfo/mountinfo_linux.go
index 43ced2c..7b41b91 100644
--- a/mountinfo/mountinfo_linux.go
+++ b/mountinfo/mountinfo_linux.go
@@ -24,8 +24,6 @@ func GetMountsFromReader(r io.Reader, filter FilterFunc) ([]*Info, error) {
 	s := bufio.NewScanner(r)
 	out := []*Info{}
 	for s.Scan() {
-		var err error
-
 		/*
 		   See http://man7.org/linux/man-pages/man5/proc.5.html
 
@@ -85,29 +83,15 @@ func GetMountsFromReader(r io.Reader, filter FilterFunc) ([]*Info, error) {
 			Parent:     toInt(fields[1]),
 			Major:      toInt(major),
 			Minor:      toInt(minor),
+			Root:       unescape(fields[3]),
+			Mountpoint: unescape(fields[4]),
 			Options:    fields[5],
 			Optional:   strings.Join(fields[6:sepIdx], " "), // zero or more optional fields
+			FSType:     unescape(fields[sepIdx+1]),
+			Source:     unescape(fields[sepIdx+2]),
 			VFSOptions: fields[sepIdx+3],
 		}
 
-		p.Mountpoint, err = unescape(fields[4])
-		if err != nil {
-			return nil, fmt.Errorf("parsing '%s' failed: mount point: %w", fields[4], err)
-		}
-		p.FSType, err = unescape(fields[sepIdx+1])
-		if err != nil {
-			return nil, fmt.Errorf("parsing '%s' failed: fstype: %w", fields[sepIdx+1], err)
-		}
-		p.Source, err = unescape(fields[sepIdx+2])
-		if err != nil {
-			return nil, fmt.Errorf("parsing '%s' failed: source: %w", fields[sepIdx+2], err)
-		}
-
-		p.Root, err = unescape(fields[3])
-		if err != nil {
-			return nil, fmt.Errorf("parsing '%s' failed: root: %w", fields[3], err)
-		}
-
 		// Run the filter after parsing all fields.
 		var skip, stop bool
 		if filter != nil {
@@ -188,62 +172,48 @@ func PidMountInfo(pid int) ([]*Info, error) {
 	return GetMountsFromReader(f, nil)
 }
 
-// A few specific characters in mountinfo path entries (root and mountpoint)
-// are escaped using a backslash followed by a character's ascii code in octal.
+// Some characters in some mountinfo fields may be escaped using a backslash
+// followed by a three octal digits of the character's ASCII code \NNN, where
+// N is 0-7, for example:
 //
 //	space              -- as \040
 //	tab (aka \t)       -- as \011
 //	newline (aka \n)   -- as \012
 //	backslash (aka \\) -- as \134
+//	hash (aka #)       -- as \043
 //
-// This function converts path from mountinfo back, i.e. it unescapes the above sequences.
-func unescape(path string) (string, error) {
-	// try to avoid copying
+// This function converts all such escape sequences back to ASCII, and returns
+// the unescaped string.
+func unescape(path string) string {
+	// Try to avoid copying.
 	if strings.IndexByte(path, '\\') == -1 {
-		return path, nil
+		return path
 	}
 
 	// The following code is UTF-8 transparent as it only looks for some
-	// specific characters (backslash and 0..7) with values < utf8.RuneSelf,
-	// and everything else is passed through as is.
+	// specific characters (backslash and 0..7) with values less than
+	// utf8.RuneSelf, and everything else is passed through as is.
 	buf := make([]byte, len(path))
 	bufLen := 0
 	for i := 0; i < len(path); i++ {
-		if path[i] != '\\' {
-			buf[bufLen] = path[i]
-			bufLen++
-			continue
-		}
-		s := path[i:]
-		if len(s) < 4 {
-			// too short
-			return "", fmt.Errorf("bad escape sequence %q: too short", s)
-		}
-		c := s[1]
-		switch c {
-		case '0', '1', '2', '3', '4', '5', '6', '7':
-			v := c - '0'
-			for j := 2; j < 4; j++ { // one digit already; two more
-				if s[j] < '0' || s[j] > '7' {
-					return "", fmt.Errorf("bad escape sequence %q: not a digit", s[:3])
-				}
-				x := s[j] - '0'
-				v = (v << 3) | x
-			}
-			if v > 255 {
-				return "", fmt.Errorf("bad escape sequence %q: out of range" + s[:3])
-			}
-			buf[bufLen] = v
-			bufLen++
+		c := path[i]
+		// Looking for \\ followed by three octal digits,
+		// maximum value is 177 (less than utf8.RuneSelf).
+		if c == '\\' && i+3 < len(path) &&
+			(path[i+1] == '0' || path[i+1] == '1') &&
+			(path[i+2] >= '0' && path[i+2] <= '7') &&
+			(path[i+3] >= '0' && path[i+3] <= '7') {
+			c1 := path[i+1] - '0'
+			c2 := path[i+2] - '0'
+			c3 := path[i+3] - '0'
+			c = c1<<6 | c2<<3 | c3
 			i += 3
-			continue
-		default:
-			return "", fmt.Errorf("bad escape sequence %q: not a digit" + s[:3])
-
 		}
+		buf[bufLen] = c
+		bufLen++
 	}
 
-	return string(buf[:bufLen]), nil
+	return string(buf[:bufLen])
 }
 
 // toInt converts a string to an int, and ignores any numbers parsing errors,
diff --git a/mountinfo/mountinfo_linux_test.go b/mountinfo/mountinfo_linux_test.go
index e78ee3f..683840f 100644
--- a/mountinfo/mountinfo_linux_test.go
+++ b/mountinfo/mountinfo_linux_test.go
@@ -733,42 +733,40 @@ func TestParseMountinfoExtraCases(t *testing.T) {
 func TestUnescape(t *testing.T) {
 	testCases := []struct {
 		input, output string
-		isErr         bool
 	}{
-		{"", "", false},
-		{"/", "/", false},
-		{"/some/longer/path", "/some/longer/path", false},
-		{"/path\\040with\\040spaces", "/path with spaces", false},
-		{"/path/with\\134backslash", "/path/with\\backslash", false},
-		{"/tab\\011in/path", "/tab\tin/path", false},
-		{`/path/"with'quotes`, `/path/"with'quotes`, false},
-		{`/path/"with'quotes,\040space,\011tab`, `/path/"with'quotes, space,	tab`, false},
-		{`\12`, "", true},
-		{`\134`, `\`, false},
-		{`"'"'"'`, `"'"'"'`, false},
-		{`/\1345`, `/\5`, false},
-		{`/\12x`, "", true},
-		{`\0`, "", true},
-		{`\x`, "", true},
-		{"\\\\", "", true},
+		// NOTE Go interprets \NNN inside double quotes exactly
+		// the same way as the function being tested, so:
+		//  - for input strings, we must either escape all backslash
+		//    characters ("\\") or use backticks `\`;
+		//  - for output strings, we can write something like "\040",
+		//    which is identical to " ".
+		{"", ""},
+		{"/", "/"},
+		{"/some/longer/path", "/some/longer/path"},
+		{`/path\040with\040spaces`, "/path\040with\040spaces"},
+		{"/path/with\\134backslash", "/path/with\\backslash"},
+		{"/tab\\011in/path", "/tab\011in/path"},
+		{`/path/"with'quotes`, `/path/"with'quotes`},
+		{`/path/"with'quotes,\040space,\011tab`, `/path/"with'quotes, space,	tab`},
+		{`\12`, `\12`}, // Not enough digits.
+		{`\134`, `\`},  // Backslash.
+		{`"'"'"'`, `"'"'"'`},
+		{`/\1345`, `/\5`}, // Backslash with extra digit.
+		{`/\12x`, `/\12x`},
+		{`\0`, `\0`},             // Not enough digits.
+		{`\000\000`, "\000\000"}, // NUL (min allowed ASCII value).
+		{`\x`, `\x`},
+		{"\\\\", "\\\\"},
+		{`\177`, "\177"}, // Max allowed ASCII value.
+		{`\222`, `\222`}, // Too large value.
+		{`Это\040камон\040какой-то`, "Это камон какой-то"}, // Some UTF-8.
 	}
 
 	for _, tc := range testCases {
-		res, err := unescape(tc.input)
-		if tc.isErr == true {
-			if err == nil {
-				t.Errorf("Input %q, want error, got nil", tc.input)
-			}
-			// no more checks
-			continue
-		}
+		res := unescape(tc.input)
 		if res != tc.output {
 			t.Errorf("Input %q, want %q, got %q", tc.input, tc.output, res)
 		}
-		if err != nil {
-			t.Errorf("Input %q, want nil, got error %v", tc.input, err)
-			continue
-		}
 	}
 }
 
@@ -796,7 +794,7 @@ func BenchmarkUnescape(b *testing.B) {
 	b.ReportAllocs()
 	for i := 0; i < b.N; i++ {
 		for x := 0; x < len(testCases); x++ {
-			_, _ = unescape(testCases[x])
+			_ = unescape(testCases[x])
 		}
 	}
 }