mountinfo: simplify/speedup unescape

It has been pointed out that unescape implementation is unnecessarily complex, and I tend to agree. Plus, we never expect any errors during unescaping, and if there are any, we can always return the string as is. So, drop the error return, and simplify the algorithm. Modify the test case and the benchmark accordingly. NOTE this also fixes the issue of trying to convert escape sequences where the character code is >= utf8.RuneSelf (128 decimal, 0200 octal). Adding those to the output may result in invalid utf-8 strings. The new code doesn't try to unescape those (and the tests are amended to check that). Finally, add some more test cases. Here's benchstat output: name old time/op new time/op delta Unescape-20 851ns ± 1% 358ns ± 3% -57.94% (p=0.008 n=5+5) name old alloc/op new alloc/op delta Unescape-20 640B ± 0% 255B ± 0% -60.16% (p=0.000 n=5+4) name old allocs/op new allocs/op delta Unescape-20 31.0 ± 0% 21.0 ± 0% -32.26% (p=0.008 n=5+5) Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
kolyshkin · Jul 26, 2024 · 4733520 · 4733520
1 parent 0015f62
commit 4733520
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 89 deletions.
diff --git a/mountinfo/mountinfo_linux.go b/mountinfo/mountinfo_linux.go
@@ -24,8 +24,6 @@ func GetMountsFromReader(r io.Reader, filter FilterFunc) ([]*Info, error) {
 	s := bufio.NewScanner(r)
 	out := []*Info{}
 	for s.Scan() {
-		var err error
-
 		/*
 		   See http://man7.org/linux/man-pages/man5/proc.5.html
 
@@ -85,29 +83,15 @@ func GetMountsFromReader(r io.Reader, filter FilterFunc) ([]*Info, error) {
 			Parent:     toInt(fields[1]),
 			Major:      toInt(major),
 			Minor:      toInt(minor),
+			Root:       unescape(fields[3]),
+			Mountpoint: unescape(fields[4]),
 			Options:    fields[5],
 			Optional:   strings.Join(fields[6:sepIdx], " "), // zero or more optional fields
+			FSType:     unescape(fields[sepIdx+1]),
+			Source:     unescape(fields[sepIdx+2]),
 			VFSOptions: fields[sepIdx+3],
 		}
 
-		p.Mountpoint, err = unescape(fields[4])
-		if err != nil {
-			return nil, fmt.Errorf("parsing '%s' failed: mount point: %w", fields[4], err)
-		}
-		p.FSType, err = unescape(fields[sepIdx+1])
-		if err != nil {
-			return nil, fmt.Errorf("parsing '%s' failed: fstype: %w", fields[sepIdx+1], err)
-		}
-		p.Source, err = unescape(fields[sepIdx+2])
-		if err != nil {
-			return nil, fmt.Errorf("parsing '%s' failed: source: %w", fields[sepIdx+2], err)
-		}
-
-		p.Root, err = unescape(fields[3])
-		if err != nil {
-			return nil, fmt.Errorf("parsing '%s' failed: root: %w", fields[3], err)
-		}
-
 		// Run the filter after parsing all fields.
 		var skip, stop bool
 		if filter != nil {
@@ -188,62 +172,48 @@ func PidMountInfo(pid int) ([]*Info, error) {
 	return GetMountsFromReader(f, nil)
 }
 
-// A few specific characters in mountinfo path entries (root and mountpoint)
-// are escaped using a backslash followed by a character's ascii code in octal.
+// Some characters in some mountinfo fields may be escaped using a backslash
+// followed by a three octal digits of the character's ASCII code \NNN, where
+// N is 0-7, for example:
 //
 //	space              -- as \040
 //	tab (aka \t)       -- as \011
 //	newline (aka \n)   -- as \012
 //	backslash (aka \\) -- as \134
+//	hash (aka #)       -- as \043
 //
-// This function converts path from mountinfo back, i.e. it unescapes the above sequences.
-func unescape(path string) (string, error) {
-	// try to avoid copying
+// This function converts all such escape sequences back to ASCII, and returns
+// the unescaped string.
+func unescape(path string) string {
+	// Try to avoid copying.
 	if strings.IndexByte(path, '\\') == -1 {
-		return path, nil
+		return path
 	}
 
 	// The following code is UTF-8 transparent as it only looks for some
-	// specific characters (backslash and 0..7) with values < utf8.RuneSelf,
-	// and everything else is passed through as is.
+	// specific characters (backslash and 0..7) with values less than
+	// utf8.RuneSelf, and everything else is passed through as is.
 	buf := make([]byte, len(path))
 	bufLen := 0
 	for i := 0; i < len(path); i++ {
-		if path[i] != '\\' {
-			buf[bufLen] = path[i]
-			bufLen++
-			continue
-		}
-		s := path[i:]
-		if len(s) < 4 {
-			// too short
-			return "", fmt.Errorf("bad escape sequence %q: too short", s)
-		}
-		c := s[1]
-		switch c {
-		case '0', '1', '2', '3', '4', '5', '6', '7':
-			v := c - '0'
-			for j := 2; j < 4; j++ { // one digit already; two more
-				if s[j] < '0' || s[j] > '7' {
-					return "", fmt.Errorf("bad escape sequence %q: not a digit", s[:3])
-				}
-				x := s[j] - '0'
-				v = (v << 3) | x
-			}
-			if v > 255 {
-				return "", fmt.Errorf("bad escape sequence %q: out of range" + s[:3])
-			}
-			buf[bufLen] = v
-			bufLen++
+		c := path[i]
+		// Looking for \\ followed by three octal digits,
+		// maximum value is 177 (less than utf8.RuneSelf).
+		if c == '\\' && i+3 < len(path) &&
+			(path[i+1] == '0' || path[i+1] == '1') &&
+			(path[i+2] >= '0' && path[i+2] <= '7') &&
+			(path[i+3] >= '0' && path[i+3] <= '7') {
+			c1 := path[i+1] - '0'
+			c2 := path[i+2] - '0'
+			c3 := path[i+3] - '0'
+			c = c1<<6 | c2<<3 | c3
 			i += 3
-			continue
-		default:
-			return "", fmt.Errorf("bad escape sequence %q: not a digit" + s[:3])
-
 		}
+		buf[bufLen] = c
+		bufLen++
 	}
 
-	return string(buf[:bufLen]), nil
+	return string(buf[:bufLen])
 }
 
 // toInt converts a string to an int, and ignores any numbers parsing errors,

diff --git a/mountinfo/mountinfo_linux_test.go b/mountinfo/mountinfo_linux_test.go
@@ -733,42 +733,40 @@ func TestParseMountinfoExtraCases(t *testing.T) {
 func TestUnescape(t *testing.T) {
 	testCases := []struct {
 		input, output string
-		isErr         bool
 	}{
-		{"", "", false},
-		{"/", "/", false},
-		{"/some/longer/path", "/some/longer/path", false},
-		{"/path\\040with\\040spaces", "/path with spaces", false},
-		{"/path/with\\134backslash", "/path/with\\backslash", false},
-		{"/tab\\011in/path", "/tab\tin/path", false},
-		{`/path/"with'quotes`, `/path/"with'quotes`, false},
-		{`/path/"with'quotes,\040space,\011tab`, `/path/"with'quotes, space,	tab`, false},
-		{`\12`, "", true},
-		{`\134`, `\`, false},
-		{`"'"'"'`, `"'"'"'`, false},
-		{`/\1345`, `/\5`, false},
-		{`/\12x`, "", true},
-		{`\0`, "", true},
-		{`\x`, "", true},
-		{"\\\\", "", true},
+		// NOTE Go interprets \NNN inside double quotes exactly
+		// the same way as the function being tested, so:
+		//  - for input strings, we must either escape all backslash
+		//    characters ("\\") or use backticks `\`;
+		//  - for output strings, we can write something like "\040",
+		//    which is identical to " ".
+		{"", ""},
+		{"/", "/"},
+		{"/some/longer/path", "/some/longer/path"},
+		{`/path\040with\040spaces`, "/path\040with\040spaces"},
+		{"/path/with\\134backslash", "/path/with\\backslash"},
+		{"/tab\\011in/path", "/tab\011in/path"},
+		{`/path/"with'quotes`, `/path/"with'quotes`},
+		{`/path/"with'quotes,\040space,\011tab`, `/path/"with'quotes, space,	tab`},
+		{`\12`, `\12`}, // Not enough digits.
+		{`\134`, `\`},  // Backslash.
+		{`"'"'"'`, `"'"'"'`},
+		{`/\1345`, `/\5`}, // Backslash with extra digit.
+		{`/\12x`, `/\12x`},
+		{`\0`, `\0`},             // Not enough digits.
+		{`\000\000`, "\000\000"}, // NUL (min allowed ASCII value).
+		{`\x`, `\x`},
+		{"\\\\", "\\\\"},
+		{`\177`, "\177"}, // Max allowed ASCII value.
+		{`\222`, `\222`}, // Too large value.
+		{`Это\040камон\040какой-то`, "Это камон какой-то"}, // Some UTF-8.
 	}
 
 	for _, tc := range testCases {
-		res, err := unescape(tc.input)
-		if tc.isErr == true {
-			if err == nil {
-				t.Errorf("Input %q, want error, got nil", tc.input)
-			}
-			// no more checks
-			continue
-		}
+		res := unescape(tc.input)
 		if res != tc.output {
 			t.Errorf("Input %q, want %q, got %q", tc.input, tc.output, res)
 		}
-		if err != nil {
-			t.Errorf("Input %q, want nil, got error %v", tc.input, err)
-			continue
-		}
 	}
 }
 
@@ -796,7 +794,7 @@ func BenchmarkUnescape(b *testing.B) {
 	b.ReportAllocs()
 	for i := 0; i < b.N; i++ {
 		for x := 0; x < len(testCases); x++ {
-			_, _ = unescape(testCases[x])
+			_ = unescape(testCases[x])
 		}
 	}
 }