From 47335207992eec9cf1148b6eac75bd2edd2c4a88 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 25 Jul 2024 19:27:56 -0700 Subject: [PATCH] mountinfo: simplify/speedup unescape MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It has been pointed out that unescape implementation is unnecessarily complex, and I tend to agree. Plus, we never expect any errors during unescaping, and if there are any, we can always return the string as is. So, drop the error return, and simplify the algorithm. Modify the test case and the benchmark accordingly. NOTE this also fixes the issue of trying to convert escape sequences where the character code is >= utf8.RuneSelf (128 decimal, 0200 octal). Adding those to the output may result in invalid utf-8 strings. The new code doesn't try to unescape those (and the tests are amended to check that). Finally, add some more test cases. Here's benchstat output: name old time/op new time/op delta Unescape-20 851ns ± 1% 358ns ± 3% -57.94% (p=0.008 n=5+5) name old alloc/op new alloc/op delta Unescape-20 640B ± 0% 255B ± 0% -60.16% (p=0.000 n=5+4) name old allocs/op new allocs/op delta Unescape-20 31.0 ± 0% 21.0 ± 0% -32.26% (p=0.008 n=5+5) Signed-off-by: Kir Kolyshkin --- mountinfo/mountinfo_linux.go | 88 ++++++++++--------------------- mountinfo/mountinfo_linux_test.go | 58 ++++++++++---------- 2 files changed, 57 insertions(+), 89 deletions(-) diff --git a/mountinfo/mountinfo_linux.go b/mountinfo/mountinfo_linux.go index 43ced2c..7b41b91 100644 --- a/mountinfo/mountinfo_linux.go +++ b/mountinfo/mountinfo_linux.go @@ -24,8 +24,6 @@ func GetMountsFromReader(r io.Reader, filter FilterFunc) ([]*Info, error) { s := bufio.NewScanner(r) out := []*Info{} for s.Scan() { - var err error - /* See http://man7.org/linux/man-pages/man5/proc.5.html @@ -85,29 +83,15 @@ func GetMountsFromReader(r io.Reader, filter FilterFunc) ([]*Info, error) { Parent: toInt(fields[1]), Major: toInt(major), Minor: toInt(minor), + Root: unescape(fields[3]), + Mountpoint: unescape(fields[4]), Options: fields[5], Optional: strings.Join(fields[6:sepIdx], " "), // zero or more optional fields + FSType: unescape(fields[sepIdx+1]), + Source: unescape(fields[sepIdx+2]), VFSOptions: fields[sepIdx+3], } - p.Mountpoint, err = unescape(fields[4]) - if err != nil { - return nil, fmt.Errorf("parsing '%s' failed: mount point: %w", fields[4], err) - } - p.FSType, err = unescape(fields[sepIdx+1]) - if err != nil { - return nil, fmt.Errorf("parsing '%s' failed: fstype: %w", fields[sepIdx+1], err) - } - p.Source, err = unescape(fields[sepIdx+2]) - if err != nil { - return nil, fmt.Errorf("parsing '%s' failed: source: %w", fields[sepIdx+2], err) - } - - p.Root, err = unescape(fields[3]) - if err != nil { - return nil, fmt.Errorf("parsing '%s' failed: root: %w", fields[3], err) - } - // Run the filter after parsing all fields. var skip, stop bool if filter != nil { @@ -188,62 +172,48 @@ func PidMountInfo(pid int) ([]*Info, error) { return GetMountsFromReader(f, nil) } -// A few specific characters in mountinfo path entries (root and mountpoint) -// are escaped using a backslash followed by a character's ascii code in octal. +// Some characters in some mountinfo fields may be escaped using a backslash +// followed by a three octal digits of the character's ASCII code \NNN, where +// N is 0-7, for example: // // space -- as \040 // tab (aka \t) -- as \011 // newline (aka \n) -- as \012 // backslash (aka \\) -- as \134 +// hash (aka #) -- as \043 // -// This function converts path from mountinfo back, i.e. it unescapes the above sequences. -func unescape(path string) (string, error) { - // try to avoid copying +// This function converts all such escape sequences back to ASCII, and returns +// the unescaped string. +func unescape(path string) string { + // Try to avoid copying. if strings.IndexByte(path, '\\') == -1 { - return path, nil + return path } // The following code is UTF-8 transparent as it only looks for some - // specific characters (backslash and 0..7) with values < utf8.RuneSelf, - // and everything else is passed through as is. + // specific characters (backslash and 0..7) with values less than + // utf8.RuneSelf, and everything else is passed through as is. buf := make([]byte, len(path)) bufLen := 0 for i := 0; i < len(path); i++ { - if path[i] != '\\' { - buf[bufLen] = path[i] - bufLen++ - continue - } - s := path[i:] - if len(s) < 4 { - // too short - return "", fmt.Errorf("bad escape sequence %q: too short", s) - } - c := s[1] - switch c { - case '0', '1', '2', '3', '4', '5', '6', '7': - v := c - '0' - for j := 2; j < 4; j++ { // one digit already; two more - if s[j] < '0' || s[j] > '7' { - return "", fmt.Errorf("bad escape sequence %q: not a digit", s[:3]) - } - x := s[j] - '0' - v = (v << 3) | x - } - if v > 255 { - return "", fmt.Errorf("bad escape sequence %q: out of range" + s[:3]) - } - buf[bufLen] = v - bufLen++ + c := path[i] + // Looking for \\ followed by three octal digits, + // maximum value is 177 (less than utf8.RuneSelf). + if c == '\\' && i+3 < len(path) && + (path[i+1] == '0' || path[i+1] == '1') && + (path[i+2] >= '0' && path[i+2] <= '7') && + (path[i+3] >= '0' && path[i+3] <= '7') { + c1 := path[i+1] - '0' + c2 := path[i+2] - '0' + c3 := path[i+3] - '0' + c = c1<<6 | c2<<3 | c3 i += 3 - continue - default: - return "", fmt.Errorf("bad escape sequence %q: not a digit" + s[:3]) - } + buf[bufLen] = c + bufLen++ } - return string(buf[:bufLen]), nil + return string(buf[:bufLen]) } // toInt converts a string to an int, and ignores any numbers parsing errors, diff --git a/mountinfo/mountinfo_linux_test.go b/mountinfo/mountinfo_linux_test.go index e78ee3f..683840f 100644 --- a/mountinfo/mountinfo_linux_test.go +++ b/mountinfo/mountinfo_linux_test.go @@ -733,42 +733,40 @@ func TestParseMountinfoExtraCases(t *testing.T) { func TestUnescape(t *testing.T) { testCases := []struct { input, output string - isErr bool }{ - {"", "", false}, - {"/", "/", false}, - {"/some/longer/path", "/some/longer/path", false}, - {"/path\\040with\\040spaces", "/path with spaces", false}, - {"/path/with\\134backslash", "/path/with\\backslash", false}, - {"/tab\\011in/path", "/tab\tin/path", false}, - {`/path/"with'quotes`, `/path/"with'quotes`, false}, - {`/path/"with'quotes,\040space,\011tab`, `/path/"with'quotes, space, tab`, false}, - {`\12`, "", true}, - {`\134`, `\`, false}, - {`"'"'"'`, `"'"'"'`, false}, - {`/\1345`, `/\5`, false}, - {`/\12x`, "", true}, - {`\0`, "", true}, - {`\x`, "", true}, - {"\\\\", "", true}, + // NOTE Go interprets \NNN inside double quotes exactly + // the same way as the function being tested, so: + // - for input strings, we must either escape all backslash + // characters ("\\") or use backticks `\`; + // - for output strings, we can write something like "\040", + // which is identical to " ". + {"", ""}, + {"/", "/"}, + {"/some/longer/path", "/some/longer/path"}, + {`/path\040with\040spaces`, "/path\040with\040spaces"}, + {"/path/with\\134backslash", "/path/with\\backslash"}, + {"/tab\\011in/path", "/tab\011in/path"}, + {`/path/"with'quotes`, `/path/"with'quotes`}, + {`/path/"with'quotes,\040space,\011tab`, `/path/"with'quotes, space, tab`}, + {`\12`, `\12`}, // Not enough digits. + {`\134`, `\`}, // Backslash. + {`"'"'"'`, `"'"'"'`}, + {`/\1345`, `/\5`}, // Backslash with extra digit. + {`/\12x`, `/\12x`}, + {`\0`, `\0`}, // Not enough digits. + {`\000\000`, "\000\000"}, // NUL (min allowed ASCII value). + {`\x`, `\x`}, + {"\\\\", "\\\\"}, + {`\177`, "\177"}, // Max allowed ASCII value. + {`\222`, `\222`}, // Too large value. + {`Это\040камон\040какой-то`, "Это камон какой-то"}, // Some UTF-8. } for _, tc := range testCases { - res, err := unescape(tc.input) - if tc.isErr == true { - if err == nil { - t.Errorf("Input %q, want error, got nil", tc.input) - } - // no more checks - continue - } + res := unescape(tc.input) if res != tc.output { t.Errorf("Input %q, want %q, got %q", tc.input, tc.output, res) } - if err != nil { - t.Errorf("Input %q, want nil, got error %v", tc.input, err) - continue - } } } @@ -796,7 +794,7 @@ func BenchmarkUnescape(b *testing.B) { b.ReportAllocs() for i := 0; i < b.N; i++ { for x := 0; x < len(testCases); x++ { - _, _ = unescape(testCases[x]) + _ = unescape(testCases[x]) } } }