Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mountinfo: simpify, speedup, and fix unescape #144

Merged
merged 2 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 32 additions & 59 deletions mountinfo/mountinfo_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ func GetMountsFromReader(r io.Reader, filter FilterFunc) ([]*Info, error) {
s := bufio.NewScanner(r)
out := []*Info{}
for s.Scan() {
var err error

/*
See http://man7.org/linux/man-pages/man5/proc.5.html

Expand Down Expand Up @@ -85,29 +83,15 @@ func GetMountsFromReader(r io.Reader, filter FilterFunc) ([]*Info, error) {
Parent: toInt(fields[1]),
Major: toInt(major),
Minor: toInt(minor),
Root: unescape(fields[3]),
Mountpoint: unescape(fields[4]),
Options: fields[5],
Optional: strings.Join(fields[6:sepIdx], " "), // zero or more optional fields
FSType: unescape(fields[sepIdx+1]),
Source: unescape(fields[sepIdx+2]),
VFSOptions: fields[sepIdx+3],
}

p.Mountpoint, err = unescape(fields[4])
if err != nil {
return nil, fmt.Errorf("parsing '%s' failed: mount point: %w", fields[4], err)
}
p.FSType, err = unescape(fields[sepIdx+1])
if err != nil {
return nil, fmt.Errorf("parsing '%s' failed: fstype: %w", fields[sepIdx+1], err)
}
p.Source, err = unescape(fields[sepIdx+2])
if err != nil {
return nil, fmt.Errorf("parsing '%s' failed: source: %w", fields[sepIdx+2], err)
}

p.Root, err = unescape(fields[3])
if err != nil {
return nil, fmt.Errorf("parsing '%s' failed: root: %w", fields[3], err)
}

// Run the filter after parsing all fields.
var skip, stop bool
if filter != nil {
Expand Down Expand Up @@ -188,62 +172,51 @@ func PidMountInfo(pid int) ([]*Info, error) {
return GetMountsFromReader(f, nil)
}

// A few specific characters in mountinfo path entries (root and mountpoint)
// are escaped using a backslash followed by a character's ascii code in octal.
// Some characters in some mountinfo fields may be escaped using a backslash
// followed by a three octal digits of the character's ASCII code \NNN, where
// N is 0-7, for example:
//
// space -- as \040
// tab (aka \t) -- as \011
// newline (aka \n) -- as \012
// backslash (aka \\) -- as \134
// hash (aka #) -- as \043
//
// This function converts path from mountinfo back, i.e. it unescapes the above sequences.
func unescape(path string) (string, error) {
// try to avoid copying
// This function converts all such escape sequences back to ASCII, and returns
// the unescaped string.
func unescape(path string) string {
// Try to avoid copying.
if strings.IndexByte(path, '\\') == -1 {
return path, nil
return path
}

// The following code is UTF-8 transparent as it only looks for some
// specific characters (backslash and 0..7) with values < utf8.RuneSelf,
// and everything else is passed through as is.
// specific characters (backslash and 0..7) with values less than
// utf8.RuneSelf, and everything else is passed through as is.
buf := make([]byte, len(path))
bufLen := 0
for i := 0; i < len(path); i++ {
if path[i] != '\\' {
buf[bufLen] = path[i]
bufLen++
continue
}
s := path[i:]
if len(s) < 4 {
// too short
return "", fmt.Errorf("bad escape sequence %q: too short", s)
}
c := s[1]
switch c {
case '0', '1', '2', '3', '4', '5', '6', '7':
v := c - '0'
for j := 2; j < 4; j++ { // one digit already; two more
if s[j] < '0' || s[j] > '7' {
return "", fmt.Errorf("bad escape sequence %q: not a digit", s[:3])
}
x := s[j] - '0'
v = (v << 3) | x
}
if v > 255 {
return "", fmt.Errorf("bad escape sequence %q: out of range" + s[:3])
}
buf[bufLen] = v
bufLen++
c := path[i]
// Look for \NNN, i.e. a backslash followed by three octal
// digits. Maximum value is 177 (equals utf8.RuneSelf-1).
if c == '\\' && i+3 < len(path) &&
(path[i+1] == '0' || path[i+1] == '1') &&
(path[i+2] >= '0' && path[i+2] <= '7') &&
(path[i+3] >= '0' && path[i+3] <= '7') {
// Convert from ASCII to numeric values.
c1 := path[i+1] - '0'
c2 := path[i+2] - '0'
c3 := path[i+3] - '0'
// Each octal digit is three bits, thus the shift value.
c = c1<<6 | c2<<3 | c3
// We read three extra bytes of input.
i += 3
continue
default:
return "", fmt.Errorf("bad escape sequence %q: not a digit" + s[:3])

}
buf[bufLen] = c
bufLen++
}

return string(buf[:bufLen]), nil
return string(buf[:bufLen])
}

// toInt converts a string to an int, and ignores any numbers parsing errors,
Expand Down
83 changes: 55 additions & 28 deletions mountinfo/mountinfo_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -731,43 +731,70 @@ func TestParseMountinfoExtraCases(t *testing.T) {
}

func TestUnescape(t *testing.T) {
// When adding test cases below, be aware that Go interprets \NNN
// inside strings enclosed in double quotes in the same way as the
// function being tested, so:
// - for input: either escape every backslash character (i.e. \\), or
// enclose the whole string in `backticks` so \NNN is passed as-is;
// - for output: write it like "\040", which is identical to " ".
testCases := []struct {
input, output string
isErr bool
}{
{"", "", false},
{"/", "/", false},
{"/some/longer/path", "/some/longer/path", false},
{"/path\\040with\\040spaces", "/path with spaces", false},
{"/path/with\\134backslash", "/path/with\\backslash", false},
{"/tab\\011in/path", "/tab\tin/path", false},
{`/path/"with'quotes`, `/path/"with'quotes`, false},
{`/path/"with'quotes,\040space,\011tab`, `/path/"with'quotes, space, tab`, false},
{`\12`, "", true},
{`\134`, `\`, false},
{`"'"'"'`, `"'"'"'`, false},
{`/\1345`, `/\5`, false},
{`/\12x`, "", true},
{`\0`, "", true},
{`\x`, "", true},
{"\\\\", "", true},
{"", ""},
{"/", "/"},
{"/some/longer/path", "/some/longer/path"},
{`/path\040with\040spaces`, "/path\040with\040spaces"},
{"/path/with\\134backslash", "/path/with\\backslash"},
{"/tab\\011in/path", "/tab\011in/path"},
{`/path/"with'quotes`, `/path/"with'quotes`},
{`/path/"with'quotes,\040space,\011tab`, `/path/"with'quotes, space, tab`},
{`\12`, `\12`}, // Not enough digits.
{`\134`, `\`}, // Backslash.
{`"'"'"'`, `"'"'"'`},
Comment on lines +750 to +753
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wondering; as we're updating all of these, should we also use keys here? I know it would be a lot more verbose, but it's generally "best practice" (I think?).

We could even consider adding a doc field to describe test-cases (instead of adding a comment), but I realise most don't have a description if they have a specific purpose 🤔

Happy to hear what you think though; if it's too verbose?

	// NOTE Go interprets \NNN inside double quotes exactly
	// the same way as the function being tested, so:
	//  - for input strings, we must either escape all backslash
	//    characters ("\\") or use backticks `\`;
	//  - for output strings, we can write something like "\040",
	//    which is identical to " ".
	testCases := []struct {
		doc, input, output string
	}{
		{
			input:  "",
			output: "",
		},
		{
			input:  "/",
			output: "/",
		},
		{
			input:  "/some/longer/path",
			output: "/some/longer/path",
		},
		{
			input:  `/path\040with\040spaces`,
			output: "/path\040with\040spaces",
		},
		{
			input:  "/path/with\\134backslash",
			output: "/path/with\\backslash",
		},
		{
			input:  "/tab\\011in/path",
			output: "/tab\011in/path",
		},
		{
			input:  `/path/"with'quotes`,
			output: `/path/"with'quotes`,
		},
		{
			input:  `/path/"with'quotes,\040space,\011tab`,
			output: `/path/"with'quotes, space,	tab`,
		},
		{
			doc:    "Not enough digits",
			input:  `\12`,
			output: `\12`,
		},
		{
			doc:    "Backslash",
			input:  `\134`,
			output: `\`,
		},
		{
			input:  `"'"'"'`,
			output: `"'"'"'`,
		},
		{
			doc:    "Backslash with extra digit",
			input:  `/\1345`,
			output: `/\5`,
		},
		{
			input:  `/\12x`,
			output: `/\12x`,
		},
		{
			doc:    "Not enough digits",
			input:  `\0`,
			output: `\0`,
		},
		{
			doc:    "NUL (min allowed ASCII value)",
			input:  `\000\000`,
			output: "\000\000",
		},
		{
			input:  `\x`,
			output: `\x`,
		},
		{
			input:  "\\\\",
			output: "\\\\",
		},
		{
			doc:    "Max allowed ASCII value",
			input:  `\177`,
			output: "\177",
		},
		{
			doc:    "Too large value",
			input:  `\222`,
			output: `\222`,
		},
		{
			doc:    "Some UTF-8",
			input:  `Это\040камон\040какой-то`,
			output: "Это камон какой-то",
		},
	}

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, with two fields only I think this will be overly verbose and taking too much vertical space.

{`/\1345`, `/\5`}, // Backslash with extra digit.
{`/\12x`, `/\12x`},
{`\0`, `\0`}, // Not enough digits.
{`\000\000`, "\000\000"}, // NUL (min allowed ASCII value).
{`\x`, `\x`},
{"\\\\", "\\\\"},
{`\177`, "\177"}, // Max allowed ASCII value.
{`\222`, `\222`}, // Too large value -- not unescaped.
{`Это\040комон\040какой-то`, "Это комон какой-то"}, // Some UTF-8 -- not unescaped.
}

for _, tc := range testCases {
res, err := unescape(tc.input)
if tc.isErr == true {
if err == nil {
t.Errorf("Input %q, want error, got nil", tc.input)
}
// no more checks
continue
}
res := unescape(tc.input)
if res != tc.output {
t.Errorf("Input %q, want %q, got %q", tc.input, tc.output, res)
}
if err != nil {
t.Errorf("Input %q, want nil, got error %v", tc.input, err)
continue
}
}

func BenchmarkUnescape(b *testing.B) {
testCases := []string{
Comment on lines +773 to +774
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW; I copied these test cases from the other test. We could possibly consider making those test-cases a package-variable, so that we can use the same ones here as part of the benchmark (if we don't want to repeat the cases).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about it, but:

  1. When we change that variable, we change benchmark's input, making it somewhat harder to compare before/after.
  2. Ultimately a test case and a benchmark should have different inputs. For test cases, we look for various ways to break things. For benchmarks, we look for either most typical or most complex (time consuming) cases, or both.

"",
"/",
"/some/longer/path",
"/path\\040with\\040spaces",
"/path/with\\134backslash",
"/tab\\011in/path",
`/path/"with'quotes`,
`/path/"with'quotes,\040space,\011tab`,
`\12`,
`\134`,
`"'"'"'`,
`/\1345`,
`/\12x`,
`\0`,
`\x`,
"\\\\",
}

b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for x := 0; x < len(testCases); x++ {
_ = unescape(testCases[x])
}
}
}
Loading