Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: walk workspace directories in parallel #1893

Merged
merged 3 commits into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions walk/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ go_library(
"//flag",
"//rule",
"@com_github_bmatcuk_doublestar_v4//:doublestar",
"@org_golang_x_sync//errgroup",
],
)

Expand Down
3 changes: 0 additions & 3 deletions walk/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,6 @@ func getWalkConfig(c *config.Config) *walkConfig {
}

func (wc *walkConfig) isExcluded(rel, base string) bool {
if base == ".git" {
return true
}
return matchAnyGlob(wc.excludes, path.Join(rel, base))
}

Expand Down
92 changes: 77 additions & 15 deletions walk/walk.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@ import (
"os"
"path"
"path/filepath"
"sort"
"strings"

"github.com/bazelbuild/bazel-gazelle/config"
"github.com/bazelbuild/bazel-gazelle/rule"
"golang.org/x/sync/errgroup"
)

// Mode determines which directories Walk visits and which directories
Expand Down Expand Up @@ -122,25 +124,26 @@ func Walk(c *config.Config, cexts []config.Configurer, dirs []string, mode Mode,
log.Printf("error loading .bazelignore: %v", err)
}

visit(c, cexts, isBazelIgnored, knownDirectives, updateRels, wf, c.RepoRoot, "", false)
}

func visit(c *config.Config, cexts []config.Configurer, isBazelIgnored isIgnoredFunc, knownDirectives map[string]bool, updateRels *UpdateFilter, wf WalkFunc, dir, rel string, updateParent bool) {
if isBazelIgnored(rel) {
return
trie, err := buildTrie(c, isBazelIgnored)
if err != nil {
log.Fatalf("error walking the file system: %v\n", err)
}

visit(c, cexts, knownDirectives, updateRels, trie, wf, c.RepoRoot, "", false)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we make parallel walk configurable? It would be good to not enable by default, and allow users to opt in/out

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I didn't see this and then this was merged. We can probably do this fairly easily.

Lets just make sure we do this (disable it by default) instead of reverting if any issues come up now that there has been a release.

}

func visit(c *config.Config, cexts []config.Configurer, knownDirectives map[string]bool, updateRels *UpdateFilter, trie *pathTrie, wf WalkFunc, dir, rel string, updateParent bool) {
haveError := false

// TODO: OPT: ReadDir stats all the files, which is slow. We just care about
// names and modes, so we should use something like
// golang.org/x/tools/internal/fastwalk to speed this up.
ents, err := os.ReadDir(dir)
if err != nil {
log.Print(err)
return
ents := make([]fs.DirEntry, 0, len(trie.children))
jbedard marked this conversation as resolved.
Show resolved Hide resolved
for _, node := range trie.children {
ents = append(ents, *node.entry)
}

sort.Slice(ents, func(i, j int) bool {
return ents[i].Name() < ents[j].Name()
})

f, err := loadBuildFile(c, rel, dir, ents)
if err != nil {
log.Print(err)
Expand All @@ -162,7 +165,7 @@ func visit(c *config.Config, cexts []config.Configurer, isBazelIgnored isIgnored
var subdirs, regularFiles []string
for _, ent := range ents {
base := ent.Name()
if isBazelIgnored(path.Join(rel, base)) || wc.isExcluded(rel, base) {
if wc.isExcluded(rel, base) {
continue
}
ent := resolveFileInfo(wc, dir, rel, ent)
Expand All @@ -179,7 +182,7 @@ func visit(c *config.Config, cexts []config.Configurer, isBazelIgnored isIgnored
shouldUpdate := updateRels.shouldUpdate(rel, updateParent)
for _, sub := range subdirs {
if subRel := path.Join(rel, sub); updateRels.shouldVisit(subRel, shouldUpdate) {
visit(c, cexts, isBazelIgnored, knownDirectives, updateRels, wf, filepath.Join(dir, sub), subRel, shouldUpdate)
visit(c, cexts, knownDirectives, updateRels, trie.children[sub], wf, path.Join(dir, sub), subRel, shouldUpdate)
}
}

Expand Down Expand Up @@ -356,3 +359,62 @@ func resolveFileInfo(wc *walkConfig, dir, rel string, ent fs.DirEntry) fs.DirEnt
}
return fs.FileInfoToDirEntry(fi)
}

type pathTrie struct {
children map[string]*pathTrie
entry *fs.DirEntry
}

// Basic factory method to ensure the entry is properly copied
func newTrie(entry fs.DirEntry) *pathTrie {
return &pathTrie{entry: &entry}
}

func buildTrie(c *config.Config, isIgnored isIgnoredFunc) (*pathTrie, error) {
trie := &pathTrie{
children: map[string]*pathTrie{},
}

// A channel to limit the number of concurrent goroutines
limitCh := make(chan struct{}, 100)
fmeum marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

100 feels pretty hardcoded. Can we check the number of CPUs or max processes we can use? https://pkg.go.dev/runtime#NumCPU

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The operations are likely I/O bound, so a limit based on the number of CPUs would likely be far too low. Eventually we may want to make this configurable, but any reasonably high number should give a speedup. In the end, the Go runtime would know best how to schedule I/O-blocked goroutines.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Today it's all I/O so I figured hardcoding something is fine and number of CPUs isn't really relevant. However that might change soon if we start putting stuff like BUILD-parsing into these goroutines.


// An error group to handle error propagation
eg := errgroup.Group{}
eg.Go(func() error {
return walkDir(c.RepoRoot, "", &eg, limitCh, isIgnored, trie)
})

return trie, eg.Wait()
}

// walkDir recursively and concurrently descends into the 'rel' directory and builds a trie
func walkDir(root, rel string, eg *errgroup.Group, limitCh chan struct{}, isIgnored isIgnoredFunc, trie *pathTrie) error {
limitCh <- struct{}{}
defer (func() { <-limitCh })()

entries, err := os.ReadDir(filepath.Join(root, rel))
if err != nil {
return err
}

for _, entry := range entries {
entryName := entry.Name()
entryPath := path.Join(rel, entryName)

// Ignore .git, empty names and ignored paths
jbedard marked this conversation as resolved.
Show resolved Hide resolved
if entryName == "" || entryName == ".git" || isIgnored(entryPath) {
continue
}

entryTrie := newTrie(entry)
trie.children[entry.Name()] = entryTrie

if entry.IsDir() {
entryTrie.children = map[string]*pathTrie{}
eg.Go(func() error {
return walkDir(root, entryPath, eg, limitCh, isIgnored, entryTrie)
})
}
}
return nil
}