diff --git a/walk/BUILD.bazel b/walk/BUILD.bazel index 29368adb9..a2b5272c5 100644 --- a/walk/BUILD.bazel +++ b/walk/BUILD.bazel @@ -13,6 +13,7 @@ go_library( "//flag", "//rule", "@com_github_bmatcuk_doublestar_v4//:doublestar", + "@org_golang_x_sync//errgroup", ], ) diff --git a/walk/config.go b/walk/config.go index 044b59a79..bc617b746 100644 --- a/walk/config.go +++ b/walk/config.go @@ -50,9 +50,6 @@ func getWalkConfig(c *config.Config) *walkConfig { } func (wc *walkConfig) isExcluded(rel, base string) bool { - if base == ".git" { - return true - } return matchAnyGlob(wc.excludes, path.Join(rel, base)) } diff --git a/walk/walk.go b/walk/walk.go index cfc088c98..85f32baa7 100644 --- a/walk/walk.go +++ b/walk/walk.go @@ -23,10 +23,12 @@ import ( "os" "path" "path/filepath" + "sort" "strings" "github.com/bazelbuild/bazel-gazelle/config" "github.com/bazelbuild/bazel-gazelle/rule" + "golang.org/x/sync/errgroup" ) // Mode determines which directories Walk visits and which directories @@ -122,25 +124,26 @@ func Walk(c *config.Config, cexts []config.Configurer, dirs []string, mode Mode, log.Printf("error loading .bazelignore: %v", err) } - visit(c, cexts, isBazelIgnored, knownDirectives, updateRels, wf, c.RepoRoot, "", false) -} - -func visit(c *config.Config, cexts []config.Configurer, isBazelIgnored isIgnoredFunc, knownDirectives map[string]bool, updateRels *UpdateFilter, wf WalkFunc, dir, rel string, updateParent bool) { - if isBazelIgnored(rel) { - return + trie, err := buildTrie(c, isBazelIgnored) + if err != nil { + log.Fatalf("error walking the file system: %v\n", err) } + visit(c, cexts, knownDirectives, updateRels, trie, wf, c.RepoRoot, "", false) +} + +func visit(c *config.Config, cexts []config.Configurer, knownDirectives map[string]bool, updateRels *UpdateFilter, trie *pathTrie, wf WalkFunc, dir, rel string, updateParent bool) { haveError := false - // TODO: OPT: ReadDir stats all the files, which is slow. We just care about - // names and modes, so we should use something like - // golang.org/x/tools/internal/fastwalk to speed this up. - ents, err := os.ReadDir(dir) - if err != nil { - log.Print(err) - return + ents := make([]fs.DirEntry, 0, len(trie.children)) + for _, node := range trie.children { + ents = append(ents, *node.entry) } + sort.Slice(ents, func(i, j int) bool { + return ents[i].Name() < ents[j].Name() + }) + f, err := loadBuildFile(c, rel, dir, ents) if err != nil { log.Print(err) @@ -162,7 +165,7 @@ func visit(c *config.Config, cexts []config.Configurer, isBazelIgnored isIgnored var subdirs, regularFiles []string for _, ent := range ents { base := ent.Name() - if isBazelIgnored(path.Join(rel, base)) || wc.isExcluded(rel, base) { + if wc.isExcluded(rel, base) { continue } ent := resolveFileInfo(wc, dir, rel, ent) @@ -179,7 +182,7 @@ func visit(c *config.Config, cexts []config.Configurer, isBazelIgnored isIgnored shouldUpdate := updateRels.shouldUpdate(rel, updateParent) for _, sub := range subdirs { if subRel := path.Join(rel, sub); updateRels.shouldVisit(subRel, shouldUpdate) { - visit(c, cexts, isBazelIgnored, knownDirectives, updateRels, wf, filepath.Join(dir, sub), subRel, shouldUpdate) + visit(c, cexts, knownDirectives, updateRels, trie.children[sub], wf, path.Join(dir, sub), subRel, shouldUpdate) } } @@ -356,3 +359,62 @@ func resolveFileInfo(wc *walkConfig, dir, rel string, ent fs.DirEntry) fs.DirEnt } return fs.FileInfoToDirEntry(fi) } + +type pathTrie struct { + children map[string]*pathTrie + entry *fs.DirEntry +} + +// Basic factory method to ensure the entry is properly copied +func newTrie(entry fs.DirEntry) *pathTrie { + return &pathTrie{entry: &entry} +} + +func buildTrie(c *config.Config, isIgnored isIgnoredFunc) (*pathTrie, error) { + trie := &pathTrie{ + children: map[string]*pathTrie{}, + } + + // A channel to limit the number of concurrent goroutines + limitCh := make(chan struct{}, 100) + + // An error group to handle error propagation + eg := errgroup.Group{} + eg.Go(func() error { + return walkDir(c.RepoRoot, "", &eg, limitCh, isIgnored, trie) + }) + + return trie, eg.Wait() +} + +// walkDir recursively and concurrently descends into the 'rel' directory and builds a trie +func walkDir(root, rel string, eg *errgroup.Group, limitCh chan struct{}, isIgnored isIgnoredFunc, trie *pathTrie) error { + limitCh <- struct{}{} + defer (func() { <-limitCh })() + + entries, err := os.ReadDir(filepath.Join(root, rel)) + if err != nil { + return err + } + + for _, entry := range entries { + entryName := entry.Name() + entryPath := path.Join(rel, entryName) + + // Ignore .git, empty names and ignored paths + if entryName == "" || entryName == ".git" || isIgnored(entryPath) { + continue + } + + entryTrie := newTrie(entry) + trie.children[entry.Name()] = entryTrie + + if entry.IsDir() { + entryTrie.children = map[string]*pathTrie{} + eg.Go(func() error { + return walkDir(root, entryPath, eg, limitCh, isIgnored, entryTrie) + }) + } + } + return nil +}