diff --git a/modules/git/repo_attribute.go b/modules/git/repo_attribute.go index aa5e4c10e70d..40c368215efc 100644 --- a/modules/git/repo_attribute.go +++ b/modules/git/repo_attribute.go @@ -6,7 +6,11 @@ package git import ( "bytes" + "context" "fmt" + "io" + "strings" + "time" ) // CheckAttributeOpts represents the possible options to CheckAttribute @@ -15,6 +19,7 @@ type CheckAttributeOpts struct { AllAttributes bool Attributes []string Filenames []string + IndexFile string } // CheckAttribute return the Blame object of file @@ -54,7 +59,12 @@ func (repo *Repository) CheckAttribute(opts CheckAttributeOpts) (map[string]map[ cmd := NewCommand(cmdArgs...) - if err := cmd.RunInDirPipeline(repo.Path, stdOut, stdErr); err != nil { + env := make([]string, 0, 1) + if len(opts.IndexFile) > 0 { + env = append(env, "GIT_INDEX_FILE="+opts.IndexFile) + } + + if err := cmd.RunInDirTimeoutEnvFullPipeline(env, -1, repo.Path, stdOut, stdErr, nil); err != nil { return nil, fmt.Errorf("Failed to run check-attr: %v\n%s\n%s", err, stdOut.String(), stdErr.String()) } @@ -80,3 +90,122 @@ func (repo *Repository) CheckAttribute(opts CheckAttributeOpts) (map[string]map[ return name2attribute2info, nil } + +// AttrChecker attrs checker +type AttrChecker struct { + // params + RequestAttrs []string + Repo *Repository + IndexFile string + + stdinReader *io.PipeReader + stdinWriter *io.PipeWriter + stdOut *lineWriter + cmd *Command + env []string +} + +// Init init cmd +func (c *AttrChecker) Init() { + if len(c.RequestAttrs) == 0 { + panic("Should have RequestAttrs!") + } + + cmdArgs := []string{"check-attr"} + cmdArgs = append(cmdArgs, c.RequestAttrs...) + if len(c.IndexFile) > 0 { + cmdArgs = append(cmdArgs, "--cached") + c.env = []string{"GIT_INDEX_FILE=" + c.IndexFile} + } + cmdArgs = append(cmdArgs, "--stdin") + c.cmd = NewCommand(cmdArgs...) + c.stdinReader, c.stdinWriter = io.Pipe() + c.stdOut = new(lineWriter) +} + +// Run run cmd +func (c *AttrChecker) Run() error { + stdErr := new(bytes.Buffer) + err := c.cmd.RunInDirTimeoutEnvFullPipeline(c.env, -1, c.Repo.Path, c.stdOut, stdErr, c.stdinReader) + if err != nil { + return fmt.Errorf("failed to run attr-check. Error: %w\nStderr: %s", err, stdErr.String()) + } + + return nil +} + +// CheckAttrs check attr for given path +func (c *AttrChecker) CheckAttrs(path string) (map[string]string, error) { + _, err := c.stdinWriter.Write([]byte(path + "\n")) + if err != nil { + return nil, err + } + + rs := make(map[string]string) + for range c.RequestAttrs { + line, err := c.stdOut.ReadLine(DefaultCommandExecutionTimeout) + if err != nil { + return nil, err + } + splits := strings.SplitN(line, ": ", 3) + if len(splits) != 3 { + continue + } + rs[splits[1]] = splits[2] + } + return rs, nil +} + +// Close close pip after use +func (c *AttrChecker) Close() { + c.stdinWriter.Close() +} + +type lineWriter struct { + tmp []byte + lines chan string +} + +func (wr *lineWriter) Write(p []byte) (n int, err error) { + l := len(p) + if wr.tmp != nil && len(wr.tmp) > 0 { + p = append(wr.tmp, p...) + } + lastEndl := -1 + for i := len(p) - 1; i >= 0; i-- { + if p[i] == '\n' { + lastEndl = i + break + } + } + if lastEndl != len(p)-1 { + wr.tmp = p[lastEndl+1:] + } + + if lastEndl == -1 { + return l, nil + } + + if wr.lines == nil { + wr.lines = make(chan string, 5) + } + + splits := bytes.Split(p[:lastEndl], []byte{'\n'}) + for _, line := range splits { + wr.lines <- string(line) + } + + return l, nil +} + +func (wr *lineWriter) ReadLine(timeOut time.Duration) (string, error) { + ctx, cancel := context.WithTimeout(context.Background(), timeOut) + defer cancel() + + select { + case rs := <-wr.lines: + return rs, nil + case <-ctx.Done(): + return "", ctx.Err() + } +} diff --git a/modules/git/repo_language_stats_gogit.go b/modules/git/repo_language_stats_gogit.go index b5a235921c8a..36fd51aee417 100644 --- a/modules/git/repo_language_stats_gogit.go +++ b/modules/git/repo_language_stats_gogit.go @@ -20,7 +20,7 @@ import ( ) // GetLanguageStats calculates language stats for git repository at specified commit -func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) { +func (repo *Repository) GetLanguageStats(commitID string, preCheck func(path string) (string, bool)) (map[string]int64, error) { r, err := git.PlainOpen(repo.Path) if err != nil { return nil, err @@ -57,9 +57,18 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err return nil } - // TODO: Use .gitattributes file for linguist overrides + language := "" + skip := false + if preCheck != nil { + language, skip = preCheck(f.Name) + if skip { + return nil + } + } - language := analyze.GetCodeLanguage(f.Name, content) + if len(language) == 0 { + language = analyze.GetCodeLanguage(f.Name, content) + } if language == enry.OtherLanguage || language == "" { return nil } diff --git a/modules/git/repo_language_stats_nogogit.go b/modules/git/repo_language_stats_nogogit.go index 4c6f07f0fba5..b5320d2c82ec 100644 --- a/modules/git/repo_language_stats_nogogit.go +++ b/modules/git/repo_language_stats_nogogit.go @@ -19,7 +19,7 @@ import ( ) // GetLanguageStats calculates language stats for git repository at specified commit -func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) { +func (repo *Repository) GetLanguageStats(commitID string, preCheck func(path string) (string, bool)) (map[string]int64, error) { // We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary. // so let's create a batch stdin and stdout @@ -128,10 +128,22 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err continue } - // TODO: Use .gitattributes file for linguist overrides - // FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary? - // - eg. do the all the detection tests using filename first before reading content. - language := analyze.GetCodeLanguage(f.Name(), content) + // Use .gitattributes file for linguist overrides + language := "" + skip := false + if preCheck != nil { + language, skip = preCheck(f.Name()) + if skip { + continue + } + } + + if len(language) == 0 { + // FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary? + // - eg. do the all the detection tests using filename first before reading content. + language = analyze.GetCodeLanguage(f.Name(), content) + } + if language == enry.OtherLanguage || language == "" { continue } diff --git a/modules/indexer/stats/db.go b/modules/indexer/stats/db.go index bc3fbc13d893..30f3f721280b 100644 --- a/modules/indexer/stats/db.go +++ b/modules/indexer/stats/db.go @@ -5,9 +5,14 @@ package stats import ( + "io/ioutil" + "os" + "sync" + "code.gitea.io/gitea/models" "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/util" ) // DBIndexer implements Indexer interface to use database's like search @@ -47,8 +52,95 @@ func (db *DBIndexer) Index(id int64) error { return nil } + var tmpIndex *os.File + if git.CheckGitVersionAtLeast("1.7.8") == nil { + tmpIndex, err = ioutil.TempFile("", "index") + if err != nil { + return err + } + defer func() { + err := util.Remove(tmpIndex.Name()) + if err != nil { + log.Error("failed to remove tmp index file: %v", err) + } + }() + + _, err = git.NewCommand("read-tree", commitID). + RunInDirWithEnv(gitRepo.Path, []string{"GIT_INDEX_FILE=" + tmpIndex.Name()}) + if err != nil { + return err + } + + checker := &git.AttrChecker{ + RequestAttrs: []string{"linguist-vendored", "linguist-language"}, + Repo: gitRepo, + IndexFile: tmpIndex.Name(), + } + + checker.Init() + + wg := new(sync.WaitGroup) + wg.Add(2) + + errCh := make(chan error) + + // run cmd + go func() { + if err := checker.Run(); err != nil { + errCh <- err + } + wg.Done() + }() + + stats := make(map[string]int64) + + go func() { + var err error + stats, err = gitRepo.GetLanguageStats(commitID, func(path string) (string, bool) { + // get language follow linguist rulers + // linguist-language= attribute to an language + // linguist-vendored attribute to vendor or un-vendor paths + rs, err := checker.CheckAttrs(path) + if err != nil { + log.Error("git.CheckAttrs: %v", err) + return "", false + } + + if rs["linguist-vendored"] == "set" { + return "", true + } + + if lang, has := rs["linguist-language"]; has { + if lang == "unspecified" { + return "", false + } + return lang, false + } + + return "", false + }) + if err != nil { + errCh <- err + } + checker.Close() + wg.Done() + }() + + wg.Wait() + + select { + case err, has := <-errCh: + if has { + log.Error("Unable to get language stats for ID %s for defaultbranch %s in %s. Error: %v", commitID, repo.DefaultBranch, repo.RepoPath(), err) + return err + } + default: + return repo.UpdateLanguageStats(commitID, stats) + } + } + // Calculate and save language statistics to database - stats, err := gitRepo.GetLanguageStats(commitID) + stats, err := gitRepo.GetLanguageStats(commitID, nil) if err != nil { log.Error("Unable to get language stats for ID %s for defaultbranch %s in %s. Error: %v", commitID, repo.DefaultBranch, repo.RepoPath(), err) return err