Skip to content

Commit

Permalink
✨ Increase PyPI parsing flexibility (ossf#3423)
Browse files Browse the repository at this point in the history
* Make PyPI parsing more flexible to find any github or gitlab url, and hope its unique

Signed-off-by: Josh Cogan <joshgc@google.com>

* Refactor the addRepo to not pass around a mutable object. Tweak a test to support gitlab better

Signed-off-by: Josh Cogan <joshgc@google.com>

* Ignore users called sponsors for github repos. Remove the set and just check there is a single valid url

Signed-off-by: Josh Cogan <joshgc@google.com>

* Remove unneeded variables and code

Signed-off-by: Josh Cogan <joshgc@google.com>

* Reducing indentation

Signed-off-by: Josh Cogan <joshgc@google.com>

* Make github url path parts case insensitive and use more explicit suffix filter to remove .git

Signed-off-by: Josh Cogan <joshgc@google.com>

* Appease the linter--may its wisdown never wane.

Signed-off-by: Josh Cogan <joshgc@google.com>

* CamelCase -> camelCase to prevent export

Signed-off-by: Josh Cogan <joshgc@google.com>

* Add test and allowance for gitlab to also be case insensitive

Signed-off-by: Josh Cogan <joshgc@google.com>

* hub vs lab typo

Signed-off-by: Josh Cogan <joshgc@google.com>

---------

Signed-off-by: Josh Cogan <joshgc@google.com>
Signed-off-by: Allen Shearin <allen.p.shearin@gmail.com>
  • Loading branch information
joshgc authored and ashearin committed Nov 13, 2023
1 parent be40c99 commit e79e486
Show file tree
Hide file tree
Showing 2 changed files with 221 additions and 127 deletions.
89 changes: 76 additions & 13 deletions cmd/package_managers.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,53 @@ package cmd
import (
"encoding/json"
"fmt"
"io"
"regexp"
"strings"

ngt "github.com/ossf/scorecard/v4/cmd/internal/nuget"
pmc "github.com/ossf/scorecard/v4/cmd/internal/packagemanager"
sce "github.com/ossf/scorecard/v4/errors"
)

var (
githubDomainRegexp = regexp.MustCompile(`^https?://github[.]com/([^/]+)/([^/]+)`)
githubSubdomainRegexp = regexp.MustCompile(`^https?://([^.]+)[.]github[.]io/([^/]+).*`)
gitlabDomainRegexp = regexp.MustCompile(`^https?://gitlab[.]com/([^/]+)/([^/]+)`)
)

func makeGithubRepo(urlAndPathParts []string) string {
if len(urlAndPathParts) < 3 {
return ""
}
userOrOrg := strings.ToLower(urlAndPathParts[1])
repoName := strings.TrimSuffix(strings.ToLower(urlAndPathParts[2]), ".git")
if userOrOrg == "sponsors" {
return ""
}
return fmt.Sprintf("https://github.com/%s/%s", userOrOrg, repoName)
}

// Both GitHub and GitLab are case insensitive (and thus we lowercase those URLS)
// however generic URLs are indeed case sensitive!
var pypiMatchers = []func(string) string{
func(url string) string {
return makeGithubRepo(githubDomainRegexp.FindStringSubmatch(url))
},

func(url string) string {
return makeGithubRepo(githubSubdomainRegexp.FindStringSubmatch(url))
},

func(url string) string {
match := gitlabDomainRegexp.FindStringSubmatch(url)
if len(match) >= 3 {
return strings.ToLower(fmt.Sprintf("https://gitlab.com/%s/%s", match[1], match[2]))
}
return ""
},
}

type packageMangerResponse struct {
associatedRepo string
exists bool
Expand Down Expand Up @@ -77,9 +118,8 @@ type npmSearchResults struct {

type pypiSearchResults struct {
Info struct {
ProjectUrls struct {
Source string `json:"Source"`
} `json:"project_urls"`
ProjectURLs map[string]string `json:"project_urls"`
ProjectURL string `json:"project_url"`
} `json:"info"`
}

Expand Down Expand Up @@ -108,6 +148,38 @@ func fetchGitRepositoryFromNPM(packageName string, packageManager pmc.Client) (s
return v.Objects[0].Package.Links.Repository, nil
}

func findGitRepositoryInPYPIResponse(packageName string, response io.Reader) (string, error) {
v := &pypiSearchResults{}
err := json.NewDecoder(response).Decode(v)
if err != nil {
return "", sce.WithMessage(sce.ErrScorecardInternal, fmt.Sprintf("failed to parse pypi package json: %v", err))
}

v.Info.ProjectURLs["key_not_used_and_very_unlikely_to_be_present_already"] = v.Info.ProjectURL
var validURL string
for _, url := range v.Info.ProjectURLs {
for _, matcher := range pypiMatchers {
repo := matcher(url)
if repo == "" {
continue
}
if validURL == "" {
validURL = repo
} else if validURL != repo {
return "", sce.WithMessage(sce.ErrScorecardInternal,
fmt.Sprintf("found too many possible source repos for pypi package: %s", packageName))
}
}
}

if validURL == "" {
return "", sce.WithMessage(sce.ErrScorecardInternal,
fmt.Sprintf("could not find source repo for pypi package: %s", packageName))
} else {
return validURL, nil
}
}

// Gets the GitHub repository URL for the pypi package.
func fetchGitRepositoryFromPYPI(packageName string, manager pmc.Client) (string, error) {
pypiSearchURL := "https://pypi.org/pypi/%s/json"
Expand All @@ -117,16 +189,7 @@ func fetchGitRepositoryFromPYPI(packageName string, manager pmc.Client) (string,
}

defer resp.Body.Close()
v := &pypiSearchResults{}
err = json.NewDecoder(resp.Body).Decode(v)
if err != nil {
return "", sce.WithMessage(sce.ErrScorecardInternal, fmt.Sprintf("failed to parse pypi package json: %v", err))
}
if v.Info.ProjectUrls.Source == "" {
return "", sce.WithMessage(sce.ErrScorecardInternal,
fmt.Sprintf("could not find source repo for pypi package: %s", packageName))
}
return v.Info.ProjectUrls.Source, nil
return findGitRepositoryInPYPIResponse(packageName, resp.Body)
}

// Gets the GitHub repository URL for the rubygems package.
Expand Down
Loading

0 comments on commit e79e486

Please sign in to comment.