From e79e48650579256cf43fad322e6679193d9e07cf Mon Sep 17 00:00:00 2001 From: joshgc Date: Fri, 25 Aug 2023 17:45:20 -0700 Subject: [PATCH] :sparkles: Increase PyPI parsing flexibility (#3423) * Make PyPI parsing more flexible to find any github or gitlab url, and hope its unique Signed-off-by: Josh Cogan * Refactor the addRepo to not pass around a mutable object. Tweak a test to support gitlab better Signed-off-by: Josh Cogan * Ignore users called sponsors for github repos. Remove the set and just check there is a single valid url Signed-off-by: Josh Cogan * Remove unneeded variables and code Signed-off-by: Josh Cogan * Reducing indentation Signed-off-by: Josh Cogan * Make github url path parts case insensitive and use more explicit suffix filter to remove .git Signed-off-by: Josh Cogan * Appease the linter--may its wisdown never wane. Signed-off-by: Josh Cogan * CamelCase -> camelCase to prevent export Signed-off-by: Josh Cogan * Add test and allowance for gitlab to also be case insensitive Signed-off-by: Josh Cogan * hub vs lab typo Signed-off-by: Josh Cogan --------- Signed-off-by: Josh Cogan Signed-off-by: Allen Shearin --- cmd/package_managers.go | 89 ++++++++++-- cmd/package_managers_test.go | 259 ++++++++++++++++++++--------------- 2 files changed, 221 insertions(+), 127 deletions(-) diff --git a/cmd/package_managers.go b/cmd/package_managers.go index 67b4ab888bf..878235ceeec 100644 --- a/cmd/package_managers.go +++ b/cmd/package_managers.go @@ -18,12 +18,53 @@ package cmd import ( "encoding/json" "fmt" + "io" + "regexp" + "strings" ngt "github.com/ossf/scorecard/v4/cmd/internal/nuget" pmc "github.com/ossf/scorecard/v4/cmd/internal/packagemanager" sce "github.com/ossf/scorecard/v4/errors" ) +var ( + githubDomainRegexp = regexp.MustCompile(`^https?://github[.]com/([^/]+)/([^/]+)`) + githubSubdomainRegexp = regexp.MustCompile(`^https?://([^.]+)[.]github[.]io/([^/]+).*`) + gitlabDomainRegexp = regexp.MustCompile(`^https?://gitlab[.]com/([^/]+)/([^/]+)`) +) + +func makeGithubRepo(urlAndPathParts []string) string { + if len(urlAndPathParts) < 3 { + return "" + } + userOrOrg := strings.ToLower(urlAndPathParts[1]) + repoName := strings.TrimSuffix(strings.ToLower(urlAndPathParts[2]), ".git") + if userOrOrg == "sponsors" { + return "" + } + return fmt.Sprintf("https://github.com/%s/%s", userOrOrg, repoName) +} + +// Both GitHub and GitLab are case insensitive (and thus we lowercase those URLS) +// however generic URLs are indeed case sensitive! +var pypiMatchers = []func(string) string{ + func(url string) string { + return makeGithubRepo(githubDomainRegexp.FindStringSubmatch(url)) + }, + + func(url string) string { + return makeGithubRepo(githubSubdomainRegexp.FindStringSubmatch(url)) + }, + + func(url string) string { + match := gitlabDomainRegexp.FindStringSubmatch(url) + if len(match) >= 3 { + return strings.ToLower(fmt.Sprintf("https://gitlab.com/%s/%s", match[1], match[2])) + } + return "" + }, +} + type packageMangerResponse struct { associatedRepo string exists bool @@ -77,9 +118,8 @@ type npmSearchResults struct { type pypiSearchResults struct { Info struct { - ProjectUrls struct { - Source string `json:"Source"` - } `json:"project_urls"` + ProjectURLs map[string]string `json:"project_urls"` + ProjectURL string `json:"project_url"` } `json:"info"` } @@ -108,6 +148,38 @@ func fetchGitRepositoryFromNPM(packageName string, packageManager pmc.Client) (s return v.Objects[0].Package.Links.Repository, nil } +func findGitRepositoryInPYPIResponse(packageName string, response io.Reader) (string, error) { + v := &pypiSearchResults{} + err := json.NewDecoder(response).Decode(v) + if err != nil { + return "", sce.WithMessage(sce.ErrScorecardInternal, fmt.Sprintf("failed to parse pypi package json: %v", err)) + } + + v.Info.ProjectURLs["key_not_used_and_very_unlikely_to_be_present_already"] = v.Info.ProjectURL + var validURL string + for _, url := range v.Info.ProjectURLs { + for _, matcher := range pypiMatchers { + repo := matcher(url) + if repo == "" { + continue + } + if validURL == "" { + validURL = repo + } else if validURL != repo { + return "", sce.WithMessage(sce.ErrScorecardInternal, + fmt.Sprintf("found too many possible source repos for pypi package: %s", packageName)) + } + } + } + + if validURL == "" { + return "", sce.WithMessage(sce.ErrScorecardInternal, + fmt.Sprintf("could not find source repo for pypi package: %s", packageName)) + } else { + return validURL, nil + } +} + // Gets the GitHub repository URL for the pypi package. func fetchGitRepositoryFromPYPI(packageName string, manager pmc.Client) (string, error) { pypiSearchURL := "https://pypi.org/pypi/%s/json" @@ -117,16 +189,7 @@ func fetchGitRepositoryFromPYPI(packageName string, manager pmc.Client) (string, } defer resp.Body.Close() - v := &pypiSearchResults{} - err = json.NewDecoder(resp.Body).Decode(v) - if err != nil { - return "", sce.WithMessage(sce.ErrScorecardInternal, fmt.Sprintf("failed to parse pypi package json: %v", err)) - } - if v.Info.ProjectUrls.Source == "" { - return "", sce.WithMessage(sce.ErrScorecardInternal, - fmt.Sprintf("could not find source repo for pypi package: %s", packageName)) - } - return v.Info.ProjectUrls.Source, nil + return findGitRepositoryInPYPIResponse(packageName, resp.Body) } // Gets the GitHub repository URL for the rubygems package. diff --git a/cmd/package_managers_test.go b/cmd/package_managers_test.go index fe5ff0bf493..edb64630745 100644 --- a/cmd/package_managers_test.go +++ b/cmd/package_managers_test.go @@ -20,6 +20,7 @@ import ( "errors" "io" "net/http" + "strings" "testing" "github.com/golang/mock/gomock" @@ -161,6 +162,144 @@ func Test_fetchGitRepositoryFromNPM(t *testing.T) { } } +func Test_findGitRepositoryInPYPIResponse(t *testing.T) { + t.Parallel() + tests := []struct { + name string + partialPYPIResponse string + want string + wantErrStr string + }{ + { + name: "findGitRepositoryInPYPIResponse_none", + partialPYPIResponse: ` + { + "info": { + "platform": "UNKNOWN", + "not_a_project_url": "https://github.com/htaslan/color", + "project_urls": { + "Homepage": "http://git_NOT_VALID_hub.com/htaslan/color" + } + } +} +`, + want: "", + wantErrStr: "could not find source repo for pypi package: somePackage", + }, + { + name: "findGitRepositoryInPYPIResponse_project_url", + partialPYPIResponse: ` + { + "info": { + "platform": "UNKNOWN", + "project_url": "https://github.com/htaslan/color/", + "project_urls": { + "Homepage": "http://git_NOT_VALID_hub.com/htaslan/color" + } + } +} +`, + want: "https://github.com/htaslan/color", + wantErrStr: "", + }, + { + name: "findGitRepositoryInPYPIResponse_project_urls", + partialPYPIResponse: ` + { + "info": { + "platform": "UNKNOWN", + + "project_url": "http://git_NOT_VALID_hub.com/htaslan/color", + "project_urls": { + "RandomKey": "https://github.com/htaslan/color/", + "SponsorsIgnored": "https://github.com/sponsors/htaslan", + "AnotherRandomKey": "http://git_NOT_VALID_hub.com/htaslan/color" + } + } +} +`, + want: "https://github.com/htaslan/color", + wantErrStr: "", + }, + { + name: "findGitRepositoryInPYPIResponse_dedup", + partialPYPIResponse: ` + { + "info": { + "platform": "UNKNOWN", + "project_url": "foo", + "project_urls": { + "RandomKey": "https://github.com/htaslan/color/", + "AnotherRandomKey": "http://htaslan.github.io/color", + "CapsTestKey": "http://HTASLAN.github.io/cOLOr", + "TrailingGit": "https://github.com/htaslan/color.git" + } + } +} +`, + want: "https://github.com/htaslan/color", + wantErrStr: "", + }, + { + name: "findGitRepositoryInPYPIResponse_dedup_gitlab", + partialPYPIResponse: ` + { + "info": { + "platform": "UNKNOWN", + "project_url": "foo", + "project_urls": { + "RandomKey": "https://gitlab.com/htaslan/color/", + "raNdoMkEY": "https://gitlab.com/hTASLan/color" + } + } +} +`, + want: "https://gitlab.com/htaslan/color", + wantErrStr: "", + }, + { + name: "findGitRepositoryInPYPIResponse_toomany", + partialPYPIResponse: ` + { + "info": { + "platform": "UNKNOWN", + "project_url": "foo", + "project_urls": { + "RandomKey": "https://github.com/htaslan/color/", + "AnotherRandomKey": "https://gitlab.com/htaslan/color" + } + } +} +`, + want: "", + wantErrStr: "found too many possible source repos for pypi package: somePackage", + }, + } + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + got, err := findGitRepositoryInPYPIResponse("somePackage", strings.NewReader(tt.partialPYPIResponse)) + if err != nil && (!strings.Contains(err.Error(), tt.wantErrStr) || tt.wantErrStr == "") { + t.Errorf("findGitRepositoryInPYPIResponse() error = \"%v\" did not contain "+ + "wantErrStr = \"%v\" testcase name %v", + err, tt.wantErrStr, tt.name) + return + } + if err == nil && tt.wantErrStr != "" { + t.Errorf("findGitRepositoryInPYPIResponse() had nil error, but wanted "+ + "wantErrStr = \"%v\" testcase name %v", + tt.wantErrStr, tt.name) + return + } + + if got != tt.want { + t.Errorf("findGitRepositoryInPYPIResponse() = %v, want %v", got, tt.want) + } + }) + } +} + func Test_fetchGitRepositoryFromPYPI(t *testing.T) { t.Parallel() type args struct { @@ -177,7 +316,7 @@ func Test_fetchGitRepositoryFromPYPI(t *testing.T) { name: "fetchGitRepositoryFromPYPI", //nolint args: args{ - packageName: "npm-package", + packageName: "some-package", //nolint result: ` { @@ -279,137 +418,29 @@ func Test_fetchGitRepositoryFromPYPI(t *testing.T) { `, }, - want: "foo", + want: "https://github.com/htaslan/color", wantErr: false, }, { - name: "fetchGitRepositoryFromNPM_error", + name: "fetchGitRepositoryFromPYPI_error", args: args{ - packageName: "npm-package", + packageName: "pypi-package", result: "", }, want: "", wantErr: true, }, { - name: "fetchGitRepositoryFromNPM_error", + name: "fetchGitRepositoryFromPYPI_error", args: args{ - packageName: "npm-package", + packageName: "pypi-package", result: "foo", }, want: "", wantErr: true, }, - { - name: "empty project url", - //nolint - args: args{ - packageName: "npm-package", - //nolint - result: ` -{ - "info": { - "author": "Hüseyin Tekinaslan", - "author_email": "htaslan@bil.omu.edu.tr", - "bugtrack_url": null, - "classifiers": [ - "Development Status :: 5 - Production/Stable", - "License :: OSI Approved :: MIT License", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.2", - "Programming Language :: Python :: 3.3", - "Programming Language :: Python :: 3.4", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: Implementation :: CPython", - "Topic :: Software Development :: Libraries :: Python Modules" - ], - "description": "UNKNOWN", - "description_content_type": null, - "docs_url": null, - "downoad_url": null, - "downloads": { - "last_day": -1, - "last_month": -1, - "last_week": -1 - }, - "home_page": "http://github.com/htaslan/color", - "keywords": "colorize pycolorize color pycolor", - "license": "MIT", - "maintainer": null, - "maintainer_email": null, - "name": "color", - "package_url": "https://pypi.org/project/color/", - "platform": "UNKNOWN", - "project_url": "https://pypi.org/project/color/", - "project_urls": { - "Homepage": "http://github.com/htaslan/color", - "Source": "" - }, - "release_url": "https://pypi.org/project/color/0.1/", - "requires_dist": null, - "requires_python": null, - "summary": "python module for colorize string", - "version": "0.1", - "yanked": false, - "yanked_reason": null - }, - "last_serial": 2041956, - "releases": { - "0.1": [ - { - "comment_text": "a python module of colorize string", - "digests": { - "md5": "1a4577069c636b28d85052db9a384b95", - "sha256": "de5b51fea834cb067631beaa1ec11d7753f1e3615e836e2e4c34dcf2b343eac2" - }, - "downloads": -1, - "filename": "color-0.1.1.tar.gz", - "has_sig": false, - "md5_digest": "1a4577069c636b28d85052db9a384b95", - "packagetype": "sdist", - "python_version": "source", - "requires_python": null, - "size": 3568, - "upload_time": "2016-04-01T13:23:25", - "upload_time_iso_8601": "2016-04-01T13:23:25.284973Z", - "url": "https://files.pythonhosted.org/packages/88/04/0defd6f424e5bafb5abc75510cbe119a85d80b5505f1de5cd9a16d89ba8c/color-0.1.1.tar.gz", - "yanked": false, - "yanked_reason": null - } - ] - }, - "urls": [ - { - "comment_text": "a python module of colorize string", - "digests": { - "md5": "1a4577069c636b28d85052db9a384b95", - "sha256": "de5b51fea834cb067631beaa1ec11d7753f1e3615e836e2e4c34dcf2b343eac2" - }, - "downloads": -1, - "filename": "color-0.1.1.tar.gz", - "has_sig": false, - "md5_digest": "1a4577069c636b28d85052db9a384b95", - "packagetype": "sdist", - "python_version": "source", - "requires_python": null, - "size": 3568, - "upload_time": "2016-04-01T13:23:25", - "upload_time_iso_8601": "2016-04-01T13:23:25.284973Z", - "url": "https://files.pythonhosted.org/packages/88/04/0defd6f424e5bafb5abc75510cbe119a85d80b5505f1de5cd9a16d89ba8c/color-0.1.1.tar.gz", - "yanked": false, - "yanked_reason": null - } - ], - "vulnerabilities": [] -} - `, - }, - want: "", - wantErr: true, - }, } for _, tt := range tests { tt := tt