Skip to content

Commit

Permalink
License crawler for third party golang libraries (#2393)
Browse files Browse the repository at this point in the history
* license file crawler tools

* Add get-github-repo cli tool to resolve github repo for golang libraries

* improve get_github_repo message

* Add get_github_license_info.py script and related documentation, it fetches license info from github api

* Add license files

* Add concatenate_license.py and update other CLI tools + documentation

* add license for parse_toml_dep.py
  • Loading branch information
Bobgy authored and k8s-ci-robot committed Oct 25, 2019
1 parent 0b8d2e1 commit 3bda9e8
Showing 15 changed files with 1,902 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -39,6 +39,7 @@ frontend/test/ui/visual-regression/screenshots/screen

*.pyc
.DS_Store
build

.ipynb_checkpoints
*.egg-info
@@ -64,3 +65,4 @@ _artifacts

# Generated Python SDK documentation
docs/_build

100 changes: 100 additions & 0 deletions third_party/argo/license-intermediate-data/dep.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
github.com/argoproj/argo
cloud.google.com/go
contrib.go.opencensus.io/exporter/ocagent
github.com/Azure/go-autorest
github.com/Knetic/govaluate
github.com/PuerkitoBio/purell
github.com/PuerkitoBio/urlesc
github.com/argoproj/pkg
github.com/aws/aws-sdk-go
github.com/beorn7/perks
github.com/census-instrumentation/opencensus-proto
github.com/colinmarc/hdfs
github.com/davecgh/go-spew
github.com/dgrijalva/jwt-go
github.com/docker/spdystream
github.com/dustin/go-humanize
github.com/emicklei/go-restful
github.com/emirpasic/gods
github.com/evanphx/json-patch
github.com/go-openapi/jsonpointer
github.com/go-openapi/jsonreference
github.com/go-openapi/spec
github.com/go-openapi/swag
github.com/go-sql-driver/mysql
github.com/gogo/protobuf
github.com/golang/protobuf
github.com/google/gofuzz
github.com/googleapis/gnostic
github.com/gorilla/websocket
github.com/grpc-ecosystem/grpc-gateway
github.com/hashicorp/go-uuid
github.com/hashicorp/golang-lru
github.com/imdario/mergo
github.com/inconshreveable/mousetrap
github.com/jbenet/go-context
github.com/jcmturner/gofork
github.com/jmespath/go-jmespath
github.com/json-iterator/go
github.com/kevinburke/ssh_config
github.com/konsorten/go-windows-terminal-sequences
github.com/lib/pq
github.com/mailru/easyjson
github.com/matttproud/golang_protobuf_extensions
github.com/minio/minio-go
github.com/mitchellh/go-homedir
github.com/mitchellh/go-ps
github.com/modern-go/concurrent
github.com/modern-go/reflect2
github.com/pkg/errors
github.com/pmezard/go-difflib
github.com/prometheus/client_golang
github.com/prometheus/client_model
github.com/prometheus/common
github.com/prometheus/procfs
github.com/sergi/go-diff
github.com/sirupsen/logrus
github.com/spf13/cobra
github.com/spf13/pflag
github.com/src-d/gcfg
github.com/stretchr/objx
github.com/stretchr/testify
github.com/tidwall/gjson
github.com/tidwall/match
github.com/tidwall/pretty
github.com/valyala/bytebufferpool
github.com/valyala/fasttemplate
github.com/xanzy/ssh-agent
go.opencensus.io
golang.org/x/crypto
golang.org/x/net
golang.org/x/oauth2
golang.org/x/sync
golang.org/x/sys
golang.org/x/text
golang.org/x/time
golang.org/x/tools
google.golang.org/api
google.golang.org/appengine
google.golang.org/genproto
google.golang.org/grpc
gopkg.in/inf.v0
gopkg.in/ini.v1
gopkg.in/jcmturner/aescts.v1
gopkg.in/jcmturner/dnsutils.v1
gopkg.in/jcmturner/gokrb5.v5
gopkg.in/jcmturner/rpc.v0
gopkg.in/src-d/go-billy.v4
gopkg.in/src-d/go-git.v4
gopkg.in/warnings.v0
gopkg.in/yaml.v2
k8s.io/api
k8s.io/apimachinery
k8s.io/client-go
k8s.io/code-generator
k8s.io/gengo
k8s.io/klog
k8s.io/kube-openapi
k8s.io/utils
sigs.k8s.io/yaml
upper.io/db.v3
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
upper.io/db.v3,upper/db
99 changes: 99 additions & 0 deletions third_party/argo/license-intermediate-data/license_info.csv

Large diffs are not rendered by default.

100 changes: 100 additions & 0 deletions third_party/argo/license-intermediate-data/repo.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
argoproj/argo
GoogleCloudPlatform/gcloud-golang
census-ecosystem/opencensus-go-exporter-ocagent
Azure/go-autorest
Knetic/govaluate
PuerkitoBio/purell
PuerkitoBio/urlesc
argoproj/pkg
aws/aws-sdk-go
beorn7/perks
census-instrumentation/opencensus-proto
colinmarc/hdfs
davecgh/go-spew
dgrijalva/jwt-go
docker/spdystream
dustin/go-humanize
emicklei/go-restful
emirpasic/gods
evanphx/json-patch
go-openapi/jsonpointer
go-openapi/jsonreference
go-openapi/spec
go-openapi/swag
go-sql-driver/mysql
gogo/protobuf
golang/protobuf
google/gofuzz
googleapis/gnostic
gorilla/websocket
grpc-ecosystem/grpc-gateway
hashicorp/go-uuid
hashicorp/golang-lru
imdario/mergo
inconshreveable/mousetrap
jbenet/go-context
jcmturner/gofork
jmespath/go-jmespath
json-iterator/go
kevinburke/ssh_config
konsorten/go-windows-terminal-sequences
lib/pq
mailru/easyjson
matttproud/golang_protobuf_extensions
minio/minio-go
mitchellh/go-homedir
mitchellh/go-ps
modern-go/concurrent
modern-go/reflect2
pkg/errors
pmezard/go-difflib
prometheus/client_golang
prometheus/client_model
prometheus/common
prometheus/procfs
sergi/go-diff
sirupsen/logrus
spf13/cobra
spf13/pflag
src-d/gcfg
stretchr/objx
stretchr/testify
tidwall/gjson
tidwall/match
tidwall/pretty
valyala/bytebufferpool
valyala/fasttemplate
xanzy/ssh-agent
census-instrumentation/opencensus-go
golang/crypto
golang/net
golang/oauth2
golang/sync
golang/sys
golang/text
golang/time
golang/tools
google/google-api-go-client
golang/appengine
google/go-genproto
grpc/grpc-go
go-inf/inf
go-ini/ini
jcmturner/aescts
jcmturner/dnsutils
jcmturner/gokrb5
jcmturner/rpc
src-d/go-billy
src-d/go-git
go-warnings/warnings
go-yaml/yaml
kubernetes/api
kubernetes/apimachinery
kubernetes/client-go
kubernetes/code-generator
kubernetes/gengo
kubernetes/klog
kubernetes/kube-openapi
kubernetes/utils
kubernetes-sigs/yaml
upper/db
47 changes: 47 additions & 0 deletions third_party/cli/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# CLI tools to fetch license info

## Why we need this?

When we release third party images (can be considered as redistributing third
party binary), we should be compliant to their licenses. Not just the library's
license, also its dependencies and transitive dependencies' licenses.

We need to do the following to be compliant:
* Put license declarations in the image for all licenses.
* Mirror source code in the image for code with MPL, EPL, GPL or CDDL licenses.

It's not an easy task to get license of all (transitive) dependencies of a go
library. Thus, we need these tools to automate this task.

## How to get all dependencies with license and source code?

1. Install CLI tools here: `python setup.py install`
1. Collect dependencies + transitive dependencies in a go library. Put them together in a text file called `dep.txt`. Format: each line has a library name. The library name should be a valid golang import module name.

Example ways to get it:
* argo uses gopkg for package management. It has a [Gopkg.lock file](https://github.com/argoproj/argo/blob/master/Gopkg.lock)
with all of its dependencies and transitive dependencies. All the name fields in this file is what we need. You can run `parse-toml-dep` to parse it.
* minio uses [official go modules](https://blog.golang.org/using-go-modules), there's a [go.mod file](https://github.com/minio/minio/blob/master/go.mod) describing its direct dependencies. Run command `go list -m all` to get final versions that will be used in a build for all direct and indirect dependencies, [reference](https://github.com/golang/go/wiki/Modules#daily-workflow). Parse its output to make a file we need.

Reminder: don't forget to put the library itself into `dep.txt`.
1. Run `get-github-repo` to resolve github repos of golang imports. Not all
imports can be figured out by my script, needs manual help for <2% of libraries.

For a library we cannot resolve, manually put it in `dep-repo-mapping.manual.csv`, so the tool knows how to find its github repo the next time.

Defaults to read dependencies from `dep.txt` and writes to `repo.txt`.
1. Run `get-github-license-info` to crawl github license info of these libraries. (Not all repos have github recognizable license, needs manual help for <2% of libraries)

Defaults to read repos from `repo.txt` and writes to `license-info.csv`. You
need to configure github personal access token because it sends a lot of
requests to github. Follow instructions in `get-github-license-info -h`.

For repos that fails to fetch license, it's usually because their github repo
doesn't have a github understandable license file. Check its readme and
update correct info into `license-info.csv`. (Usually, use its README file which mentions license.)
1. Edit license info file. Manually check the license file for all repos with a license categorized as "Other" by github. Figure out their true license names.
1. Run `concatenate-license` to crawl full text license files for all dependencies and concat them into one file.

Defaults to read license info from `license-info.csv`. Writes to `license.txt`.
Put `license.txt` to `third_party/library/license.txt` where it is read when building docker images.
1. Manually update a list of dependencies that requires source code, put it into `third_party/library/repo-MPL.txt`.
84 changes: 84 additions & 0 deletions third_party/cli/concatenate_license.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import requests
import sys
import traceback

parser = argparse.ArgumentParser(
description='Generate dependencies json from license.csv file.')
parser.add_argument(
'license_info_file',
nargs='?',
default='license_info.csv',
help=
'CSV file with license info fetched from github using get-github-license-info CLI tool. (default: %(default)s)',
)
parser.add_argument(
'-o',
'--output',
dest='output_file',
nargs='?',
default='license.txt',
help=
'Concatenated license file path this command generates. (default: %(default)s)'
)
args = parser.parse_args()


def fetch_license_text(download_link):
response = requests.get(download_link)
assert response.ok, 'Fetching {} failed with {} {}'.format(
download_link, response.status_code, response.reason)
return response.text


def main():
with open(args.license_info_file,
'r') as license_info_file, open(args.output_file,
'w') as output_file:
repo_failed = []
for line in license_info_file:
line = line.strip()
[repo, license_link, license_name,
license_download_link] = line.split(',')
try:
print('Repo {} has license download link {}'.format(
repo, license_download_link),
file=sys.stderr)
license_text = fetch_license_text(license_download_link)
print(
'--------------------------------------------------------------------------------',
file=output_file,
)
print('{} {} {}'.format(repo, license_name, license_link),
file=output_file)
print(
'--------------------------------------------------------------------------------',
file=output_file,
)
print(license_text, file=output_file)
except Exception as e:
print('[failed]', e, file=sys.stderr)
traceback.print_exc(file=sys.stderr)
repo_failed.append(repo)
print('Failed to download license file for {} repos.'.format(
len(repo_failed)),
file=sys.stderr)
for repo in repo_failed:
print(repo, file=sys.stderr)


main()
Loading

0 comments on commit 3bda9e8

Please sign in to comment.