From 9e947c1c22853ad14b08c152f0bce53b76eb198b Mon Sep 17 00:00:00 2001 From: Spencer Schrock Date: Wed, 9 Nov 2022 15:43:46 -0800 Subject: [PATCH] =?UTF-8?q?=F0=9F=8C=B1=20cron:=20support=20reading=20pref?= =?UTF-8?q?ix=20from=20file=20for=20controller=20input=20files=20(7/n)=20(?= =?UTF-8?q?#2445)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add prefix marker file to config Signed-off-by: Spencer Schrock * Read the new config values, if they exist. Signed-off-by: Spencer Schrock * Add function to fetch prefix file config value. Signed-off-by: Spencer Schrock * Read prefix file if prefix not set. Signed-off-by: Spencer Schrock * Add tests to verify how List works with various prefixes Signed-off-by: Spencer Schrock * Add tests for getPrefix Signed-off-by: Spencer Schrock * Remove panics from iterator helper functions Signed-off-by: Spencer Schrock Signed-off-by: Spencer Schrock --- cron/config/config.go | 37 ++++++-- cron/config/config.yaml | 10 +- cron/config/config_test.go | 61 ++++++++++-- cron/data/blob_test.go | 17 +++- .../testdata/blob_test/subdir/nested/key5.txt | 0 cron/internal/controller/bucket.go | 92 ++++++++++++++++++ cron/internal/controller/bucket_test.go | 95 +++++++++++++++++++ cron/internal/controller/main.go | 55 +++-------- .../controller/testdata/getPrefix/marker | 1 + 9 files changed, 305 insertions(+), 63 deletions(-) create mode 100644 cron/data/testdata/blob_test/subdir/nested/key5.txt create mode 100644 cron/internal/controller/bucket.go create mode 100644 cron/internal/controller/bucket_test.go create mode 100644 cron/internal/controller/testdata/getPrefix/marker diff --git a/cron/config/config.go b/cron/config/config.go index 34a501338bf..cbc47293313 100644 --- a/cron/config/config.go +++ b/cron/config/config.go @@ -37,9 +37,10 @@ const ( // TransferStatusFilename file identifies if shard transfer to BigQuery is completed. TransferStatusFilename string = ".transfer_complete" - configFlag string = "config" - configDefault string = "" - configUsage string = "Location of config file. Required" + configFlag string = "config" + configDefault string = "" + configUsage string = "Location of config file. Required" + inputBucketParams string = "input-bucket" projectID string = "SCORECARD_PROJECT_ID" requestTopicURL string = "SCORECARD_REQUEST_TOPIC_URL" @@ -300,16 +301,36 @@ func GetAPIResultsBucketURL() (string, error) { // GetInputBucketURL() returns the bucket URL for input files. func GetInputBucketURL() (string, error) { - return getStringConfigValue(inputBucketURL, configYAML, "InputBucketURL", "input-bucket-url") + bucketParams, err := GetAdditionalParams(inputBucketParams) + bURL, ok := bucketParams["url"] + if err != nil || !ok { + // TODO temporarily falling back to old variables until changes propagate to production + return getStringConfigValue(inputBucketURL, configYAML, "InputBucketURL", "input-bucket-url") + } + return bURL, nil } // GetInputBucketPrefix() returns the prefix used when fetching files from a bucket. func GetInputBucketPrefix() (string, error) { - prefix, err := getStringConfigValue(inputBucketPrefix, configYAML, "InputBucketPrefix", "input-bucket-prefix") - if err != nil && !errors.Is(err, ErrorEmptyConfigValue) { - return "", err + bucketParams, err := GetAdditionalParams(inputBucketParams) + if err != nil { + // TODO temporarily falling back to old variables until changes propagate to production + prefix, err := getStringConfigValue(inputBucketPrefix, configYAML, "InputBucketPrefix", "input-bucket-prefix") + if err != nil && !errors.Is(err, ErrorEmptyConfigValue) { + return "", err + } + return prefix, nil + } + return bucketParams["prefix"], nil +} + +// GetInputBucketPrefixFile() returns the file whose contents specify the prefix to use. +func GetInputBucketPrefixFile() (string, error) { + bucketParams, err := GetAdditionalParams(inputBucketParams) + if err != nil { + return "", fmt.Errorf("getting config for %s: %w", inputBucketParams, err) } - return prefix, nil + return bucketParams["prefix-file"], nil } func GetAdditionalParams(subMapName string) (map[string]string, error) { diff --git a/cron/config/config.yaml b/cron/config/config.yaml index d0846c1d1ad..e9d387e8da4 100644 --- a/cron/config/config.yaml +++ b/cron/config/config.yaml @@ -22,12 +22,20 @@ shard-size: 10 webhook-url: metric-exporter: stackdriver result-data-bucket-url: gs://ossf-scorecard-data2 + +# TODO temporarily leaving old variables until changes propagate to production input-bucket-url: gs://ossf-scorecard-input-projects # Can be used to specify directories within a bucket. Can be empty. input-bucket-prefix: additional-params: - criticality: + input-bucket: + url: gs://ossf-scorecard-input-projects + # Optional prefix to limit files used as input files within a bucket (e.g. a specific file or directory) + prefix: + # Optional file to read a prefix from, instead of statically defining prefix above (note: prefix must be blank to use this option) + # This is good in situations where the prefix changes frequently (e.g. always using the most recent folder in a bucket) + prefix-file: scorecard: # API results bucket diff --git a/cron/config/config_test.go b/cron/config/config_test.go index 96260c5e804..3f9851d470b 100644 --- a/cron/config/config_test.go +++ b/cron/config/config_test.go @@ -38,14 +38,20 @@ const ( prodShardSize int = 10 prodMetricExporter string = "stackdriver" // Raw results. - prodRawBucket = "gs://ossf-scorecard-rawdata" - prodRawBigQueryTable = "scorecard-rawdata" - prodAPIBucketURL = "gs://ossf-scorecard-cron-results" - prodInputBucketURL = "gs://ossf-scorecard-input-projects" - prodInputBucketPrefix = "" + prodRawBucket = "gs://ossf-scorecard-rawdata" + prodRawBigQueryTable = "scorecard-rawdata" + prodAPIBucketURL = "gs://ossf-scorecard-cron-results" + prodInputBucketURL = "gs://ossf-scorecard-input-projects" + prodInputBucketPrefix = "" + prodInputBucketPrefixFile = "" ) var ( + prodInputBucketParams = map[string]string{ + "url": prodInputBucketURL, + "prefix": prodInputBucketPrefix, + "prefix-file": prodInputBucketPrefixFile, + } prodScorecardParams = map[string]string{ "api-results-bucket-url": prodAPIBucketURL, "blacklisted-checks": prodBlacklistedChecks, @@ -53,10 +59,9 @@ var ( "raw-bigquery-table": prodRawBigQueryTable, "raw-result-data-bucket-url": prodRawBucket, } - prodCriticalityParams map[string]string = nil - prodAdditionalParams = map[string]map[string]string{ - "scorecard": prodScorecardParams, - "criticality": prodCriticalityParams, + prodAdditionalParams = map[string]map[string]string{ + "input-bucket": prodInputBucketParams, + "scorecard": prodScorecardParams, } ) @@ -474,3 +479,41 @@ func TestEnvVarName(t *testing.T) { }) } } + +func TestGetAdditionalParams(t *testing.T) { + t.Parallel() + //nolint:govet + tests := []struct { + name string + mapName string + want map[string]string + wantErr bool + }{ + { + name: "scorecard values", + mapName: "scorecard", + want: prodScorecardParams, + wantErr: false, + }, + { + name: "nonexistant value", + mapName: "this-value-should-never-exist", + want: map[string]string{}, + wantErr: true, + }, + } + for _, testcase := range tests { + testcase := testcase + t.Run(testcase.name, func(t *testing.T) { + t.Parallel() + got, err := GetAdditionalParams(testcase.mapName) + if testcase.wantErr != (err != nil) { + t.Fatalf("unexpected error value for GetAdditionalParams: %v", err) + } + if !cmp.Equal(got, testcase.want) { + diff := cmp.Diff(got, testcase.want) + t.Errorf("test failed: expected - %v, got - %v. \n%s", testcase.want, got, diff) + } + }) + } +} diff --git a/cron/data/blob_test.go b/cron/data/blob_test.go index 205c7b47ffb..2c2121a5c50 100644 --- a/cron/data/blob_test.go +++ b/cron/data/blob_test.go @@ -128,11 +128,26 @@ func TestBlobKeysPrefix(t *testing.T) { { name: "no prefix", prefix: "", - want: []string{"key1.txt", "key2.txt", "key3.txt", "subdir/key4.txt"}, + want: []string{"key1.txt", "key2.txt", "key3.txt", "subdir/key4.txt", "subdir/nested/key5.txt"}, }, { name: "subdir prefix", prefix: "subdir/", + want: []string{"subdir/key4.txt", "subdir/nested/key5.txt"}, + }, + { + name: "subdir prefix no terminal slash", + prefix: "subdir", + want: []string{"subdir/key4.txt", "subdir/nested/key5.txt"}, + }, + { + name: "nested prefix", + prefix: "subdir/nested/", + want: []string{"subdir/nested/key5.txt"}, + }, + { + name: "file prefix", + prefix: "subdir/key4.txt", want: []string{"subdir/key4.txt"}, }, } diff --git a/cron/data/testdata/blob_test/subdir/nested/key5.txt b/cron/data/testdata/blob_test/subdir/nested/key5.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/cron/internal/controller/bucket.go b/cron/internal/controller/bucket.go new file mode 100644 index 00000000000..0cf7bbfdf80 --- /dev/null +++ b/cron/internal/controller/bucket.go @@ -0,0 +1,92 @@ +// Copyright 2022 Security Scorecard Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package main implements the PubSub controller. +package main + +import ( + "bytes" + "context" + "fmt" + "strings" + + "github.com/ossf/scorecard/v4/cron/config" + "github.com/ossf/scorecard/v4/cron/data" +) + +// getPrefix returns the prefix used when reading input files from a bucket. +// If "prefix" is set, the value is used irrespective of the value of "prefix-file". +// Otherwise, the contents of "prefix-file" (if set) are used. +func getPrefix(ctx context.Context, bucket string) (string, error) { + prefix, err := config.GetInputBucketPrefix() + if err != nil { + return "", fmt.Errorf("config.GetInputBucketPrefix: %w", err) + } + if prefix != "" { + // prioritize prefix if set + return prefix, nil + } + + prefixFile, err := config.GetInputBucketPrefixFile() + if err != nil { + return "", fmt.Errorf("config.GetInputBucketPrefixFile: %w", err) + } + if prefixFile == "" { + // cant read a file which doesnt exist, but the value is optional so no error + return "", nil + } + + b, err := data.GetBlobContent(ctx, bucket, prefixFile) + if err != nil { + return "", fmt.Errorf("fetching contents of prefix-file: %w", err) + } + s := string(b) + return strings.TrimSpace(s), nil +} + +func bucketFiles(ctx context.Context) (data.Iterator, error) { + var iters []data.Iterator + + bucket, err := config.GetInputBucketURL() + if err != nil { + return nil, fmt.Errorf("config.GetInputBucketURL: %w", err) + } + prefix, err := getPrefix(ctx, bucket) + if err != nil { + return nil, fmt.Errorf("getPrefix: %w", err) + } + + files, err := data.GetBlobKeysWithPrefix(ctx, bucket, prefix) + if err != nil { + return nil, fmt.Errorf("data.GetBlobKeysWithPrefix: %w", err) + } + + for _, f := range files { + b, err := data.GetBlobContent(ctx, bucket, f) + if err != nil { + return nil, fmt.Errorf("data.GetBlobContent: %w", err) + } + r := bytes.NewReader(b) + i, err := data.MakeIteratorFrom(r) + if err != nil { + return nil, fmt.Errorf("data.MakeIteratorFrom: %w", err) + } + iters = append(iters, i) + } + iter, err := data.MakeNestedIterator(iters) + if err != nil { + return nil, fmt.Errorf("data.MakeNestedIterator: %w", err) + } + return iter, nil +} diff --git a/cron/internal/controller/bucket_test.go b/cron/internal/controller/bucket_test.go new file mode 100644 index 00000000000..38e78b419e9 --- /dev/null +++ b/cron/internal/controller/bucket_test.go @@ -0,0 +1,95 @@ +// Copyright 2022 Security Scorecard Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "path/filepath" + "testing" +) + +//nolint:tparallel,paralleltest // since t.Setenv is used +func TestGetPrefix(t *testing.T) { + //nolint:govet + testcases := []struct { + name string + url string + prefix string + prefixFile string + want string + wantErr bool + }{ + { + name: "no prefix", + url: "testdata/getPrefix", + prefix: "", + prefixFile: "", + want: "", + wantErr: false, + }, + { + name: "prefix set", + url: "testdata/getPrefix", + prefix: "foo", + prefixFile: "", + want: "foo", + wantErr: false, + }, + { + name: "prefix file set", + url: "testdata/getPrefix", + prefix: "", + prefixFile: "marker", + want: "bar", + wantErr: false, + }, + { + name: "prefix and prefix file set", + url: "testdata/getPrefix", + prefix: "foo", + prefixFile: "marker", + want: "foo", + wantErr: false, + }, + { + name: "non existent prefix file", + url: "testdata/getPrefix", + prefix: "", + prefixFile: "baz", + want: "", + wantErr: true, + }, + } + + for _, testcase := range testcases { + testcase := testcase + t.Run(testcase.name, func(t *testing.T) { + t.Setenv("INPUT_BUCKET_URL", testcase.url) + t.Setenv("INPUT_BUCKET_PREFIX", testcase.prefix) + t.Setenv("INPUT_BUCKET_PREFIX_FILE", testcase.prefixFile) + testdataPath, err := filepath.Abs(testcase.url) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + got, err := getPrefix(context.Background(), "file:///"+testdataPath) + if (err != nil) != testcase.wantErr { + t.Errorf("unexpected error produced: %v", err) + } + if got != testcase.want { + t.Errorf("test failed: expected - %s, got = %s", testcase.want, got) + } + }) + } +} diff --git a/cron/internal/controller/main.go b/cron/internal/controller/main.go index da37de62c8e..628a25894cf 100644 --- a/cron/internal/controller/main.go +++ b/cron/internal/controller/main.go @@ -16,7 +16,6 @@ package main import ( - "bytes" "context" "flag" "fmt" @@ -84,60 +83,24 @@ func publishToRepoRequestTopic(iter data.Iterator, topicPublisher pubsub.Publish return shardNum, nil } -func localFiles(filenames []string) data.Iterator { +func localFiles(filenames []string) (data.Iterator, error) { var iters []data.Iterator for _, filename := range filenames { f, err := os.Open(filename) if err != nil { - panic(err) + return nil, fmt.Errorf("unable to open input file: %w", err) } i, err := data.MakeIteratorFrom(f) if err != nil { - panic(err) + return nil, fmt.Errorf("data.MakeIteratorFrom: %w", err) } iters = append(iters, i) } iter, err := data.MakeNestedIterator(iters) if err != nil { - panic(err) - } - return iter -} - -func bucketFiles(ctx context.Context) data.Iterator { - var iters []data.Iterator - - bucket, err := config.GetInputBucketURL() - if err != nil { - panic(err) - } - prefix, err := config.GetInputBucketPrefix() - if err != nil { - panic(err) - } - - files, err := data.GetBlobKeysWithPrefix(ctx, bucket, prefix) - if err != nil { - panic(err) + return nil, fmt.Errorf("data.MakeNestedIterator: %w", err) } - - for _, f := range files { - b, err := data.GetBlobContent(ctx, bucket, f) - if err != nil { - panic(err) - } - r := bytes.NewReader(b) - i, err := data.MakeIteratorFrom(r) - if err != nil { - panic(err) - } - iters = append(iters, i) - } - iter, err := data.MakeNestedIterator(iters) - if err != nil { - panic(err) - } - return iter + return iter, nil } func main() { @@ -175,10 +138,14 @@ func main() { var reader data.Iterator if useLocalFiles := len(flag.Args()) > 0; useLocalFiles { - reader = localFiles(flag.Args()) + reader, err = localFiles(flag.Args()) } else { - reader = bucketFiles(ctx) + reader, err = bucketFiles(ctx) } + if err != nil { + panic(err) + } + shardNum, err := publishToRepoRequestTopic(reader, topicPublisher, shardSize, t) if err != nil { panic(err) diff --git a/cron/internal/controller/testdata/getPrefix/marker b/cron/internal/controller/testdata/getPrefix/marker new file mode 100644 index 00000000000..5716ca5987c --- /dev/null +++ b/cron/internal/controller/testdata/getPrefix/marker @@ -0,0 +1 @@ +bar