Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update registry cache to (ab)use Data field of Descriptor objects #34

Merged
merged 5 commits into from
Mar 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
423 changes: 359 additions & 64 deletions .test/lookup-test.json

Large diffs are not rendered by default.

44 changes: 43 additions & 1 deletion .test/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,50 @@ lookup=(
# tianon/test:index-no-platform-smaller - a "broken" index with *zero* platform objects in it (so every manifest requires a platform lookup)
'tianon/test@sha256:347290ddd775c1b85a3e381b09edde95242478eb65153e9b17225356f4c072ac'
# (doing these in the same run means the manifest from above should be cached and exercise more codepaths for better coverage)

--type manifest 'tianon/test@sha256:347290ddd775c1b85a3e381b09edde95242478eb65153e9b17225356f4c072ac' # same manifest again, but without SynthesizeIndex
--type blob 'tianon/test@sha256:d2c94e258dcb3c5ac2798d32e1249e42ef01cba4841c2234249495f87264ac5a' # first config blob from the above
# and again, but this time HEADs
--head --type manifest 'tianon/test@sha256:347290ddd775c1b85a3e381b09edde95242478eb65153e9b17225356f4c072ac'
--head --type blob 'tianon/test@sha256:d2c94e258dcb3c5ac2798d32e1249e42ef01cba4841c2234249495f87264ac5a'

# again with things that aren't cached yet (tianon/true:oci, specifically)
--head --type blob 'tianon/true@sha256:25be82253336f0b8c4347bc4ecbbcdc85d0e0f118ccf8dc2e119c0a47a0a486e' # config blob
--head --type manifest 'tianon/true:oci@sha256:9ef42f1d602fb423fad935aac1caa0cfdbce1ad7edce64d080a4eb7b13f7cd9d'
--type blob 'tianon/true@sha256:25be82253336f0b8c4347bc4ecbbcdc85d0e0f118ccf8dc2e119c0a47a0a486e' # config blob
--type manifest 'tianon/true:oci@sha256:9ef42f1d602fb423fad935aac1caa0cfdbce1ad7edce64d080a4eb7b13f7cd9d'
'tianon/true:oci@sha256:9ef42f1d602fb423fad935aac1caa0cfdbce1ad7edce64d080a4eb7b13f7cd9d'

# tag lookup! (but with a hopefully stable example tag -- a build of notary:server)
--head 'oisupport/staging-amd64:71756dd75e41c4bc5144b64d36b4834a5a960c495470915eb69f96e9f2cb6694'
--head 'oisupport/staging-amd64:71756dd75e41c4bc5144b64d36b4834a5a960c495470915eb69f96e9f2cb6694' # twice, to exercise "tag is cached" case
--type manifest 'oisupport/staging-amd64:71756dd75e41c4bc5144b64d36b4834a5a960c495470915eb69f96e9f2cb6694'
'oisupport/staging-amd64:71756dd75e41c4bc5144b64d36b4834a5a960c495470915eb69f96e9f2cb6694'

# exercise 404 codepaths
"tianon/this-is-a-repository-that-will-never-ever-exist-$RANDOM-$RANDOM:$RANDOM-$RANDOM"
--head "tianon/this-is-a-repository-that-will-never-ever-exist-$RANDOM-$RANDOM:$RANDOM-$RANDOM"
'tianon/test@sha256:0000000000000000000000000000000000000000000000000000000000000000'
)
"$dir/../bin/lookup" "${lookup[@]}" | jq -s > "$dir/lookup-test.json"
"$dir/../bin/lookup" "${lookup[@]}" | jq -s '
[
reduce (
$ARGS.positional[]
| if startswith("tianon/this-is-a-repository-that-will-never-ever-exist-") then
gsub("[0-9]+"; "$RANDOM")
else . end
) as $a ([];
if .[-1][-1] == "--type" then
.[-1][-1] += " " + $a
elif length > 0 and (.[-1][-1] | startswith("--")) then
.[-1] += [$a]
else
. += [[$a]]
end
),
.
] | transpose
' --args -- "${lookup[@]}" > "$dir/lookup-test.json"

# don't leave around the "-cover" versions of these binaries
rm -f "$dir/../bin/builds" "$dir/../bin/lookup"
Expand Down
69 changes: 64 additions & 5 deletions cmd/lookup/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package main
import (
"context"
"encoding/json"
"io"
"os"
"os/signal"

Expand All @@ -15,21 +16,79 @@ func main() {
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt)
defer stop()

for _, img := range os.Args[1:] {
var (
zeroOpts registry.LookupOptions
opts = zeroOpts
)

args := os.Args[1:]
for len(args) > 0 {
img := args[0]
args = args[1:]
switch img {
case "--type":
opts.Type = registry.LookupType(args[0])
args = args[1:]
continue
case "--head":
opts.Head = true
continue
}

ref, err := registry.ParseRef(img)
if err != nil {
panic(err)
}

index, err := registry.SynthesizeIndex(ctx, ref)
if err != nil {
panic(err)
var obj any
if opts == zeroOpts {
// if we have no explicit type and didn't request a HEAD, invoke SynthesizeIndex instead of Lookup
obj, err = registry.SynthesizeIndex(ctx, ref)
if err != nil {
panic(err)
}
} else {
r, err := registry.Lookup(ctx, ref, &opts)
if err != nil {
panic(err)
}
if r != nil {
desc := r.Descriptor()
if opts.Head {
obj = desc
} else {
b, err := io.ReadAll(r)
if err != nil {
r.Close()
panic(err)
}
if opts.Type == registry.LookupTypeManifest {
// if it was a manifest lookup, cast the byte slice to json.RawMessage so we get the actual JSON (not base64)
obj = json.RawMessage(b)
} else {
obj = b
}
}
err = r.Close()
if err != nil {
panic(err)
}
} else {
obj = nil
}
}

e := json.NewEncoder(os.Stdout)
e.SetIndent("", "\t")
if err := e.Encode(index); err != nil {
if err := e.Encode(obj); err != nil {
panic(err)
}

// reset state
opts = zeroOpts
}

if opts != zeroOpts {
panic("dangling --type, --head, etc (without a following reference for it to apply to)")
}
}
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,5 @@ require (
google.golang.org/protobuf v1.28.1 // indirect
)

// https://github.com/cue-labs/oci/pull/27
replace cuelabs.dev/go/oci/ociregistry => github.com/tianon/cuelabs-oci/ociregistry v0.0.0-20240216044210-8aa0c990bd77
// https://github.com/cue-labs/oci/pull/29
replace cuelabs.dev/go/oci/ociregistry => github.com/tianon/cuelabs-oci/ociregistry v0.0.0-20240322151419-7d3242933116
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ github.com/sirupsen/logrus v1.9.0/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVs
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/tianon/cuelabs-oci/ociregistry v0.0.0-20240216044210-8aa0c990bd77 h1:9EPZm+sGlYHo6LleMXWR6s3P8SJEYA7/aovpJ76JSpw=
github.com/tianon/cuelabs-oci/ociregistry v0.0.0-20240216044210-8aa0c990bd77/go.mod h1:ApHceQLLwcOkCEXM1+DyCXTHEJhNGDpJ2kmV6axsx24=
github.com/tianon/cuelabs-oci/ociregistry v0.0.0-20240322151419-7d3242933116 h1:ZDy4uRAhzODJXRo4EoNpJTCiSeOs8wwrkfMJy3JyDps=
github.com/tianon/cuelabs-oci/ociregistry v0.0.0-20240322151419-7d3242933116/go.mod h1:pK23AUVXuNzzTpfMCA06sxZGeVQ/75FdVtW249de9Uo=
golang.org/x/net v0.7.0 h1:rJrUqqhjsgNp7KqAIc25s9pZnjU7TUcSY7HcVZjdn1g=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
Expand Down
138 changes: 99 additions & 39 deletions registry/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ func RegistryCache(r ociregistry.Interface) ociregistry.Interface {
registry: r, // TODO support "nil" here so this can be a poor-man's ocimem implementation? 👀 see also https://github.com/cue-labs/oci/issues/24
has: map[string]bool{},
tags: map[string]ociregistry.Digest{},
types: map[ociregistry.Digest]string{},
data: map[ociregistry.Digest][]byte{},
data: map[ociregistry.Digest]ociregistry.Descriptor{},
}
}

Expand All @@ -32,11 +31,10 @@ type registryCache struct {
registry ociregistry.Interface

// https://github.com/cue-labs/oci/issues/24
mu sync.Mutex // TODO some kind of per-object/name/digest mutex so we don't request the same object from the upstream registry concurrently (on *top* of our maps mutex)?
has map[string]bool // "repo/name@digest" => true (whether a given repo has the given digest)
tags map[string]ociregistry.Digest // "repo/name:tag" => digest
types map[ociregistry.Digest]string // digest => "mediaType" (most recent *storing* / "cache-miss" lookup wins, in the case of upstream/cross-repo ambiguity)
data map[ociregistry.Digest][]byte // digest => data
mu sync.Mutex // TODO some kind of per-object/name/digest mutex so we don't request the same object from the upstream registry concurrently (on *top* of our maps mutex)?
has map[string]bool // "repo/name@digest" => true (whether a given repo has the given digest)
tags map[string]ociregistry.Digest // "repo/name:tag" => digest
data map[ociregistry.Digest]ociregistry.Descriptor // digest => mediaType+size(+data) (most recent *storing* / "cache-miss" lookup wins, in the case of upstream/cross-repo ambiguity)
}

func cacheKeyDigest(repo string, digest ociregistry.Digest) string {
Expand All @@ -52,41 +50,38 @@ func (rc *registryCache) getBlob(ctx context.Context, repo string, digest ocireg
rc.mu.Lock()
defer rc.mu.Unlock()

if b, ok := rc.data[digest]; ok && rc.has[cacheKeyDigest(repo, digest)] {
return ocimem.NewBytesReader(b, ociregistry.Descriptor{
MediaType: rc.types[digest],
Digest: digest,
Size: int64(len(b)),
}), nil
if desc, ok := rc.data[digest]; ok && desc.Data != nil && rc.has[cacheKeyDigest(repo, digest)] {
return ocimem.NewBytesReader(desc.Data, desc), nil
}

r, err := f(ctx, repo, digest)
if err != nil {
return nil, err
}
//defer r.Close()
// defer r.Close() happens later when we know we aren't making Close the caller's responsibility

desc := r.Descriptor()
digest = desc.Digest // if this isn't a no-op, we've got a naughty registry

rc.has[cacheKeyDigest(repo, desc.Digest)] = true
rc.types[desc.Digest] = desc.MediaType
rc.has[cacheKeyDigest(repo, digest)] = true

if desc.Size > manifestSizeLimit {
rc.data[digest] = desc
return r, nil
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So if it is too big, we do not cache it?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct -- that's in the previous implementation too, but handling it earlier turns out to be better because we can then return the original reader earlier instead of buffering it.

Also, "too big" here is currently 4MiB, so it's a pretty generous amount of things that will be cached in memory anyhow (especially since we're looking at mostly just indexes, image manifests, and config blobs).

}
defer r.Close()

b, err := io.ReadAll(r)
desc.Data, err = io.ReadAll(r)
if err != nil {
r.Close()
return nil, err
}
if err := r.Close(); err != nil {
return nil, err
}

if len(b) <= manifestSizeLimit {
rc.data[desc.Digest] = b
} else {
delete(rc.data, desc.Digest)
}
rc.data[digest] = desc

return ocimem.NewBytesReader(b, desc), nil
return ocimem.NewBytesReader(desc.Data, desc), nil
}

func (rc *registryCache) GetBlob(ctx context.Context, repo string, digest ociregistry.Digest) (ociregistry.BlobReader, error) {
Expand All @@ -104,43 +99,108 @@ func (rc *registryCache) GetTag(ctx context.Context, repo string, tag string) (o
tagKey := cacheKeyTag(repo, tag)

if digest, ok := rc.tags[tagKey]; ok {
if b, ok := rc.data[digest]; ok {
return ocimem.NewBytesReader(b, ociregistry.Descriptor{
MediaType: rc.types[digest],
Digest: digest,
Size: int64(len(b)),
}), nil
if desc, ok := rc.data[digest]; ok && desc.Data != nil {
return ocimem.NewBytesReader(desc.Data, desc), nil
}
}

r, err := rc.registry.GetTag(ctx, repo, tag)
if err != nil {
return nil, err
}
//defer r.Close()
// defer r.Close() happens later when we know we aren't making Close the caller's responsibility

desc := r.Descriptor()

rc.has[cacheKeyDigest(repo, desc.Digest)] = true
rc.tags[tagKey] = desc.Digest
rc.types[desc.Digest] = desc.MediaType

b, err := io.ReadAll(r)
if desc.Size > manifestSizeLimit {
rc.data[desc.Digest] = desc
return r, nil
}
defer r.Close()

desc.Data, err = io.ReadAll(r)
if err != nil {
r.Close()
return nil, err
}
if err := r.Close(); err != nil {
return nil, err
}

if len(b) <= manifestSizeLimit {
rc.data[desc.Digest] = b
} else {
delete(rc.data, desc.Digest)
rc.data[desc.Digest] = desc

return ocimem.NewBytesReader(desc.Data, desc), nil
}

func (rc *registryCache) resolveBlob(ctx context.Context, repo string, digest ociregistry.Digest, f func(ctx context.Context, repo string, digest ociregistry.Digest) (ociregistry.Descriptor, error)) (ociregistry.Descriptor, error) {
rc.mu.Lock()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we have to lock on reads?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately it's unsafe for two goroutines to read a map otherwise (and because if this does end up querying the remote registry, it needs to write also). The "data is cached" implementation should be really fast, though, so there shouldn't be much lock contention here (the contention is going to come from actual "load-bearing" concurrent requests 😞).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seeing your answer, I think I have asked that before. Sorry for the noise.

defer rc.mu.Unlock()

if desc, ok := rc.data[digest]; ok && rc.has[cacheKeyDigest(repo, digest)] {
return desc, nil
}

desc, err := f(ctx, repo, digest)
if err != nil {
return desc, err
}

digest = desc.Digest // if this isn't a no-op, we've got a naughty registry

rc.has[cacheKeyDigest(repo, digest)] = true

// carefully copy only valid Resolve* fields such that any other existing fields are kept (this matters more if we ever make our mutexes better/less aggressive 👀)
if d, ok := rc.data[digest]; ok {
d.MediaType = desc.MediaType
d.Digest = desc.Digest
d.Size = desc.Size
desc = d
}
rc.data[digest] = desc

return desc, nil
}

func (rc *registryCache) ResolveManifest(ctx context.Context, repo string, digest ociregistry.Digest) (ociregistry.Descriptor, error) {
return rc.resolveBlob(ctx, repo, digest, rc.registry.ResolveManifest)
}

func (rc *registryCache) ResolveBlob(ctx context.Context, repo string, digest ociregistry.Digest) (ociregistry.Descriptor, error) {
return rc.resolveBlob(ctx, repo, digest, rc.registry.ResolveBlob)
}

func (rc *registryCache) ResolveTag(ctx context.Context, repo string, tag string) (ociregistry.Descriptor, error) {
rc.mu.Lock()
defer rc.mu.Unlock()

tagKey := cacheKeyTag(repo, tag)

if digest, ok := rc.tags[tagKey]; ok {
if desc, ok := rc.data[digest]; ok {
return desc, nil
}
}

desc, err := rc.registry.ResolveTag(ctx, repo, tag)
if err != nil {
return desc, err
}

rc.has[cacheKeyDigest(repo, desc.Digest)] = true
rc.tags[tagKey] = desc.Digest

// carefully copy only valid Resolve* fields such that any other existing fields are kept (this matters more if we ever make our mutexes better/less aggressive 👀)
if d, ok := rc.data[desc.Digest]; ok {
d.MediaType = desc.MediaType
d.Digest = desc.Digest
d.Size = desc.Size
desc = d
}
rc.data[desc.Digest] = desc

return ocimem.NewBytesReader(b, desc), nil
return desc, nil
}

// TODO more methods (currently only implements what's actually necessary for SynthesizeIndex)
Loading