Skip to content

Commit

Permalink
Expose reosurce descriptors from manifests
Browse files Browse the repository at this point in the history
We aim this to be similar to in-toto's `ResourceDescriptor`. To support cases where in-toto cannot be directly used, we make this a dataclass that can be mapped to in-toto when needed, and used as its own otherwise.

Not all fields from in-toto are specified at this moment. All fields here must be present, unlike in-toto, where all are optional.

See https://github.com/in-toto/attestation/blob/main/spec/v1/resource_descriptor.md for the in-toto specification.

This is the first separable PR for the signing support (see full draft on sigstore#253)

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>
  • Loading branch information
mihaimaruseac committed Jul 24, 2024
1 parent 9798149 commit 3344e2e
Show file tree
Hide file tree
Showing 8 changed files with 278 additions and 6 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ jobs:
pip install -r model_signing/install/requirements_test_Linux.txt
pip install -r model_signing/install/requirements_dev_Linux.txt
# TODO: https://github.com/sigstore/model-transparency/issues/231 - Support all repo
pytype --keep-going model_signing/{hashing,manifest,serialization}
pytype --keep-going model_signing/{hashing,manifest,serialization,signing}
pylint-lint:
runs-on: ubuntu-latest
Expand All @@ -85,4 +85,4 @@ jobs:
pip install -r model_signing/install/requirements_dev_Linux.txt
# TODO: https://github.com/sigstore/model-transparency/issues/231 - Support all repo
# We should actually migrate to ruff, but that's configured via pyproject.toml which we use when we release the wheel
pylint --disable C0114,C0115,C0116,R0801,R0903,R0904,R0913,R0914,R1721,R1737,W0107,W0212,W0223,W0231,W0511,W0621 model_signing/{hashing,manifest,serialization}
pylint --disable C0114,C0115,C0116,R0801,R0903,R0904,R0913,R0914,R1721,R1737,W0107,W0212,W0223,W0231,W0511,W0621 model_signing/{hashing,manifest,serialization,signing}
64 changes: 62 additions & 2 deletions model_signing/manifest/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,44 @@
from collections.abc import Iterable
import dataclasses
import pathlib
from typing import Self
from typing import Iterator, Self
from typing_extensions import override

from model_signing.hashing import hashing


@dataclasses.dataclass(frozen=True)
class ResourceDescriptor:
"""A description of any content from any `Manifest`.
We aim this to be similar to in-toto's `ResourceDescriptor`. To support
cases where in-toto cannot be directly used, we make this a dataclass that
can be mapped to in-toto when needed, and used as its own otherwise.
Not all fields from in-toto are specified at this moment. All fields here
must be present, unlike in-toto, where all are optional.
See github.com/in-toto/attestation/blob/main/spec/v1/resource_descriptor.md
for the in-toto specification.
Attributes:
identifier: A string that uniquely identifies this `ResourceDescriptor`.
Corresponds to `name`, `uri`, or `content` in in-toto specification.
digest: One digest for the item. Note that unlike in-toto, we only have
one digest for the item and it is always required.
"""

identifier: str
digest: hashing.Digest


class Manifest(metaclass=abc.ABCMeta):
"""Generic manifest file to represent a model."""

pass
@abc.abstractmethod
def resource_descriptors(self) -> Iterator[ResourceDescriptor]:
"""Yields each resource from the manifest, one by one."""
pass


@dataclasses.dataclass(frozen=True)
Expand All @@ -72,6 +101,17 @@ class DigestManifest(Manifest):

digest: hashing.Digest

@override
def resource_descriptors(self) -> Iterator[ResourceDescriptor]:
"""Yields each resource from the manifest, one by one.
In this case, we have only one descriptor to return. Since model paths
are already encoded in the digest, use "." for the digest. Subclasses
might record additional fields to have distinguishable human readable
identifiers.
"""
yield ResourceDescriptor(identifier=".", digest=self.digest)


class ItemizedManifest(Manifest):
"""A detailed manifest, recording integrity of every model component."""
Expand Down Expand Up @@ -130,6 +170,15 @@ def __init__(self, items: Iterable[FileManifestItem]):
def __eq__(self, other: Self):
return self._item_to_digest == other._item_to_digest

@override
def resource_descriptors(self) -> Iterator[ResourceDescriptor]:
"""Yields each resource from the manifest, one by one.
The items are returned in alphabetical order of the path.
"""
for item, digest in sorted(self._item_to_digest.items()):
yield ResourceDescriptor(identifier=str(item), digest=digest)


@dataclasses.dataclass(frozen=True, order=True)
class Shard:
Expand Down Expand Up @@ -200,3 +249,14 @@ def __init__(self, items: Iterable[ShardedFileManifestItem]):
efficient updates and retrieval of digests.
"""
self._item_to_digest = {item.input_tuple: item.digest for item in items}

@override
def resource_descriptors(self) -> Iterator[ResourceDescriptor]:
"""Yields each resource from the manifest, one by one.
The items are returned in the order given by the `input_tuple` property
of `ShardedFileManifestItem` used to create this instance (the triple of
file name and shard endpoints).
"""
for item, digest in sorted(self._item_to_digest.items()):
yield ResourceDescriptor(identifier=str(item), digest=digest)
100 changes: 100 additions & 0 deletions model_signing/manifest/manifest_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,31 @@
# limitations under the License.

import pathlib
import pytest

from model_signing.hashing import hashing
from model_signing.manifest import manifest


class TestDigestManifest:

def test_manifest_has_just_one_resource_descriptor(self):
digest = hashing.Digest("test", b"test_digest")
manifest_file = manifest.DigestManifest(digest)

descriptors = list(manifest_file.resource_descriptors())

assert len(descriptors) == 1

def test_manifest_has_the_correct_resource_descriptor(self):
digest = hashing.Digest("test", b"test_digest")
manifest_file = manifest.DigestManifest(digest)

for descriptor in manifest_file.resource_descriptors():
assert descriptor.identifier == "."
assert descriptor.digest == digest


class TestFileLevelManifest:

def test_insert_order_does_not_matter(self):
Expand All @@ -34,6 +54,39 @@ def test_insert_order_does_not_matter(self):

assert manifest1 == manifest2

@pytest.mark.parametrize("num_items", [1, 3, 5])
def test_manifest_has_all_resource_descriptors(self, num_items):
items: list[manifest.FileManifestItem] = []
for i in range(num_items):
path = pathlib.PurePath(f"file{i}")
digest = hashing.Digest("test", b"hash{i}")
item = manifest.FileManifestItem(path=path, digest=digest)
items.append(item)
manifest_file = manifest.FileLevelManifest(items)

descriptors = list(manifest_file.resource_descriptors())

assert len(descriptors) == num_items

def test_manifest_has_the_correct_resource_descriptors(self):
path1 = pathlib.PurePath("file1")
digest1 = hashing.Digest("test", b"hash1")
item1 = manifest.FileManifestItem(path=path1, digest=digest1)

path2 = pathlib.PurePath("file2")
digest2 = hashing.Digest("test", b"hash2")
item2 = manifest.FileManifestItem(path=path2, digest=digest2)

# Note order is reversed
manifest_file = manifest.FileLevelManifest([item2, item1])
descriptors = list(manifest_file.resource_descriptors())

# But we expect the descriptors to be in order by file
assert descriptors[0].identifier == "file1"
assert descriptors[1].identifier == "file2"
assert descriptors[0].digest.digest_value == b"hash1"
assert descriptors[1].digest.digest_value == b"hash2"


class TestShardLevelManifest:

Expand Down Expand Up @@ -70,3 +123,50 @@ def test_same_path_different_shards_gives_different_manifest(self):
manifest2 = manifest.ShardLevelManifest([item])

assert manifest1 != manifest2

@pytest.mark.parametrize("num_items", [1, 3, 5])
def test_manifest_has_all_resource_descriptors(self, num_items):
items: list[manifest.ShardedFileManifestItem] = []
for i in range(num_items):
path = pathlib.PurePath("file")
digest = hashing.Digest("test", b"hash{i}")
item = manifest.ShardedFileManifestItem(
path=path, digest=digest, start=i, end=i + 2
)
items.append(item)
manifest_file = manifest.ShardLevelManifest(items)

descriptors = list(manifest_file.resource_descriptors())

assert len(descriptors) == num_items

def test_manifest_has_the_correct_resource_descriptors(self):
path1 = pathlib.PurePath("file1")
digest1 = hashing.Digest("test", b"hash1")
item1 = manifest.ShardedFileManifestItem(
path=path1, digest=digest1, start=0, end=4
)

path2 = pathlib.PurePath("file2")
digest2 = hashing.Digest("test", b"hash2")
item2 = manifest.ShardedFileManifestItem(
path=path2, digest=digest2, start=0, end=4
)

# First file, but second shard
digest3 = hashing.Digest("test", b"hash3")
item3 = manifest.ShardedFileManifestItem(
path=path1, digest=digest3, start=4, end=8
)

# Note order is reversed
manifest_file = manifest.ShardLevelManifest([item3, item2, item1])
descriptors = list(manifest_file.resource_descriptors())

# But we expect the descriptors to be in order by file shard
assert descriptors[0].identifier == "file1:0:4"
assert descriptors[1].identifier == "file1:4:8"
assert descriptors[2].identifier == "file2:0:4"
assert descriptors[0].digest.digest_value == b"hash1"
assert descriptors[1].digest.digest_value == b"hash3"
assert descriptors[2].digest.digest_value == b"hash2"
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@
import pathlib
import pytest

from model_signing import test_support
from model_signing.hashing import file
from model_signing.hashing import memory
from model_signing.manifest import manifest
from model_signing.serialization import serialize_by_file_shard
from model_signing import test_support


class TestDigestSerializer:
Expand Down
2 changes: 1 addition & 1 deletion model_signing/serialization/serialize_by_file_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@
import pathlib
import pytest

from model_signing import test_support
from model_signing.hashing import file
from model_signing.hashing import memory
from model_signing.manifest import manifest
from model_signing.serialization import serialize_by_file
from model_signing import test_support


class TestDigestSerializer:
Expand Down
13 changes: 13 additions & 0 deletions model_signing/signing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright 2024 The Sigstore Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
80 changes: 80 additions & 0 deletions model_signing/signing/signing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Copyright 2024 The Sigstore Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Machinery for signing and verification of ML models.
The serialization API produces a manifest representation of the models, which
can be used to implement various verification patterns. However, when signing,
we need to actually represent this manifest in a specific disk format. But,
there are multiple ways to use `manifest.Manifest` objects, so we add a new
`SigningMaterial` class hierarchy to serialize and sign manifests.
The output of a signing process is a `Signature` instance, backed by a format to
serialize this to disk. In OSS, this is usually a Sigstore bundle.
TODO: expand on this.
"""

import abc
import pathlib
from typing import Self

from model_signing.manifest import manifest


class SigningMaterial(metaclass=abc.ABCMeta):
"""Generic material that we can sign."""

@classmethod
@abc.abstractmethod
def from_manifest(cls, manifest: manifest.Manifest) -> Self:
"""Converts a manifest to the signing material used for signing."""
pass

@abc.abstractmethod
def sign(self) -> "Signature":
"""Signs the current SigningMaterial with the provided key/signer.
TODO: arguments, abstract over signing format, etc.
"""
pass


class Signature(metaclass=abc.ABCMeta):
"""Generic signature support."""

@abc.abstractmethod
def write_signature(self, path: pathlib.Path):
"""Writes the signature to disk, to the given path."""
pass

@classmethod
@abc.abstractmethod
def read_signature(cls, path: pathlib.Path) -> Self:
"""Reads the signature from disk.
Does not perform any verification, except what is needed to parse the
signature file. Use `verify` to validate the signature.
"""
pass

@abc.abstractmethod
def verify(self): # TODO: signature
"""Verifies the signature.
If the verification passes, this method returns TODO: what?
TODO: Document return and raises.
"""
pass
19 changes: 19 additions & 0 deletions model_signing/test_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,25 @@
]


# All directory models to use in testing, where only non empty directory models
# are supported. See also `all_test_models` comments.
all_non_empty_directory_test_models = [
"sample_model_folder",
"deep_model_folder",
"model_folder_with_empty_file",
"symlink_model_folder",
]


# All directory models to use in testing, where only non empty directory models
# are supported. See also `all_test_models` comments.
all_non_empty_directory_test_models = [
"sample_model_folder",
"deep_model_folder",
"model_folder_with_empty_file",
]


def get_first_directory(path: pathlib.Path) -> pathlib.Path:
"""Returns the first directory that is a children of path.
Expand Down

0 comments on commit 3344e2e

Please sign in to comment.