From 53c3e652a32409f50a484145f7d477c00089de1d Mon Sep 17 00:00:00 2001 From: Wauplin Date: Thu, 4 Jan 2024 10:27:49 +0100 Subject: [PATCH 1/5] Document huggingface_hub.get_safetensors_metadata --- docs/source/metadata_parsing.mdx | 59 ++++++++++++++++---------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/docs/source/metadata_parsing.mdx b/docs/source/metadata_parsing.mdx index 4dd14330..daafd39d 100644 --- a/docs/source/metadata_parsing.mdx +++ b/docs/source/metadata_parsing.mdx @@ -92,40 +92,39 @@ export type SafetensorsShardedHeaders = Record; ### Python -In this example python script, we are parsing metadata of [gpt2](https://huggingface.co/gpt2/blob/main/model.safetensors). +[`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/index) provides a Python API to parse safetensors metadata. +Use [`get_safetensors_metadata`](https://huggingface.co/docs/huggingface_hub/package_reference/hf_api#huggingface_hub.HfApi.get_safetensors_metadata) in order to get all the metadata of the repo. +Depending on if the model is sharded or not, one or multiple safetensors files will be parsed. ```python -import requests # pip install requests -import struct - -def parse_single_file(url): - # Fetch the first 8 bytes of the file - headers = {'Range': 'bytes=0-7'} - response = requests.get(url, headers=headers) - # Interpret the bytes as a little-endian unsigned 64-bit integer - length_of_header = struct.unpack('>> metadata = get_safetensors_metadata("bigscience/bloomz-560m") +>>> metadata +SafetensorsRepoMetadata( + metadata=None, + sharded=False, + weight_map={'h.0.input_layernorm.bias': 'model.safetensors', ...}, + files_metadata={'model.safetensors': SafetensorsFileMetadata(...)} +) +>>> metadata.files_metadata["model.safetensors"].metadata +{'format': 'pt'} + +# Parse repo with sharded model +>>> metadata = get_safetensors_metadata("bigscience/bloom") +Parse safetensors files: 100%|██████████████████████████████████████████| 72/72 [00:12<00:00, 5.78it/s] +>>> metadata +SafetensorsRepoMetadata(metadata={'total_size': 352494542848}, sharded=True, weight_map={...}, files_metadata={...}) +>>> len(metadata.files_metadata) +72 # All safetensors files have been fetched + +# Parse repo that is not a safetensors repo +>>> get_safetensors_metadata("runwayml/stable-diffusion-v1-5") +NotASafetensorsRepoError: 'runwayml/stable-diffusion-v1-5' is not a safetensors repo. Couldn't find 'model.safetensors.index.json' or 'model.safetensors' files. ``` +To parse the metadata of a single safetensors file, use [`parse_safetensors_file_metadata`](https://huggingface.co/docs/huggingface_hub/package_reference/hf_api#huggingface_hub.HfApi.parse_safetensors_file_metadata). + + ## Example output For instance, here are the number of params per dtype for a few models on the HuggingFace Hub. Also see [this issue](https://github.com/huggingface/safetensors/issues/44) for more examples of usage. From ea86d3d365b6230c3aaedaee64d5ac7f8ed4ced0 Mon Sep 17 00:00:00 2001 From: Lucain Date: Thu, 4 Jan 2024 11:18:46 +0100 Subject: [PATCH 2/5] Update docs/source/metadata_parsing.mdx Co-authored-by: Mishig --- docs/source/metadata_parsing.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/metadata_parsing.mdx b/docs/source/metadata_parsing.mdx index daafd39d..8abb84d8 100644 --- a/docs/source/metadata_parsing.mdx +++ b/docs/source/metadata_parsing.mdx @@ -92,7 +92,7 @@ export type SafetensorsShardedHeaders = Record; ### Python -[`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/index) provides a Python API to parse safetensors metadata. +[`huggingface_hub`](https://huggingface.co/docs/huggingface_hub) provides a Python API to parse safetensors metadata. Use [`get_safetensors_metadata`](https://huggingface.co/docs/huggingface_hub/package_reference/hf_api#huggingface_hub.HfApi.get_safetensors_metadata) in order to get all the metadata of the repo. Depending on if the model is sharded or not, one or multiple safetensors files will be parsed. From 72e62958cf9449c860791f34858bb94beb1e7a08 Mon Sep 17 00:00:00 2001 From: Lucain Date: Thu, 4 Jan 2024 11:19:00 +0100 Subject: [PATCH 3/5] Update docs/source/metadata_parsing.mdx Co-authored-by: Mishig --- docs/source/metadata_parsing.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/metadata_parsing.mdx b/docs/source/metadata_parsing.mdx index 8abb84d8..a51be7b5 100644 --- a/docs/source/metadata_parsing.mdx +++ b/docs/source/metadata_parsing.mdx @@ -93,7 +93,7 @@ export type SafetensorsShardedHeaders = Record; ### Python [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub) provides a Python API to parse safetensors metadata. -Use [`get_safetensors_metadata`](https://huggingface.co/docs/huggingface_hub/package_reference/hf_api#huggingface_hub.HfApi.get_safetensors_metadata) in order to get all the metadata of the repo. +Use [`get_safetensors_metadata`](https://huggingface.co/docs/huggingface_hub/package_reference/hf_api#huggingface_hub.HfApi.get_safetensors_metadata) to get all safetensors metadata of a model. Depending on if the model is sharded or not, one or multiple safetensors files will be parsed. ```python From 9dd1bad9d44895d2c886857ab62bf1ed86081410 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Thu, 4 Jan 2024 11:20:37 +0100 Subject: [PATCH 4/5] add import line --- docs/source/metadata_parsing.mdx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/metadata_parsing.mdx b/docs/source/metadata_parsing.mdx index a51be7b5..3aabc0c8 100644 --- a/docs/source/metadata_parsing.mdx +++ b/docs/source/metadata_parsing.mdx @@ -97,6 +97,8 @@ Use [`get_safetensors_metadata`](https://huggingface.co/docs/huggingface_hub/pac Depending on if the model is sharded or not, one or multiple safetensors files will be parsed. ```python +>>> from huggingface_hub import get_safetensors_metadata + # Parse repo with single weights file >>> metadata = get_safetensors_metadata("bigscience/bloomz-560m") >>> metadata From 0157114729bcc26b5dd6a629fa4fdd672875093c Mon Sep 17 00:00:00 2001 From: Lucain Date: Thu, 4 Jan 2024 11:26:47 +0100 Subject: [PATCH 5/5] Update docs/source/metadata_parsing.mdx Co-authored-by: Mishig --- docs/source/metadata_parsing.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/metadata_parsing.mdx b/docs/source/metadata_parsing.mdx index 3aabc0c8..ed557963 100644 --- a/docs/source/metadata_parsing.mdx +++ b/docs/source/metadata_parsing.mdx @@ -111,7 +111,7 @@ SafetensorsRepoMetadata( >>> metadata.files_metadata["model.safetensors"].metadata {'format': 'pt'} -# Parse repo with sharded model +# Parse repo with sharded model (i.e. multiple weights files) >>> metadata = get_safetensors_metadata("bigscience/bloom") Parse safetensors files: 100%|██████████████████████████████████████████| 72/72 [00:12<00:00, 5.78it/s] >>> metadata