diff --git a/client/starwhale/core/dataset/view.py b/client/starwhale/core/dataset/view.py index 256823e332..df50a63198 100644 --- a/client/starwhale/core/dataset/view.py +++ b/client/starwhale/core/dataset/view.py @@ -13,7 +13,7 @@ from starwhale.base.view import BaseTermView from starwhale.base.uri.project import Project from starwhale.base.uri.resource import Resource, ResourceType -from starwhale.core.dataset.type import DatasetConfig +from starwhale.core.dataset.type import Text, DatasetConfig from starwhale.core.runtime.process import Process as RuntimeProcess from .model import Dataset @@ -306,6 +306,8 @@ def head( console.rule(f"row [{row['index']}]", align="left") output = f":deciduous_tree: id: {row['index']} \n" ":cyclone: features:\n" for _k, _v in row["features"].items(): + if show_raw_data and isinstance(_v, Text): + _v = _v.link_to_content() output += f"\t :dim_button: [bold green]{_k}[/] : {_v} \n" if show_types: diff --git a/client/starwhale/integrations/huggingface/dataset.py b/client/starwhale/integrations/huggingface/dataset.py index c03b37b852..683f170f7b 100644 --- a/client/starwhale/integrations/huggingface/dataset.py +++ b/client/starwhale/integrations/huggingface/dataset.py @@ -12,15 +12,22 @@ raise ImportError("Please install huggingface/datasets with `pip install datasets`") from starwhale.utils import console +from starwhale.utils.fs import DIGEST_SIZE from starwhale.core.dataset.type import Text, Audio, Image, Binary, MIMEType def _transform_to_starwhale(data: t.Any, feature: t.Any) -> t.Any: if isinstance(feature, hf_datasets.Value): - if feature.dtype == "large_string": - return Text(content=data) - elif feature.dtype == "large_binary": - return Binary(fp=data) + if feature.dtype in ("large_string", "string"): + if len(data) > DIGEST_SIZE: + return Text(content=data) + else: + return data + elif feature.dtype in ("large_binary", "binary"): + if len(data) > DIGEST_SIZE: + return Binary(fp=data) + else: + return data else: return data elif isinstance(feature, hf_datasets.Audio): diff --git a/client/starwhale/utils/fs.py b/client/starwhale/utils/fs.py index 5558e7f588..a08f1765fc 100644 --- a/client/starwhale/utils/fs.py +++ b/client/starwhale/utils/fs.py @@ -15,6 +15,7 @@ BLAKE2B_SIGNATURE_ALGO = "blake2b" _MIN_GUESS_NAME_LENGTH = 5 +DIGEST_SIZE = 32 class FilePosition(IntEnum): @@ -122,7 +123,7 @@ def blake2b_file(fpath: t.Union[str, Path]) -> str: _chunk_size = 8192 fpath = Path(fpath) # blake2b is more faster and better than md5,sha1,sha2 - _hash = hashlib.blake2b(digest_size=64) + _hash = hashlib.blake2b(digest_size=DIGEST_SIZE) with fpath.open("rb") as f: _chunk = f.read(_chunk_size) @@ -134,7 +135,7 @@ def blake2b_file(fpath: t.Union[str, Path]) -> str: def blake2b_content(content: bytes) -> str: - _hash = hashlib.blake2b(digest_size=64) + _hash = hashlib.blake2b(digest_size=DIGEST_SIZE) _hash.update(content) return _hash.hexdigest() diff --git a/client/tests/sdk/test_dataset_sdk.py b/client/tests/sdk/test_dataset_sdk.py index fcb7616576..32c71b1773 100644 --- a/client/tests/sdk/test_dataset_sdk.py +++ b/client/tests/sdk/test_dataset_sdk.py @@ -1860,9 +1860,13 @@ def test_simple_data(self) -> None: (1, hf_datasets.Value("int64"), 1), (1.0, hf_datasets.Value("float64"), 1.0), (b"000", hf_datasets.Value("binary"), b"000"), - (b"000", hf_datasets.Value("large_binary"), Binary(b"000")), + (b"000" * 32, hf_datasets.Value("binary"), Binary(b"000" * 32)), + (b"000", hf_datasets.Value("large_binary"), b"000"), + (b"000" * 32, hf_datasets.Value("large_binary"), Binary(b"000" * 32)), ("000", hf_datasets.Value("string"), "000"), - ("000", hf_datasets.Value("large_string"), Text("000")), + ("000" * 32, hf_datasets.Value("string"), Text("000" * 32)), + ("000", hf_datasets.Value("large_string"), "000"), + ("000" * 32, hf_datasets.Value("large_string"), Text("000" * 32)), (1, hf_datasets.ClassLabel(num_classes=3, names=["a", "b", "c"]), 1), ( [[1, 2], [11, 22]], @@ -2029,8 +2033,8 @@ def test_build_dataset(self, m_load_dataset: MagicMock) -> None: "float": [1.0, 2.0], "str": ["test1", "test2"], "bin": [b"test1", b"test2"], - "large_str": ["test1", "test2"], - "large_bin": [b"test1", b"test2"], + "large_str": ["test1" * 20, "test2" * 20], + "large_bin": [b"test1" * 20, b"test2" * 20], } simple_features = hf_datasets.Features( @@ -2069,11 +2073,11 @@ def test_build_dataset(self, m_load_dataset: MagicMock) -> None: large_bin = simple_ds["train/0"].features["large_bin"] assert isinstance(large_str, Text) assert isinstance(large_bin, Binary) - assert large_str.to_str() == "test1" - assert large_bin.to_bytes() == b"test1" + assert large_str.to_str() == "test1" * 20 + assert large_bin.to_bytes() == b"test1" * 20 assert simple_ds["train/1"].features.int == 2 - assert simple_ds["train/1"].features["large_bin"].to_bytes() == b"test2" + assert simple_ds["train/1"].features["large_bin"].to_bytes() == b"test2" * 20 m_load_dataset.return_value = hf_complex_ds complex_ds = Dataset.from_huggingface(name="complex", repo="complex") diff --git a/example/text_cls_AG_NEWS/dataset.yaml b/example/text_cls_AG_NEWS/dataset.yaml index a0a53eff60..d7e097b6dc 100644 --- a/example/text_cls_AG_NEWS/dataset.yaml +++ b/example/text_cls_AG_NEWS/dataset.yaml @@ -5,5 +5,5 @@ handler: tcan.dataset:iter_agnews_item desc: AG_NEWS data and label test dataset attr: - alignment_size: 4k - volume_size: 8M + alignment_size: 128 + volume_size: 64M diff --git a/example/text_cls_AG_NEWS/tcan/dataset.py b/example/text_cls_AG_NEWS/tcan/dataset.py index 6c6dc70cd8..5fef917c84 100644 --- a/example/text_cls_AG_NEWS/tcan/dataset.py +++ b/example/text_cls_AG_NEWS/tcan/dataset.py @@ -2,6 +2,8 @@ import typing as t from pathlib import Path +from starwhale import Text + def iter_agnews_item() -> t.Generator: root_dir = Path(__file__).parent.parent / "data" @@ -12,6 +14,6 @@ def iter_agnews_item() -> t.Generator: # https://huggingface.co/datasets/ag_news#default-1 data = " ".join(row[1:]) yield { - "text": data, + "text": Text(data), "label": int(row[0]) - 1, }