chore(client): update huggingface dataset string/binary convert logic…

…al (#2498)
star-whale · Jul 12, 2023 · e450b17 · e450b17
1 parent a7c9a28
commit e450b17
Show file tree

Hide file tree

Showing 6 changed files with 33 additions and 17 deletions.
diff --git a/client/starwhale/core/dataset/view.py b/client/starwhale/core/dataset/view.py
@@ -13,7 +13,7 @@
 from starwhale.base.view import BaseTermView
 from starwhale.base.uri.project import Project
 from starwhale.base.uri.resource import Resource, ResourceType
-from starwhale.core.dataset.type import DatasetConfig
+from starwhale.core.dataset.type import Text, DatasetConfig
 from starwhale.core.runtime.process import Process as RuntimeProcess
 
 from .model import Dataset
@@ -306,6 +306,8 @@ def head(
             console.rule(f"row [{row['index']}]", align="left")
             output = f":deciduous_tree: id: {row['index']} \n" ":cyclone: features:\n"
             for _k, _v in row["features"].items():
+                if show_raw_data and isinstance(_v, Text):
+                    _v = _v.link_to_content()
                 output += f"\t :dim_button: [bold green]{_k}[/] : {_v} \n"
 
             if show_types:

diff --git a/client/starwhale/integrations/huggingface/dataset.py b/client/starwhale/integrations/huggingface/dataset.py
@@ -12,15 +12,22 @@
     raise ImportError("Please install huggingface/datasets with `pip install datasets`")
 
 from starwhale.utils import console
+from starwhale.utils.fs import DIGEST_SIZE
 from starwhale.core.dataset.type import Text, Audio, Image, Binary, MIMEType
 
 
 def _transform_to_starwhale(data: t.Any, feature: t.Any) -> t.Any:
     if isinstance(feature, hf_datasets.Value):
-        if feature.dtype == "large_string":
-            return Text(content=data)
-        elif feature.dtype == "large_binary":
-            return Binary(fp=data)
+        if feature.dtype in ("large_string", "string"):
+            if len(data) > DIGEST_SIZE:
+                return Text(content=data)
+            else:
+                return data
+        elif feature.dtype in ("large_binary", "binary"):
+            if len(data) > DIGEST_SIZE:
+                return Binary(fp=data)
+            else:
+                return data
         else:
             return data
     elif isinstance(feature, hf_datasets.Audio):

diff --git a/client/starwhale/utils/fs.py b/client/starwhale/utils/fs.py
@@ -15,6 +15,7 @@
 
 BLAKE2B_SIGNATURE_ALGO = "blake2b"
 _MIN_GUESS_NAME_LENGTH = 5
+DIGEST_SIZE = 32
 
 
 class FilePosition(IntEnum):
@@ -122,7 +123,7 @@ def blake2b_file(fpath: t.Union[str, Path]) -> str:
     _chunk_size = 8192
     fpath = Path(fpath)
     # blake2b is more faster and better than md5,sha1,sha2
-    _hash = hashlib.blake2b(digest_size=64)
+    _hash = hashlib.blake2b(digest_size=DIGEST_SIZE)
 
     with fpath.open("rb") as f:
         _chunk = f.read(_chunk_size)
@@ -134,7 +135,7 @@ def blake2b_file(fpath: t.Union[str, Path]) -> str:
 
 
 def blake2b_content(content: bytes) -> str:
-    _hash = hashlib.blake2b(digest_size=64)
+    _hash = hashlib.blake2b(digest_size=DIGEST_SIZE)
     _hash.update(content)
     return _hash.hexdigest()
 

diff --git a/client/tests/sdk/test_dataset_sdk.py b/client/tests/sdk/test_dataset_sdk.py
@@ -1860,9 +1860,13 @@ def test_simple_data(self) -> None:
             (1, hf_datasets.Value("int64"), 1),
             (1.0, hf_datasets.Value("float64"), 1.0),
             (b"000", hf_datasets.Value("binary"), b"000"),
-            (b"000", hf_datasets.Value("large_binary"), Binary(b"000")),
+            (b"000" * 32, hf_datasets.Value("binary"), Binary(b"000" * 32)),
+            (b"000", hf_datasets.Value("large_binary"), b"000"),
+            (b"000" * 32, hf_datasets.Value("large_binary"), Binary(b"000" * 32)),
             ("000", hf_datasets.Value("string"), "000"),
-            ("000", hf_datasets.Value("large_string"), Text("000")),
+            ("000" * 32, hf_datasets.Value("string"), Text("000" * 32)),
+            ("000", hf_datasets.Value("large_string"), "000"),
+            ("000" * 32, hf_datasets.Value("large_string"), Text("000" * 32)),
             (1, hf_datasets.ClassLabel(num_classes=3, names=["a", "b", "c"]), 1),
             (
                 [[1, 2], [11, 22]],
@@ -2029,8 +2033,8 @@ def test_build_dataset(self, m_load_dataset: MagicMock) -> None:
             "float": [1.0, 2.0],
             "str": ["test1", "test2"],
             "bin": [b"test1", b"test2"],
-            "large_str": ["test1", "test2"],
-            "large_bin": [b"test1", b"test2"],
+            "large_str": ["test1" * 20, "test2" * 20],
+            "large_bin": [b"test1" * 20, b"test2" * 20],
         }
 
         simple_features = hf_datasets.Features(
@@ -2069,11 +2073,11 @@ def test_build_dataset(self, m_load_dataset: MagicMock) -> None:
         large_bin = simple_ds["train/0"].features["large_bin"]
         assert isinstance(large_str, Text)
         assert isinstance(large_bin, Binary)
-        assert large_str.to_str() == "test1"
-        assert large_bin.to_bytes() == b"test1"
+        assert large_str.to_str() == "test1" * 20
+        assert large_bin.to_bytes() == b"test1" * 20
 
         assert simple_ds["train/1"].features.int == 2
-        assert simple_ds["train/1"].features["large_bin"].to_bytes() == b"test2"
+        assert simple_ds["train/1"].features["large_bin"].to_bytes() == b"test2" * 20
 
         m_load_dataset.return_value = hf_complex_ds
         complex_ds = Dataset.from_huggingface(name="complex", repo="complex")

diff --git a/example/text_cls_AG_NEWS/dataset.yaml b/example/text_cls_AG_NEWS/dataset.yaml
@@ -5,5 +5,5 @@ handler: tcan.dataset:iter_agnews_item
 desc: AG_NEWS data and label test dataset
 
 attr:
-  alignment_size: 4k
-  volume_size: 8M
+  alignment_size: 128
+  volume_size: 64M
diff --git a/example/text_cls_AG_NEWS/tcan/dataset.py b/example/text_cls_AG_NEWS/tcan/dataset.py
@@ -2,6 +2,8 @@
 import typing as t
 from pathlib import Path
 
+from starwhale import Text
+
 
 def iter_agnews_item() -> t.Generator:
     root_dir = Path(__file__).parent.parent / "data"
@@ -12,6 +14,6 @@ def iter_agnews_item() -> t.Generator:
             # https://huggingface.co/datasets/ag_news#default-1
             data = " ".join(row[1:])
             yield {
-                "text": data,
+                "text": Text(data),
                 "label": int(row[0]) - 1,
             }