Skip to content

Commit

Permalink
chore(client): update huggingface dataset string/binary convert logic…
Browse files Browse the repository at this point in the history
…al (#2498)
  • Loading branch information
tianweidut authored Jul 12, 2023
1 parent a7c9a28 commit e450b17
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 17 deletions.
4 changes: 3 additions & 1 deletion client/starwhale/core/dataset/view.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from starwhale.base.view import BaseTermView
from starwhale.base.uri.project import Project
from starwhale.base.uri.resource import Resource, ResourceType
from starwhale.core.dataset.type import DatasetConfig
from starwhale.core.dataset.type import Text, DatasetConfig
from starwhale.core.runtime.process import Process as RuntimeProcess

from .model import Dataset
Expand Down Expand Up @@ -306,6 +306,8 @@ def head(
console.rule(f"row [{row['index']}]", align="left")
output = f":deciduous_tree: id: {row['index']} \n" ":cyclone: features:\n"
for _k, _v in row["features"].items():
if show_raw_data and isinstance(_v, Text):
_v = _v.link_to_content()
output += f"\t :dim_button: [bold green]{_k}[/] : {_v} \n"

if show_types:
Expand Down
15 changes: 11 additions & 4 deletions client/starwhale/integrations/huggingface/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,22 @@
raise ImportError("Please install huggingface/datasets with `pip install datasets`")

from starwhale.utils import console
from starwhale.utils.fs import DIGEST_SIZE
from starwhale.core.dataset.type import Text, Audio, Image, Binary, MIMEType


def _transform_to_starwhale(data: t.Any, feature: t.Any) -> t.Any:
if isinstance(feature, hf_datasets.Value):
if feature.dtype == "large_string":
return Text(content=data)
elif feature.dtype == "large_binary":
return Binary(fp=data)
if feature.dtype in ("large_string", "string"):
if len(data) > DIGEST_SIZE:
return Text(content=data)
else:
return data
elif feature.dtype in ("large_binary", "binary"):
if len(data) > DIGEST_SIZE:
return Binary(fp=data)
else:
return data
else:
return data
elif isinstance(feature, hf_datasets.Audio):
Expand Down
5 changes: 3 additions & 2 deletions client/starwhale/utils/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

BLAKE2B_SIGNATURE_ALGO = "blake2b"
_MIN_GUESS_NAME_LENGTH = 5
DIGEST_SIZE = 32


class FilePosition(IntEnum):
Expand Down Expand Up @@ -122,7 +123,7 @@ def blake2b_file(fpath: t.Union[str, Path]) -> str:
_chunk_size = 8192
fpath = Path(fpath)
# blake2b is more faster and better than md5,sha1,sha2
_hash = hashlib.blake2b(digest_size=64)
_hash = hashlib.blake2b(digest_size=DIGEST_SIZE)

with fpath.open("rb") as f:
_chunk = f.read(_chunk_size)
Expand All @@ -134,7 +135,7 @@ def blake2b_file(fpath: t.Union[str, Path]) -> str:


def blake2b_content(content: bytes) -> str:
_hash = hashlib.blake2b(digest_size=64)
_hash = hashlib.blake2b(digest_size=DIGEST_SIZE)
_hash.update(content)
return _hash.hexdigest()

Expand Down
18 changes: 11 additions & 7 deletions client/tests/sdk/test_dataset_sdk.py
Original file line number Diff line number Diff line change
Expand Up @@ -1860,9 +1860,13 @@ def test_simple_data(self) -> None:
(1, hf_datasets.Value("int64"), 1),
(1.0, hf_datasets.Value("float64"), 1.0),
(b"000", hf_datasets.Value("binary"), b"000"),
(b"000", hf_datasets.Value("large_binary"), Binary(b"000")),
(b"000" * 32, hf_datasets.Value("binary"), Binary(b"000" * 32)),
(b"000", hf_datasets.Value("large_binary"), b"000"),
(b"000" * 32, hf_datasets.Value("large_binary"), Binary(b"000" * 32)),
("000", hf_datasets.Value("string"), "000"),
("000", hf_datasets.Value("large_string"), Text("000")),
("000" * 32, hf_datasets.Value("string"), Text("000" * 32)),
("000", hf_datasets.Value("large_string"), "000"),
("000" * 32, hf_datasets.Value("large_string"), Text("000" * 32)),
(1, hf_datasets.ClassLabel(num_classes=3, names=["a", "b", "c"]), 1),
(
[[1, 2], [11, 22]],
Expand Down Expand Up @@ -2029,8 +2033,8 @@ def test_build_dataset(self, m_load_dataset: MagicMock) -> None:
"float": [1.0, 2.0],
"str": ["test1", "test2"],
"bin": [b"test1", b"test2"],
"large_str": ["test1", "test2"],
"large_bin": [b"test1", b"test2"],
"large_str": ["test1" * 20, "test2" * 20],
"large_bin": [b"test1" * 20, b"test2" * 20],
}

simple_features = hf_datasets.Features(
Expand Down Expand Up @@ -2069,11 +2073,11 @@ def test_build_dataset(self, m_load_dataset: MagicMock) -> None:
large_bin = simple_ds["train/0"].features["large_bin"]
assert isinstance(large_str, Text)
assert isinstance(large_bin, Binary)
assert large_str.to_str() == "test1"
assert large_bin.to_bytes() == b"test1"
assert large_str.to_str() == "test1" * 20
assert large_bin.to_bytes() == b"test1" * 20

assert simple_ds["train/1"].features.int == 2
assert simple_ds["train/1"].features["large_bin"].to_bytes() == b"test2"
assert simple_ds["train/1"].features["large_bin"].to_bytes() == b"test2" * 20

m_load_dataset.return_value = hf_complex_ds
complex_ds = Dataset.from_huggingface(name="complex", repo="complex")
Expand Down
4 changes: 2 additions & 2 deletions example/text_cls_AG_NEWS/dataset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ handler: tcan.dataset:iter_agnews_item
desc: AG_NEWS data and label test dataset

attr:
alignment_size: 4k
volume_size: 8M
alignment_size: 128
volume_size: 64M
4 changes: 3 additions & 1 deletion example/text_cls_AG_NEWS/tcan/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import typing as t
from pathlib import Path

from starwhale import Text


def iter_agnews_item() -> t.Generator:
root_dir = Path(__file__).parent.parent / "data"
Expand All @@ -12,6 +14,6 @@ def iter_agnews_item() -> t.Generator:
# https://huggingface.co/datasets/ag_news#default-1
data = " ".join(row[1:])
yield {
"text": data,
"text": Text(data),
"label": int(row[0]) - 1,
}

0 comments on commit e450b17

Please sign in to comment.