Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(client): update huggingface dataset string/binary convert logical #2498

Merged
merged 1 commit into from
Jul 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion client/starwhale/core/dataset/view.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from starwhale.base.view import BaseTermView
from starwhale.base.uri.project import Project
from starwhale.base.uri.resource import Resource, ResourceType
from starwhale.core.dataset.type import DatasetConfig
from starwhale.core.dataset.type import Text, DatasetConfig
from starwhale.core.runtime.process import Process as RuntimeProcess

from .model import Dataset
Expand Down Expand Up @@ -306,6 +306,8 @@ def head(
console.rule(f"row [{row['index']}]", align="left")
output = f":deciduous_tree: id: {row['index']} \n" ":cyclone: features:\n"
for _k, _v in row["features"].items():
if show_raw_data and isinstance(_v, Text):
_v = _v.link_to_content()
output += f"\t :dim_button: [bold green]{_k}[/] : {_v} \n"

if show_types:
Expand Down
15 changes: 11 additions & 4 deletions client/starwhale/integrations/huggingface/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,22 @@
raise ImportError("Please install huggingface/datasets with `pip install datasets`")

from starwhale.utils import console
from starwhale.utils.fs import DIGEST_SIZE
from starwhale.core.dataset.type import Text, Audio, Image, Binary, MIMEType


def _transform_to_starwhale(data: t.Any, feature: t.Any) -> t.Any:
if isinstance(feature, hf_datasets.Value):
if feature.dtype == "large_string":
return Text(content=data)
elif feature.dtype == "large_binary":
return Binary(fp=data)
if feature.dtype in ("large_string", "string"):
if len(data) > DIGEST_SIZE:
return Text(content=data)
else:
return data
elif feature.dtype in ("large_binary", "binary"):
if len(data) > DIGEST_SIZE:
return Binary(fp=data)
else:
return data
else:
return data
elif isinstance(feature, hf_datasets.Audio):
Expand Down
5 changes: 3 additions & 2 deletions client/starwhale/utils/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

BLAKE2B_SIGNATURE_ALGO = "blake2b"
_MIN_GUESS_NAME_LENGTH = 5
DIGEST_SIZE = 32


class FilePosition(IntEnum):
Expand Down Expand Up @@ -122,7 +123,7 @@ def blake2b_file(fpath: t.Union[str, Path]) -> str:
_chunk_size = 8192
fpath = Path(fpath)
# blake2b is more faster and better than md5,sha1,sha2
_hash = hashlib.blake2b(digest_size=64)
_hash = hashlib.blake2b(digest_size=DIGEST_SIZE)

with fpath.open("rb") as f:
_chunk = f.read(_chunk_size)
Expand All @@ -134,7 +135,7 @@ def blake2b_file(fpath: t.Union[str, Path]) -> str:


def blake2b_content(content: bytes) -> str:
_hash = hashlib.blake2b(digest_size=64)
_hash = hashlib.blake2b(digest_size=DIGEST_SIZE)
_hash.update(content)
return _hash.hexdigest()

Expand Down
18 changes: 11 additions & 7 deletions client/tests/sdk/test_dataset_sdk.py
Original file line number Diff line number Diff line change
Expand Up @@ -1860,9 +1860,13 @@ def test_simple_data(self) -> None:
(1, hf_datasets.Value("int64"), 1),
(1.0, hf_datasets.Value("float64"), 1.0),
(b"000", hf_datasets.Value("binary"), b"000"),
(b"000", hf_datasets.Value("large_binary"), Binary(b"000")),
(b"000" * 32, hf_datasets.Value("binary"), Binary(b"000" * 32)),
(b"000", hf_datasets.Value("large_binary"), b"000"),
(b"000" * 32, hf_datasets.Value("large_binary"), Binary(b"000" * 32)),
("000", hf_datasets.Value("string"), "000"),
("000", hf_datasets.Value("large_string"), Text("000")),
("000" * 32, hf_datasets.Value("string"), Text("000" * 32)),
("000", hf_datasets.Value("large_string"), "000"),
("000" * 32, hf_datasets.Value("large_string"), Text("000" * 32)),
(1, hf_datasets.ClassLabel(num_classes=3, names=["a", "b", "c"]), 1),
(
[[1, 2], [11, 22]],
Expand Down Expand Up @@ -2029,8 +2033,8 @@ def test_build_dataset(self, m_load_dataset: MagicMock) -> None:
"float": [1.0, 2.0],
"str": ["test1", "test2"],
"bin": [b"test1", b"test2"],
"large_str": ["test1", "test2"],
"large_bin": [b"test1", b"test2"],
"large_str": ["test1" * 20, "test2" * 20],
"large_bin": [b"test1" * 20, b"test2" * 20],
}

simple_features = hf_datasets.Features(
Expand Down Expand Up @@ -2069,11 +2073,11 @@ def test_build_dataset(self, m_load_dataset: MagicMock) -> None:
large_bin = simple_ds["train/0"].features["large_bin"]
assert isinstance(large_str, Text)
assert isinstance(large_bin, Binary)
assert large_str.to_str() == "test1"
assert large_bin.to_bytes() == b"test1"
assert large_str.to_str() == "test1" * 20
assert large_bin.to_bytes() == b"test1" * 20

assert simple_ds["train/1"].features.int == 2
assert simple_ds["train/1"].features["large_bin"].to_bytes() == b"test2"
assert simple_ds["train/1"].features["large_bin"].to_bytes() == b"test2" * 20

m_load_dataset.return_value = hf_complex_ds
complex_ds = Dataset.from_huggingface(name="complex", repo="complex")
Expand Down
4 changes: 2 additions & 2 deletions example/text_cls_AG_NEWS/dataset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ handler: tcan.dataset:iter_agnews_item
desc: AG_NEWS data and label test dataset

attr:
alignment_size: 4k
volume_size: 8M
alignment_size: 128
volume_size: 64M
4 changes: 3 additions & 1 deletion example/text_cls_AG_NEWS/tcan/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import typing as t
from pathlib import Path

from starwhale import Text


def iter_agnews_item() -> t.Generator:
root_dir = Path(__file__).parent.parent / "data"
Expand All @@ -12,6 +14,6 @@ def iter_agnews_item() -> t.Generator:
# https://huggingface.co/datasets/ag_news#default-1
data = " ".join(row[1:])
yield {
"text": data,
"text": Text(data),
"label": int(row[0]) - 1,
}
Loading