diff --git a/CHANGELOG.md b/CHANGELOG.md index e9360bdb67f..4d5e55c2512 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ### Fixed -- +- Default `label-map` parameter value for VOC converter () +- Randomness of random split transform () +- `Transform.subsets()` method () ### Security - diff --git a/datumaro/components/extractor.py b/datumaro/components/extractor.py index 8f5298ba959..7bf604ce539 100644 --- a/datumaro/components/extractor.py +++ b/datumaro/components/extractor.py @@ -643,7 +643,7 @@ def categories(self): def subsets(self): if self._subsets is None: self._subsets = set(self._extractor.subsets()) - return self._subsets + return super().subsets() def __len__(self): assert self._length in {None, 'parent'} or isinstance(self._length, int) diff --git a/datumaro/plugins/transforms.py b/datumaro/plugins/transforms.py index 317a83f7a78..f50afae070f 100644 --- a/datumaro/plugins/transforms.py +++ b/datumaro/plugins/transforms.py @@ -355,24 +355,27 @@ def __init__(self, extractor, splits, seed=None): dataset_size = len(extractor) indices = list(range(dataset_size)) - random.seed(seed) random.shuffle(indices) parts = [] s = 0 - for subset, ratio in splits: + lower_boundary = 0 + for split_idx, (subset, ratio) in enumerate(splits): s += ratio - boundary = int(s * dataset_size) - parts.append((boundary, subset)) - + upper_boundary = int(s * dataset_size) + if split_idx == len(splits) - 1: + upper_boundary = dataset_size + subset_indices = set(indices[lower_boundary : upper_boundary]) + parts.append((subset_indices, subset)) + lower_boundary = upper_boundary self._parts = parts self._subsets = set(s[0] for s in splits) self._length = 'parent' def _find_split(self, index): - for boundary, subset in self._parts: - if index < boundary: + for subset_indices, subset in self._parts: + if index in subset_indices: return subset return subset # all the possible remainder goes to the last split