Skip to content
This repository has been archived by the owner on Oct 9, 2023. It is now read-only.

from_tensors support for VideoClassification #1389

Merged
merged 48 commits into from
Sep 1, 2022
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
bef8e50
WIP: from_tensors support
krshrimali Jul 14, 2022
b29cdb2
remove unused func in tests
krshrimali Jul 14, 2022
6d1a0be
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 14, 2022
2f8bb7e
Remove doc, add LabeledVideoTensorDataset
krshrimali Jul 14, 2022
9490e31
Fix merge conflict
krshrimali Jul 14, 2022
ce882a1
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 14, 2022
4031f5c
class for prediction
krshrimali Jul 14, 2022
22f049d
Fixes for predictions
krshrimali Jul 14, 2022
951ae93
Merge branch 'master' into video/feature/classification/from_tensors
krshrimali Jul 14, 2022
c36dbbe
minor... to fix the CI
krshrimali Jul 14, 2022
23002f2
Merge branch 'video/feature/classification/from_tensors' of github.co…
krshrimali Jul 14, 2022
d210bd8
remove make_tensor, use randint (compatible with older pytorch versions)
krshrimali Jul 15, 2022
242ca8b
Merge branch 'master' into video/feature/classification/from_tensors
krshrimali Jul 15, 2022
738a022
Separate tests for data loading for tensors
krshrimali Jul 15, 2022
465fb2f
Separate tests for data loading for tensors
krshrimali Jul 15, 2022
b235e7d
Merge branch 'video/feature/classification/from_tensors' of github.co…
krshrimali Jul 15, 2022
5da11a4
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 15, 2022
80ec1eb
Skip doctest if not video installed
krshrimali Jul 15, 2022
6617097
Fix tests
krshrimali Jul 15, 2022
22edc4e
skip if pytorchvideo not installed
krshrimali Jul 15, 2022
df37dd1
correct format in the doctest
krshrimali Jul 15, 2022
f864a3b
Merge branch 'master' into video/feature/classification/from_tensors
krshrimali Jul 21, 2022
76410e0
Add labels to the call; prediction test
krshrimali Aug 29, 2022
6f6de3a
Pass labels, add prediction test
krshrimali Aug 29, 2022
6827164
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 29, 2022
1f61192
Fix doc
krshrimali Aug 29, 2022
abd7f22
Merge branch 'video/feature/classification/from_tensors' of github.co…
krshrimali Aug 29, 2022
92df464
Update tests/video/classification/test_model.py
krshrimali Aug 30, 2022
3e887f3
Update flash/video/classification/utils.py
krshrimali Aug 30, 2022
a006bb6
Address review
krshrimali Aug 30, 2022
2e65edc
Merge branch 'video/feature/classification/from_tensors' of github.co…
krshrimali Aug 30, 2022
3ef8fbd
pep8
krshrimali Aug 30, 2022
6fea612
Update flash/video/classification/utils.py
krshrimali Aug 30, 2022
d71ce41
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 30, 2022
1c9e3f8
Address review: allow stack of tensors, tensor, list of tensors, matc…
krshrimali Aug 31, 2022
b81803b
Remove breakpoints
krshrimali Aug 31, 2022
13df5af
Fix doctests
krshrimali Aug 31, 2022
da2fe56
Fix doctest
krshrimali Aug 31, 2022
2381f04
Revert pre-commit change
krshrimali Aug 31, 2022
f95ef7d
Merge branch 'master' into video/feature/classification/from_tensors
krshrimali Aug 31, 2022
ecc5528
Add license, improve tests - use parametrize, refactor
krshrimali Sep 1, 2022
73613f3
Merge branch 'video/feature/classification/from_tensors' of github.co…
krshrimali Sep 1, 2022
2d69b40
Fix error for video not available
krshrimali Sep 1, 2022
7009f86
unused import
krshrimali Sep 1, 2022
62e67f5
Add check for video available or not
krshrimali Sep 1, 2022
e5b2350
If not video available, return tensors from randint
krshrimali Sep 1, 2022
3e72e1a
mock_video_tensors is removed now
krshrimali Sep 1, 2022
1cf3d75
Use _is_list_like instead of isinstance for list/tuple
krshrimali Sep 1, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 129 additions & 1 deletion flash/video/classification/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
VideoClassificationFilesInput,
VideoClassificationFoldersInput,
VideoClassificationPathsPredictInput,
VideoClassificationTensorsInput,
VideoClassificationTensorsPredictInput,
)
from flash.video.classification.input_transform import VideoClassificationInputTransform

Expand All @@ -63,6 +65,7 @@
"VideoClassificationData.from_folders",
"VideoClassificationData.from_data_frame",
"VideoClassificationData.from_csv",
"VideoClassificationData.from_tensors",
]
if not _VIDEO_EXTRAS_TESTING:
__doctest_skip__ += ["VideoClassificationData.from_fiftyone"]
Expand Down Expand Up @@ -395,7 +398,6 @@ def from_data_frame(
predict_data_frame: Optional[pd.DataFrame] = None,
predict_videos_root: Optional[str] = None,
predict_resolver: Optional[Callable[[str, str], str]] = None,
target_formatter: Optional[TargetFormatter] = None,
clip_sampler: Union[str, "ClipSampler"] = "random",
clip_duration: float = 2,
clip_sampler_kwargs: Dict[str, Any] = None,
Expand All @@ -404,6 +406,7 @@ def from_data_frame(
decoder: str = "pyav",
input_cls: Type[Input] = VideoClassificationDataFrameInput,
predict_input_cls: Type[Input] = VideoClassificationDataFramePredictInput,
target_formatter: Optional[TargetFormatter] = None,
transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform,
transform_kwargs: Optional[Dict] = None,
**data_module_kwargs: Any,
Expand Down Expand Up @@ -566,6 +569,131 @@ def from_data_frame(
**data_module_kwargs,
)

@classmethod
def from_tensors(
cls,
input_field: str,
target_field: Optional[Union[str, Sequence[str]]] = None,
train_data: Optional[Dict[str, Union[torch.Tensor, Any, List[Any]]]] = None,
val_data: Optional[Dict[str, Union[torch.Tensor, Any, List[Any]]]] = None,
test_data: Optional[Dict[str, Union[torch.Tensor, Any, List[Any]]]] = None,
predict_data: Optional[Dict[str, Union[torch.Tensor, Any, List[Any]]]] = None,
video_sampler: Type[Sampler] = torch.utils.data.SequentialSampler,
input_cls: Type[Input] = VideoClassificationTensorsInput,
predict_input_cls: Type[Input] = VideoClassificationTensorsPredictInput,
target_formatter: Optional[TargetFormatter] = None,
transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform,
transform_kwargs: Optional[Dict] = None,
**data_module_kwargs: Any,
) -> "VideoClassificationData":
"""Load the :class:`~flash.video.classification.data.VideoClassificationData` from a dictionary containing
PyTorch tensors representing input video frames and their corresponding targets.

Input tensor(s) will be extracted from the ``input_field`` in the ``dict``.
The targets will be extracted from the ``target_fields`` in the ``dict`` and can be in any of our
:ref:`supported classification target formats <formatting_classification_targets>`.

To learn how to customize the transforms applied for each stage, read our
:ref:`customizing transforms guide <customizing_transforms>`.

Args:
input_field: The field (key name) in ``dict`` containing the video tensors.
target_field: The field (key name) in the ``dict`` containing the targets.
train_data: The ``dict`` containing tensors in ``input_field`` key and targets in
``target_fields`` key to use when training.
val_data: The ``dict`` containing tensors in ``input_field`` key and targets in
``target_fields`` key to use when validating.
test_data: The ``dict`` containing tensors in ``input_field`` key and targets in
``target_fields`` key to use when testing.
target_formatter: Optionally provide a :class:`~flash.core.data.utilities.classification.TargetFormatter` to
control how targets are handled. See :ref:`formatting_classification_targets` for more details.
video_sampler: Sampler for the internal video container. This defines the order tensors are used and,
if necessary, the distributed split.
input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the data.
predict_input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the prediction data.
transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use.
transform_kwargs: Dict of keyword arguments to be provided when instantiating the transforms.
data_module_kwargs: Additional keyword arguments to provide to the
:class:`~flash.core.data.data_module.DataModule` constructor.

Returns:
The constructed :class:`~flash.video.classification.data.VideoClassificationData`.

Examples
________

.. doctest::

>>> import torch
>>> from flash import Trainer
>>> from flash.video import VideoClassifier, VideoClassificationData
>>> input_video = torch.randint(low=0, high=255, size=(3, 10, 10, 10), dtype=torch.uint8, device="cpu")
>>> train_data = {
... "data": torch.stack(
... (
... input_video,
... input_video,
... )
... ), # 2 videos (each video: 10 frames)
... "targets": ["fruit", "vegetable"], # Labels corresponding to each video
... }
>>> predict_data = {
... "data": torch.stack((input_video,)),
... }
>>> datamodule = VideoClassificationData.from_tensors(
... input_field="data",
... target_field="targets",
... train_data=train_data,
... predict_data=predict_data,
... batch_size=1,
... )
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be better to make the API consistent with what we have for image classification. E.g. like this:

            >>> datamodule = VideoClassificationData.from_tensors(
            ...     train_data=[input_video, input_video, input_video],
            ...     train_targets=[1, 2, 3],
            ...     predict_data=predict_data,
            ...     batch_size=1,
            ... )

>>> datamodule.num_classes
2
>>> datamodule.labels
['fruit', 'vegetable']
>>> model = VideoClassifier(backbone="x3d_xs", num_classes=datamodule.num_classes)
>>> trainer = Trainer(fast_dev_run=True)
>>> trainer.fit(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
Training...
>>> trainer.predict(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
Predicting...

.. testcleanup::

>>> del input_video
>>> del train_data
>>> del predict_data
"""
train_tuple = (train_data, input_field, target_field)
val_tuple = (val_data, input_field, target_field)
test_tuple = (test_data, input_field, target_field)
predict_tuple = (predict_data, input_field)

train_input = input_cls(
RunningStage.TRAINING, *train_tuple, video_sampler=video_sampler, target_formatter=target_formatter
)
target_formatter = getattr(train_input, "target_formatter", None)

return cls(
train_input,
input_cls(
RunningStage.VALIDATING,
*val_tuple,
video_sampler=video_sampler,
target_formatter=target_formatter,
),
input_cls(
RunningStage.TESTING,
*test_tuple,
video_sampler=video_sampler,
target_formatter=target_formatter,
),
predict_input_cls(RunningStage.PREDICTING, *predict_tuple),
transform=transform,
transform_kwargs=transform_kwargs,
**data_module_kwargs,
)

@classmethod
def from_csv(
cls,
Expand Down
78 changes: 77 additions & 1 deletion flash/video/classification/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,17 @@
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.data.labeled_video_dataset import LabeledVideoDataset
from pytorchvideo.data.labeled_video_paths import LabeledVideoPaths

from flash.video.classification.utils import LabeledVideoTensorDataset

else:
ClipSampler, LabeledVideoDataset, EncodedVideo, ApplyTransformToKey = None, None, None, None
ClipSampler, LabeledVideoDataset, LabeledVideoTensorDataset, EncodedVideo, ApplyTransformToKey = (
None,
None,
None,
None,
None,
)


def _make_clip_sampler(
Expand Down Expand Up @@ -87,6 +96,29 @@ def load_sample(self, sample):
return sample


class VideoClassificationTensorsBaseInput(IterableInput, ClassificationInputMixin):
def load_data(
self,
inputs: torch.Tensor,
targets: Union[List[Any], Any],
video_sampler: Type[Sampler] = torch.utils.data.RandomSampler,
target_formatter: Optional[TargetFormatter] = None,
) -> "LabeledVideoTensorDataset":
# Note: We take whatever is the shortest out of inputs and targets
dataset = LabeledVideoTensorDataset(list(zip(inputs, targets)), video_sampler=video_sampler)
if not self.predicting:
self.load_target_metadata(
[sample[1] for sample in dataset._labeled_videos], target_formatter=target_formatter
)
return dataset

def load_sample(self, sample):
sample["label"] = self.format_target(sample["label"])
sample[DataKeys.INPUT] = sample.pop("video")
sample[DataKeys.TARGET] = sample.pop("label")
return sample


class VideoClassificationFoldersInput(VideoClassificationInput):
def load_data(
self,
Expand Down Expand Up @@ -178,6 +210,35 @@ def load_data(
return result


class VideoClassificationTensorsInput(VideoClassificationTensorsBaseInput):
labels: list

def load_data(
self,
input_data: Dict[str, Union[torch.Tensor, Any, List[Any]]],
input_key: str,
target_keys: Union[str, List[str]],
video_sampler: Type[Sampler] = torch.utils.data.RandomSampler,
target_formatter: Optional[TargetFormatter] = None,
) -> "LabeledVideoTensorDataset":
result = super().load_data(
input_data[input_key],
input_data[target_keys], # TODO: @krshrimali: this does not support list of str as of now
video_sampler=video_sampler,
target_formatter=target_formatter,
)

# If we had binary multi-class targets then we also know the labels (column names)
if (
self.training
and isinstance(self.target_formatter, MultiBinaryTargetFormatter)
and isinstance(target_keys, List)
):
self.labels = target_keys

return result


class VideoClassificationCSVInput(VideoClassificationDataFrameInput):
def load_data(
self,
Expand Down Expand Up @@ -316,6 +377,21 @@ def predict_load_data(
)


class VideoClassificationTensorsPredictInput(Input):
def predict_load_data(
self,
data: Dict[str, Union[torch.Tensor, List[Any], Any]],
data_key: str,
):
return list(data[data_key])

def predict_load_sample(self, sample: torch.Tensor) -> Dict[str, Any]:
return {
DataKeys.INPUT: sample,
"video_index": 0,
}


class VideoClassificationCSVPredictInput(VideoClassificationDataFramePredictInput):
def predict_load_data(
self,
Expand Down
89 changes: 89 additions & 0 deletions flash/video/classification/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from typing import List, Optional, Tuple, Type

import torch

from flash.core.utilities.imports import _VIDEO_AVAILABLE

if _VIDEO_AVAILABLE:
from pytorchvideo.data.utils import MultiProcessSampler
else:
MultiProcessSampler = None


class LabeledVideoTensorDataset(torch.utils.data.IterableDataset):
"""LabeledVideoTensorDataset handles a direct tensor input data."""

def __init__(
self,
labeled_video_tensors: List[Tuple[str, Optional[dict]]],
video_sampler: Type[torch.utils.data.Sampler] = torch.utils.data.RandomSampler,
) -> None:
self._labeled_videos = labeled_video_tensors

# If a RandomSampler is used we need to pass in a custom random generator that
# ensures all PyTorch multiprocess workers have the same random seed.
self._video_random_generator = None
if video_sampler == torch.utils.data.RandomSampler:
self._video_random_generator = torch.Generator()
self._video_sampler = video_sampler(self._labeled_videos, generator=self._video_random_generator)
else:
self._video_sampler = video_sampler(self._labeled_videos)

self._video_sampler_iter = None # Initialized on first call to self.__next__()

# Depending on the clip sampler type, we may want to sample multiple clips
# from one video. In that case, we keep the store video, label and previous sampled
# clip time in these variables.
self._loaded_video_label = None

def __next__(self) -> dict:
"""Retrieves the next clip based on the clip sampling strategy and video sampler.

Returns:
A dictionary with the following format.

.. code-block:: text

{
'video': <video_tensor>,
'label': <index_label>,
'video_label': <index_label>
'video_index': <video_index>,
}
"""
if not self._video_sampler_iter:
# Setup MultiProcessSampler here - after PyTorch DataLoader workers are spawned.
self._video_sampler_iter = iter(MultiProcessSampler(self._video_sampler))

# Reuse previously stored video if there are still clips to be sampled from
# the last loaded video.
video_index = next(self._video_sampler_iter)
video_tensor, info_dict = self._labeled_videos[video_index]
self._loaded_video_label = (video_tensor, info_dict, video_index)

sample_dict = {
"video": self._loaded_video_label[0],
"video_name": f"video{video_index}",
"video_index": video_index,
"label": info_dict,
"video_label": info_dict,
}

return sample_dict

def __iter__(self):
self._video_sampler_iter = None # Reset video sampler

# If we're in a PyTorch DataLoader multiprocessing context, we need to use the
# same seed for each worker's RandomSampler generator. The workers at each
# __iter__ call are created from the unique value: worker_info.seed - worker_info.id,
# which we can use for this seed.
worker_info = torch.utils.data.get_worker_info()
if self._video_random_generator is not None and worker_info is not None:
base_seed = worker_info.seed - worker_info.id
self._video_random_generator.manual_seed(base_seed)

return self

def size(self):
return len(self._labeled_videos)
Loading