From a5c97b38e83eb6a0572f2883a2a0dff89911c977 Mon Sep 17 00:00:00 2001 From: paoxiaode <51984235+paoxiaode@users.noreply.github.com> Date: Thu, 12 Oct 2023 16:00:59 +0800 Subject: [PATCH] [Dataset] add COCOsuperpixel dataset (#6407) --- python/dgl/data/__init__.py | 1 + python/dgl/data/lrgb.py | 283 +++++++++++++++++++++++++++++++++ tests/integration/test_data.py | 17 ++ 3 files changed, 301 insertions(+) diff --git a/python/dgl/data/__init__.py b/python/dgl/data/__init__.py index 64225b6f0ac7..a4b18cc8d8d3 100644 --- a/python/dgl/data/__init__.py +++ b/python/dgl/data/__init__.py @@ -76,6 +76,7 @@ # datasets. try: from .lrgb import ( + COCOSuperpixelsDataset, PeptidesFunctionalDataset, PeptidesStructuralDataset, VOCSuperpixelsDataset, diff --git a/python/dgl/data/lrgb.py b/python/dgl/data/lrgb.py index b638d7247250..5290eb0517ec 100644 --- a/python/dgl/data/lrgb.py +++ b/python/dgl/data/lrgb.py @@ -797,3 +797,286 @@ def __getitem__(self, idx): return self.graphs[idx] return self._transform(self.graphs[idx]) + + +class COCOSuperpixelsDataset(DGLDataset): + r"""COCO superpixel dataset for the node classification task. + + DGL dataset of COCO-SP in the LRGB benckmark which contains image + superpixels and a semantic segmentation label for each node superpixel. + + Based on the COCO 2017 dataset. Original source ``_ + + Reference ``_ + + Statistics: + + - Train examples: 113,286 + - Valid examples: 5,000 + - Test examples: 5,000 + - Average number of nodes: 476.88 + - Average number of edges: 2,710.48 + - Number of node classes: 81 + + Parameters + ---------- + raw_dir : str + Directory to store all the downloaded raw datasets. + Default: "~/.dgl/". + split : str + Should be chosen from ["train", "val", "test"] + Default: "train". + construct_format : str, optional + Option to select the graph construction format. + Should be chosen from the following formats: + + - "edge_wt_only_coord": the graphs are 8-nn graphs with the edge weights + computed based on only spatial coordinates of superpixel nodes. + - "edge_wt_coord_feat": the graphs are 8-nn graphs with the edge weights + computed based on combination of spatial coordinates and feature + values of superpixel nodes. + - "edge_wt_region_boundary": the graphs region boundary graphs where two + regions (i.e. superpixel nodes) have an edge between them if they + share a boundary in the original image. + + Default: "edge_wt_region_boundary". + slic_compactness : int, optional + Option to select compactness of slic that was used for superpixels + Should be chosen from [10, 30] + Default: 30. + force_reload : bool + Whether to reload the dataset. + Default: False. + verbose : bool + Whether to print out progress information. + Default: False. + transform : callable, optional + A transform that takes in a :class:`~dgl.DGLGraph` object and returns + a transformed version. The :class:`~dgl.DGLGraph` object will be + transformed before every access. + + Examples + --------- + >>> from dgl.data import COCOSuperpixelsDataset + + >>> train_dataset = COCOSuperpixelsDataset(split="train") + >>> len(train_dataset) + 113286 + >>> train_dataset.num_classes + 81 + >>> graph = train_dataset[0] + >>> graph + Graph(num_nodes=488, num_edges=2766, + ndata_schemes={'feat': Scheme(shape=(14,), dtype=torch.float32), + 'label': Scheme(shape=(), dtype=torch.uint8)} + edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float32)}) + + >>> # support tensor to be index when transform is None + >>> # see details in __getitem__ function + >>> import torch + >>> idx = torch.tensor([0, 1, 2]) + >>> train_dataset_subset = train_dataset[idx] + >>> train_dataset_subset[0] + Graph(num_nodes=488, num_edges=2766, + ndata_schemes={'feat': Scheme(shape=(14,), dtype=torch.float32), + 'label': Scheme(shape=(), dtype=torch.uint8)} + edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float32)}) + """ + + urls = { + 10: { + "edge_wt_only_coord": """ + https://www.dropbox.com/s/prqizdep8gk0ndk/coco_superpixels_edge_wt_only_coord.zip?dl=1 + """, + "edge_wt_coord_feat": """ + https://www.dropbox.com/s/zftoyln1pkcshcg/coco_superpixels_edge_wt_coord_feat.zip?dl=1 + """, + "edge_wt_region_boundary": """ + https://www.dropbox.com/s/fhihfcyx2y978u8/coco_superpixels_edge_wt_region_boundary.zip?dl=1 + """, + }, + 30: { + "edge_wt_only_coord": """ + https://www.dropbox.com/s/hrbfkxmc5z9lsaz/coco_superpixels_edge_wt_only_coord.zip?dl=1 + """, + "edge_wt_coord_feat": """ + https://www.dropbox.com/s/4rfa2d5ij1gfu9b/coco_superpixels_edge_wt_coord_feat.zip?dl=1 + """, + "edge_wt_region_boundary": """ + https://www.dropbox.com/s/r6ihg1f4pmyjjy0/coco_superpixels_edge_wt_region_boundary.zip?dl=1 + """, + }, + } + + def __init__( + self, + raw_dir=None, + split="train", + construct_format="edge_wt_region_boundary", + slic_compactness=30, + force_reload=None, + verbose=None, + transform=None, + ): + assert split in ["train", "val", "test"], "split not valid." + assert construct_format in [ + "edge_wt_only_coord", + "edge_wt_coord_feat", + "edge_wt_region_boundary", + ], "construct_format not valid." + assert slic_compactness in [10, 30], "slic_compactness not valid." + + self.construct_format = construct_format + self.slic_compactness = slic_compactness + self.split = split + self.graphs = [] + + super().__init__( + name="COCO-SP", + raw_dir=raw_dir, + url=self.urls[self.slic_compactness][self.construct_format], + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) + + @property + def save_path(self): + r"""Directory to save the processed dataset.""" + return os.path.join( + self.raw_path, + "slic_compactness_" + str(self.slic_compactness), + self.construct_format, + ) + + @property + def raw_data_path(self): + r"""Path to save the raw dataset file.""" + return os.path.join(self.save_path, f"{self.split}.pickle") + + @property + def graph_path(self): + r"""Path to save the processed dataset file.""" + return os.path.join(self.save_path, f"processed_{self.split}.pkl") + + @property + def num_classes(self): + r"""Number of classes for each node.""" + return 81 + + def __len__(self): + r"""The number of examples in the dataset.""" + return len(self.graphs) + + def download(self): + zip_file_path = os.path.join( + self.raw_path, "coco_superpixels_" + self.construct_format + ".zip" + ) + path = download(self.url, path=zip_file_path, overwrite=True) + extract_archive(path, self.raw_path, overwrite=True) + makedirs(self.save_path) + os.rename( + os.path.join( + self.raw_path, "coco_superpixels_" + self.construct_format + ), + self.save_path, + ) + os.unlink(path) + + def label_remap(self): + # Util function to remap the labels as the original label + # idxs are not contiguous + # fmt: off + original_label_idx = [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, + 58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, + 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90 + ] + # fmt: on + label_map = {} + for i, key in enumerate(original_label_idx): + label_map[key] = i + + return label_map + + def process(self): + with open(self.raw_data_path, "rb") as file: + graphs = pickle.load(file) + + label_map = self.label_remap() + + for idx in tqdm( + range(len(graphs)), desc=f"Processing {self.split} dataset" + ): + graph = graphs[idx] + + """ + Each `graph` is a tuple (x, edge_attr, edge_index, y) + Shape of x : [num_nodes, 14] + Shape of edge_attr : [num_edges, 1] or [num_edges, 2] + Shape of edge_index : [2, num_edges] + Shape of y : [num_nodes] + """ + + DGLgraph = dgl_graph( + (graph[2][0], graph[2][1]), + num_nodes=len(graph[3]), + ) + DGLgraph.ndata["feat"] = graph[0].to(F.float32) + DGLgraph.edata["feat"] = graph[1].to(F.float32) + + y = F.tensor(graph[3]) + + # Label remapping. See self.label_remap() func + for i, label in enumerate(y): + y[i] = label_map[label.item()] + + DGLgraph.ndata["label"] = y + self.graphs.append(DGLgraph) + + def load(self): + with open(self.graph_path, "rb") as file: + graphs = pickle.load(file) + self.graphs = graphs + + def save(self): + with open(os.path.join(self.graph_path), "wb") as file: + pickle.dump(self.graphs, file) + + def has_cache(self): + return os.path.exists(self.graph_path) + + def __getitem__(self, idx): + r"""Get the idx-th sample. + + Parameters + --------- + idx : int or tensor + The sample index. + 1-D tensor as `idx` is allowed when transform is None. + + Returns + ------- + :class:`dgl.DGLGraph` + graph structure, node features, node labels and edge features. + + - ``ndata['feat']``: node features + - ``ndata['label']``: node labels + - ``edata['feat']``: edge features + or + :class:`dgl.data.utils.Subset` + Subset of the dataset at specified indices + """ + if F.is_tensor(idx) and idx.dim() == 1: + if self._transform is None: + return Subset(self, idx.cpu()) + raise ValueError( + "Tensor idx not supported when transform is not None." + ) + + if self._transform is None: + return self.graphs[idx] + + return self._transform(self.graphs[idx]) diff --git a/tests/integration/test_data.py b/tests/integration/test_data.py index ebd9ab0d9379..586c4444b5f9 100644 --- a/tests/integration/test_data.py +++ b/tests/integration/test_data.py @@ -107,6 +107,23 @@ def test_VOC_superpixels(): assert g2.num_edges() - g1.num_edges() == g1.num_nodes() +@unittest.skipIf( + F._default_context_str == "gpu", + reason="Datasets don't need to be tested on GPU.", +) +@unittest.skipIf( + dgl.backend.backend_name != "pytorch", reason="only supports pytorch" +) +def test_COCO_superpixels(): + transform = dgl.AddSelfLoop(allow_duplicate=True) + dataset1 = data.COCOSuperpixelsDataset() + g1 = dataset1[0] + dataset2 = data.COCOSuperpixelsDataset(transform=transform) + g2 = dataset2[0] + + assert g2.num_edges() - g1.num_edges() == g1.num_nodes() + + @unittest.skipIf( F._default_context_str == "gpu", reason="Datasets don't need to be tested on GPU.",