feat: add a classifier for fashion-mnist with object storage support

yjoer · Jul 29, 2024 · db7fd27 · db7fd27
1 parent 8772ba2
commit db7fd27
Show file tree

Hide file tree

Showing 6 changed files with 216 additions and 2 deletions.
diff --git a/.env.example b/.env.example
@@ -0,0 +1,5 @@
+JP_LSP_VIRTUAL_DIR=.venv/.virtual_documents
+
+S3_ENDPOINT=
+S3_ACCESS_KEY_ID=
+S3_SECRET_ACCESS_KEY=
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@ __pycache__/
 node_modules/
 target/
 
+.env
 *.pyd
 Cargo.lock
 pnpm-lock.yaml

diff --git a/camp/datasets/mnist.py b/camp/datasets/mnist.py
@@ -0,0 +1,67 @@
+import gzip
+import struct
+from typing import Literal
+
+import fsspec
+import numpy as np
+import torch
+
+TensorType = Literal["np", "pt"]
+
+
+class FashionMNIST:
+    files = {
+        "train": "train-images-idx3-ubyte.gz",
+        "train_labels": "train-labels-idx1-ubyte.gz",
+        "test": "t10k-images-idx3-ubyte.gz",
+        "test_labels": "t10k-labels-idx1-ubyte.gz",
+    }
+
+    @staticmethod
+    def load(path: str, storage_options={}, return_tensors: TensorType = "pt"):
+        mnist = FashionMNIST()
+        buffers = mnist._load(path, storage_options)
+        arrays = mnist._parse(buffers)
+
+        if return_tensors == "np":
+            return arrays
+
+        if return_tensors == "pt":
+            return mnist._to_tensor(arrays)
+
+    def _load(self, path: str, storage_options: dict):
+        buffers = {}
+
+        for k, v in self.files.items():
+            with fsspec.open(f"{path}/{v}", **storage_options) as f:
+                buffers[k] = bytearray(gzip.decompress(f.read()))
+
+        return buffers
+
+    def _parse(self, buffers: dict[str, bytes]):
+        arrays = {}
+
+        for subset in ["train", "test"]:
+            header = struct.unpack(">IIII", buffers[subset][0:16])
+            magic_number, n_items, n_rows, n_cols = header
+
+            images = np.frombuffer(buffers[subset][16:], dtype=np.uint8)
+            images = images.reshape(n_items, n_rows * n_cols)
+
+            arrays[subset] = images
+
+        for subset in ["train_labels", "test_labels"]:
+            magic_number, n_items = struct.unpack(">II", buffers[subset][0:8])
+            labels = np.frombuffer(buffers[subset][8:], dtype=np.uint8)
+
+            arrays[subset] = labels
+
+        return arrays
+
+    def _to_tensor(self, arrays: dict[str, np.ndarray]):
+        tensors = {}
+
+        for k, v in arrays.items():
+            tensors[k] = torch.from_numpy(v)
+
+        return tensors
diff --git a/package.json b/package.json
@@ -1,7 +1,7 @@
 {
   "name": "@base/camp",
   "scripts": {
-    "jupyter": "cross-env-shell JP_LSP_VIRTUAL_DIR=.venv/.virtual_documents PYTHONPATH=$INIT_CWD rye run jupyter lab"
+    "jupyter": "cross-env-shell PYTHONPATH=$INIT_CWD rye --env-file .env run jupyter lab"
   },
   "devDependencies": {
     "@commitlint/cli": "^19.3.0",

diff --git a/pyproject.toml b/pyproject.toml
@@ -2,11 +2,13 @@
 name = "camp"
 dependencies = [
     "altair[all]>=5.3.0",
+    "fsspec>=2024.6.1",
     "gymnasium[atari,box2d,classic-control,mujoco,toy-text]>=0.29.1",
     "keras>=3.4.1",
     "matplotlib>=3.9.1",
     "numpy>=1.26.4",
     "pandas>=2.2.2",
+    "s3fs>=2024.6.1",
     "scikit-learn>=1.5.1",
     "scipy>=1.14.0",
     "seaborn>=0.13.2",
@@ -19,14 +21,23 @@ tensorflow = [
 ]
 torch = [
     "torch>=2.3.1",
-    "torchvision>=0.18.1",
+    "torchmetrics>=1.4.0.post0",
     "torch-directml>=0.2.3.dev240715",
 ]
 tree = [
     "catboost>=1.2.5",
     "lightgbm>=4.4.0",
     "xgboost>=2.1.0",
 ]
+vision = [
+    "mmcv>=2.2.0",
+    "mmdet>=3.3.0",
+    "mmengine>=0.10.4",
+    "supervision>=0.22.0",
+    "timm>=1.0.7",
+    "torchvision>=0.18.1",
+    "ultralytics>=8.2.67",
+]
 
 [build-system]
 requires = ["maturin>=1,<2"]
@@ -42,6 +53,7 @@ module-name = "camp.rs"
 
 [tool.mypy]
 cache_dir = ".venv/.mypy_cache"
+explicit_package_bases = true
 ignore_missing_imports = true
 allow_redefinition = true
 

diff --git a/vision/fashion_mnist.py b/vision/fashion_mnist.py
@@ -0,0 +1,129 @@
+# %%
+import os
+
+os.environ["KERAS_BACKEND"] = "torch"
+
+# %%
+import keras
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from torch.utils.data import TensorDataset
+from torchmetrics import Accuracy
+from torchmetrics import MetricCollection
+from torchmetrics import Precision
+from torchmetrics import Recall
+from torchvision.transforms import v2
+
+from camp.datasets.mnist import FashionMNIST
+
+# %load_ext autoreload
+# %autoreload 2
+
+# %% [markdown]
+# ## Sample
+
+# %%
+storage_options = {
+    "endpoint_url": os.getenv("S3_ENDPOINT"),
+    "key": os.getenv("S3_ACCESS_KEY_ID"),
+    "secret": os.getenv("S3_SECRET_ACCESS_KEY"),
+}
+
+# %%
+dataset = FashionMNIST.load("s3://datasets/fashion_mnist", storage_options)
+
+# %% [markdown]
+# ## Modify
+
+# %%
+transforms = v2.Compose(
+    [
+        v2.Lambda(lambda x: x.view(-1, 1, 28, 28)),
+        v2.ToDtype(torch.float32, scale=True),
+    ]
+)
+
+train_images = transforms(dataset["train"])
+test_images = transforms(dataset["test"])
+
+# %%
+train_dataset = TensorDataset(train_images, dataset["train_labels"])
+test_dataset = TensorDataset(test_images, dataset["test_labels"])
+
+# %%
+batch_size = 32
+
+train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True)
+test_dataloader = DataLoader(test_dataset, batch_size)
+
+# %% [markdown]
+# ## Model
+
+# %%
+n_batches = np.ceil(len(train_dataset) / batch_size).astype(np.int32)
+n_epochs = 2
+
+# %%
+feature_extractor = nn.Sequential(
+    nn.Conv2d(1, 32, kernel_size=3, padding=1),
+    nn.ReLU(),
+    nn.MaxPool2d(kernel_size=2),
+    nn.Conv2d(32, 64, kernel_size=3, padding=1),
+    nn.ReLU(),
+    nn.MaxPool2d(kernel_size=2),
+    nn.Flatten(),
+)
+
+classifier = nn.Sequential(nn.Linear(64 * 7 * 7, 10))
+
+# %%
+parameters = list(feature_extractor.parameters()) + list(classifier.parameters())
+optimizer = optim.Adam(parameters, lr=1e-3)
+
+# %%
+for i in range(n_epochs):
+    print(f"Epoch: {i + 1}/{n_epochs}")
+
+    steps = 1
+    pbar = keras.utils.Progbar(n_batches)
+
+    for images, labels in train_dataloader:
+        x = feature_extractor(images)
+        x = classifier(x)
+
+        loss = F.cross_entropy(x, labels)
+        pbar.update(steps, values=[("loss", loss.item())])
+
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        steps += 1
+
+# %%
+metrics = MetricCollection(
+    [
+        Accuracy(task="multiclass", num_classes=10),
+        Precision(task="multiclass", num_classes=10, average=None),
+        Recall(task="multiclass", num_classes=10, average=None),
+    ]
+)
+
+feature_extractor.eval()
+classifier.eval()
+
+with torch.no_grad():
+    for images, labels in test_dataloader:
+        x = feature_extractor(images)
+        x = classifier(x)
+
+        _, predictions = torch.max(x, dim=1)
+        metrics.update(predictions, labels)
+
+metrics.compute()
+
+# %%