Skip to content

Commit

Permalink
feat: add a classifier for fashion-mnist with object storage support
Browse files Browse the repository at this point in the history
  • Loading branch information
yjoer committed Jul 29, 2024
1 parent 8772ba2 commit db7fd27
Show file tree
Hide file tree
Showing 6 changed files with 216 additions and 2 deletions.
5 changes: 5 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
JP_LSP_VIRTUAL_DIR=.venv/.virtual_documents

S3_ENDPOINT=
S3_ACCESS_KEY_ID=
S3_SECRET_ACCESS_KEY=
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ __pycache__/
node_modules/
target/

.env
*.pyd
Cargo.lock
pnpm-lock.yaml
Expand Down
67 changes: 67 additions & 0 deletions camp/datasets/mnist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import gzip
import struct
from typing import Literal

import fsspec
import numpy as np
import torch

TensorType = Literal["np", "pt"]


class FashionMNIST:
files = {
"train": "train-images-idx3-ubyte.gz",
"train_labels": "train-labels-idx1-ubyte.gz",
"test": "t10k-images-idx3-ubyte.gz",
"test_labels": "t10k-labels-idx1-ubyte.gz",
}

@staticmethod
def load(path: str, storage_options={}, return_tensors: TensorType = "pt"):
mnist = FashionMNIST()
buffers = mnist._load(path, storage_options)
arrays = mnist._parse(buffers)

if return_tensors == "np":
return arrays

if return_tensors == "pt":
return mnist._to_tensor(arrays)

def _load(self, path: str, storage_options: dict):
buffers = {}

for k, v in self.files.items():
with fsspec.open(f"{path}/{v}", **storage_options) as f:
buffers[k] = bytearray(gzip.decompress(f.read()))

return buffers

def _parse(self, buffers: dict[str, bytes]):
arrays = {}

for subset in ["train", "test"]:
header = struct.unpack(">IIII", buffers[subset][0:16])
magic_number, n_items, n_rows, n_cols = header

images = np.frombuffer(buffers[subset][16:], dtype=np.uint8)
images = images.reshape(n_items, n_rows * n_cols)

arrays[subset] = images

for subset in ["train_labels", "test_labels"]:
magic_number, n_items = struct.unpack(">II", buffers[subset][0:8])
labels = np.frombuffer(buffers[subset][8:], dtype=np.uint8)

arrays[subset] = labels

return arrays

def _to_tensor(self, arrays: dict[str, np.ndarray]):
tensors = {}

for k, v in arrays.items():
tensors[k] = torch.from_numpy(v)

return tensors
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "@base/camp",
"scripts": {
"jupyter": "cross-env-shell JP_LSP_VIRTUAL_DIR=.venv/.virtual_documents PYTHONPATH=$INIT_CWD rye run jupyter lab"
"jupyter": "cross-env-shell PYTHONPATH=$INIT_CWD rye --env-file .env run jupyter lab"
},
"devDependencies": {
"@commitlint/cli": "^19.3.0",
Expand Down
14 changes: 13 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
name = "camp"
dependencies = [
"altair[all]>=5.3.0",
"fsspec>=2024.6.1",
"gymnasium[atari,box2d,classic-control,mujoco,toy-text]>=0.29.1",
"keras>=3.4.1",
"matplotlib>=3.9.1",
"numpy>=1.26.4",
"pandas>=2.2.2",
"s3fs>=2024.6.1",
"scikit-learn>=1.5.1",
"scipy>=1.14.0",
"seaborn>=0.13.2",
Expand All @@ -19,14 +21,23 @@ tensorflow = [
]
torch = [
"torch>=2.3.1",
"torchvision>=0.18.1",
"torchmetrics>=1.4.0.post0",
"torch-directml>=0.2.3.dev240715",
]
tree = [
"catboost>=1.2.5",
"lightgbm>=4.4.0",
"xgboost>=2.1.0",
]
vision = [
"mmcv>=2.2.0",
"mmdet>=3.3.0",
"mmengine>=0.10.4",
"supervision>=0.22.0",
"timm>=1.0.7",
"torchvision>=0.18.1",
"ultralytics>=8.2.67",
]

[build-system]
requires = ["maturin>=1,<2"]
Expand All @@ -42,6 +53,7 @@ module-name = "camp.rs"

[tool.mypy]
cache_dir = ".venv/.mypy_cache"
explicit_package_bases = true
ignore_missing_imports = true
allow_redefinition = true

Expand Down
129 changes: 129 additions & 0 deletions vision/fashion_mnist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# %%
import os

os.environ["KERAS_BACKEND"] = "torch"

# %%
import keras
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torchmetrics import Accuracy
from torchmetrics import MetricCollection
from torchmetrics import Precision
from torchmetrics import Recall
from torchvision.transforms import v2

from camp.datasets.mnist import FashionMNIST

# %load_ext autoreload
# %autoreload 2

# %% [markdown]
# ## Sample

# %%
storage_options = {
"endpoint_url": os.getenv("S3_ENDPOINT"),
"key": os.getenv("S3_ACCESS_KEY_ID"),
"secret": os.getenv("S3_SECRET_ACCESS_KEY"),
}

# %%
dataset = FashionMNIST.load("s3://datasets/fashion_mnist", storage_options)

# %% [markdown]
# ## Modify

# %%
transforms = v2.Compose(
[
v2.Lambda(lambda x: x.view(-1, 1, 28, 28)),
v2.ToDtype(torch.float32, scale=True),
]
)

train_images = transforms(dataset["train"])
test_images = transforms(dataset["test"])

# %%
train_dataset = TensorDataset(train_images, dataset["train_labels"])
test_dataset = TensorDataset(test_images, dataset["test_labels"])

# %%
batch_size = 32

train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size)

# %% [markdown]
# ## Model

# %%
n_batches = np.ceil(len(train_dataset) / batch_size).astype(np.int32)
n_epochs = 2

# %%
feature_extractor = nn.Sequential(
nn.Conv2d(1, 32, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2),
nn.Conv2d(32, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2),
nn.Flatten(),
)

classifier = nn.Sequential(nn.Linear(64 * 7 * 7, 10))

# %%
parameters = list(feature_extractor.parameters()) + list(classifier.parameters())
optimizer = optim.Adam(parameters, lr=1e-3)

# %%
for i in range(n_epochs):
print(f"Epoch: {i + 1}/{n_epochs}")

steps = 1
pbar = keras.utils.Progbar(n_batches)

for images, labels in train_dataloader:
x = feature_extractor(images)
x = classifier(x)

loss = F.cross_entropy(x, labels)
pbar.update(steps, values=[("loss", loss.item())])

optimizer.zero_grad()
loss.backward()
optimizer.step()

steps += 1

# %%
metrics = MetricCollection(
[
Accuracy(task="multiclass", num_classes=10),
Precision(task="multiclass", num_classes=10, average=None),
Recall(task="multiclass", num_classes=10, average=None),
]
)

feature_extractor.eval()
classifier.eval()

with torch.no_grad():
for images, labels in test_dataloader:
x = feature_extractor(images)
x = classifier(x)

_, predictions = torch.max(x, dim=1)
metrics.update(predictions, labels)

metrics.compute()

# %%

0 comments on commit db7fd27

Please sign in to comment.