From e7de0a7d1802dc29b088ab00ef1f69f865a5f477 Mon Sep 17 00:00:00 2001 From: Matthew Archer <36638242+ma595@users.noreply.github.com> Date: Tue, 9 Jul 2024 14:54:31 +0100 Subject: [PATCH 1/2] Define a simpler PenguinDataset in Exercise 1 notebook (#73) * Update the classification solutions notebook to utilise the PenguinDataset defined within the notebook itself * Add solution outputs * Add 'task 4' to exercises * Add docstrings to class and re-add the Compose object approach as an n optional exercise * Add e * Improve task 2 text * Improve task 3 text and remove x_tfms and y_tfms comments * Fix Task 4 comment * Fix Task 3 comment with more on PyTorch Tensors --- exercises/01_penguin_classification.ipynb | 176 +++++++++- .../01_penguin_classification_solutions.ipynb | 315 ++++++++++++++---- 2 files changed, 410 insertions(+), 81 deletions(-) diff --git a/exercises/01_penguin_classification.ipynb b/exercises/01_penguin_classification.ipynb index 95eb953..cf532cb 100644 --- a/exercises/01_penguin_classification.ipynb +++ b/exercises/01_penguin_classification.ipynb @@ -55,9 +55,13 @@ "source": [ "### Task 2: creating a ``torch.utils.data.Dataset``\n", "\n", + "The penguin data reading and processing can be encapsulated in a PyTorch dataset class.\n", + "\n", + "- Why is a class representation helpful?\n", + "\n", "All PyTorch dataset objects are subclasses of the ``torch.utils.data.Dataset`` class. To make a custom dataset, create a class which inherits from the ``Dataset`` class, implement some methods (the Python magic (or dunder) methods ``__len__`` and ``__getitem__``) and supply some data.\n", "\n", - "Spoiler alert: we've done this for you already in ``src/ml_workshop/_penguins.py``.\n", + "Spoiler alert: we've done this for you already below (see ``src/ml_workshop/_penguins.py`` for a more sophisticated implementation)\n", "\n", "- Open the file ``src/ml_workshop/_penguins.py``.\n", "- Let's examine, and discuss, each of the methods together.\n", @@ -70,9 +74,118 @@ "- Review and discuss the class arguments.\n", " - ``input_keys``— ...\n", " - ``target_keys``— ...\n", - " - ``train``— ...\n", - " - ``x_tfms``— ...\n", - " - ``y_tfms``— ..." + " - ``train``— ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List, Tuple, Any\n", + "\n", + "# import some useful functions here, see https://pytorch.org/docs/stable/torch.html\n", + "# where `tensor` and `eye` are used for constructing tensors,\n", + "# and using a lower-precision float32 is advised for performance\n", + "# Task 4: add imports here\n", + "# from torch import tensor, eye, float32\n", + "\n", + "from torch.utils.data import Dataset\n", + "\n", + "from palmerpenguins import load_penguins\n", + "\n", + "\n", + "class PenguinDataset(Dataset):\n", + " \"\"\"Penguin dataset class.\n", + "\n", + " Parameters\n", + " ----------\n", + " input_keys : List[str]\n", + " The column titles to use in the input feature vectors.\n", + " target_keys : List[str]\n", + " The column titles to use in the target feature vectors.\n", + " train : bool\n", + " If ``True``, this object will serve as the training set, and if\n", + " ``False``, the validation set.\n", + "\n", + " Notes\n", + " -----\n", + " The validation split contains 10 male and 10 female penguins of each\n", + " species.\n", + "\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " input_keys: List[str],\n", + " target_keys: List[str],\n", + " train: bool,\n", + " ):\n", + " \"\"\"Build ``PenguinDataset``.\"\"\"\n", + " self.input_keys = input_keys\n", + " self.target_keys = target_keys\n", + "\n", + " data = load_penguins()\n", + " data = (\n", + " data.loc[~data.isna().any(axis=1)]\n", + " .sort_values(by=sorted(data.keys()))\n", + " .reset_index(drop=True)\n", + " )\n", + " # Transform the sex field into a float, with male represented by 1.0, female by 0.0\n", + " data.sex = (data.sex == \"male\").astype(float)\n", + " self.full_df = data\n", + "\n", + " valid_df = self.full_df.groupby(by=[\"species\", \"sex\"]).sample(\n", + " n=10,\n", + " random_state=123,\n", + " )\n", + " # The training items are simply the items *not* in the valid split\n", + " train_df = self.full_df.loc[~self.full_df.index.isin(valid_df.index)]\n", + "\n", + " self.split = {\"train\": train_df, \"valid\": valid_df}[\n", + " \"train\" if train is True else \"valid\"\n", + " ]\n", + "\n", + " def __len__(self) -> int:\n", + " \"\"\"Return the length of requested split.\n", + "\n", + " Returns\n", + " -------\n", + " int\n", + " The number of items in the dataset.\n", + "\n", + " \"\"\"\n", + " return len(self.split)\n", + "\n", + " def __getitem__(self, idx: int) -> Tuple[Any, Any]:\n", + " \"\"\"Return an input-target pair.\n", + "\n", + " Parameters\n", + " ----------\n", + " idx : int\n", + " Index of the input-target pair to return.\n", + "\n", + " Returns\n", + " -------\n", + " in_feats : Any\n", + " Inputs.\n", + " target : Any\n", + " Targets.\n", + "\n", + " \"\"\"\n", + " # get the row index (idx) from the dataframe and\n", + " # select relevant column features (provided as input_keys)\n", + " feats = tuple(self.split.iloc[idx][self.input_keys])\n", + "\n", + " # this gives a 'species' i.e. one of ('Gentoo',), ('Chinstrap',), or ('Adelie',)\n", + " tgts = tuple(self.split.iloc[idx][self.target_keys])\n", + "\n", + " # Task 4 - Exercise #1: convert the features to PyTorch Tensors\n", + "\n", + " # Task 4 - Exercise #2: convert target to a 'one-hot' vector.\n", + "\n", + " return feats, tgts" ] }, { @@ -97,8 +210,6 @@ "metadata": {}, "outputs": [], "source": [ - "from ml_workshop import PenguinDataset\n", - "\n", "data_set = PenguinDataset(\n", " input_keys=[\"bill_length_mm\", \"body_mass_g\"],\n", " target_keys=[\"species\"],\n", @@ -117,7 +228,12 @@ "source": [ "- Can we give these items to a neural network, or do they need to be transformed first?\n", " - Short answer: no, we can't just pass tuples of numbers or strings to a neural network.\n", - " - We must represent these data as ``torch.Tensor``s." + " - We must represent these data as ``torch.Tensor``s. This is the fundamental data abstraction used by PyTorch; they are the PyTorch equivalent to Numpy arrays, while also providing support for GPU acceleration. See [pytorch tensors documentation](https://pytorch.org/tutorials/beginner/introyt/tensors_deeper_tutorial.html).\n", + " - The targets are tuples of strings i.e. ('Gentoo', )\n", + " - One idea is to represent as ordinal values i.e. [1] or [2] or [3]. But this implies that the class encoded by value 1 is closer to 2 than 1 is to 3. This is not desirable for categorical data. One-hot encoding avoids this by representing each species independently.\\\n", + " \"A\" — [1, 0, 0]\\\n", + " \"B\" — [0, 1, 0]\\\n", + " \"C\" — [0, 0, 1]" ] }, { @@ -126,14 +242,46 @@ "source": [ "### Task 4: Applying transforms to the data\n", "\n", - "A common way of transforming inputs to neural networks is to apply a series of transforms using ``torchvision.transforms.Compose``. The [``Compose``](https://pytorch.org/vision/stable/generated/torchvision.transforms.Compose.html) object takes a list of callable objects (i.e., functions) and applies them to the incoming data.\n", + "Modify the `PenguinDataset` class above so that the tuples of numbers are converted to PyTorch `torch.Tensor` s and the string targets are converted to one-hot vectors.\n", + "\n", + "- Begin by importing relevant PyTorch functions.\n", + "- Apply transformations inside `__getitem__()` function above.\n", + "\n", + "Then create a training and validation set.\n", + "\n", + " - We allow the model to learn directly from the training set—i.e. we fit the function to these data.\n", + " - During training, we monitor the model's performance on the validation set in order to check how it's doing on unseen data. Normally, people use the validation performance to determine when to stop the training process.\n", + " \n", + "For the validation set, we choose ten males and ten females of each species. This means the validation set is less likely to be biased by sex and species, and is potentially a more reliable measure of performance. You should always be _very_ careful when choosing metrics and splitting data.\n", + "\n", + "- Is this solution general?\n", "\n", - "These transforms can be very useful for mapping between file paths and tensors of images, etc.\n", + "A common way of transforming inputs to neural networks is to apply a series of transforms using `torchvision.transforms.Compose`. The [ `Compose` ](https://pytorch.org/vision/stable/generated/torchvision.transforms.Compose.html) object takes a list of callable objects and applies them to the incoming data. See how this is done more generally in the `src/ml_workshop/_penguins.py` file. \n", + "\n", + "These transforms can be very useful for mapping between file paths and tensors of images, etc.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Apply transforms we need to PenguinDataset to convert input data and target class to tensors. \n", + "# See Task 4 exercise comments above.\n", + "\n", + "# Create train_set\n", + "\n", + "# Create valid_set\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### (Optional) Task 4b: \n", "\n", - "- Note: here we create a training and validation set.\n", - " - We allow the model to learn directly from the training set — i.e. we fit the function to these data.\n", - " - During training, we monitor the model's performance on the validation set in order to check how it's doing on unseen data. Normally, people use the validation performance to determine when to stop the training process.\n", - "- For the validation set, we choose ten males and ten females of each species. This means the validation set is less likely to be biased by sex and species, and is potentially a more reliable measure of performance. You should always be _very_ careful when choosing metrics and splitting data." + "Apply the `torchvision.transforms.Compose` transformations instead of hardcoding as above. " ] }, { @@ -417,7 +565,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/worked-solutions/01_penguin_classification_solutions.ipynb b/worked-solutions/01_penguin_classification_solutions.ipynb index fcab877..25b6f49 100644 --- a/worked-solutions/01_penguin_classification_solutions.ipynb +++ b/worked-solutions/01_penguin_classification_solutions.ipynb @@ -108,9 +108,16 @@ "source": [ "### Task 2: creating a ``torch.utils.data.Dataset``\n", "\n", + "The penguin data reading and processing can be encapsulated in a PyTorch dataset class.\n", + "\n", + "- Why is a class representation helpful?\n", + " - Modularity - Separation of concerns makes the cde easier to understand, maintain and test.\n", + " - Maintainability - Changes are localised, therefore we only need to change a single file to update. \n", + " - Abstraction - Users do not need to know how the data is read or processed, they only need to know how to interact with the class. \n", + "\n", "All PyTorch dataset objects are subclasses of the ``torch.utils.data.Dataset`` class. To make a custom dataset, create a class which inherits from the ``Dataset`` class, implement some methods (the Python magic (or dunder) methods ``__len__`` and ``__getitem__``) and supply some data.\n", "\n", - "Spoiler alert: we've done this for you already in ``src/ml_workshop/_penguins.py``.\n", + "Spoiler alert: we've done this for you already below (see ``src/ml_workshop/_penguins.py`` for a more sophisticated implementation)\n", "\n", "- Open the file ``src/ml_workshop/_penguins.py``.\n", "- Let's examine, and discuss, each of the methods together.\n", @@ -123,9 +130,120 @@ "- Review and discuss the class arguments.\n", " - ``input_keys``— A sequence of strings telling the data set which objects to return as inputs to the model.\n", " - ``target_keys``— Same as ``input_keys`` but specifying the targets.\n", - " - ``train``— A boolean variable determining if the model returns the training or validation split (``True`` for training).\n", - " - ``x_tfms``— A ``Compose`` object with functions which will convert the raw input to a tensor. This argument is _optional_.\n", - " - ``y_tfms``— A ``Compose`` object with functions which will convert the raw target to a tensor. This argument is _optional_." + " - ``train``— A boolean variable determining if the model returns the training or validation split (``True`` for training)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List, Tuple, Any\n", + "\n", + "# import some useful functions here, see https://pytorch.org/docs/stable/torch.html\n", + "# where `tensor` and `eye` are used for constructing tensors,\n", + "# and using a lower-precision float32 is advised for performance\n", + "from torch import tensor, float32, eye\n", + "\n", + "from torch.utils.data import Dataset\n", + "\n", + "from palmerpenguins import load_penguins\n", + "\n", + "\n", + "class PenguinDataset(Dataset):\n", + " \"\"\"Penguin dataset class.\n", + "\n", + " Parameters\n", + " ----------\n", + " input_keys : List[str]\n", + " The column titles to use in the input feature vectors.\n", + " target_keys : List[str]\n", + " The column titles to use in the target feature vectors.\n", + " train : bool\n", + " If ``True``, this object will serve as the training set, and if\n", + " ``False``, the validation set.\n", + "\n", + " Notes\n", + " -----\n", + " The validation split contains 10 male and 10 female penguins of each\n", + " species.\n", + "\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " input_keys: List[str],\n", + " target_keys: List[str],\n", + " train: bool,\n", + " ):\n", + " \"\"\"Build ``PenguinDataset``.\"\"\"\n", + " self.input_keys = input_keys\n", + " self.target_keys = target_keys\n", + "\n", + " data = load_penguins()\n", + " data = (\n", + " data.loc[~data.isna().any(axis=1)]\n", + " .sort_values(by=sorted(data.keys()))\n", + " .reset_index(drop=True)\n", + " )\n", + " # Transform the sex field into a float, with male represented by 1.0, female by 0.0\n", + " data.sex = (data.sex == \"male\").astype(float)\n", + " self.full_df = data\n", + "\n", + " valid_df = self.full_df.groupby(by=[\"species\", \"sex\"]).sample(\n", + " n=10,\n", + " random_state=123,\n", + " )\n", + " # The training items are simply the items *not* in the valid split\n", + " train_df = self.full_df.loc[~self.full_df.index.isin(valid_df.index)]\n", + "\n", + " self.split = {\"train\": train_df, \"valid\": valid_df}[\n", + " \"train\" if train is True else \"valid\"\n", + " ]\n", + "\n", + " def __len__(self) -> int:\n", + " \"\"\"Return the length of requested split.\n", + "\n", + " Returns\n", + " -------\n", + " int\n", + " The number of items in the dataset.\n", + "\n", + " \"\"\"\n", + " return len(self.split)\n", + "\n", + " def __getitem__(self, idx: int) -> Tuple[Any, Any]:\n", + " \"\"\"Return an input-target pair.\n", + "\n", + " Parameters\n", + " ----------\n", + " idx : int\n", + " Index of the input-target pair to return.\n", + "\n", + " Returns\n", + " -------\n", + " in_feats : Any\n", + " Inputs.\n", + " target : Any\n", + " Targets.\n", + "\n", + " \"\"\"\n", + " # get the row index (idx) from the dataframe and\n", + " # select relevant column features (provided as input_keys)\n", + " feats = tuple(self.split.iloc[idx][self.input_keys])\n", + "\n", + " # this gives a 'species' i.e. one of ('Gentoo',), ('Chinstrap',), or ('Adelie',)\n", + " tgts = tuple(self.split.iloc[idx][self.target_keys])\n", + "\n", + " # Task 4 - Exercise #1: convert the features to PyTorch Tensors\n", + " feats = tensor(feats, dtype=float32)\n", + "\n", + " # Task 4 - Exercise #2: convert target to a 'one-hot' vector.\n", + " target_names = sorted(self.full_df.species.unique())\n", + " tgts = eye(len(target_names))[target_names.index(tgts[0])]\n", + "\n", + " return feats, tgts" ] }, { @@ -146,39 +264,37 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(42.9, 13.1, 5000.0, 215.0, 0.0) ('Gentoo',)\n", - "(46.1, 13.2, 4500.0, 211.0, 0.0) ('Gentoo',)\n", - "(44.9, 13.3, 5100.0, 213.0, 0.0) ('Gentoo',)\n", - "(43.3, 13.4, 4400.0, 209.0, 0.0) ('Gentoo',)\n", - "(42.0, 13.5, 4150.0, 210.0, 0.0) ('Gentoo',)\n", - "(46.5, 13.5, 4550.0, 210.0, 0.0) ('Gentoo',)\n", - "(44.0, 13.6, 4350.0, 208.0, 0.0) ('Gentoo',)\n", - "(40.9, 13.7, 4650.0, 214.0, 0.0) ('Gentoo',)\n", - "(42.6, 13.7, 4950.0, 213.0, 0.0) ('Gentoo',)\n", - "(42.7, 13.7, 3950.0, 208.0, 0.0) ('Gentoo',)\n", - "(45.3, 13.7, 4300.0, 210.0, 0.0) ('Gentoo',)\n", - "(47.2, 13.7, 4925.0, 214.0, 0.0) ('Gentoo',)\n", - "(45.2, 13.8, 4750.0, 215.0, 0.0) ('Gentoo',)\n", - "(43.6, 13.9, 4900.0, 217.0, 0.0) ('Gentoo',)\n", - "(43.8, 13.9, 4300.0, 208.0, 0.0) ('Gentoo',)\n", - "(45.5, 13.9, 4200.0, 210.0, 0.0) ('Gentoo',)\n", - "(45.7, 13.9, 4400.0, 214.0, 0.0) ('Gentoo',)\n", - "(43.3, 14.0, 4575.0, 208.0, 0.0) ('Gentoo',)\n", - "(47.5, 14.0, 4875.0, 212.0, 0.0) ('Gentoo',)\n", - "(46.2, 14.1, 4375.0, 217.0, 0.0) ('Gentoo',)\n" + "tensor([ 42.9000, 13.1000, 5000.0000, 215.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 46.1000, 13.2000, 4500.0000, 211.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 44.9000, 13.3000, 5100.0000, 213.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 43.3000, 13.4000, 4400.0000, 209.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 42.0000, 13.5000, 4150.0000, 210.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 46.5000, 13.5000, 4550.0000, 210.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 44.0000, 13.6000, 4350.0000, 208.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 40.9000, 13.7000, 4650.0000, 214.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 42.6000, 13.7000, 4950.0000, 213.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 42.7000, 13.7000, 3950.0000, 208.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 45.3000, 13.7000, 4300.0000, 210.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 47.2000, 13.7000, 4925.0000, 214.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 45.2000, 13.8000, 4750.0000, 215.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 43.6000, 13.9000, 4900.0000, 217.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 43.8000, 13.9000, 4300.0000, 208.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 45.5000, 13.9000, 4200.0000, 210.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 45.7000, 13.9000, 4400.0000, 214.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 43.3000, 14.0000, 4575.0000, 208.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 47.5000, 14.0000, 4875.0000, 212.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 46.2000, 14.1000, 4375.0000, 217.0000, 0.0000]) tensor([0., 0., 1.])\n" ] } ], "source": [ - "from ml_workshop import PenguinDataset\n", - "\n", "features = [\n", " \"bill_length_mm\",\n", " \"bill_depth_mm\",\n", @@ -196,16 +312,21 @@ ")\n", "\n", "for _, (input_feats, target) in zip(range(20), data_set):\n", - " print(input_feats, target)" + " print(input_feats, target)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "- Can we give these items to a neural network, or do they need to be transformed first?\n", - " - Short answer: no, we can't just pass tuples of numbers or strings to a neural network.\n", - " - We must represent these data as ``torch.Tensor``s." + "* Can we give these items to a neural network, or do they need to be transformed first?\n", + " + Short answer: no, we can't just pass tuples of numbers or strings to a neural network.\n", + " - We must represent these data as ``torch.Tensor``s. This is the fundamental data abstraction used by PyTorch; they are the PyTorch equivalent to Numpy arrays. See [pytorch tensors documentation](https://pytorch.org/tutorials/beginner/introyt/tensors_deeper_tutorial.html) . \n", + " - The targets are tuples of strings i.e. ('Gentoo', )\n", + " - One idea is to represent as ordinal values i.e. [1] or [2] or [3]. But this implies that the class encoded by value 1 is closer to 2 than 1 is to 3. This is not desirable for categorical data. One-hot encoding avoids this by representing each species independently.\\\n", + " \"A\" — [1, 0, 0]\\\n", + " \"B\" — [0, 1, 0]\\\n", + " \"C\" — [0, 0, 1]\n" ] }, { @@ -214,19 +335,77 @@ "source": [ "### Task 4: Applying transforms to the data\n", "\n", - "A common way of transforming inputs to neural networks is to apply a series of transforms using ``torchvision.transforms.Compose``. The [``Compose``](https://pytorch.org/vision/stable/generated/torchvision.transforms.Compose.html) object takes a list of callable objects and applies them to the incoming data.\n", + "Modify the `PenguinDataset` class above so that the tuples of numbers are converted to PyTorch `torch.Tensor` s and the string targets are converted to one-hot vectors.\n", + "\n", + "- Begin by importing relevant PyTorch functions.\n", + "- Apply transformations inside `__getitem__()` function above.\n", + "\n", + "Then create a training and validation set.\n", "\n", - "These transforms can be very useful for mapping between file paths and tensors of images, etc.\n", + " - We allow the model to learn directly from the training set—i.e. we fit the function to these data.\n", + " - During training, we monitor the model's performance on the validation set in order to check how it's doing on unseen data. Normally, people use the validation performance to determine when to stop the training process.\n", + " \n", + "For the validation set, we choose ten males and ten females of each species. This means the validation set is less likely to be biased by sex and species, and is potentially a more reliable measure of performance. You should always be _very_ careful when choosing metrics and splitting data.\n", "\n", - "- Note: here we create a training and validation set.\n", - " - We allow the model to learn directly from the training set—i.e. we fit the function to these data.\n", - " - During training, we monitor the model's performance on the validation set in order to check how it's doing on unseen data. Normally, people use the validation performance to determine when to stop the training process.\n", - "- For the validation set, we choose ten males and ten females of each species. This means the validation set is less likely to be biased by sex and species, and is potentially a more reliable measure of performance. You should always be _very_ careful when choosing metrics and splitting data." + "- Is this solution general?\n", + " - No. The transformations have been hardcoded. A more flexible way of transforming inputs to neural networks is to apply a series of transforms using `torchvision.transforms.Compose`. The [ `Compose` ](https://pytorch.org/vision/stable/generated/torchvision.transforms.Compose.html) object takes a list of callable objects and applies them to the incoming data. See how this is done more generally in the `src/ml_workshop/_penguins.py` file. \n", + "\n", + "These transforms can be very useful for mapping between file paths and tensors of images, etc." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([ 42.9000, 13.1000, 5000.0000, 215.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 46.1000, 13.2000, 4500.0000, 211.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 44.9000, 13.3000, 5100.0000, 213.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 43.3000, 13.4000, 4400.0000, 209.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 42.0000, 13.5000, 4150.0000, 210.0000, 0.0000]) tensor([0., 0., 1.])\n" + ] + } + ], + "source": [ + "# Apply transforms we need to PenguinDataset to convert input data and target class to tensors. \n", + "# See Task 4 exercise comments above.\n", + "\n", + "\n", + "# Create train_set\n", + "train_set = PenguinDataset(\n", + " input_keys=features,\n", + " target_keys=[\"species\"],\n", + " train=True,\n", + ")\n", + "\n", + "# Create valid_set\n", + "valid_set = PenguinDataset(\n", + " input_keys=features,\n", + " target_keys=[\"species\"],\n", + " train=False,\n", + ")\n", + "\n", + "\n", + "for _, (input_feats, target) in zip(range(5), train_set):\n", + " print(input_feats, target)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### (Optional) Task 4b: \n", + "\n", + "Apply the `torchvision.transforms.Compose` transformations instead of hardcoding as above. " + ] + }, + { + "cell_type": "code", + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -244,6 +423,8 @@ "source": [ "from torchvision.transforms import Compose\n", "\n", + "from ml_workshop import PenguinDataset\n", + "\n", "# import some useful functions here, see https://pytorch.org/docs/stable/torch.html\n", "# where `tensor` and `eye` are used for constructing tensors,\n", "# and using a lower-precision float32 is advised for performance\n", @@ -336,7 +517,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -387,7 +568,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -485,7 +666,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -507,7 +688,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -535,7 +716,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -692,34 +873,34 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Epoch 0-25 time: 1.934173 seconds\n", - "Epoch 25-50 time: 1.844448 seconds\n", - "Epoch 50-75 time: 1.831056 seconds\n", - "Epoch 75-100 time: 1.817979 seconds\n", - "Epoch 100-125 time: 1.822820 seconds\n", - "Epoch 125-150 time: 1.842434 seconds\n", - "Epoch 150-175 time: 1.967782 seconds\n", + "Epoch 0-25 time: 8.108920 seconds\n", + "Epoch 25-50 time: 8.245825 seconds\n", + "Epoch 50-75 time: 7.894095 seconds\n", + "Epoch 75-100 time: 8.292500 seconds\n", + "Epoch 100-125 time: 7.116918 seconds\n", + "Epoch 125-150 time: 6.541059 seconds\n", + "Epoch 150-175 time: 7.708282 seconds\n", "\n", "\n", " loss_train accuracy_train loss_valid accuracy_valid\n", - "0 0.578070 0.496324 0.586362 0.484375\n", - "1 0.490388 0.742647 0.495531 0.750000\n", - "2 0.417000 0.819853 0.406423 0.781250\n", - "3 0.371912 0.841912 0.356070 0.828125\n", - "4 0.325209 0.871324 0.310226 0.890625\n", + "0 0.614220 0.452206 0.668509 0.375000\n", + "1 0.524949 0.698529 0.527703 0.703125\n", + "2 0.460917 0.786765 0.463121 0.781250\n", + "3 0.380868 0.886029 0.396204 0.828125\n", + "4 0.347903 0.878676 0.337664 0.859375\n", ".. ... ... ... ...\n", - "195 0.019916 0.988971 0.026766 0.984375\n", - "196 0.021192 0.988971 0.023146 0.984375\n", - "197 0.022928 0.988971 0.024764 0.984375\n", - "198 0.023786 0.985294 0.026085 0.984375\n", - "199 0.023932 0.981618 0.031793 0.984375\n", + "195 0.050222 0.966912 0.013005 0.984375\n", + "196 0.036788 0.985294 0.012601 1.000000\n", + "197 0.033748 0.970588 0.011316 1.000000\n", + "198 0.038716 0.988971 0.020271 0.984375\n", + "199 0.015950 0.988971 0.019603 0.984375\n", "\n", "[200 rows x 4 columns]\n" ] @@ -774,12 +955,12 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -832,7 +1013,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -844,8 +1025,8 @@ " [3.3600e+01, 1.1300e+01, 2.0000e+03, 2.1100e+02, 1.0000e+00]])\n", "\n", "Raw output:\n", - "tensor([[2.4082e-05, 4.3393e-06, 9.9997e-01],\n", - " [8.5355e-01, 6.9033e-06, 1.4644e-01]])\n", + "tensor([[8.2419e-07, 8.8322e-09, 1.0000e+00],\n", + " [6.8586e-01, 4.3171e-06, 3.1413e-01]])\n", "\n", "Predicted species:\n", "['Gentoo', 'Adelie']\n", @@ -894,7 +1075,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.2" } }, "nbformat": 4, From 88fdb653b8ec5a03c14fa1589f774fe190745705 Mon Sep 17 00:00:00 2001 From: Matthew Archer <36638242+ma595@users.noreply.github.com> Date: Wed, 10 Jul 2024 19:38:08 +0100 Subject: [PATCH 2/2] More helper comments to make exercises easier to complete in a self directed way. (#75) * Additional helper comments and content throughout * Comment on the softmax function * Fix solution comment * More content * Fix formatting * Fix comment on output format * Forward modification * Updated autograd comment * Small changes --- exercises/01_penguin_classification.ipynb | 125 ++++++++++++++++-- .../01_penguin_classification_solutions.ipynb | 2 +- 2 files changed, 113 insertions(+), 14 deletions(-) diff --git a/exercises/01_penguin_classification.ipynb b/exercises/01_penguin_classification.ipynb index cf532cb..05adb52 100644 --- a/exercises/01_penguin_classification.ipynb +++ b/exercises/01_penguin_classification.ipynb @@ -311,8 +311,11 @@ " - The ``DataLoader`` object allows us to put our inputs and targets in mini-batches, which makes for more efficient training.\n", " - Note: rather than supplying one input-target pair to the model at a time, we supply \"mini-batches\" of these data at once (typically a small power of 2, like 16 or 32).\n", " - The number of items we supply at once is called the batch size.\n", - " - The ``DataLoader`` can also randomly shuffle the data each epoch (when training).\n", - " - It allows us to load different mini-batches in parallel, which can be very useful for larger datasets and images that can't all fit in memory at once.\n", + " - Q. What number should we choose for the batch size?\n", + " - The ``DataLoader`` can also randomly shuffle the data each epoch (when training). This avoids accidental patterns in the data harming the fitting process. Consider providing lots of the positive class followed by the negative class,\n", + "the network will only learn by saying yes all the time. Therefore need to intersperse positives and negatives.\n", + "\n", + " - The ``DataLoader`` also allows us to load different mini-batches in parallel, which can be very useful for larger datasets and images that can't all fit in memory at once.\n", "\n", "\n", "Note: we are going to use batch normalisation layers in our network, which don't work if the batch size is one. This can happen on the last batch, if we don't choose a batch size that evenly divides the number of items in the data set. To avoid this, we can set the ``drop_last`` argument to ``True``. The last batch, which will be of size ``len(data_set) % batch_size`` gets dropped, and the data are reshuffled. This is only relevant during the training process - validation will use population statistics." @@ -337,23 +340,48 @@ "\n", "Here we will create our neural network in PyTorch, and have a general discussion on clean and messy ways of going about it.\n", "\n", + "  The module `torch.nn` contains different classes that help you build neural network models. All models in PyTorch inherit from the subclass `nn.Module`, which has useful methods like `parameters()`, `__call__()` and others.\n", + "\n", + "  `torch.nn` also has various layers that you can use to build your neural network. For example, we will use `nn.Linear` in our code below, which constructs a fully connected layer. `torch.nn.Linear` is a subclass of `torch.nn.Module`. \n", + "\n", + "  What exactly is a \"layer\"? It is essentially a step in the neural network computation. i.e. The `nn.Linear` layer computes the linear transformation of the input vector `$x$`: `$y$ = $W^T x + b$`. Where `W` is the matrix of tunable parameters and `b` is a bias vector.\n", + "\n", + "We can also think of the ReLU activation as a \"layer\". However, there are no tunable parameters associated with the ReLU activation function.\n", + "\n", + "  The `__init__()` method is where we typically define the attributes of a class. In our case, all the \"sub-components\" of our model should be defined here.\n", + "\n", + "  The `forward` method is called when we use the neural network to make a prediction. Another term for \"making a prediction\" is running the forward pass, because information flows forward from the input through the hidden layers to the output. This builds a computational graph. To compute parameter updates, we run the backward pass by calling the function `loss.backward()`. During the backward pass, `autograd` traverses this graph to compute the gradients, which are then used to update the model's parameters.\n", + "\n", + "  The `forward` method is called from the `__call__()` function of `nn.Module`, so that when we run `model(batch)`, the `forward` method is called. \n", "- First, we will create quite an ugly network to highlight how to make a neural network in PyTorch on a very basic level.\n", - "- We will then discuss a trick for making the print-out nicer.\n", + "- We will then utilise `torch.nn.Sequential` as a neater approach.\n", "- Finally, we will discuss how the best approach would be to write a class where various parameters (e.g. number of layers, dropout probabilities, etc.) are passed as arguments." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "from torch.nn import Module\n", "from torch.nn import BatchNorm1d, Linear, ReLU, Dropout\n", + "from torch import Tensor\n", "\n", "\n", "class FCNet(Module):\n", - " \"\"\"Fully-connected neural network.\"\"\"" + " \"\"\"Fully-connected neural network.\"\"\"\n", + "\n", + " # define __init__ function - model defined here.\n", + " def __init__(self):\n", + " pass\n", + "\n", + " # define forward function which calls network\n", + " def forward(self, batch: Tensor) -> Tensor:\n", + " pass\n", + "\n", + "\n", + "# define a model and print and test (try with torch.rand() function)" ] }, { @@ -384,7 +412,9 @@ "\n", "While we talked about stochastic gradient descent in the slides, most people use the so-called [Adam optimiser](https://pytorch.org/docs/stable/generated/torch.optim.Adam.html).\n", "\n", - "You can think of it as a more complex and improved implementation of SGD." + "You can think of it as a more complex and improved implementation of SGD.\n", + "\n", + "Here we will tell the optimiser what parameters to fit in order to minimise the loss. " ] }, { @@ -397,20 +427,58 @@ "from torch.optim import Adam" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Have a go at importing the model weights for a large model like ResNet50" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Task 9: Writing basic training and validation loops\n", "\n", - "- Before we jump in and write these loops, we must first choose an activation function to apply to the model's outputs.\n", - " - Here we are going to use the softmax activation function: see [the PyTorch docs](https://pytorch.org/docs/stable/generated/torch.nn.Softmax.html).\n", - " - For those of you who've studied physics, you may be remininded of the partition function in thermodynamics.\n", - " - This activation function is good for classifcation when the result is one of ``A or B or C``.\n", - " - It's bad if you even want to assign two classification to one images—say a photo of a dog _and_ a cat.\n", + "- Before we jump in and write these loops, we must first choose an activation function to apply to the model's outputs so that they compared to our targets i.e. `[0, 0, 1]`. We chose not to include this in the network itself.\n", + " - Here we are going to use the softmax activation function: see [the PyTorch docs](https://pytorch.org/docs/stable/generated/torch.nn.Softmax.html). It can be seen as a generalization of both the logits and sigmoid functions to handle multi-class classification tasks\n", + " - For those of you who've studied physics, you may be reminded of the partition function in thermodynamics.\n", + " - This activation function is good for classification when the result is one of ``A or B or C``.\n", + " - It's bad if you even want to assign two classification to a single image—say a photo of a dog _and_ a cat.\n", " - It turns the raw outputs, or logits, into \"psuedo probabilities\", and we take our prediction to be the most probable class.\n", "\n", - "- We will write the training loop together, then you can go ahead and write the (simpler) validation loop." + "- Have a go at writing these loops. Read the comments below for help.\n", + "\n", + "TIPS:\n", + "\n", + "- The model needs to be configured for training and validation.\n", + "- We need to tell the softmax function over what dimension we should sum the probabilities over in order to equal 1. This should be along the column axis. \n", + "- The automatic behaviour of the optimiser is to accumulate gradients during training.\n", + "\n", + "- Extracting metrics: \n", + " - Define a dictionary `metrics = {\"loss\": [], \"accuracy\" : []}`\n", + " - Append the loss `loss.item()` which is a 1x1 tensor. We do not need gradients.\n", + " - Get the accuracy by writing a function `get_batch_accuracy(preds: Tensor, targets: Tensor)`.\n", + " - A decision can be computed as follows: `decision = preds.argmax(dim=1)`\n", + " - We need to supply the metrics as `means` over each epoch.\n", + " - The metrics should be a dictionary containing \"loss\" and \"accuracy\" as keys and lists as values which we append to each iteration. We can then use dictionary comprehension to get epoch statistics. \n", + " ```\n", + " metrics = {\"loss \" : [1.0, 2.0, 3.0], \"accuracy\" : [0.7, 0.8, 0.9]}\n", + " return {k : mean(v) for k, v in metrics.items() }\n", + " ```\n", + " - If the validation performance gets really poor this is a sign that we have possibly overfit. \n", + "\n", + "- Utilise `@no_grad` where possible. It temporarily disables gradient calculation, which is beneficial during evaluation phases when gradient updates are not required. \n", + "\n", + "\n", + "NOTE: In PyTorch, `requires_grad=True` is set automatically for the parameters of layers defined using `torch.nn.Module` subclasses. Examine the following example:\n", + "```\n", + "x = ones(10, requires_grad=True)\n", + "y = 2*x.exp()\n", + "print(y)\n", + "```\n", + "- Why use BCELoss?\n", + " - It may seem odd to be using BCELoss for a multi-class classification problem. In this case, BCELoss treats each element of the prediction vector as an independent binary classification problem. For each class, it compares the predicted probability against the target and computes the loss. It might be better to use `CrossEntropyLoss` instead (ground truth does not need to be one-hot encoded). `CrossEntropyLoss` combines softmax and negative log likelihood. \n" ] }, { @@ -448,6 +516,27 @@ "\n", " \"\"\"\n", "\n", + " # setup the model for training. IMPORTANT!\n", + "\n", + " # setup loss and accuracy metrics dictionary\n", + "\n", + " # iterate over the batch, targets in the train_loader\n", + " for batch, targets in train_loader:\n", + " pass\n", + "\n", + " # zero the gradients (otherwise gradients accumulate)\n", + "\n", + " # run forward model and compute proxy probabilities over dimension 1 (columns of tensor).\n", + "\n", + " # compute loss\n", + " # e.g. pred = [0.2, 0.7, 0.1] and target = [0, 1, 0]\n", + "\n", + " # compute gradients\n", + "\n", + " # nudge parameters in direction of steepest descent c\n", + "\n", + " # append metrics\n", + "\n", "\n", "def validate_one_epoch(\n", " model: Module,\n", @@ -470,7 +559,10 @@ " Dict[str, float]\n", " Metrics of interest.\n", "\n", - " \"\"\"" + " \"\"\"\n", + "\n", + " for batch, targets in valid_loader:\n", + " pass" ] }, { @@ -498,7 +590,14 @@ "source": [ "epochs = 3\n", "\n", + "# define train_metrics and valid_metrics lists. \n", + "\n", "for _ in range(epochs):\n", + "\n", + " # append output of train_one_epoch() to train_metrics\n", + "\n", + " # append output of valid_one_epoch() to valid_metrics\n", + "\n", " pass" ] }, diff --git a/worked-solutions/01_penguin_classification_solutions.ipynb b/worked-solutions/01_penguin_classification_solutions.ipynb index 25b6f49..04c51d2 100644 --- a/worked-solutions/01_penguin_classification_solutions.ipynb +++ b/worked-solutions/01_penguin_classification_solutions.ipynb @@ -810,7 +810,7 @@ " and to instead use the stats it has built up from the training set.\n", " The model should not \"remember\" anything from the validation set.\n", " - We also protect this function with ``torch.no_grad()``, because having\n", - " gradients enable while validating is a pointless waste of\n", + " gradients enabled while validating is a pointless waste of\n", " resources — they are only needed for training.\n", "\n", " \"\"\"\n",