Skip to content

Commit

Permalink
add exercise outputs
Browse files Browse the repository at this point in the history
  • Loading branch information
ma595 committed Jul 4, 2024
1 parent ed375db commit bb826e4
Showing 1 changed file with 121 additions and 12 deletions.
133 changes: 121 additions & 12 deletions exercises/01_penguin_classification.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,30 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 75,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"['Adelie', 'Chinstrap', 'Gentoo']"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from palmerpenguins import load_penguins"
"from palmerpenguins import load_penguins\n",
"\n",
"data = load_penguins()\n",
"\n",
"data\n",
"\n",
"target_names = sorted(data.species.unique())\n",
"\n",
"target_names\n"
]
},
{
Expand All @@ -55,9 +74,13 @@
"source": [
"### Task 2: creating a ``torch.utils.data.Dataset``\n",
"\n",
"The penguin data reading and processing can be encapsulated in a PyTorch dataset class.\n",
"\n",
"- This is helpful because...\n",
"\n",
"All PyTorch dataset objects are subclasses of the ``torch.utils.data.Dataset`` class. To make a custom dataset, create a class which inherits from the ``Dataset`` class, implement some methods (the Python magic (or dunder) methods ``__len__`` and ``__getitem__``) and supply some data.\n",
"\n",
"Spoiler alert: we've done this for you already in ``src/ml_workshop/_penguins.py``.\n",
"Spoiler alert: we've done this for you already below (see ``src/ml_workshop/_penguins.py`` for a more sophisticated implementation)\n",
"\n",
"- Open the file ``src/ml_workshop/_penguins.py``.\n",
"- Let's examine, and discuss, each of the methods together.\n",
Expand All @@ -75,6 +98,78 @@
" - ``y_tfms``— ..."
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [],
"source": [
"from typing import Optional, List, Dict, Tuple, Any\n",
"\n",
"# import pytorch functions necessary for transformations:\n",
"from torch import tensor, float32, eye\n",
"\n",
"from torch.utils.data import Dataset\n",
"from torchvision.transforms import Compose\n",
"\n",
"from pandas import DataFrame\n",
"\n",
"from palmerpenguins import load_penguins\n",
"\n",
"\n",
"class PenguinDataset(Dataset):\n",
" def __init__(\n",
" self,\n",
" input_keys: List[str],\n",
" target_keys: List[str],\n",
" train: bool,\n",
" ):\n",
" \"\"\"Build ``PenguinDataset``.\"\"\"\n",
" self.input_keys = input_keys\n",
" self.target_keys = target_keys\n",
"\n",
" data = load_penguins()\n",
" data = (\n",
" data.loc[~data.isna().any(axis=1)]\n",
" .sort_values(by=sorted(data.keys()))\n",
" .reset_index(drop=True)\n",
" )\n",
" # Transform the sex field into a float, with male represented by 1.0, female by 0.0\n",
" data.sex = (data.sex == \"male\").astype(float)\n",
" self.full_df = data\n",
"\n",
" valid_df = self.full_df.groupby(by=[\"species\", \"sex\"]).sample(\n",
" n=10,\n",
" random_state=123,\n",
" )\n",
" # The training items are simply the items *not* in the valid split\n",
" train_df = self.full_df.loc[~self.full_df.index.isin(valid_df.index)]\n",
"\n",
" self.split = {\"train\": train_df, \"valid\": valid_df}[\"train\" if train is True else \"valid\"]\n",
"\n",
"\n",
" def __len__(self) -> int:\n",
" return len(self.split)\n",
" \n",
" def __getitem__(self, idx: int) -> Tuple[Any, Any]:\n",
" # get the row index (idx) from the dataframe and \n",
" # select relevant column features (provided as input_keys)\n",
" feats = self.split.iloc[idx][self.input_keys]\n",
"\n",
" # this gives a 'species' i.e. one of ('Gentoo',), ('Chinstrap',), or ('Adelie',) \n",
" tgts = self.split.iloc[idx][self.target_keys]\n",
"\n",
" # Exercise #1: convert the feats to PyTorch\n",
" feats = tensor(feats.values, dtype=float32)\n",
"\n",
" # Exercise #2: convert this to a 'one-hot vector' \n",
" target_names = sorted(self.full_df.species.unique())\n",
" \n",
" tgts = eye(len(target_names))[target_names.index(tgts.values[0])]\n",
" \n",
" return (feats, tgts)"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -93,22 +188,36 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 109,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"(tensor([ 42.9000, 5000.0000]), tensor([0., 0., 1.]))"
]
},
"execution_count": 109,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from ml_workshop import PenguinDataset\n",
"# from ml_workshop import PenguinDataset\n",
"\n",
"data_set = PenguinDataset(\n",
"data_set_1 = PenguinDataset(\n",
" input_keys=[\"bill_length_mm\", \"body_mass_g\"],\n",
" target_keys=[\"species\"],\n",
" train=True,\n",
")\n",
"\n",
"\n",
"for features, target in data_set:\n",
" # print the features and targets here\n",
" pass"
"# for features, target in data_set:\n",
"# # print the features and targets here\n",
"# print(features, target)\n",
"\n",
"\n",
"data_set_1[0]"
]
},
{
Expand Down Expand Up @@ -417,7 +526,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
"version": "3.12.4"
}
},
"nbformat": 4,
Expand Down

0 comments on commit bb826e4

Please sign in to comment.