-
Notifications
You must be signed in to change notification settings - Fork 27
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Simpler PenguinDataset #71
Comments
Below is a first attempt at this:
from typing import Optional, List, Dict, Tuple, Any
# import pytorch functions necessary for transformations:
from torch import tensor, float32, eye
from torch.utils.data import Dataset
from torchvision.transforms import Compose
from pandas import DataFrame
from palmerpenguins import load_penguins
class PenguinDataset(Dataset):
def __init__(
self,
input_keys: List[str],
target_keys: List[str],
train: bool,
transform: Optional[bool] = False,
):
"""Build ``PenguinDataset``."""
self.input_keys = input_keys
self.target_keys = target_keys
self.full_df = _load_penguin_data()
self.split = _split_data(self.full_df)["train" if train is True else "valid"]
self.transform = transform
def __len__(self) -> int:
return len(self.split)
def __getitem__(self, idx: int) -> Tuple[Any, Any]:
# get the row index (idx) from the dataframe and
# select relevant column features (provided as input_keys)
feats = self.split.iloc[idx][self.input_keys]
# this gives a 'species' i.e. one of ('Gentoo',), ('Chinstrap',), or ('Adelie',)
tgts = self.split.iloc[idx][self.target_keys]
if self.transform:
# Put the features in a more suitable format for PyTorch
# Exercise #1: convert the feats to PyTorch
feats = tensor(feats, dtype=float32)
# Exercise #2: convert this to a 'one-hot vector'
target_names = sorted(self.full_df.species.unique())
tgts = eye(len(target_names))[target_names.index(tgts[0])]
return (feats, tgts)
def _load_penguin_data() -> DataFrame:
data = load_penguins()
data = (
data.loc[~data.isna().any(axis=1)]
.sort_values(by=sorted(data.keys()))
.reset_index(drop=True)
)
# Transform the sex field into a float, with male represented by 1.0, female by 0.0
data.sex = (data.sex == "male").astype(float)
return data
def _split_data(penguin_df: DataFrame) -> Dict[str, DataFrame]:
valid_df = penguin_df.groupby(by=["species", "sex"]).sample(
n=10,
random_state=123,
)
# The training items are simply the items *not* in the valid split
train_df = penguin_df.loc[~penguin_df.index.isin(valid_df.index)]
return {"train": train_df, "valid": valid_df} |
Another 2c: Could you get rid of the The Same for |
thanks @jatkinson1000, I'll make those changes. |
from typing import Optional, List, Dict, Tuple, Any
# import pytorch functions necessary for transformations:
from torch import tensor, float32, eye
from torch.utils.data import Dataset
from torchvision.transforms import Compose
from pandas import DataFrame
from palmerpenguins import load_penguins
class PenguinDataset(Dataset):
def __init__(
self,
input_keys: List[str],
target_keys: List[str],
train: bool,
):
"""Build ``PenguinDataset``."""
self.input_keys = input_keys
self.target_keys = target_keys
data = load_penguins()
data = (
data.loc[~data.isna().any(axis=1)]
.sort_values(by=sorted(data.keys()))
.reset_index(drop=True)
)
# Transform the sex field into a float, with male represented by 1.0, female by 0.0
data.sex = (data.sex == "male").astype(float)
self.full_df = data
valid_df = self.full_df.groupby(by=["species", "sex"]).sample(
n=10,
random_state=123,
)
# The training items are simply the items *not* in the valid split
train_df = self.full_df.loc[~self.full_df.index.isin(valid_df.index)]
self.split = {"train": train_df, "valid": valid_df}["train" if train is True else "valid"]
def __len__(self) -> int:
return len(self.split)
def __getitem__(self, idx: int) -> Tuple[Any, Any]:
# get the row index (idx) from the dataframe and
# select relevant column features (provided as input_keys)
feats = self.split.iloc[idx][self.input_keys]
# this gives a 'species' i.e. one of ('Gentoo',), ('Chinstrap',), or ('Adelie',)
tgts = self.split.iloc[idx][self.target_keys]
# Exercise #1: convert the feats (Series) to PyTorch Tensors
feats = tensor(tuple(feats), dtype=float32)
# Exercise #2: convert target to a 'one-hot' vector.
target_names = sorted(self.full_df.species.unique())
tgts = eye(len(target_names))[target_names.index(tuple(tgts)[0])]
return feats, tgts |
If the above all works I think that is a lot more transparent to students what is going on, and directly introduces the Torch |
To make the data reading aspect a little easier to understand, we intend to embed a simpler version of the
PenguinDataset
directly into the notebook. Thesrc/ml_workshop/_penguins.py
will remain untouched and can still be used as before.Thought process around the 'simpler' class (discussion between @jatkinson1000 and @ma595)
Load pandas df in notebook:
Put the definition of the PenguinDataset in the notebook:
__getitem__
Propagate change to
The text was updated successfully, but these errors were encountered: