From f12c4766f8c0aa64b30fd1b967c0d53dddb847ac Mon Sep 17 00:00:00 2001
From: Shivam Sharma <sshivam95@gmail.com>
Date: Thu, 21 Mar 2024 16:20:08 +0100
Subject: [PATCH] Refactor: Add Numpy style documentation in dataset_classes.py

---
 dicee/dataset_classes.py | 648 +++++++++++++++++++++++++++++++--------
 1 file changed, 513 insertions(+), 135 deletions(-)

diff --git a/dicee/dataset_classes.py b/dicee/dataset_classes.py
index 9b6b38bd..faafc4af 100644
--- a/dicee/dataset_classes.py
+++ b/dicee/dataset_classes.py
@@ -8,8 +8,29 @@
 
 
 @timeit
-def reload_dataset(path: str, form_of_labelling, scoring_technique, neg_ratio, label_smoothing_rate):
-    """ Reload the files from disk to construct the Pytorch dataset """
+def reload_dataset(path: str, form_of_labelling: str, scoring_technique: str, 
+                   neg_ratio: float, label_smoothing_rate: float) -> torch.utils.data.Dataset:
+    """
+    Reloads the dataset from disk and constructs a PyTorch dataset for training.
+
+    Parameters
+    ----------
+    path : str
+        The path to the directory where the dataset is stored.
+    form_of_labelling : str
+        The form of labelling used in the dataset. Determines how data points are represented.
+    scoring_technique : str
+        The scoring technique used for evaluating the embeddings.
+    neg_ratio : float
+        The ratio of negative samples to positive samples in the dataset.
+    label_smoothing_rate : float
+        The rate of label smoothing applied to the dataset.
+
+    Returns
+    -------
+    torch.utils.data.Dataset
+        A PyTorch dataset object ready for training.
+    """
     return construct_dataset(train_set=np.load(path + '/train_set.npy'),
                              valid_set=None,
                              test_set=None,
@@ -37,6 +58,45 @@ def construct_dataset(*,
                       byte_pair_encoding=None,
                       block_size: int = None
                       ) -> torch.utils.data.Dataset:
+    """
+    Constructs a dataset based on the specified parameters and returns a PyTorch Dataset object.
+
+    Parameters
+    ----------
+    train_set : Union[np.ndarray, list]
+        The training set consisting of triples or tokens.
+    valid_set : Optional
+        The validation set. Not currently used in dataset construction.
+    test_set : Optional
+        The test set. Not currently used in dataset construction.
+    ordered_bpe_entities : Optional
+        Ordered byte pair encoding entities for the dataset.
+    train_target_indices : Optional
+        Indices of target entities or relations for training.
+    target_dim : int, optional
+        The dimension of target entities or relations.
+    entity_to_idx : dict
+        A dictionary mapping entity strings to indices.
+    relation_to_idx : dict
+        A dictionary mapping relation strings to indices.
+    form_of_labelling : str
+        Specifies the form of labelling, such as 'EntityPrediction' or 'RelationPrediction'.
+    scoring_technique : str
+        The scoring technique used for generating negative samples or evaluating the model.
+    neg_ratio : int
+        The ratio of negative samples to positive samples.
+    label_smoothing_rate : float
+        The rate of label smoothing applied to labels.
+    byte_pair_encoding : Optional
+        Indicates if byte pair encoding is used.
+    block_size : int, optional
+        The block size for transformer-based models.
+
+    Returns
+    -------
+    torch.utils.data.Dataset
+        A PyTorch dataset object ready for model training.
+    """
     if ordered_bpe_entities and byte_pair_encoding and scoring_technique == 'NegSample':
         train_set = BPE_NegativeSamplingDataset(
             train_set=torch.tensor(train_set, dtype=torch.long),
@@ -95,6 +155,30 @@ def construct_dataset(*,
 
 
 class BPE_NegativeSamplingDataset(torch.utils.data.Dataset):
+    """
+    A PyTorch Dataset for handling negative sampling with Byte Pair Encoding (BPE) entities.
+
+    This dataset extends the PyTorch Dataset class to provide functionality for negative sampling
+    in the context of knowledge graph embeddings. It uses byte pair encoding for entities
+    to handle large vocabularies efficiently.
+
+    Parameters
+    ----------
+    train_set : torch.LongTensor
+        A tensor containing the training set triples with byte pair encoded entities and relations.
+        The shape of the tensor is [N, 3], where N is the number of triples.
+    ordered_shaped_bpe_entities : torch.LongTensor
+        A tensor containing the ordered and shaped byte pair encoded entities.
+    neg_ratio : int
+        The ratio of negative samples to generate per positive sample.
+
+    Attributes
+    ----------
+    num_bpe_entities : int
+        The number of byte pair encoded entities.
+    num_datapoints : int
+        The number of data points (triples) in the training set.
+    """
     def __init__(self, train_set: torch.LongTensor, ordered_shaped_bpe_entities: torch.LongTensor, neg_ratio: int):
         super().__init__()
         assert isinstance(train_set, torch.LongTensor)
@@ -105,13 +189,62 @@ def __init__(self, train_set: torch.LongTensor, ordered_shaped_bpe_entities: tor
         self.neg_ratio = neg_ratio
         self.num_datapoints = len(self.train_set)
 
-    def __len__(self):
+    def __len__(self) -> int:
+        """
+        Returns the total number of data points in the dataset.
+
+        Returns
+        -------
+        int
+            The number of data points.
+        """
         return self.num_datapoints
 
-    def __getitem__(self, idx):
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Retrieves the BPE-encoded triple and its corresponding label at the specified index.
+
+        Parameters
+        ----------
+        idx : int
+            Index of the triple to retrieve.
+
+        Returns
+        -------
+        tuple
+            A tuple containing the following elements:
+            - The BPE-encoded triple as a torch.Tensor of shape (3,).
+            - The label for the triple, where positive examples have a label of 1 and negative examples have a label
+              of 0, as a torch.Tensor.
+        """
         return self.train_set[idx]
 
-    def collate_fn(self, batch_shaped_bpe_triples: List[Tuple[torch.Tensor, torch.Tensor]]):
+    def collate_fn(self, batch_shaped_bpe_triples: List[Tuple[torch.Tensor, torch.Tensor]]) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Collate function for the BPE_NegativeSamplingDataset. It processes a batch of byte pair encoded triples, 
+        performs negative sampling, and returns the batch along with corresponding labels.
+
+        This function is designed to be used with a PyTorch DataLoader. It takes a list of byte pair encoded triples
+        as input and generates negative samples according to the specified negative sampling ratio. The function
+        ensures that the negative samples are combined with the original triples to form a single batch, which is
+        suitable for training a knowledge graph embedding model.
+
+        Parameters
+        ----------
+        batch_shaped_bpe_triples : List[Tuple[torch.Tensor, torch.Tensor]]
+            A list of tuples, where each tuple contains byte pair encoded representations of head entities, relations,
+            and tail entities for a batch of triples.
+
+        Returns
+        -------
+        Tuple[torch.Tensor, torch.Tensor]
+            A tuple containing two elements:
+            - The first element is a torch.Tensor of shape [N * (1 + neg_ratio), 3] that contains both the original
+            byte pair encoded triples and the generated negative samples. N is the original number of triples in the
+            batch, and neg_ratio is the negative sampling ratio.
+            - The second element is a torch.Tensor of shape [N * (1 + neg_ratio)] that contains the labels for each
+            triple in the batch. Positive samples are labeled as 1, and negative samples are labeled as 0.
+        """
         batch_of_bpe_triples = torch.stack(batch_shaped_bpe_triples, dim=0)
 
         size_of_batch, _, token_length = batch_of_bpe_triples.shape
@@ -140,6 +273,44 @@ def collate_fn(self, batch_shaped_bpe_triples: List[Tuple[torch.Tensor, torch.Te
 
 
 class MultiLabelDataset(torch.utils.data.Dataset):
+    """
+    A dataset class for multi-label knowledge graph embedding tasks. This dataset is designed for models where
+    the output involves predicting multiple labels (entities or relations) for a given input (e.g., predicting all
+    possible tail entities given a head entity and a relation).
+
+    Parameters
+    ----------
+    train_set : torch.LongTensor
+        A tensor containing the training set triples with byte pair encoding, shaped as [num_triples, 3], 
+        where each triple is [head, relation, tail].
+    
+    train_indices_target : torch.LongTensor
+        A tensor where each row corresponds to the indices of the target labels for each training example. 
+        The length of this tensor must match the number of triples in `train_set`.
+    
+    target_dim : int
+        The dimensionality of the target space, typically the total number of possible labels (entities or relations).
+    
+    torch_ordered_shaped_bpe_entities : torch.LongTensor
+        A tensor containing ordered byte pair encoded entities used for creating embeddings. 
+        This tensor is not directly used in generating targets but may be utilized for additional processing 
+        or embedding lookup.
+
+    Attributes
+    ----------
+    num_datapoints : int
+        The number of data points (triples) in the dataset.
+    
+    collate_fn : None or callable
+        Optional custom collate function to be used with a PyTorch DataLoader. 
+        It's set to None by default and can be specified after initializing the dataset if needed.
+        
+    Note
+    ----
+    This dataset is particularly suited for KvsAll (K entities vs. All entities) and AllvsAll training strategies 
+    in knowledge graph embedding, where a model predicts a set of possible tail entities given a head entity 
+    and a relation (or vice versa), and where each training example can have multiple correct labels.
+    """
     def __init__(self, train_set: torch.LongTensor, train_indices_target: torch.LongTensor, target_dim: int,
                  torch_ordered_shaped_bpe_entities: torch.LongTensor):
         super().__init__()
@@ -153,10 +324,34 @@ def __init__(self, train_set: torch.LongTensor, train_indices_target: torch.Long
         self.torch_ordered_shaped_bpe_entities = torch_ordered_shaped_bpe_entities
         self.collate_fn = None
 
-    def __len__(self):
+    def __len__(self) -> int:
+        """
+        Returns the total number of data points in the dataset.
+
+        Returns
+        -------
+        int
+            The number of data points.
+        """
         return self.num_datapoints
 
-    def __getitem__(self, idx):
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Retrieves the knowledge graph triple and its corresponding multi-label target vector at the specified index.
+
+        Parameters
+        ----------
+        idx : int
+            Index of the triple to retrieve.
+
+        Returns
+        -------
+        tuple
+            A tuple containing the following elements:
+            - The triple as a torch.Tensor of shape (3,).
+            - The multi-label target vector as a torch.Tensor of shape (`target_dim`,), where each element
+              indicates the presence (1) or absence (0) of a label for the given triple.
+        """
         # (1) Initialize as all zeros.
         y_vec = torch.zeros(self.target_dim)
         # (2) Indices of labels.
@@ -169,27 +364,37 @@ def __getitem__(self, idx):
 
 class MultiClassClassificationDataset(torch.utils.data.Dataset):
     """
-       Dataset for the 1vsALL training strategy
+    A dataset class for multi-class classification tasks, specifically designed for the 1vsALL training strategy
+    in knowledge graph embedding models. This dataset supports tasks where the model predicts a single correct
+    label from all possible labels for a given input.
 
-       Parameters
-       ----------
-       train_set_idx
-           Indexed triples for the training.
-       entity_idxs
-           mapping.
-       relation_idxs
-           mapping.
-       form
-           ?
-       num_workers
-           int for https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader
-
-
-
-       Returns
-       -------
-       torch.utils.data.Dataset
-       """
+    Parameters
+    ----------
+    subword_units : np.ndarray
+        An array of subword unit indices representing the training data. Each row in the array corresponds to a
+        sequence of subword units (e.g., Byte Pair Encoding tokens) that have been converted to their respective
+        numeric indices.
+    
+    block_size : int, optional
+        The size of each sequence of subword units to be used as input to the model. This defines the length of
+        the sequences that the model will receive as input, by default 8.
+
+    Attributes
+    ----------
+    num_of_data_points : int
+        The number of sequences or data points available in the dataset, calculated based on the length of the
+        `subword_units` array and the `block_size`.
+    
+    collate_fn : None or callable
+        An optional custom collate function to be used with a PyTorch DataLoader. It's set to None by default
+        and can be specified after initializing the dataset if needed.
+        
+    Note
+    ----
+    This dataset is tailored for training knowledge graph embedding models on tasks where the output is a single
+    label out of many possible labels (1vsALL strategy). It is especially suited for models trained with subword
+    tokenization methods like Byte Pair Encoding (BPE), where inputs are sequences of subword unit indices.
+    """
 
     def __init__(self, subword_units: np.ndarray, block_size: int = 8):
         super().__init__()
@@ -200,10 +405,34 @@ def __init__(self, subword_units: np.ndarray, block_size: int = 8):
         self.num_of_data_points = len(self.train_data) - block_size
         self.collate_fn = None
 
-    def __len__(self):
+    def __len__(self) -> int:
+        """
+        Returns the total number of sequences or data points available in the dataset.
+
+        Returns
+        -------
+        int
+            The number of sequences or data points.
+        """
         return self.num_of_data_points
 
-    def __getitem__(self, idx):
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Retrieves an input sequence and its subsequent target sequence for next token prediction.
+
+        Parameters
+        ----------
+        idx : int
+            The starting index for the sequence to be retrieved from the dataset.
+
+        Returns
+        -------
+        Tuple[torch.Tensor, torch.Tensor]
+            A tuple containing two elements:
+            - `x`: The input sequence as a torch.Tensor of shape (`block_size`,).
+            - `y`: The target sequence as a torch.Tensor of shape (`block_size`,), offset by one position
+              from the input sequence.
+        """
         x = self.train_data[idx:idx + self.block_size]
         y = self.train_data[idx + 1: idx + 1 + self.block_size]
         return x, y
@@ -211,27 +440,37 @@ def __getitem__(self, idx):
 
 class OnevsAllDataset(torch.utils.data.Dataset):
     """
-       Dataset for the 1vsALL training strategy
+    A dataset for the One-vs-All (1vsAll) training strategy designed for knowledge graph embedding tasks.
+    This dataset structure is particularly suited for models predicting a single correct label (entity) out of
+    all possible entities for a given pair of head entity and relation.
 
-       Parameters
-       ----------
-       train_set_idx
-           Indexed triples for the training.
-       entity_idxs
-           mapping.
-       relation_idxs
-           mapping.
-       form
-           ?
-       num_workers
-           int for https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader
-
-
-
-       Returns
-       -------
-       torch.utils.data.Dataset
-       """
+    Parameters
+    ----------
+    train_set_idx : np.ndarray
+        An array containing indexed triples from the knowledge graph. Each row represents a triple consisting of
+        indices for the head entity, relation, and tail entity, respectively.
+    
+    entity_idxs : dict
+        A dictionary mapping entity names to their corresponding unique integer indices. This is used to determine
+        the dimensionality of the target vector in the 1vsAll setting.
+
+    Attributes
+    ----------
+    train_data : torch.LongTensor
+        A tensor version of `train_set_idx`, prepared for use with PyTorch models.
+    
+    target_dim : int
+        The dimensionality of the target vector, equivalent to the total number of unique entities in the dataset.
+    
+    collate_fn : None or callable
+        An optional custom collate function for use with a PyTorch DataLoader. By default, it is set to None and can
+        be specified after initializing the dataset.
+
+    Note
+    ----
+    This dataset is optimized for training knowledge graph embedding models using the 1vsAll strategy, where the
+    model aims to correctly predict the tail entity from all possible entities given the head entity and relation.
+    """
 
     def __init__(self, train_set_idx: np.ndarray, entity_idxs):
         super().__init__()
@@ -242,49 +481,94 @@ def __init__(self, train_set_idx: np.ndarray, entity_idxs):
         self.collate_fn = None
 
     def __len__(self):
+        """
+        Returns the total number of triples in the dataset.
+
+        Returns
+        -------
+        int
+            The total number of triples.
+        """
         return len(self.train_data)
 
     def __getitem__(self, idx):
+        """
+        Retrieves the input data and target vector for the triple at index `idx`.
+
+        The input data consists of the indices for the head entity and relation, while the target vector is a
+        one-hot encoded vector with a `1` at the position corresponding to the tail entity's index and `0`s elsewhere.
+
+        Parameters
+        ----------
+        idx : int
+            The index of the triple to retrieve.
+
+        Returns
+        -------
+        Tuple[torch.Tensor, torch.Tensor]
+            A tuple containing two elements:
+            - The input data as a torch.Tensor of shape (2,), containing the indices of the head entity and relation.
+            - The target vector as a torch.Tensor of shape (`target_dim`,), a one-hot encoded vector for the tail entity.
+        """
         y_vec = torch.zeros(self.target_dim)
         y_vec[self.train_data[idx, 2]] = 1
         return self.train_data[idx, :2], y_vec
 
 
 class KvsAll(torch.utils.data.Dataset):
-    """ Creates a dataset for KvsAll training by inheriting from torch.utils.data.Dataset.
-    Let D denote a dataset for KvsAll training and be defined as D:= {(x,y)_i}_i ^N, where
-    x: (h,r) is an unique tuple of an entity h \in E and a relation r \in R that has been seed in the input graph.
-    y: denotes a multi-label vector \in [0,1]^{|E|} is a binary label. \forall y_i =1 s.t. (h r E_i) \in KG
+    """
+    Creates a dataset for K-vs-All training strategy, inheriting from torch.utils.data.Dataset.
+    This dataset is tailored for training scenarios where a model predicts all valid tail entities
+    given a head entity and relation pair or vice versa. The labels are multi-hot encoded to represent
+    the presence of multiple valid entities.
 
-    .. note::
-        TODO
+    Let \(D\) denote a dataset for KvsAll training and be defined as \(D := \{(x, y)_i\}_{i=1}^{N}\), where:
+    \(x: (h, r)\) is a unique tuple of an entity \(h \in E\) and a relation \(r \in R\) that has been seen in the input graph.
+    \(y\) denotes a multi-label vector \(\in [0, 1]^{|E|}\) is a binary label. For all \(y_i = 1\) s.t. \((h, r, E_i) \in KG\).
 
     Parameters
     ----------
     train_set_idx : numpy.ndarray
-        n by 3 array representing n triples
-
-    entity_idxs : dictonary
-        string representation of an entity to its integer id
-
-    relation_idxs : dictonary
-        string representation of a relation to its integer id
-
-    Returns
-    -------
-    self : torch.utils.data.Dataset
-
-    See Also
-    --------
-
-    Notes
+        A numpy array of shape `(n, 3)` representing `n` triples, where each triple consists of
+        integer indices corresponding to a head entity, a relation, and a tail entity.
+    entity_idxs : dict
+        A dictionary mapping entity names (strings) to their unique integer identifiers.
+    relation_idxs : dict
+        A dictionary mapping relation names (strings) to their unique integer identifiers.
+    form : str
+        A string indicating the prediction form, either 'RelationPrediction' or 'EntityPrediction'.
+    store : dict, optional
+        A precomputed dictionary storing the training data points. If provided, it should map
+        tuples of entity and relation indices to lists of entity indices. If `None`, the store
+        will be constructed from `train_set_idx`.
+    label_smoothing_rate : float, default=0.0
+        A float representing the rate of label smoothing to be applied. A value of 0 means no
+        label smoothing is applied.
+
+    Attributes
+    ----------
+    train_data : torch.LongTensor
+        Tensor containing the input features for the model, typically consisting of pairs of
+        entity and relation indices.
+    train_target : torch.LongTensor
+        Tensor containing the target labels for the model, multi-hot encoded to indicate the
+        presence of multiple valid entities.
+    target_dim : int
+        The dimensionality of the target labels, corresponding to the number of unique entities
+        or relations, depending on the `form`.
+    collate_fn : None
+        Placeholder for a custom collate function to be used with a PyTorch DataLoader. This is
+        typically set to `None` and can be overridden as needed.
+    
+    Note
     -----
+    The K-vs-All training strategy is used in scenarios where the task is to predict multiple
+    valid entities given a single entity and relation pair. This dataset supports both predicting
+    multiple valid tail entities given a head entity and relation (EntityPrediction) and predicting
+    multiple valid relations given a pair of entities (RelationPrediction).
 
-    Examples
-    --------
-    >>> a = KvsAll()
-    >>> a
-    ? array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+    The label smoothing rate can be adjusted to control the degree of smoothing applied to the
+    target labels, which can help with regularization and model generalization.
     """
 
     def __init__(self, train_set_idx: np.ndarray, entity_idxs, relation_idxs, form, store=None,
@@ -331,11 +615,39 @@ def __init__(self, train_set_idx: np.ndarray, entity_idxs, relation_idxs, form,
             assert isinstance(self.train_target[0], list)
         del store
 
-    def __len__(self):
+    def __len__(self) -> int:
+        """
+        Returns the number of items in the dataset.
+
+        Returns
+        -------
+        int
+            The total number of items.
+        """
         assert len(self.train_data) == len(self.train_target)
         return len(self.train_data)
 
-    def __getitem__(self, idx):
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Retrieves the input pair (head entity, relation) and the corresponding multi-label target vector for the
+        item at index `idx`.
+
+        The target vector is a binary vector of length `target_dim`, where each element indicates the presence or
+        absence of a tail entity for the given input pair.
+
+        Parameters
+        ----------
+        idx : int
+            The index of the item to retrieve.
+
+        Returns
+        -------
+        Tuple[torch.Tensor, torch.Tensor]
+            A tuple containing two elements:
+            - The input pair as a torch.Tensor of shape (2,), containing the indices of the head entity and relation.
+            - The multi-label target vector as a torch.Tensor of shape (`target_dim`,), indicating the presence or
+              absence of each possible tail entity.
+        """
         # 1. Initialize a vector of output.
         y_vec = torch.zeros(self.target_dim)
         y_vec[self.train_target[idx]] = 1.0
@@ -346,41 +658,42 @@ def __getitem__(self, idx):
 
 
 class AllvsAll(torch.utils.data.Dataset):
-    """ Creates a dataset for AllvsAll training by inheriting from torch.utils.data.Dataset.
+    """ 
+    A dataset class for the All-versus-All (AllvsAll) training strategy suitable for knowledge graph embedding models.
+    This strategy considers all possible pairs of entities and relations, regardless of whether they exist in the
+    knowledge graph, to predict the associated tail entities.
+    
     Let D denote a dataset for AllvsAll training and be defined as D:= {(x,y)_i}_i ^N, where
     x: (h,r) is a possible unique tuple of an entity h \in E and a relation r \in R. Hence N = |E| x |R|
-    y: denotes a multi-label vector \in [0,1]^{|E|} is a binary label. \forall y_i =1 s.t. (h r E_i) \in KG
-
-    .. note::
-        AllvsAll extends KvsAll via none existing (h,r). Hence, it adds data points that are labelled without 1s,
-         only with 0s.
+    y: denotes a multi-label vector \in [0,1]^{|E|} is a binary label. \forall y_i =1 s.t. (h, r, E_i) \in KG.
+    This setup extends beyond observed triples to include all possible combinations of entities and relations,
+    marking non-existent combinations as negatives. It aims to enrich the training data with hard negatives.
 
     Parameters
     ----------
     train_set_idx : numpy.ndarray
-        n by 3 array representing n triples
-
-    entity_idxs : dictonary
-        string representation of an entity to its integer id
-
-    relation_idxs : dictonary
-        string representation of a relation to its integer id
-
-    Returns
-    -------
-    self : torch.utils.data.Dataset
-
-    See Also
-    --------
-
-    Notes
-    -----
-
-    Examples
-    --------
-    >>> a = AllvsAll()
-    >>> a
-    ? array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+        An array of shape `(n, 3)`, where each row represents a triple (head entity index, relation index,
+        tail entity index).
+    entity_idxs : dict
+        A dictionary mapping entity names to their unique integer indices.
+    relation_idxs : dict
+        A dictionary mapping relation names to their unique integer indices.
+    label_smoothing_rate : float, default=0.0
+        A parameter for label smoothing to mitigate overfitting by softening the hard labels.
+
+    Attributes
+    ----------
+    train_data : torch.LongTensor
+        A tensor containing all possible pairs of entities and relations derived from the input triples.
+    train_target : Union[np.ndarray, list]
+        A target structure (either a Numpy array or a list) indicating the existence of a tail entity for
+        each head entity and relation pair. It supports multi-label classification where a pair can have
+        multiple correct tail entities.
+    target_dim : int
+        The dimension of the target vector, equal to the total number of unique entities.
+    collate_fn : None or callable
+        An optional function to merge a list of samples into a batch for loading. If not provided, the default
+        collate function of PyTorch's DataLoader will be used.
     """
 
     def __init__(self, train_set_idx: np.ndarray, entity_idxs, relation_idxs,
@@ -414,11 +727,37 @@ def __init__(self, train_set_idx: np.ndarray, entity_idxs, relation_idxs,
             assert isinstance(self.train_target[0], list)
         del store
 
-    def __len__(self):
+    def __len__(self) -> int:
+        """
+        Returns the number of items in the dataset, including both existing and potential triples.
+
+        Returns
+        -------
+        int
+            The total number of items.
+        """
         assert len(self.train_data) == len(self.train_target)
         return len(self.train_data)
 
-    def __getitem__(self, idx):
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Retrieves the input pair (head entity, relation) and the corresponding multi-label target vector for the
+        item at index `idx`. The target vector is a binary vector of length `target_dim`, where each element indicates
+        the presence or absence of a tail entity for the given input pair, including negative samples.
+
+        Parameters
+        ----------
+        idx : int
+            The index of the item to retrieve.
+
+        Returns
+        -------
+        Tuple[torch.Tensor, torch.Tensor]
+            A tuple containing two elements:
+            - The input pair as a torch.Tensor of shape (2,), containing the indices of the head entity and relation.
+            - The multi-label target vector as a torch.Tensor of shape (`target_dim`,), indicating the presence or
+              absence of each possible tail entity, including negative samples.
+        """
         # 1. Initialize a vector of output.
         y_vec = torch.zeros(self.target_dim)
         existing_indices = self.train_target[idx]
@@ -432,32 +771,44 @@ def __getitem__(self, idx):
 
 class KvsSampleDataset(torch.utils.data.Dataset):
     """
-    KvsSample a Dataset:
-        D:= {(x,y)_i}_i ^N, where
-            . x:(h,r) is a unique h \in E and a relation r \in R and
-            . y \in [0,1]^{|E|} is a binary label. \forall y_i =1 s.t. (h r E_i) \in KG
-           At each mini-batch construction, we subsample(y), hence n
-            |new_y| << |E|
-            new_y contains all 1's if sum(y)< neg_sample ratio
-            new_y contains
-       Parameters
-       ----------
-       train_set_idx
-           Indexed triples for the training.
-       entity_idxs
-           mapping.
-       relation_idxs
-           mapping.
-       form
-           ?
-       store
-            ?
-       label_smoothing_rate
-           ?
-       Returns
-       -------
-       torch.utils.data.Dataset
-       """
+    Constructs a dataset for KvsSample training strategy, specifically designed for knowledge graph embedding models.
+    This dataset formulation is aimed at handling the imbalance between positive and negative examples for each
+    (head, relation) pair by subsampling tail entities. The subsampling ensures a balanced representation of positive
+    and negative examples in each training batch, according to the specified negative sampling ratio.
+
+    The dataset is defined as \(D:= \{(x,y)_i\}_{i=1}^{N}\), where:
+        - \(x: (h,r)\) is a unique head entity \(h \in E\) and a relation \(r \in R\).
+        - \(y \in [0,1]^{|E|}\) is a binary label vector. For all \(y_i = 1\) such that \((h, r, E_i) \in KG\).
+
+    At each mini-batch construction, we subsample \(y\), hence \(|new_y| \ll |E|\).
+    The new \(y\) contains all 1's if \(sum(y) <\) neg_sample_ratio, otherwise, it contains a balanced mix of 1's and 0's.
+
+    Parameters
+    ----------
+    train_set : np.ndarray
+        An array of shape \((n, 3)\), where \(n\) is the number of triples in the dataset. Each row in the array
+        represents a triple \((h, r, t)\), consisting of head entity index \(h\), relation index \(r\), and
+        tail entity index \(t\).
+    num_entities : int
+        The total number of unique entities in the dataset.
+    num_relations : int
+        The total number of unique relations in the dataset.
+    neg_sample_ratio : int
+        The ratio of negative samples to positive samples for each (head, relation) pair. If the number of
+        available positive samples is less than this ratio, additional negative samples are generated to meet the ratio.
+    label_smoothing_rate : float, default=0.0
+        A parameter for label smoothing, aiming to mitigate overfitting by softening the hard labels. The labels
+        are adjusted towards a uniform distribution, with the smoothing rate determining the degree of softening.
+
+    Attributes
+    ----------
+    train_data : torch.IntTensor
+        A tensor containing the (head, relation) pairs derived from the input triples, used to index the training set.
+    train_target : list of numpy.ndarray
+        A list where each element corresponds to the tail entity indices associated with a given (head, relation) pair.
+    collate_fn : None or callable
+        A function to merge a list of samples to form a batch. If None, PyTorch's default collate function is used.
+    """
 
     def __init__(self, train_set: np.ndarray, num_entities, num_relations, neg_sample_ratio: int = None,
                  label_smoothing_rate: float = 0.0):
@@ -492,10 +843,37 @@ def __init__(self, train_set: np.ndarray, num_entities, num_relations, neg_sampl
         # print(gc.get_referrers(self.train_target[0]))
 
     def __len__(self):
+        """
+        Returns the total number of unique (head, relation) pairs in the dataset.
+
+        Returns
+        -------
+        int
+            The number of unique (head, relation) pairs.
+        """
         assert len(self.train_data) == len(self.train_target)
         return len(self.train_data)
 
     def __getitem__(self, idx):
+        """
+        Retrieves the data for the given index, including the (head, relation) pair, selected tail entity indices,
+        and their labels. Positive examples are sampled from the training set, and negative examples are generated
+        by randomly selecting tail entities not associated with the (head, relation) pair.
+
+        Parameters
+        ----------
+        idx : int
+            The index of the (head, relation) pair in the dataset.
+
+        Returns
+        -------
+        tuple
+            A tuple containing the following elements:
+            - x: The (head, relation) pair as a torch.Tensor.
+            - y_idx: The indices of selected tail entities, both positive and negative, as a torch.IntTensor.
+            - y_vec: The labels for the selected tail entities, with 1s indicating positive and 0s indicating negative
+                     examples, as a torch.Tensor.
+        """
         # (1) Get i.th unique (head,relation) pair.
         x = self.train_data[idx]
         # (2) Get tail entities given (1).