Add docstrings to Corpus object

flairNLP · Dec 7, 2024 · 0795f63 · 0795f63
1 parent 8ae1ab8
commit 0795f63
Showing 1 changed file with 43 additions and 0 deletions.
diff --git a/flair/data.py b/flair/data.py
@@ -1372,6 +1372,14 @@ def unlabeled_identifier(self) -> str:
 
 
 class Corpus(typing.Generic[T_co]):
+    """The main object in Flair for holding a dataset used for training and testing.
+
+    A corpus consists of three splits: A `train` split used for training, a `dev` split used for model selection
+    and/or early stopping and a `test` split used for testing. All three splits are optional, so it is possible
+    to create a corpus only using one or two splits. If the option `sample_missing_splits` is set to True,
+    missing splits will be randomly sampled from the training split.
+    """
+
     def __init__(
         self,
         train: Optional[Dataset[T_co]] = None,
@@ -1381,6 +1389,26 @@ def __init__(
         sample_missing_splits: Union[bool, str] = True,
         random_seed: Optional[int] = None,
     ) -> None:
+        """
+        Constructor method to initialize a :class:`Corpus`. You can define the train, dev and test split
+        by passing the corresponding Dataset object to the constructor. At least one split should be defined.
+        If the option `sample_missing_splits` is set to True, missing splits will be randomly sampled from the
+        train split.
+
+        In most cases, you will not use the constructor yourself. Rather, you will create a corpus using one of our
+        helper methods that read common NLP filetypes. For instance, you can use
+         :class:`flair.datasets.sequence_labeling.ColumnCorpus` to read CoNLL-formatted files directly into
+         a :class:`Corpus`.
+
+        Args:
+            train (torch.utils.data.Dataset): The split you use for model training.
+            dev (torch.utils.data.Dataset): A holdout split typically used for model selection or early stopping.
+            test (torch.utils.data.Dataset): The final test data to compute the score of the model.
+            name (str): A name that identifies the corpus.
+            sample_missing_splits (bool): If set to True, missing splits are sampled from train. If set to False,
+                missing splits are not sampled and left empty. Default: True.
+            random_seed (int): Set a random seed to control the sampling of missing splits.
+        """
         # set name
         self.name: str = name
 
@@ -1419,14 +1447,17 @@ def __init__(
 
     @property
     def train(self) -> Optional[Dataset[T_co]]:
+        """The training split as a :class:`torch.utils.data.Dataset` object."""
         return self._train
 
     @property
     def dev(self) -> Optional[Dataset[T_co]]:
+        """The dev split as a :class:`torch.utils.data.Dataset` object."""
         return self._dev
 
     @property
     def test(self) -> Optional[Dataset[T_co]]:
+        """The test split as a :class:`torch.utils.data.Dataset` object."""
         return self._test
 
     def downsample(
@@ -1833,13 +1864,25 @@ def add_label_noise(
         )
 
     def get_label_distribution(self):
+        """Counts occurrences of each label in the corpus and returns them as a dictionary object.
+
+        This allows you to get an idea of which label appears how often in the Corpus.
+
+        Returns:
+            Dictionary with labels as keys and their occurrences as values.
+        """
         class_to_count = defaultdict(lambda: 0)
         for sent in self.train:
             for label in sent.labels:
                 class_to_count[label.value] += 1
         return class_to_count
 
     def get_all_sentences(self) -> ConcatDataset:
+        """Returns all sentences (spanning all three splits) in the :class:`Corpus`.
+
+        Returns:
+            A :class:`torch.utils.data.Dataset` object that includes all sentences of this corpus.
+        """
         parts = []
         if self.train:
             parts.append(self.train)