From 4f82732b14571a1504aee1faa39b24cbdc86dfbb Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Sat, 7 Dec 2024 15:11:01 +0100 Subject: [PATCH] Add docstrings to Corpus object --- flair/data.py | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/flair/data.py b/flair/data.py index aa5d5ba2d..fe92a3cbd 100644 --- a/flair/data.py +++ b/flair/data.py @@ -1397,8 +1397,8 @@ def __init__( In most cases, you will not use the constructor yourself. Rather, you will create a corpus using one of our helper methods that read common NLP filetypes. For instance, you can use - :class:`flair.datasets.sequence_labeling.ColumnCorpus` to read CoNLL-formatted files directly into - a :class:`Corpus`. + :class:`flair.datasets.sequence_labeling.ColumnCorpus` to read CoNLL-formatted files directly into + a :class:`Corpus`. Args: train (torch.utils.data.Dataset): The split you use for model training. @@ -1611,9 +1611,17 @@ def _downsample_to_proportion(dataset: Dataset, proportion: float, random_seed: return splits[0] def obtain_statistics(self, label_type: Optional[str] = None, pretty_print: bool = True) -> Union[dict, str]: - """Print statistics about the class distribution and sentence sizes. + """Print statistics about the corpus, including the length of the sentences and the labels in the corpus. - only labels of sentences are taken into account + Args: + label_type (str): Optionally set this value to obtain statistics only for one specific type of label (such + as "ner" or "pos"). If not set, statistics for all labels will be returned. + pretty_print (bool): If set to True, returns pretty json (indented for readabilty). If not, the json is + returned as a single line. Default: True. + + Returns: + A pretty print formatted string in json format if pretty_print is set to True. + A dictionary holding a json if pretty_print is set to False. """ json_data = { "TRAIN": self._obtain_statistics_for(self.train, "TRAIN", label_type), @@ -1685,7 +1693,21 @@ def make_label_dictionary( ) -> Dictionary: """Creates a dictionary of all labels assigned to the sentences in the corpus. - :return: dictionary of labels + Args: + label_type (str): The name of the label type for which the dictionary should be created. Some corpora have + multiple layers of annotation, such as "pos" and "ner". In this case, you should choose the label type + you are interested in. + min_count (int): Optionally set this to exclude rare labels from the dictionary (i.e., labels seen fewer + than the provided integer value). + add_unk (bool): Optionally set this to True to include a "UNK" value in the dictionary. In most cases, this + is not needed since the label dictionary is well-defined, but some use cases might have open classes + and require this. + add_dev_test (bool): Optionally set this to True to construct the label dictionary not only from the train + split, but also from dev and test. This is only necessary if some labels never appear in train but do + appear in one of the other splits. + + Returns: + A Dictionary of all unique labels in the corpus. """ if min_count > 0 and not add_unk: add_unk = True