updated docstrings

Bayer-Group · Nov 20, 2024 · 843be1a · 843be1a
1 parent 0febb90
commit 843be1a
Show file tree

Hide file tree

Showing 12 changed files with 144 additions and 136 deletions.
diff --git a/docs/api/codelists/local_csv_codelist_factory.md b/docs/api/codelists/local_csv_codelist_factory.md
@@ -0,0 +1,3 @@
+# LocalCSVCodelistFactory
+
+::: phenex.codelists.codelists.LocalCSVCodelistFactory
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -27,6 +27,7 @@ nav:
           - Cohort: api/phenotypes/cohort.md
       - Codelists:
           - Codelist: api/codelists/codelists.md
+          - LocalCSVCodelistFactory: api/codelists/local_csv_codelist_factory.md
       - Connectors: api/ibis_connect.md
   - License: LICENSE.md
 

diff --git a/phenex/codelists/codelists.py b/phenex/codelists/codelists.py
@@ -5,16 +5,15 @@
 
 class Codelist:
     """
-    A Codelist has two fields:
+    Codelist is a class that allows us to conveniently work with medical codes used in RWD analyses. A Codelist represents a (single) specific medical concept, such as 'atrial fibrillation' or 'myocardial infarction'. A Codelist is associated with a set of medical codes from one or multiple source vocabularies (such as ICD10CM or CPT); we call these vocabularies 'code types'. Code type is important, as there are no assurances that codes from different vocabularies (different code types) do not overlap. It is therefore highly recommended to always specify the code type when using a codelist.
+
+    Codelist is a simple class that stores the codelist as a dictionary. The dictionary is keyed by code type and the value is a list of codes. Codelist also has various convenience methods such as read from excel, csv or yaml files, and export to excel files.
 
     Parameters:
-    name: Descriptive name of codelist
-    codelist: User can enter codelists as either a string, a list of strings
-    or a dictionary keyed by code type. In first two cases, the class will convert
-    the input to a dictionary with a single key None. All consumers of the Codelist
-    instance can then assume the codelist in that format.
+        name: Descriptive name of codelist
+        codelist: User can enter codelists as either a string, a list of strings or a dictionary keyed by code type. In first two cases, the class will convert the input to a dictionary with a single key None. All consumers of the Codelist instance can then assume the codelist in that format.
 
-    Example: 
+    Example:
     ```python
     # Initialize with a list
     cl = Codelist(
@@ -34,7 +33,7 @@ class Codelist:
     print(cl.codelist)
     {None: ['SBP']}
     ```
-    
+
     Example:
     ```python
     # Initialize with a dictionary
@@ -127,53 +126,50 @@ def from_excel(
         codelist_column: Optional[str] = "codelist",
     ) -> "Codelist":
         """
-        Load a codelist from an Excel file.
+         Load a single codelist located in an Excel file.
 
-        The Excel file should contain columns for code types, codes, and optionally
-        codelist names. Each row represents a code entry.
+         It is required that the Excel file contains a minimum of two columns for code and code_type. The actual columnnames can be specified using the code_column and code_type_column parameters.
 
-        The codelists can be in one sheet or spread across multiple sheets:
+         If multiple codelists exist in the same excel table, the codelist_column and codelist_name are required to point to the specific codelist of interest.
 
-        1. Single Sheet:
-        If all codelists are in one sheet, the sheet should have a column for codelist names.
-        Use codelist_name to point to the specific codelist of interest.
+         It is possible to specify the sheet name if the codelist is in a specific sheet.
 
-        Example:
-        ```markdown
-        | code_type | code   | codelist           |
-        |-----------|--------|--------------------|
-        | ICD-9     | 427.31 | atrial_fibrillation|
-        | ICD-10    | I48.0  | atrial_fibrillation|
-        | ICD-10    | I48.1  | atrial_fibrillation|
-        | ICD-10    | I48.2  | atrial_fibrillation|
-        | ICD-10    | I48.91 | atrial_fibrillation|
-        ```
-        
-        2. Multiple Sheets:
-        If codelists are spread across multiple sheets, each sheet should represent a single codelist.
-        Use sheet_name to point to the specific codelist of interest.
-        
-        Example:
-        ```markdown
-        | code_type | code   |
-        |-----------|--------|
-        | ICD-9     | 427.31 |
-        | ICD-10    | I48.0  |
-        | ICD-10    | I48.1  |
-        | ICD-10    | I48.2  |
-        | ICD-10    | I48.91 |
-        ```
-        
-        Parameters:
-            path: Path to the Excel file.
-            sheet_name: An optional label for the sheet to read from. If defined, the codelist will be taken from that sheet. If no sheet_name is defined, the first sheet is taken.
-            codelist_name: An optional name of the codelist which to extract. If defined, codelist_column must be present and the codelist_name must occur within the codelist_column.
-            code_column: The name of the column containing the codes.
-            code_type_column: The name of the column containing the code types.
-            codelist_column: The name of the column containing the codelist names.
+         1. Single table, single codelist : The table (whether an entire excel file, or a single sheet in an excel file) contains only one codelist. The table should have columns for code and code_type.
 
-        Returns:
-            Codelist instance.
+             ```markdown
+             | code_type | code   |
+             |-----------|--------|
+             | ICD-9     | 427.31 |
+             | ICD-10    | I48.0  |
+             | ICD-10    | I48.1  |
+             | ICD-10    | I48.2  |
+             | ICD-10    | I48.91 |
+             ```
+
+        2. Single table, multiple codelists: A single table (whether an entire file, or a single sheet in an excel file) contains multiple codelists. A column for the name of each codelist is required. Use codelist_name to point to the specific codelist of interest.
+
+             ```markdown
+             | code_type | code   | codelist           |
+             |-----------|--------|--------------------|
+             | ICD-9     | 427.31 | atrial_fibrillation|
+             | ICD-10    | I48.0  | atrial_fibrillation|
+             | ICD-10    | I48.1  | atrial_fibrillation|
+             | ICD-10    | I48.2  | atrial_fibrillation|
+             | ICD-10    | I48.91 | atrial_fibrillation|
+             ```
+
+
+
+         Parameters:
+             path: Path to the Excel file.
+             sheet_name: An optional label for the sheet to read from. If defined, the codelist will be taken from that sheet. If no sheet_name is defined, the first sheet is taken.
+             codelist_name: An optional name of the codelist which to extract. If defined, codelist_column must be present and the codelist_name must occur within the codelist_column.
+             code_column: The name of the column containing the codes.
+             code_type_column: The name of the column containing the code types.
+             codelist_column: The name of the column containing the codelist names.
+
+         Returns:
+             Codelist instance.
         """
         import pandas as pd
 
@@ -220,17 +216,20 @@ def __repr__(self):
 
     def to_pandas(self) -> pd.DataFrame:
         """
-        Convert the codelist to a pandas DataFrame.
+        Export the codelist to a pandas DataFrame. The DataFrame will have three columns: code_type, code, and codelist.
         """
 
         _df = pd.DataFrame(self.to_tuples(), columns=["code_type", "code"])
-        _df['codelist'] = self.name
+        _df["codelist"] = self.name
         return _df
 
 
-
 class LocalCSVCodelistFactory:
-    """ """
+    """
+    LocalCSVCodelistFactory allows for the creation of multiple codelists from a single CSV file. Use this class when you have a single CSV file that contains multiple codelists.
+
+    To use, create an instance of the class and then call the `create_codelist` method with the name of the codelist you want to create; this codelist name must be an entry in the name_code_type_column.
+    """
 
     def __init__(
         self,
@@ -239,6 +238,13 @@ def __init__(
         name_codelist_column: str = "codelist",
         name_code_type_column: str = "code_type",
     ) -> None:
+        """
+        Parameters:
+            path: Path to the CSV file.
+            name_code_column: The name of the column containing the codes.
+            name_codelist_column: The name of the column containing the codelist names.
+            name_code_type_column: The name of the column containing the code types.
+        """
         self.path = path
         self.name_code_column = name_code_column
         self.name_codelist_column = name_codelist_column

diff --git a/phenex/filters/aggregator.py b/phenex/filters/aggregator.py
@@ -8,7 +8,7 @@ def __init__(
         aggregation_index=["PERSON_ID"],
         aggregation_function="sum",
         event_date_column="EVENT_DATE",
-        reduce=False
+        reduce=False,
     ):
         self.aggregation_index = aggregation_index
         self.aggregation_function = aggregation_function
@@ -40,7 +40,9 @@ def aggregate(self, input_table: Table):
         input_table = input_table.mutate(aggregated_date=aggregated_date)
 
         # Filter rows where the original date matches the aggregated date
-        input_table = input_table.filter(input_table[self.event_date_column] == input_table.aggregated_date)
+        input_table = input_table.filter(
+            input_table[self.event_date_column] == input_table.aggregated_date
+        )
 
         # Select the necessary columns
 
@@ -52,6 +54,7 @@ def aggregate(self, input_table: Table):
 
         return input_table
 
+
 class Nearest(VerticalDateAggregator):
     def __init__(self, **kwargs):
         super().__init__(aggregation_function="max", **kwargs)

diff --git a/phenex/filters/categorical_filter.py b/phenex/filters/categorical_filter.py
@@ -2,6 +2,7 @@
 from typing import List, Optional, Union
 from ibis.expr.types.relations import Table
 
+
 class CategoricalFilter(Filter):
     """
     This class filters events in an EventTable based on specified categorical values
@@ -19,10 +20,10 @@ class CategoricalFilter(Filter):
     """
 
     def __init__(
-            self,
-            column_name: str,
-            allowed_values: List[Union[str, int]],
-            domain: Optional[str] = None
+        self,
+        column_name: str,
+        allowed_values: List[Union[str, int]],
+        domain: Optional[str] = None,
     ):
         self.column_name = column_name
         self.allowed_values = allowed_values

diff --git a/phenex/ibis_connect.py b/phenex/ibis_connect.py
@@ -23,13 +23,14 @@ def _check_env_vars(*vars: str) -> None:
 
 
 def ibis_snowflake_connect(
-        SNOWFLAKE_USER: Optional[str] = None,
-        SNOWFLAKE_ACCOUNT: Optional[str] = None,
-        SNOWFLAKE_WAREHOUSE: Optional[str] = None,
-        SNOWFLAKE_DATABASE: Optional[str] = None,
-        SNOWFLAKE_SCHEMA: Optional[str] = None,
-        SNOWFLAKE_ROLE: Optional[str] = None,
-        SNOWFLAKE_PASSWORD: Optional[str] = None) -> BaseBackend:
+    SNOWFLAKE_USER: Optional[str] = None,
+    SNOWFLAKE_ACCOUNT: Optional[str] = None,
+    SNOWFLAKE_WAREHOUSE: Optional[str] = None,
+    SNOWFLAKE_DATABASE: Optional[str] = None,
+    SNOWFLAKE_SCHEMA: Optional[str] = None,
+    SNOWFLAKE_ROLE: Optional[str] = None,
+    SNOWFLAKE_PASSWORD: Optional[str] = None,
+) -> BaseBackend:
     """
     Establish a connection to Snowflake using Ibis. Variables for the connection can
     be passed either via this function call or as environment variables of the same name.

diff --git a/phenex/mappers.py b/phenex/mappers.py
@@ -58,7 +58,13 @@ def rename(self, table: Table) -> Table:
         mapping = copy.deepcopy(asdict(self))
         mapping.pop("NAME_TABLE")
         # delete optional params from mapping
-        for key in ["DATE_OF_BIRTH", "DATE_OF_DEATH", "YEAR_OF_BIRTH", "SEX", "ETHNICITY"]:
+        for key in [
+            "DATE_OF_BIRTH",
+            "DATE_OF_DEATH",
+            "YEAR_OF_BIRTH",
+            "SEX",
+            "ETHNICITY",
+        ]:
             if getattr(self, key) is None:
                 del mapping[key]
         return table.rename(**mapping)
@@ -118,6 +124,7 @@ class MeasurementTableColumnMapper(CodeTableColumnMapper):
 
     VALUE: str = "VALUE"
 
+
 @dataclass
 class ObservationPeriodTableMapper:
     NAME_TABLE: str = "OBSERVATION_PERIOD"
@@ -139,26 +146,30 @@ def rename(self, table: Table) -> Table:
         mapping.pop("NAME_TABLE")
         return table.rename(**mapping)
 
+
 #
 # OMOP Column Mappers
 #
 OMOPPersonTableColumnMapper = PersonTableColumnMapper(
-    NAME_TABLE="PERSON", PERSON_ID="PERSON_ID", 
+    NAME_TABLE="PERSON",
+    PERSON_ID="PERSON_ID",
     DATE_OF_BIRTH="BIRTH_DATETIME",
     YEAR_OF_BIRTH="YEAR_OF_BIRTH",
-    SEX="GENDER_CONCEPT_ID", ETHNICITY="ETHNICITY_CONCEPT_ID"
+    SEX="GENDER_CONCEPT_ID",
+    ETHNICITY="ETHNICITY_CONCEPT_ID",
 )
 
 OMOPDeathTableColumnMapper = PersonTableColumnMapper(
-    NAME_TABLE="DEATH", PERSON_ID="PERSON_ID",
-    DATE_OF_DEATH="DEATH_DATE"
+    NAME_TABLE="DEATH", PERSON_ID="PERSON_ID", DATE_OF_DEATH="DEATH_DATE"
 )
 
 OMOPPersonTableSourceColumnMapper = PersonTableColumnMapper(
-    NAME_TABLE="PERSON", PERSON_ID="PERSON_ID", 
+    NAME_TABLE="PERSON",
+    PERSON_ID="PERSON_ID",
     DATE_OF_BIRTH="BIRTH_DATETIME",
     YEAR_OF_BIRTH="YEAR_OF_BIRTH",
-    SEX="GENDER_SOURCE_VALUE", ETHNICITY="ETHNICITY_SOURCE_VALUE"
+    SEX="GENDER_SOURCE_VALUE",
+    ETHNICITY="ETHNICITY_SOURCE_VALUE",
 )
 
 OMOPConditionOccurrenceColumnMapper = CodeTableColumnMapper(
@@ -223,12 +234,14 @@ def rename(self, table: Table) -> Table:
 OMOPDomains = DomainsDictionary(**OMOPColumnMappers)
 
 
-
 #
 # Vera Column Mappers
 #
 VeraPersonTableColumnMapper = PersonTableColumnMapper(
-    NAME_TABLE="PERSON", PERSON_ID="PERSON_ID", DATE_OF_BIRTH="BIRTH_DATETIME", DATE_OF_DEATH="DEATH_DATETIME"
+    NAME_TABLE="PERSON",
+    PERSON_ID="PERSON_ID",
+    DATE_OF_BIRTH="BIRTH_DATETIME",
+    DATE_OF_DEATH="DEATH_DATETIME",
 )
 
 VeraConditionOccurrenceColumnMapper = CodeTableColumnMapper(
@@ -268,4 +281,4 @@ def rename(self, table: Table) -> Table:
 #
 # Domains
 #
-VeraDomains = DomainsDictionary(**VeraColumnMappers)
+VeraDomains = DomainsDictionary(**VeraColumnMappers)
diff --git a/phenex/phenotypes/categorical_phenotype.py b/phenex/phenotypes/categorical_phenotype.py
@@ -29,12 +29,13 @@ class HospitalizationPhenotype(Phenotype):
         _execute(tables: Dict[str, Table]) -> PhenotypeTable:
             Executes the filtering process on the provided tables and returns the filtered phenotype table.
     """
+
     def __init__(
         self,
         domain,
         column_name: str,
         allowed_values: List[str],
-        name = None,
+        name=None,
         date_range: DateRangeFilter = None,
         relative_time_range: Union[
             RelativeTimeRangeFilter, List[RelativeTimeRangeFilter]
@@ -43,7 +44,9 @@ def __init__(
     ):
         super(HospitalizationPhenotype, self).__init__()
 
-        self.categorical_filter = CategoricalFilter(column_name=column_name, allowed_values=allowed_values)
+        self.categorical_filter = CategoricalFilter(
+            column_name=column_name, allowed_values=allowed_values
+        )
         self.name = name
         self.date_range = date_range
         self.return_date = return_date
@@ -93,4 +96,4 @@ def _perform_date_selection(self, code_table):
             aggregator = Last()
         else:
             raise ValueError(f"Unknown return_date: {self.return_date}")
-        return aggregator.aggregate(code_table)
+        return aggregator.aggregate(code_table)
diff --git a/phenex/phenotypes/death_phenotype.py b/phenex/phenotypes/death_phenotype.py
@@ -3,7 +3,8 @@
 from ibis.expr.types.relations import Table
 from phenex.phenotypes.phenotype import Phenotype
 from phenex.tables import PhenotypeTable, is_phenex_person_table
-import ibis 
+import ibis
+
 
 class DeathPhenotype(Phenotype):
     """
@@ -20,10 +21,14 @@ class DeathPhenotype(Phenotype):
             Executes the phenotype calculation and returns a table with the filtered individuals.
     """
 
-    def __init__(self, name: str = "death", domain: str = "PERSON",
+    def __init__(
+        self,
+        name: str = "death",
+        domain: str = "PERSON",
         relative_time_range: Union[
             RelativeTimeRangeFilter, List[RelativeTimeRangeFilter]
-        ] = None):
+        ] = None,
+    ):
         self.name = name
         self.domain = domain
         self.children = []
@@ -46,4 +51,4 @@ def _execute(self, tables: Dict[str, Table]) -> PhenotypeTable:
             for rtr in self.relative_time_range:
                 death_table = rtr.filter(death_table)
         death_table = death_table.mutate(VALUE=ibis.null())
-        return death_table.mutate(EVENT_DATE=death_table.DATE_OF_DEATH)
+        return death_table.mutate(EVENT_DATE=death_table.DATE_OF_DEATH)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# LocalCSVCodelistFactory

		::: phenex.codelists.codelists.LocalCSVCodelistFactory