Skip to content

Commit

Permalink
fix merges
Browse files Browse the repository at this point in the history
  • Loading branch information
sprivite committed Nov 20, 2024
2 parents a0b5768 + 843be1a commit c5bab7a
Show file tree
Hide file tree
Showing 6 changed files with 170 additions and 43 deletions.
3 changes: 3 additions & 0 deletions docs/api/codelists/local_csv_codelist_factory.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# LocalCSVCodelistFactory

::: phenex.codelists.codelists.LocalCSVCodelistFactory
3 changes: 3 additions & 0 deletions docs/api/ibis_connect.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Ibis Connectors

::: phenex.ibis_connect
4 changes: 3 additions & 1 deletion mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ nav:
- CodelistPhenotype: api/phenotypes/codelist_phenotype.md
- MeasurementPhenotype: api/phenotypes/measurement_phenotype.md
- AgePhenotype: api/phenotypes/age_phenotype.md
- SexPhenotype: api/phenotypes/sec_phenotype.md
- SexPhenotype: api/phenotypes/sex_phenotype.md
- DeathPhenotype: api/phenotypes/death_phenotype.md
- ContinuousCoveragePhenotype: api/phenotypes/continuous_coverage_phenotype.md
- AgePhenotype: api/phenotypes/age_phenotype.md
Expand All @@ -28,6 +28,8 @@ nav:
- Cohort: api/phenotypes/cohort.md
- Codelists:
- Codelist: api/codelists/codelists.md
- LocalCSVCodelistFactory: api/codelists/local_csv_codelist_factory.md
- Connectors: api/ibis_connect.md
- License: LICENSE.md

plugins:
Expand Down
119 changes: 97 additions & 22 deletions phenex/codelists/codelists.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,37 @@

class Codelist:
"""
A Codelist has two fields:
Codelist is a class that allows us to conveniently work with medical codes used in RWD analyses. A Codelist represents a (single) specific medical concept, such as 'atrial fibrillation' or 'myocardial infarction'. A Codelist is associated with a set of medical codes from one or multiple source vocabularies (such as ICD10CM or CPT); we call these vocabularies 'code types'. Code type is important, as there are no assurances that codes from different vocabularies (different code types) do not overlap. It is therefore highly recommended to always specify the code type when using a codelist.
:param name: Descriptive name of codelist
:param codelist: User can enter codelists as either a string, a list of strings
or a dictionary keyed by code type. In first two cases, the class will convert
the input to a dictionary with a single key None. All consumers of the Codelist
instance can then assume the codelist in that format.
Codelist is a simple class that stores the codelist as a dictionary. The dictionary is keyed by code type and the value is a list of codes. Codelist also has various convenience methods such as read from excel, csv or yaml files, and export to excel files.
Parameters:
name: Descriptive name of codelist
codelist: User can enter codelists as either a string, a list of strings or a dictionary keyed by code type. In first two cases, the class will convert the input to a dictionary with a single key None. All consumers of the Codelist instance can then assume the codelist in that format.
Example:
```python
# Initialize with a list
>> cl = Codelist(
cl = Codelist(
['x', 'y', 'z'],
'mycodelist'
)
>> print(cl.codelist)
print(cl.codelist)
{None: ['x', 'y', 'z']}
```
Example:
```python
# Initialize with string
>> cl = Codelist(
cl = Codelist(
'SBP'
)
>> print(cl.codelist)
print(cl.codelist)
{None: ['SBP']}
```
Example:
```python
# Initialize with a dictionary
>> atrial_fibrillation_icd_codes = {
"ICD-9": [
Expand All @@ -40,11 +48,11 @@ class Codelist:
"I48.91", # Unspecified atrial fibrillation
]
}
>> cl = Codelist(
cl = Codelist(
atrial_fibrillation_icd_codes,
'atrial_fibrillation',
)
>> print(cl.codelist)
print(cl.codelist)
{
"ICD-9": [
"427.31" # Atrial fibrillation
Expand All @@ -56,6 +64,7 @@ class Codelist:
"I48.91", # Unspecified atrial fibrillation
]
}
```
"""

def __init__(
Expand All @@ -77,6 +86,26 @@ def __init__(
def from_yaml(cls, path: str) -> "Codelist":
"""
Load a codelist from a yaml file.
The YAML file should contain a dictionary where the keys are code types
(e.g., "ICD-9", "ICD-10") and the values are lists of codes for each type.
Example:
```yaml
ICD-9:
- "427.31" # Atrial fibrillation
ICD-10:
- "I48.0" # Paroxysmal atrial fibrillation
- "I48.1" # Persistent atrial fibrillation
- "I48.2" # Chronic atrial fibrillation
- "I48.91" # Unspecified atrial fibrillation
```
Parameters:
path: Path to the YAML file.
Returns:
Codelist instance.
"""
import yaml

Expand All @@ -97,15 +126,50 @@ def from_excel(
codelist_column: Optional[str] = "codelist",
) -> "Codelist":
"""
Load a codelist from an Excel file.
Load a single codelist located in an Excel file.
Parameters:
path: path to the excel file.
sheet_name: an optional label for the sheet to read from. If defined, the codelist will be taken from that sheet. If no sheet_name is defined, the first sheet is taken.
codelist_name: an optional name of the codelist which to extract. If defined, codelist_column must be present and the codelist_name must occur within the codelist_column.
code_column: the name of the column containing the codes.
code_type_column: the name of the column containing the code types.
codelist_column: the name of the column containing the codelist names.
It is required that the Excel file contains a minimum of two columns for code and code_type. The actual columnnames can be specified using the code_column and code_type_column parameters.
If multiple codelists exist in the same excel table, the codelist_column and codelist_name are required to point to the specific codelist of interest.
It is possible to specify the sheet name if the codelist is in a specific sheet.
1. Single table, single codelist : The table (whether an entire excel file, or a single sheet in an excel file) contains only one codelist. The table should have columns for code and code_type.
```markdown
| code_type | code |
|-----------|--------|
| ICD-9 | 427.31 |
| ICD-10 | I48.0 |
| ICD-10 | I48.1 |
| ICD-10 | I48.2 |
| ICD-10 | I48.91 |
```
2. Single table, multiple codelists: A single table (whether an entire file, or a single sheet in an excel file) contains multiple codelists. A column for the name of each codelist is required. Use codelist_name to point to the specific codelist of interest.
```markdown
| code_type | code | codelist |
|-----------|--------|--------------------|
| ICD-9 | 427.31 | atrial_fibrillation|
| ICD-10 | I48.0 | atrial_fibrillation|
| ICD-10 | I48.1 | atrial_fibrillation|
| ICD-10 | I48.2 | atrial_fibrillation|
| ICD-10 | I48.91 | atrial_fibrillation|
```
Parameters:
path: Path to the Excel file.
sheet_name: An optional label for the sheet to read from. If defined, the codelist will be taken from that sheet. If no sheet_name is defined, the first sheet is taken.
codelist_name: An optional name of the codelist which to extract. If defined, codelist_column must be present and the codelist_name must occur within the codelist_column.
code_column: The name of the column containing the codes.
code_type_column: The name of the column containing the code types.
codelist_column: The name of the column containing the codelist names.
Returns:
Codelist instance.
"""
import pandas as pd

Expand Down Expand Up @@ -152,7 +216,7 @@ def __repr__(self):

def to_pandas(self) -> pd.DataFrame:
"""
Convert the codelist to a pandas DataFrame.
Export the codelist to a pandas DataFrame. The DataFrame will have three columns: code_type, code, and codelist.
"""

_df = pd.DataFrame(self.to_tuples(), columns=["code_type", "code"])
Expand All @@ -161,7 +225,11 @@ def to_pandas(self) -> pd.DataFrame:


class LocalCSVCodelistFactory:
""" """
"""
LocalCSVCodelistFactory allows for the creation of multiple codelists from a single CSV file. Use this class when you have a single CSV file that contains multiple codelists.
To use, create an instance of the class and then call the `create_codelist` method with the name of the codelist you want to create; this codelist name must be an entry in the name_code_type_column.
"""

def __init__(
self,
Expand All @@ -170,6 +238,13 @@ def __init__(
name_codelist_column: str = "codelist",
name_code_type_column: str = "code_type",
) -> None:
"""
Parameters:
path: Path to the CSV file.
name_code_column: The name of the column containing the codes.
name_codelist_column: The name of the column containing the codelist names.
name_code_type_column: The name of the column containing the code types.
"""
self.path = path
self.name_code_column = name_code_column
self.name_codelist_column = name_codelist_column
Expand Down
82 changes: 63 additions & 19 deletions phenex/ibis_connect.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,55 @@
from typing import Optional
import os
import ibis
from ibis.backends import BaseBackend


# Snowflake connection function
def check_env_vars(*vars):
def _check_env_vars(*vars: str) -> None:
"""
Check if the required environment variables are set.
Args:
*vars: Variable length argument list of environment variable names.
Raises:
EnvironmentError: If any of the required environment variables are missing.
"""
missing_vars = [var for var in vars if os.getenv(var) is None]
if missing_vars:
raise EnvironmentError(
f"Missing required environment variables: {', '.join(missing_vars)}. Add to .env file or set in the environment."
)


def ibis_snowflake_connect() -> BaseBackend:
def ibis_snowflake_connect(
SNOWFLAKE_USER: Optional[str] = None,
SNOWFLAKE_ACCOUNT: Optional[str] = None,
SNOWFLAKE_WAREHOUSE: Optional[str] = None,
SNOWFLAKE_DATABASE: Optional[str] = None,
SNOWFLAKE_SCHEMA: Optional[str] = None,
SNOWFLAKE_ROLE: Optional[str] = None,
SNOWFLAKE_PASSWORD: Optional[str] = None,
) -> BaseBackend:
"""
Establish a connection to Snowflake using Ibis. Variables for the connection can
be passed either via this function call or as environment variables of the same name.
All arguments are required to be specified by one of these two methods except
SNOWFLAKE_PASSWORD.
Args:
SNOWFLAKE_USER: Snowflake user name.
SNOWFLAKE_ACCOUNT: Snowflake account identifier.
SNOWFLAKE_WAREHOUSE: Snowflake warehouse name.
SNOWFLAKE_DATABASE: Snowflake database name.
SNOWFLAKE_SCHEMA : Snowflake schema name.
SNOWFLAKE_ROLE: Snowflake role name.
SNOWFLAKE_PASSWORD: Snowflake password. If not specified, will attempt to authenticate with externalbrowser.
Returns:
BaseBackend: An Ibis backend connection to Snowflake.
"""
required_vars = [
"SNOWFLAKE_USER",
"SNOWFLAKE_ACCOUNT",
Expand All @@ -21,32 +58,39 @@ def ibis_snowflake_connect() -> BaseBackend:
"SNOWFLAKE_SCHEMA",
"SNOWFLAKE_ROLE",
]
check_env_vars(*required_vars)
_check_env_vars(*required_vars)
if "SNOWFLAKE_PASSWORD" in os.environ:
return ibis.snowflake.connect(
user=os.getenv("SNOWFLAKE_USER"),
password=os.getenv("SNOWFLAKE_PASSWORD"),
account=os.getenv("SNOWFLAKE_ACCOUNT"),
warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
database=os.getenv("SNOWFLAKE_DATABASE"),
role=os.getenv("SNOWFLAKE_ROLE"),
schema=os.getenv("SNOWFLAKE_SCHEMA"),
user=os.getenv("SNOWFLAKE_USER", SNOWFLAKE_USER),
password=os.getenv("SNOWFLAKE_PASSWORD", SNOWFLAKE_PASSWORD),
account=os.getenv("SNOWFLAKE_ACCOUNT", SNOWFLAKE_ACCOUNT),
warehouse=os.getenv("SNOWFLAKE_WAREHOUSE", SNOWFLAKE_WAREHOUSE),
database=os.getenv("SNOWFLAKE_DATABASE", SNOWFLAKE_DATABASE),
role=os.getenv("SNOWFLAKE_ROLE", SNOWFLAKE_ROLE),
schema=os.getenv("SNOWFLAKE_SCHEMA", SNOWFLAKE_SCHEMA),
)
else:
return ibis.snowflake.connect(
user=os.getenv("SNOWFLAKE_USER"),
user=os.getenv("SNOWFLAKE_USER", SNOWFLAKE_USER),
authenticator="externalbrowser",
account=os.getenv("SNOWFLAKE_ACCOUNT"),
warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
database=os.getenv("SNOWFLAKE_DATABASE"),
role=os.getenv("SNOWFLAKE_ROLE"),
schema=os.getenv("SNOWFLAKE_SCHEMA"),
account=os.getenv("SNOWFLAKE_ACCOUNT", SNOWFLAKE_ACCOUNT),
warehouse=os.getenv("SNOWFLAKE_WAREHOUSE", SNOWFLAKE_WAREHOUSE),
database=os.getenv("SNOWFLAKE_DATABASE", SNOWFLAKE_DATABASE),
role=os.getenv("SNOWFLAKE_ROLE", SNOWFLAKE_ROLE),
schema=os.getenv("SNOWFLAKE_SCHEMA", SNOWFLAKE_SCHEMA),
)


# DuckDB connection function
def ibis_duckdb_connect() -> BaseBackend:
def ibis_duckdb_connect(DUCKDB_PATH: Optional[str] = ":memory") -> BaseBackend:
"""
Establish a connection to DuckDB using Ibis. Variables for the connection can
be passed either via this function call or as environment variables of the same name.
Returns:
BaseBackend: An Ibis backend connection to DuckDB.
"""
required_vars = ["DUCKDB_PATH"]
check_env_vars(*required_vars)
_check_env_vars(*required_vars)

return ibis.connect(backend="duckdb", path=os.getenv("DUCKDB_PATH", ":memory:"))
return ibis.connect(backend="duckdb", path=os.getenv("DUCKDB_PATH", DUCKDB_PATH))
2 changes: 1 addition & 1 deletion phenex/phenotypes/sex_phenotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class SexPhenotype(Phenotype):
def __init__(
self,
name: str = "sex",
allowed_values: Optional[List[Union[str, int, float]]] = ["male", "female"],
allowed_values: Optional[List[Union[str, int, float]]] = None,
domain: str = "PERSON",
):
self.name = name
Expand Down

0 comments on commit c5bab7a

Please sign in to comment.