diff --git a/phenex/filters/filter.py b/phenex/filters/filter.py index 945a5ac..aa0b736 100644 --- a/phenex/filters/filter.py +++ b/phenex/filters/filter.py @@ -8,6 +8,7 @@ class Filter: but cannot trigger recursive execution. Fitlers can add columns but may not remove columns. All classes in the filters module should subclass this class. Subclasses must implement the _filter method. + """ def __init__(self): @@ -19,6 +20,7 @@ def filter(self, table: Table) -> Table: if not set(input_columns) <= set(table.columns): raise ValueError(f"Filter must not remove columns.") + table = table.select(input_columns) return table def _filter(self, table: Table) -> Table: diff --git a/phenex/test/phenotype_test_generator.py b/phenex/test/phenotype_test_generator.py index 29faf5b..56c6f60 100644 --- a/phenex/test/phenotype_test_generator.py +++ b/phenex/test/phenotype_test_generator.py @@ -99,7 +99,7 @@ def df_from_test_info(test_info): if test_info.get("dates") is not None: df[columnname_date] = test_info["dates"] else: - test_info[columnname_date] = ibis.null(type=datetime.date) + df[columnname_date] = None if test_info.get("values") is not None: df[columnname_value] = test_info["values"] diff --git a/phenex/test/phenotypes/test_clpt_anchor_phenotype.py b/phenex/test/phenotypes/test_clpt_anchor_phenotype.py deleted file mode 100644 index 43704e1..0000000 --- a/phenex/test/phenotypes/test_clpt_anchor_phenotype.py +++ /dev/null @@ -1,201 +0,0 @@ -import datetime, os -import pandas as pd - -from phenex.phenotypes.codelist_phenotype import CodelistPhenotype -from phenex.codelists import LocalCSVCodelistFactory -from phenex.filters.date_range_filter import DateRangeFilter -from phenex.filters.relative_time_range_filter import RelativeTimeRangeFilter -from phenex.test.util.dummy.generate_dummy_data import ( - sdf_and_tt_dummycodes_3variables, -) -from phenex.test.phenotype_test_generator import PhenotypeTestGenerator -from phenex.filters.value import ( - GreaterThan, - GreaterThanOrEqualTo, - LessThan, - LessThanOrEqualTo, - EqualTo, -) - - -class CodelistPhenotypeAnchorPhenotypeRelativeTimeRangeFilterTestGenerator( - PhenotypeTestGenerator -): - name_space = "clpt_anchor_phenotype" - - def define_input_tables(self): - """ - want to test eg breast cancer pre index, gives the anchor date, code before breast cancer - time component phenotype 1 : anchor date is index date - time component phenotype 2 : anchor date is phenotype 1 - """ - min_days = datetime.timedelta(days=90) - max_days = datetime.timedelta(days=180) - index_date = datetime.date(2022, 1, 1) - - dates = [ - # phenotype 1 before index - index_date - max_days, - index_date - min_days, # pass - index_date, - index_date + min_days, - index_date + max_days, - ] - - phenotype1_eventdates = [] - phenotype2_eventdates = [] - - pids = [] - i = 0 - daysdif_p1 = [] - daysdif_p2 = [] - - for phenotype1_eventdate in dates: - phenotype1_eventdates += [phenotype1_eventdate] * 5 - daysdif_p1 += [0] * 5 - - new = [] - new.append(phenotype1_eventdate - max_days) - new.append(phenotype1_eventdate - min_days) - new.append(phenotype1_eventdate) - new.append(phenotype1_eventdate + min_days) - new.append(phenotype1_eventdate + max_days) - - daysdif_p2 += [(x - phenotype1_eventdate).days for x in new] - phenotype2_eventdates += new - - for _unused in range(5): - pids.append(f"P{i}") - i += 1 - - N = len(phenotype1_eventdates) + len(phenotype2_eventdates) - - df = pd.DataFrame.from_dict( - { - "CODE": ["c1"] * len(phenotype1_eventdates) - + ["c2"] * len(phenotype2_eventdates), - "PERSON_ID": pids + pids, - "CODE_TYPE": ["ICD10CM"] * N, - "INDEX_DATE": [index_date] * N, - "EVENT_DATE": phenotype1_eventdates + phenotype2_eventdates, - "days_from_anchor": daysdif_p1 + daysdif_p2, - } - ) - - df["days_from_index"] = [ - y.days - for y in ( - [x - index_date for x in phenotype1_eventdates] - + [x - index_date for x in phenotype2_eventdates] - ) - ] - - info_input = {"name": "CONDITION_OCCURRENCE", "df": df} - - return [info_input] - - def define_phenotype_tests(self): - # INDEX PHENOTYPES - codelist_factory = LocalCSVCodelistFactory( - os.path.join(os.path.dirname(__file__), "../util/dummy/codelists.csv") - ) - - phenotypeindex1 = CodelistPhenotype( - name="anchor_g0_leq90", - codelist=codelist_factory.get_codelist("c1"), - domain="CONDITION_OCCURRENCE", - return_date="last", - relative_time_range=RelativeTimeRangeFilter( - min_days=GreaterThan(0), - max_days=LessThanOrEqualTo(90), - ), - ) - - phenotypeindex2 = CodelistPhenotype( - name="anchor_ge0_leq180", - codelist=codelist_factory.get_codelist("c1"), - domain="CONDITION_OCCURRENCE", - return_date="last", - relative_time_range=RelativeTimeRangeFilter( - min_days=GreaterThanOrEqualTo(0), - max_days=LessThanOrEqualTo(180), - ), - ) - - # second phenotype must occur any time prior to phenotype 1, but bounded within 1 year of index date - - #### USE INDEX PHENOTYEPS AS ANCHOR : - - phenotype1 = CodelistPhenotype( - name="p1", - codelist=codelist_factory.get_codelist("c2"), - domain="CONDITION_OCCURRENCE", - relative_time_range=RelativeTimeRangeFilter( - anchor_phenotype=phenotypeindex1, - min_days=GreaterThanOrEqualTo(91), - ), - ) - - t1 = {"name": "p1", "persons": ["P5"], "phenotype": phenotype1} - - phenotype2 = CodelistPhenotype( - name="p2", - codelist=codelist_factory.get_codelist("c2"), - domain="CONDITION_OCCURRENCE", - relative_time_range=RelativeTimeRangeFilter( - anchor_phenotype=phenotypeindex2, - max_days=LessThanOrEqualTo(90), - ), - ) - - t2 = { - "name": "p2", - "persons": [f"P{i}" for i in [1, 2, 6, 7, 11, 12]], - "phenotype": phenotype2, - } - - # Test that a baseline period works even with a linked time component. - # the anchor event occurs at some period pre baseline - # and we can add additional time components to the linked time component that ensures - # the verification phenotype is also within the baseline period - # phenotype3 = CodelistPhenotype( - # name="p3", - # codelist=codelist_factory.get_codelist("c2"), - # domain='CONDITION_OCCURRENCE', - # relative_time_range=[ - # RelativeTimeRangeFilter( - # anchor_phenotype=phenotypeindex2, - # max_days=LessThanOrEqualTo(90), - # ), - # RelativeTimeRangeFilter( - # max_days=LessThan(180) - # ), # ensure this event is within the baseline period - # ], - # ) - # t3 = {"persons": [f"P{i}" for i in [7, 11, 12]], "phenotype": phenotype3} - - phenotype4 = CodelistPhenotype( - name="p4", - codelist=codelist_factory.get_codelist("c2"), - domain="CONDITION_OCCURRENCE", - relative_time_range=RelativeTimeRangeFilter( - anchor_phenotype=phenotypeindex2, - min_days=GreaterThanOrEqualTo(-90), - max_days=LessThanOrEqualTo(90), - ), - ) - - t4 = { - "name": "p4", - "persons": [f"P{i}" for i in [1, 2, 3, 6, 7, 8, 11, 12, 13]], - "phenotype": phenotype4, - } - - test_infos = [t1, t2, t4] # t3 # TODO implement list of relative time ranges - - return test_infos - - -def test_anchor_phenotype(): - tg = CodelistPhenotypeAnchorPhenotypeRelativeTimeRangeFilterTestGenerator() - tg.run_tests() diff --git a/phenex/test/phenotypes/test_clpt_return_date.py b/phenex/test/phenotypes/test_clpt_return_date.py deleted file mode 100644 index 765182d..0000000 --- a/phenex/test/phenotypes/test_clpt_return_date.py +++ /dev/null @@ -1,190 +0,0 @@ -import datetime, os -import pandas as pd - -from phenex.phenotypes.codelist_phenotype import CodelistPhenotype -from phenex.codelists import LocalCSVCodelistFactory -from phenex.filters.date_range_filter import DateRangeFilter -from phenex.filters.relative_time_range_filter import RelativeTimeRangeFilter -from phenex.test.util.dummy.generate_dummy_data import ( - sdf_and_tt_dummycodes_3variables, -) -from phenex.test.phenotype_test_generator import PhenotypeTestGenerator -from phenex.filters.value import ( - GreaterThan, - GreaterThanOrEqualTo, - LessThan, - LessThanOrEqualTo, - EqualTo, -) - - -class CodelistPhenotypeReturnDateFilterTestGenerator(PhenotypeTestGenerator): - name_space = "clpt_return_date" - - def define_input_tables(self): - min_days = datetime.timedelta(days=90) - max_days = datetime.timedelta(days=180) - one_day = datetime.timedelta(days=1) - index_date = datetime.date(2022, 1, 1) - - self.event_dates = [ - index_date - min_days - one_day, # P0 c1 0 - index_date - min_days, # P1 c1 1 - index_date - min_days + one_day, # P2 c1 2 - index_date - min_days - one_day, # P0 c2 3 - index_date - min_days, # P1 c2 4 - index_date - min_days + one_day, # P2 c2 5 - index_date + min_days - one_day, # P0 c1 6 - index_date + min_days, # P1 c1 7 - index_date + min_days + one_day, # P2 c1 8 - index_date + min_days - one_day, # P0 c2 9 - index_date + min_days, # P1 c2 10 - index_date + min_days + one_day, # P2 c2 11 - ] - - """ - - idx - - -min_days +min_days - 1 7 - 4 10 - -min_days-1 -min_days+1 +min_days-1 +min_days+1 - 0 2 6 8 - 3 5 9 11 - """ - N = len(self.event_dates) - - df = pd.DataFrame.from_dict( - { - "CODE": ["c1"] * 3 + ["c2"] * 3 + ["c1"] * 3 + ["c2"] * 3, - "PERSON_ID": [f"P0" for x in list(range(N))], - "CODE_TYPE": ["ICD10CM"] * N, - "INDEX_DATE": [index_date] * N, - "EVENT_DATE": self.event_dates, - } - ) - - return [{"name": "CONDITION_OCCURRENCE", "df": df}] - - def define_phenotype_tests(self): - t1 = { - "name": "returndate", - "return_date": "all", - "persons": ["P0", "P0", "P0"] * 2, - "dates": self.event_dates[:3] + self.event_dates[6:9], - } - - t2 = { - "name": "l90", - "return_date": "all", - "persons": ["P0"], - "dates": [self.event_dates[2]], - "relative_time_range": RelativeTimeRangeFilter(max_days=LessThan(90)), - } - - t3 = { - "name": "leq90", - "return_date": "all", - "persons": ["P0", "P0"], - "dates": self.event_dates[1:3], - "relative_time_range": RelativeTimeRangeFilter( - max_days=LessThanOrEqualTo(90) - ), - } - - t4 = { - "name": "first_preindex", - "return_date": "first", - "persons": ["P0"], - "dates": [self.event_dates[0]], - } - - t5 = { - "name": "last_preindex", - "return_date": "last", - "persons": ["P0"], - "dates": [self.event_dates[2]], - "relative_time_range": RelativeTimeRangeFilter(when="before"), - } - - t6 = { - "name": "first_leq90", - "return_date": "first", - "persons": ["P0"], - "dates": [self.event_dates[1]], - "relative_time_range": RelativeTimeRangeFilter( - max_days=LessThanOrEqualTo(90) - ), - } - - # POST INDEX TESTS - t7 = { - "name": "last_postindex", - "return_date": "last", - "persons": ["P0"], - "dates": [self.event_dates[8]], - "relative_time_range": RelativeTimeRangeFilter(when="after"), - } - - t8 = { - "name": "first_postindex", - "return_date": "first", - "persons": ["P0"], - "dates": [self.event_dates[6]], - "relative_time_range": RelativeTimeRangeFilter(when="after"), - } - - t9 = { - "name": "postindex_leq90", - "return_date": "all", - "persons": ["P0", "P0"], - "dates": [self.event_dates[6], self.event_dates[7]], - "relative_time_range": RelativeTimeRangeFilter( - when="after", max_days=LessThanOrEqualTo(90) - ), - } - - # TODO implement nearest - - t10 = { - "name": "nearest_prior", - "return_date": "nearest", - "persons": ["P0"], - "dates": [self.event_dates[2]], - "relative_time_range": RelativeTimeRangeFilter( - when="before", min_days=GreaterThanOrEqualTo(0) - ), - } - - t11 = { - "name": "nearest_all", - "return_date": "nearest", - "persons": ["P0"], - "dates": [self.event_dates[2]], - "relative_time_range": RelativeTimeRangeFilter( - when="before", max_days=LessThanOrEqualTo(1000) - ), - } - - test_infos = [t1, t2, t3, t4, t5, t6, t7, t8, t9] # , t10, t11] - codelist_factory = LocalCSVCodelistFactory( - os.path.join(os.path.dirname(__file__), "../util/dummy/codelists.csv") - ) - for test_info in test_infos: - test_info["column_types"] = {f"{test_info['name']}_date": "date"} - - test_info["phenotype"] = CodelistPhenotype( - name=test_info["name"], - domain="CONDITION_OCCURRENCE", - codelist=codelist_factory.get_codelist("c1"), - relative_time_range=test_info.get("relative_time_range"), - return_date=test_info["return_date"], - ) - - return test_infos - - -def test_codelist_phenotype(): - tg = CodelistPhenotypeReturnDateFilterTestGenerator() - tg.run_tests() diff --git a/phenex/test/phenotypes/test_continuous_coverage_phenotype.py b/phenex/test/phenotypes/test_continuous_coverage_phenotype.py index 166caa3..8875bf8 100644 --- a/phenex/test/phenotypes/test_continuous_coverage_phenotype.py +++ b/phenex/test/phenotypes/test_continuous_coverage_phenotype.py @@ -2,7 +2,8 @@ import pandas as pd from phenex.phenotypes.continuous_coverage_phenotype import ContinuousCoveragePhenotype -from phenex.codelists import LocalCSVCodelistFactory +from phenex.phenotypes.codelist_phenotype import CodelistPhenotype +from phenex.codelists import LocalCSVCodelistFactory, Codelist from phenex.filters.date_range_filter import DateRangeFilter from phenex.filters.relative_time_range_filter import RelativeTimeRangeFilter @@ -124,11 +125,54 @@ def define_phenotype_tests(self): min_days=test_info.get("coverage_period_min"), when="after", ) - test_info["column_types"] = {f"{test_info['name']}_date": "date"} return test_infos +class ContinuousCoverageWithAnchorPhenotype(ContinuousCoveragePhenotypeTestGenerator): + name_space = "ccpt_anchorphenotype" + + def define_input_tables(self): + tables = super().define_input_tables() + + tables[0]["df"].drop(columns=["INDEX_DATE"], inplace=True) + df = pd.DataFrame() + n_patients = tables[0]["df"]["PERSON_ID"].unique().shape[0] + c1s = ["c1"] * 12 + df["CODE"] = c1s + ["c2"] * (n_patients - len(c1s)) + df["CODE_TYPE"] = "ICD10" + df["EVENT_DATE"] = datetime.date(2022, 1, 1) + df["PERSON_ID"] = ["P" + str(x) for x in range(n_patients)] + tables.append({"name": "CONDITION_OCCURRENCE", "df": df}) + return tables + + def define_phenotype_tests(self): + + entry = CodelistPhenotype( + name="entry", + codelist=Codelist(name="c1", codelist={"ICD10": ["c1"]}), + domain="CONDITION_OCCURRENCE", + ) + + cc1 = ContinuousCoveragePhenotype( + name="cc_prior_entry", + min_days=GreaterThanOrEqualTo(90), + when="before", + anchor_phenotype=entry, + ) + + persons = ["P7", "P10", "P11"] + + t1 = { + "name": "coverage_min_geq_90", + "persons": persons, + "phenotype": cc1, + } + + test_infos = [t1] + return test_infos + + def test_continuous_coverage_phenotypes(): spg = ContinuousCoveragePhenotypeTestGenerator() spg.run_tests() @@ -139,6 +183,12 @@ def test_continuous_coverage_return_last(): spg.run_tests() +def test_continuous_coverage_with_anchor_phenotype(): + spg = ContinuousCoverageWithAnchorPhenotype() + spg.run_tests() + + if __name__ == "__main__": test_continuous_coverage_phenotypes() test_continuous_coverage_return_last() + test_continuous_coverage_with_anchor_phenotype() diff --git a/phenex/test/util/check_equality.py b/phenex/test/util/check_equality.py index c34f67d..1a61b81 100644 --- a/phenex/test/util/check_equality.py +++ b/phenex/test/util/check_equality.py @@ -13,6 +13,7 @@ def check_equality( result.loc[:, "DUMMY"] = 1 expected = expected.to_pandas() expected.loc[:, "DUMMY"] = 1 + full_results = result.merge( expected, on=join_on, suffixes=("_result", "_expected"), how="outer" )