Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split Assessment Factors into separate dataframe to reduce duplication of error rows passed to FE #475

Merged
merged 6 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cin_validator/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def run_all(filename: str, ruleset, select, output):

# click.echo(full_issue_df)
# click.echo(validator.multichild_issues)
click.echo(validator.data_files["ChildIdentifiers"])
click.echo(validator.data_files["Assessments"])


@cli.command(name="test")
Expand Down
1 change: 1 addition & 0 deletions cin_validator/cin_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def convert_data(root: ET.Element):
"Reviews": data_files.Reviews,
"Section47": data_files.Section47,
"Assessments": data_files.Assessments,
"AssessmentFactorsList": data_files.AssessmentFactorsList,
"Disabilities": data_files.Disabilities,
}
return cin_tables
Expand Down
65 changes: 47 additions & 18 deletions cin_validator/ingress.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,21 @@ class XMLtoCSV:
columns=[
"LAchildID",
"CINdetailsID",
"AssessmentID",
"AssessmentActualStartDate",
"AssessmentInternalReviewDate",
"AssessmentAuthorisationDate",
"AssessmentFactors",
]
)
AssessmentFactorsList = pd.DataFrame(
columns=[
"LAchildID",
"CINdetailsID",
"AssessmentID",
"AssessmentFactor",
]
)
CINplanDates = pd.DataFrame(
columns=["LAchildID", "CINdetailsID", "CINPlanStartDate", "CINPlanEndDate"]
)
Expand Down Expand Up @@ -121,7 +130,7 @@ class XMLtoCSV:
columns=["LAchildID", "CINdetailsID", "CPPID", "CPPreviewDate"]
)

id_cols = ["LAchildID", "CINdetailsID", "CPPID"]
id_cols = ["LAchildID", "CINdetailsID", "AssessmentID", "CPPID"]

def __init__(self, root):
"""
Expand Down Expand Up @@ -314,34 +323,54 @@ def create_Assessments(self, cin_detail):
:rtype: DataFrame
"""

def assessment_block_maker(assmnt):
assessment_dict = {
"LAchildID": self.LAchildID,
"CINdetailsID": self.CINdetailsID,
}
assessment_dict = get_values(elements, assessment_dict, assessment)
# the get_values function will not find AssessmentFactors on that level so it'll assign it to NaN
assessment_dict["AssessmentFactors"] = assmnt.text
assessments_list.append(assessment_dict)
return assessment_dict

assessments_list = []
columns = self.Assessments.columns
elements = list(set(columns).difference(set(self.id_cols)))

self.AssessmentID = 0
assessments = cin_detail.findall("Assessments")

for assessment in assessments:
# all the assessment descriptors repeat to create a row for each assessment factor.
self.AssessmentID += 1
assessment_dict = {
"LAchildID": self.LAchildID,
"CINdetailsID": self.CINdetailsID,
"AssessmentID": self.AssessmentID,
}

assessment_dict = get_values(elements, assessment_dict, assessment)

# the get_values function will not find AssessmentFactors on that level so we retrieve these separately.
assessment_factors = assessment.find("FactorsIdentifiedAtAssessment")
assessment_factors_list = []
assessment_columns = self.AssessmentFactorsList.columns
assessment_elements = list(
set(assessment_columns).difference(set(self.id_cols))
)

if assessment_factors is not None:
# if statement handles the non-iterable NoneType that .find produces if the element is not present.
for factor in assessment_factors:
assessment_block_maker(factor)
assessment_factors_dict = {
"LAchildID": self.LAchildID,
"CINdetailsID": self.CINdetailsID,
"AssessmentID": self.AssessmentID,
}
assessment_factors_dict = get_values(
assessment_elements, assessment_factors_dict, factor
)
assessment_factors_dict["AssessmentFactor"] = factor.text
assessment_factors_list.append(assessment_factors_dict)
assessment_factors_df = pd.DataFrame(assessment_factors_list)
self.AssessmentFactorsList = pd.concat(
[self.AssessmentFactorsList, assessment_factors_df],
ignore_index=True,
)
assessment_dict["AssessmentFactors"] = assessment_factors_df[
"AssessmentFactor"
].tolist()

# else needed to build blocks in instances where assessments aren't completed which means there's no assessment factors to build with.
else:
for assessment in assessments:
assessment_block_maker(assessment)
assessments_list.append(assessment_dict)

assessments_df = pd.DataFrame(assessments_list)
self.Assessments = pd.concat(
Expand Down
10 changes: 10 additions & 0 deletions cin_validator/rule_engine/__api.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,22 @@ class CINTable(Enum):
[
"LAchildID",
"CINdetailsID",
"AssessmentID",
"AssessmentActualStartDate",
"AssessmentInternalReviewDate",
"AssessmentAuthorisationDate",
"AssessmentFactors",
],
)
AssessmentFactorsList = Enum(
"AssessmentFactorsList",
[
"LAchildID",
"CINdetailsID",
"AssessmentID",
"AssessmentFactor",
],
)
CINplanDates = Enum(
"CINplanDates",
["LAchildID", "CINdetailsID", "CINPlanStartDate", "CINPlanEndDate"],
Expand Down
56 changes: 41 additions & 15 deletions cin_validator/rules/cin2022_23/rule_8873Q.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,13 @@

Assessments = CINTable.Assessments
LAchildID = Assessments.LAchildID
AssessmentFactors = Assessments.AssessmentFactors
CINdetailsID = Assessments.CINdetailsID
AssessmentID = Assessments.AssessmentID
AssessmentActualStartDate = Assessments.AssessmentActualStartDate
AssessmentFactors = Assessments.AssessmentFactors

AssessmentFactorsList = CINTable.AssessmentFactorsList
AssessmentFactor = AssessmentFactorsList.AssessmentFactor

CINdetails = CINTable.CINdetails
LAchildID = CINdetails.LAchildID
Expand All @@ -38,43 +42,48 @@ def validate(
# PREPARING DATA

df_ass = data_container[Assessments].copy()
df_asslist = data_container[AssessmentFactorsList].copy()
df_cin = data_container[CINdetails].copy()

# Before you begin, rename the index so that the initial row positions can be kept intact.
df_ass.index.name = "ROW_ID"
df_asslist.index.name = "ROW_ID"
df_cin.index.name = "ROW_ID"

# Resetting the index causes the ROW_IDs to become columns of their respective DataFrames
# so that they can come along when the merge is done.
df_ass.reset_index(inplace=True)
df_asslist.reset_index(inplace=True)
df_cin.reset_index(inplace=True)

# lOGIC
# Within a <CINDetails> group, if there is only one <Assessment> group present and <AssessmentFactors> (N00181) = “21”, <ReasonForClosure> (N00103) must should = RC8 or RC9.

# Eliminates rows with more than 1 assessment per CINdetails group by determining if there's more than 1 AssessmentActualStartDate per CINdetailsID per child
# Eliminates rows with more than 1 assessment per CINdetails group by determining if there's more than 1 AssessmentID per CINdetailsID per child
df_ass_merged = df_ass.merge(df_ass, on=["LAchildID", "CINdetailsID"])
df_ass_merged = df_ass_merged[
(
df_ass_merged["AssessmentActualStartDate_x"]
!= df_ass_merged["AssessmentActualStartDate_y"]
)
(df_ass_merged["AssessmentID_x"] != df_ass_merged["AssessmentID_y"])
]
more_than_1_ass = df_ass_merged["ROW_ID_x"].tolist()

df_ass = df_ass[~df_ass["ROW_ID"].isin(more_than_1_ass)]

df_ass = df_ass[
(df_ass[AssessmentFactors] == "21")
| (df_ass[AssessmentFactors] == "21 No factors identified")
| (df_ass[AssessmentFactors].str.contains("21"))
df_ass_merged = df_ass.merge(
df_asslist[["LAchildID", "CINdetailsID", "AssessmentID", "AssessmentFactor"]],
on=["LAchildID", "CINdetailsID", "AssessmentID"],
)

df_ass_merged = df_ass_merged[
(df_ass_merged[AssessmentFactor] == "2B")
| (df_ass_merged[AssessmentFactor] == "21 No factors identified")
| (df_ass_merged[AssessmentFactor].str.contains("21"))
]

# left merge means that only the filtered cpp children will be considered and there is no possibility of additonal children coming in from other tables.
# left merge means that only the filtered cin children will be considered and there is no possibility of additonal children coming in from other tables.

# get only the CINdetails groups with AssessmentFactors including 21.
merged_df = df_ass.copy().merge(
df_cin.copy(),
merged_df = df_ass_merged.merge(
df_cin,
on=[LAchildID, "CINdetailsID"],
how="left",
suffixes=["_ass", "_cin"],
Expand All @@ -88,7 +97,12 @@ def validate(

# create an identifier for each error instance.
merged_df["ERROR_ID"] = tuple(
zip(merged_df[LAchildID], merged_df[CINdetailsID], merged_df[ReasonForClosure])
zip(
merged_df[LAchildID],
merged_df[CINdetailsID],
merged_df[AssessmentID],
merged_df[ReasonForClosure],
)
)

# The merges were done on copies of df_ass, and df_cin so that the column names in dataframes themselves aren't affected by the suffixes.
Expand Down Expand Up @@ -123,48 +137,56 @@ def test_validate():
"LAchildID": "child1",
"AssessmentFactors": "21", # 0 pass
"CINdetailsID": "cinID1",
"AssessmentID": "11",
"AssessmentActualStartDate": "5/12/1993",
},
{
"LAchildID": "child1",
"AssessmentFactors": "BOO", # 1 ignore: factor!=21
"CINdetailsID": "cinID2",
"AssessmentID": "12",
"AssessmentActualStartDate": "5/12/1993",
},
{
"LAchildID": "child2",
"AssessmentFactors": "BOO", # 2 ignore: factor!=21
"CINdetailsID": "cinID1",
"AssessmentID": "21",
"AssessmentActualStartDate": "5/12/1993",
},
{
"LAchildID": "child3",
"AssessmentFactors": "21", # 3 fail. reason!=RC8
"CINdetailsID": "cinID1",
"AssessmentID": "31",
"AssessmentActualStartDate": "5/12/1993",
},
{ # absent
"LAchildID": "child3",
"AssessmentFactors": pd.NA, # 4 ignore: factor!=21
"CINdetailsID": "cinID2",
"AssessmentID": "32",
"AssessmentActualStartDate": "5/12/1993",
},
{ # fail
"LAchildID": "child3",
"AssessmentFactors": "21", # 5 fail. reason!=RC8
"CINdetailsID": "cinID3",
"AssessmentID": "33",
"AssessmentActualStartDate": "5/12/1993",
},
{
"LAchildID": "child3",
"AssessmentFactors": "21", # ignore: more than one assessment in CIN episode
"CINdetailsID": "cinID4",
"AssessmentID": "34",
"AssessmentActualStartDate": "5/12/1993",
},
{
"LAchildID": "child3",
"AssessmentFactors": "20", # 6 ignore: factor!=21
"CINdetailsID": "cinID4",
"AssessmentID": "35",
"AssessmentActualStartDate": "5/12/1994",
},
]
Expand Down Expand Up @@ -216,6 +238,9 @@ def test_validate():
validate,
{
Assessments: sample_ass,
AssessmentFactorsList: sample_ass.rename(
columns={"AssessmentFactors": "AssessmentFactor"}
),
CINdetails: sample_cin,
},
)
Expand Down Expand Up @@ -254,7 +279,7 @@ def test_validate():
"ERROR_ID": (
"child3", # ChildID
"cinID1", # CINdetailsID
# corresponding CPPstartDate
"31", # AssessmentID
"RC10",
),
"ROW_ID": [3],
Expand All @@ -263,6 +288,7 @@ def test_validate():
"ERROR_ID": (
"child3",
"cinID3",
"33",
"RC10",
),
"ROW_ID": [5],
Expand Down
Loading
Loading