diff --git a/cin_validator/__main__.py b/cin_validator/__main__.py index d0d223fd..29624ba0 100644 --- a/cin_validator/__main__.py +++ b/cin_validator/__main__.py @@ -91,7 +91,7 @@ def run_all(filename: str, ruleset, select, output): # click.echo(full_issue_df) # click.echo(validator.multichild_issues) - click.echo(validator.data_files["ChildIdentifiers"]) + click.echo(validator.data_files["Assessments"]) @cli.command(name="test") diff --git a/cin_validator/cin_validator.py b/cin_validator/cin_validator.py index efb8aa57..6f17cc02 100644 --- a/cin_validator/cin_validator.py +++ b/cin_validator/cin_validator.py @@ -52,6 +52,7 @@ def convert_data(root: ET.Element): "Reviews": data_files.Reviews, "Section47": data_files.Section47, "Assessments": data_files.Assessments, + "AssessmentFactorsList": data_files.AssessmentFactorsList, "Disabilities": data_files.Disabilities, } return cin_tables diff --git a/cin_validator/ingress.py b/cin_validator/ingress.py index e8766263..59b30d90 100644 --- a/cin_validator/ingress.py +++ b/cin_validator/ingress.py @@ -86,12 +86,21 @@ class XMLtoCSV: columns=[ "LAchildID", "CINdetailsID", + "AssessmentID", "AssessmentActualStartDate", "AssessmentInternalReviewDate", "AssessmentAuthorisationDate", "AssessmentFactors", ] ) + AssessmentFactorsList = pd.DataFrame( + columns=[ + "LAchildID", + "CINdetailsID", + "AssessmentID", + "AssessmentFactor", + ] + ) CINplanDates = pd.DataFrame( columns=["LAchildID", "CINdetailsID", "CINPlanStartDate", "CINPlanEndDate"] ) @@ -121,7 +130,7 @@ class XMLtoCSV: columns=["LAchildID", "CINdetailsID", "CPPID", "CPPreviewDate"] ) - id_cols = ["LAchildID", "CINdetailsID", "CPPID"] + id_cols = ["LAchildID", "CINdetailsID", "AssessmentID", "CPPID"] def __init__(self, root): """ @@ -314,34 +323,54 @@ def create_Assessments(self, cin_detail): :rtype: DataFrame """ - def assessment_block_maker(assmnt): - assessment_dict = { - "LAchildID": self.LAchildID, - "CINdetailsID": self.CINdetailsID, - } - assessment_dict = get_values(elements, assessment_dict, assessment) - # the get_values function will not find AssessmentFactors on that level so it'll assign it to NaN - assessment_dict["AssessmentFactors"] = assmnt.text - assessments_list.append(assessment_dict) - return assessment_dict - assessments_list = [] columns = self.Assessments.columns elements = list(set(columns).difference(set(self.id_cols))) + self.AssessmentID = 0 assessments = cin_detail.findall("Assessments") + for assessment in assessments: - # all the assessment descriptors repeat to create a row for each assessment factor. + self.AssessmentID += 1 + assessment_dict = { + "LAchildID": self.LAchildID, + "CINdetailsID": self.CINdetailsID, + "AssessmentID": self.AssessmentID, + } + + assessment_dict = get_values(elements, assessment_dict, assessment) + + # the get_values function will not find AssessmentFactors on that level so we retrieve these separately. assessment_factors = assessment.find("FactorsIdentifiedAtAssessment") + assessment_factors_list = [] + assessment_columns = self.AssessmentFactorsList.columns + assessment_elements = list( + set(assessment_columns).difference(set(self.id_cols)) + ) + if assessment_factors is not None: # if statement handles the non-iterable NoneType that .find produces if the element is not present. for factor in assessment_factors: - assessment_block_maker(factor) + assessment_factors_dict = { + "LAchildID": self.LAchildID, + "CINdetailsID": self.CINdetailsID, + "AssessmentID": self.AssessmentID, + } + assessment_factors_dict = get_values( + assessment_elements, assessment_factors_dict, factor + ) + assessment_factors_dict["AssessmentFactor"] = factor.text + assessment_factors_list.append(assessment_factors_dict) + assessment_factors_df = pd.DataFrame(assessment_factors_list) + self.AssessmentFactorsList = pd.concat( + [self.AssessmentFactorsList, assessment_factors_df], + ignore_index=True, + ) + assessment_dict["AssessmentFactors"] = assessment_factors_df[ + "AssessmentFactor" + ].tolist() - # else needed to build blocks in instances where assessments aren't completed which means there's no assessment factors to build with. - else: - for assessment in assessments: - assessment_block_maker(assessment) + assessments_list.append(assessment_dict) assessments_df = pd.DataFrame(assessments_list) self.Assessments = pd.concat( diff --git a/cin_validator/rule_engine/__api.py b/cin_validator/rule_engine/__api.py index 79ee1197..9af3c18f 100644 --- a/cin_validator/rule_engine/__api.py +++ b/cin_validator/rule_engine/__api.py @@ -67,12 +67,22 @@ class CINTable(Enum): [ "LAchildID", "CINdetailsID", + "AssessmentID", "AssessmentActualStartDate", "AssessmentInternalReviewDate", "AssessmentAuthorisationDate", "AssessmentFactors", ], ) + AssessmentFactorsList = Enum( + "AssessmentFactorsList", + [ + "LAchildID", + "CINdetailsID", + "AssessmentID", + "AssessmentFactor", + ], + ) CINplanDates = Enum( "CINplanDates", ["LAchildID", "CINdetailsID", "CINPlanStartDate", "CINPlanEndDate"], diff --git a/cin_validator/rules/cin2022_23/rule_8873Q.py b/cin_validator/rules/cin2022_23/rule_8873Q.py index bce40676..2e1b87ef 100644 --- a/cin_validator/rules/cin2022_23/rule_8873Q.py +++ b/cin_validator/rules/cin2022_23/rule_8873Q.py @@ -9,9 +9,13 @@ Assessments = CINTable.Assessments LAchildID = Assessments.LAchildID -AssessmentFactors = Assessments.AssessmentFactors CINdetailsID = Assessments.CINdetailsID +AssessmentID = Assessments.AssessmentID AssessmentActualStartDate = Assessments.AssessmentActualStartDate +AssessmentFactors = Assessments.AssessmentFactors + +AssessmentFactorsList = CINTable.AssessmentFactorsList +AssessmentFactor = AssessmentFactorsList.AssessmentFactor CINdetails = CINTable.CINdetails LAchildID = CINdetails.LAchildID @@ -38,43 +42,48 @@ def validate( # PREPARING DATA df_ass = data_container[Assessments].copy() + df_asslist = data_container[AssessmentFactorsList].copy() df_cin = data_container[CINdetails].copy() # Before you begin, rename the index so that the initial row positions can be kept intact. df_ass.index.name = "ROW_ID" + df_asslist.index.name = "ROW_ID" df_cin.index.name = "ROW_ID" # Resetting the index causes the ROW_IDs to become columns of their respective DataFrames # so that they can come along when the merge is done. df_ass.reset_index(inplace=True) + df_asslist.reset_index(inplace=True) df_cin.reset_index(inplace=True) # lOGIC # Within a group, if there is only one group present and (N00181) = “21”, (N00103) must should = RC8 or RC9. - # Eliminates rows with more than 1 assessment per CINdetails group by determining if there's more than 1 AssessmentActualStartDate per CINdetailsID per child + # Eliminates rows with more than 1 assessment per CINdetails group by determining if there's more than 1 AssessmentID per CINdetailsID per child df_ass_merged = df_ass.merge(df_ass, on=["LAchildID", "CINdetailsID"]) df_ass_merged = df_ass_merged[ - ( - df_ass_merged["AssessmentActualStartDate_x"] - != df_ass_merged["AssessmentActualStartDate_y"] - ) + (df_ass_merged["AssessmentID_x"] != df_ass_merged["AssessmentID_y"]) ] more_than_1_ass = df_ass_merged["ROW_ID_x"].tolist() df_ass = df_ass[~df_ass["ROW_ID"].isin(more_than_1_ass)] - df_ass = df_ass[ - (df_ass[AssessmentFactors] == "21") - | (df_ass[AssessmentFactors] == "21 No factors identified") - | (df_ass[AssessmentFactors].str.contains("21")) + df_ass_merged = df_ass.merge( + df_asslist[["LAchildID", "CINdetailsID", "AssessmentID", "AssessmentFactor"]], + on=["LAchildID", "CINdetailsID", "AssessmentID"], + ) + + df_ass_merged = df_ass_merged[ + (df_ass_merged[AssessmentFactor] == "2B") + | (df_ass_merged[AssessmentFactor] == "21 No factors identified") + | (df_ass_merged[AssessmentFactor].str.contains("21")) ] - # left merge means that only the filtered cpp children will be considered and there is no possibility of additonal children coming in from other tables. + # left merge means that only the filtered cin children will be considered and there is no possibility of additonal children coming in from other tables. # get only the CINdetails groups with AssessmentFactors including 21. - merged_df = df_ass.copy().merge( - df_cin.copy(), + merged_df = df_ass_merged.merge( + df_cin, on=[LAchildID, "CINdetailsID"], how="left", suffixes=["_ass", "_cin"], @@ -88,7 +97,12 @@ def validate( # create an identifier for each error instance. merged_df["ERROR_ID"] = tuple( - zip(merged_df[LAchildID], merged_df[CINdetailsID], merged_df[ReasonForClosure]) + zip( + merged_df[LAchildID], + merged_df[CINdetailsID], + merged_df[AssessmentID], + merged_df[ReasonForClosure], + ) ) # The merges were done on copies of df_ass, and df_cin so that the column names in dataframes themselves aren't affected by the suffixes. @@ -123,48 +137,56 @@ def test_validate(): "LAchildID": "child1", "AssessmentFactors": "21", # 0 pass "CINdetailsID": "cinID1", + "AssessmentID": "11", "AssessmentActualStartDate": "5/12/1993", }, { "LAchildID": "child1", "AssessmentFactors": "BOO", # 1 ignore: factor!=21 "CINdetailsID": "cinID2", + "AssessmentID": "12", "AssessmentActualStartDate": "5/12/1993", }, { "LAchildID": "child2", "AssessmentFactors": "BOO", # 2 ignore: factor!=21 "CINdetailsID": "cinID1", + "AssessmentID": "21", "AssessmentActualStartDate": "5/12/1993", }, { "LAchildID": "child3", "AssessmentFactors": "21", # 3 fail. reason!=RC8 "CINdetailsID": "cinID1", + "AssessmentID": "31", "AssessmentActualStartDate": "5/12/1993", }, { # absent "LAchildID": "child3", "AssessmentFactors": pd.NA, # 4 ignore: factor!=21 "CINdetailsID": "cinID2", + "AssessmentID": "32", "AssessmentActualStartDate": "5/12/1993", }, { # fail "LAchildID": "child3", "AssessmentFactors": "21", # 5 fail. reason!=RC8 "CINdetailsID": "cinID3", + "AssessmentID": "33", "AssessmentActualStartDate": "5/12/1993", }, { "LAchildID": "child3", "AssessmentFactors": "21", # ignore: more than one assessment in CIN episode "CINdetailsID": "cinID4", + "AssessmentID": "34", "AssessmentActualStartDate": "5/12/1993", }, { "LAchildID": "child3", "AssessmentFactors": "20", # 6 ignore: factor!=21 "CINdetailsID": "cinID4", + "AssessmentID": "35", "AssessmentActualStartDate": "5/12/1994", }, ] @@ -216,6 +238,9 @@ def test_validate(): validate, { Assessments: sample_ass, + AssessmentFactorsList: sample_ass.rename( + columns={"AssessmentFactors": "AssessmentFactor"} + ), CINdetails: sample_cin, }, ) @@ -254,7 +279,7 @@ def test_validate(): "ERROR_ID": ( "child3", # ChildID "cinID1", # CINdetailsID - # corresponding CPPstartDate + "31", # AssessmentID "RC10", ), "ROW_ID": [3], @@ -263,6 +288,7 @@ def test_validate(): "ERROR_ID": ( "child3", "cinID3", + "33", "RC10", ), "ROW_ID": [5], diff --git a/cin_validator/rules/cin2022_23/rule_8897Q.py b/cin_validator/rules/cin2022_23/rule_8897Q.py index ab9ba43d..0a87d37f 100644 --- a/cin_validator/rules/cin2022_23/rule_8897Q.py +++ b/cin_validator/rules/cin2022_23/rule_8897Q.py @@ -12,6 +12,10 @@ AssessmentAuthorisationDate = Assessments.AssessmentAuthorisationDate AssessmentFactors = Assessments.AssessmentFactors LAchildID = Assessments.LAchildID +AssessmentID = Assessments.AssessmentID + +AssessmentFactorsList = CINTable.AssessmentFactorsList +AssessmentFactor = AssessmentFactorsList.AssessmentFactor Header = CINTable.Header ReferenceDate = Header.ReferenceDate @@ -32,14 +36,16 @@ def validate( ): # PREPARING DATA - df = data_container[Assessments] + df_ass = data_container[Assessments] + df_asslist = data_container[AssessmentFactorsList] header = data_container[Header] ref_date_series = header[ReferenceDate] collection_start, collection_end = make_census_period(ref_date_series) # Before you begin, rename the index so that the initial row positions can be kept intact. - df.index.name = "ROW_ID" + df_ass.index.name = "ROW_ID" + df_ass.reset_index(inplace=True) # lOGIC # Implement rule logic as described by the Github issue. @@ -95,15 +101,21 @@ def validate( "24A", ] - condition1 = (df[AssessmentAuthorisationDate] >= collection_start) & ( - df[AssessmentAuthorisationDate].notna() + df_ass_merged = df_ass.merge( + df_asslist[["LAchildID", "AssessmentID", "AssessmentFactor"]], + on=["LAchildID", "AssessmentID"], + how="left", + ) + + condition1 = (df_ass_merged[AssessmentAuthorisationDate] >= collection_start) & ( + df_ass_merged[AssessmentAuthorisationDate].notna() ) - condition2 = (df[AssessmentFactors].notna()) & ( - df[AssessmentFactors].isin(factors_list) + condition2 = (df_ass_merged[AssessmentFactors].notna()) & ( + df_ass_merged[AssessmentFactor].isin(factors_list) ) # get all the data that fits the failing condition. Reset the index so that ROW_ID now becomes a column of df - df_issues = df[condition1 & ~condition2].reset_index() + df_issues = df_ass_merged[condition1 & ~condition2].reset_index() # SUBMIT ERRORS # Generate a unique ID for each instance of an error. In this case, @@ -118,7 +130,7 @@ def validate( link_id = tuple( zip( df_issues[LAchildID], - df_issues[AssessmentAuthorisationDate], + df_issues[AssessmentID], df_issues[AssessmentFactors], ) ) @@ -142,36 +154,43 @@ def test_validate(): [ { "LAchildID": "child1", + "AssessmentID": "11", "AssessmentFactors": pd.NA, "AssessmentAuthorisationDate": "26/05/2000", }, # Fails as no assessment factor code { "LAchildID": "child2", + "AssessmentID": "21", "AssessmentFactors": "99", "AssessmentAuthorisationDate": "26/05/2000", }, # Fails as incorrect assessment factor code { "LAchildID": "child3", + "AssessmentID": "31", "AssessmentFactors": "1A", "AssessmentAuthorisationDate": "26/05/2000", }, { "LAchildID": "child3", + "AssessmentID": "32", "AssessmentAuthorisationDate": "26/05/2000", "AssessmentFactors": pd.NA, }, # Fails as no factor selected { "LAchildID": "child4", + "AssessmentID": "41", "AssessmentFactors": "1A", "AssessmentAuthorisationDate": "26/05/2000", }, { "LAchildID": "child5", + "AssessmentID": "51", "AssessmentAuthorisationDate": pd.NA, "AssessmentFactors": pd.NA, }, { "LAchildID": "child5", + "AssessmentID": "52", "AssessmentAuthorisationDate": "26/05/1945", "AssessmentFactors": pd.NA, # Passes as before census year }, @@ -186,7 +205,16 @@ def test_validate(): sample_header = pd.DataFrame([{ReferenceDate: "31/03/2001"}]) # Run rule function passing in our sample data - result = run_rule(validate, {Assessments: fake_data, Header: sample_header}) + result = run_rule( + validate, + { + Assessments: fake_data, + AssessmentFactorsList: fake_data.rename( + columns={"AssessmentFactors": "AssessmentFactor"} + ), + Header: sample_header, + }, + ) # Use .type1_issues to check for the result of .push_type1_issues() which you used above. issues = result.type1_issues @@ -215,7 +243,7 @@ def test_validate(): { "ERROR_ID": ( "child1", - pd.to_datetime("26/05/2000", format="%d/%m/%Y", errors="coerce"), + "11", pd.NA, ), "ROW_ID": [0], @@ -223,7 +251,7 @@ def test_validate(): { "ERROR_ID": ( "child2", - pd.to_datetime("26/05/2000", format="%d/%m/%Y", errors="coerce"), + "21", "99", ), "ROW_ID": [1], @@ -231,7 +259,7 @@ def test_validate(): { "ERROR_ID": ( "child3", - pd.to_datetime("26/05/2000", format="%d/%m/%Y", errors="coerce"), + "32", pd.NA, ), "ROW_ID": [3], diff --git a/cin_validator/rules/cin2022_23/rule_8898.py b/cin_validator/rules/cin2022_23/rule_8898.py index 832cffeb..cbf03786 100644 --- a/cin_validator/rules/cin2022_23/rule_8898.py +++ b/cin_validator/rules/cin2022_23/rule_8898.py @@ -8,11 +8,15 @@ # Get tables and columns of interest from the CINTable object defined in rule_engine/__api.py Assessments = CINTable.Assessments -AssessmentActualStartDate = Assessments.AssessmentActualStartDate +AssessmentID = Assessments.AssessmentID AssessmentFactors = Assessments.AssessmentFactors +AssessmentActualStartDate = Assessments.AssessmentActualStartDate LAchildID = Assessments.LAchildID CINdetailsID = Assessments.CINdetailsID +AssessmentFactorsList = CINTable.AssessmentFactorsList +AssessmentFactor = AssessmentFactorsList.AssessmentFactor + # define characteristics of rule @rule_definition( @@ -23,29 +27,36 @@ # replace the message with the corresponding value for this rule, gotten from the excel sheet. message=" The assessment has more than one parental or child factors with the same code", # The column names tend to be the words within the < > signs in the github issue description. - affected_fields=[AssessmentActualStartDate], + affected_fields=[AssessmentFactors], ) def validate( data_container: Mapping[CINTable, pd.DataFrame], rule_context: RuleContext ): - df = data_container[Assessments] - df.index.name = "ROW_ID" - df.reset_index(inplace=True) + df_ass = data_container[Assessments] + df_asslist = data_container[AssessmentFactorsList] + df_ass.index.name = "ROW_ID" + df_ass.reset_index(inplace=True) + + df_ass_merged = df_ass.merge( + df_asslist[["LAchildID", "CINdetailsID", "AssessmentID", "AssessmentFactor"]], + on=["LAchildID", "CINdetailsID", "AssessmentID"], + ) # If there is more than one (N00181) for an assessment recorded, then none of the values should appear more than once for an assessment. - # .duplicated() returns True in all the locations where LAchildID-CINdetailsID-AssessmentActualStartDate-AssessmentFactors combination is a duplicate - condition = df.duplicated( - [LAchildID, CINdetailsID, AssessmentActualStartDate, AssessmentFactors], + # .duplicated() returns True in all the locations where LAchildID-CINdetailsID-AssessmentID-AssessmentFactors combination is a duplicate + condition = df_ass_merged.duplicated( + [LAchildID, CINdetailsID, AssessmentID, AssessmentFactor], keep=False, ) - df_issues = df[condition].reset_index() + + df_issues = df_ass_merged[condition].drop_duplicates().reset_index() df_issues["ERROR_ID"] = tuple( zip( df_issues[LAchildID], - df_issues[CINdetailsID], - df_issues[AssessmentActualStartDate], + df_issues[AssessmentID], + df_issues[AssessmentFactors], ) ) @@ -57,7 +68,7 @@ def validate( # Ensure that you do not change the ROW_ID, and ERROR_ID column names which are shown above. They are keywords in this project. rule_context.push_type_3( - table=Assessments, columns=[AssessmentActualStartDate], row_df=df_issues + table=Assessments, columns=[AssessmentFactors], row_df=df_issues ) @@ -68,26 +79,44 @@ def test_validate(): { LAchildID: "child1", CINdetailsID: "cinID1", - AssessmentFactors: "111", # fail: duplicate in Assessment group AssessmentActualStartDate: "01/01/2000", + AssessmentFactors: ("111", "111"), # fail: duplicate Assessment Factor + AssessmentID: "1", # fail: duplicate Assessment Factor }, { LAchildID: "child1", CINdetailsID: "cinID1", - AssessmentFactors: "111", # fail: duplicate in Assessment group - AssessmentActualStartDate: "01/01/2000", + AssessmentActualStartDate: "02/01/2000", + AssessmentFactors: ("111", "222"), + AssessmentID: "2", # pass: different Assessment Factor }, + ] + ) + sample_factors = pd.DataFrame( + [ # child1 { LAchildID: "child1", CINdetailsID: "cinID1", - AssessmentFactors: "111", # pass: different Assessment group - AssessmentActualStartDate: "02/01/2000", + AssessmentFactor: "111", # fail: duplicate Assessment Factor + AssessmentID: "1", + }, + { + LAchildID: "child1", + CINdetailsID: "cinID1", + AssessmentFactor: "111", # fail: duplicate Assessment Factor + AssessmentID: "1", + }, + { + LAchildID: "child1", + CINdetailsID: "cinID1", + AssessmentFactor: "111", # pass: different Assessment Factor + AssessmentID: "2", }, { LAchildID: "child1", CINdetailsID: "cinID2", - AssessmentFactors: "222", # pass: not duplicate - AssessmentActualStartDate: "02/01/2001", + AssessmentFactor: "222", # pass: different Assessment Factor + AssessmentID: "2", }, ] ) @@ -99,7 +128,13 @@ def test_validate(): ) # Run rule function passing in our sample data - result = run_rule(validate, {Assessments: sample_assessments}) + result = run_rule( + validate, + { + Assessments: sample_assessments, + AssessmentFactorsList: sample_factors, + }, + ) # Use .type3_issues to check for the result of .push_type3_issues() which you used above. issues_list = result.type3_issues @@ -112,7 +147,7 @@ def test_validate(): assert issue_table == Assessments issue_columns = issues.columns - assert issue_columns == [AssessmentActualStartDate] + assert issue_columns == [AssessmentFactors] # check that the location linking dataframe was formed properly. issue_rows = issues.row_df @@ -121,7 +156,7 @@ def test_validate(): # check that the failing locations are contained in a DataFrame having the appropriate columns. These lines do not change. assert isinstance(issue_rows, pd.DataFrame) assert issue_rows.columns.to_list() == ["ERROR_ID", "ROW_ID"] - + print(issue_rows) # Create the dataframe which you expect, based on the fake data you created. It should have two columns. # - The first column is ERROR_ID which contains the unique combination that identifies each error instance, which you decided on earlier. # - The second column in ROW_ID which contains a list of index positions that belong to each error instance. @@ -132,10 +167,10 @@ def test_validate(): { "ERROR_ID": ( "child1", - "cinID1", - pd.to_datetime("01/01/2000", format="%d/%m/%Y"), + "1", + ("111", "111"), ), - "ROW_ID": [0, 1], + "ROW_ID": [0], }, ] )