From 70995856de04ae300cb4ffd3e3388b40d40a5ca9 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Wed, 22 Nov 2023 12:43:38 -0800 Subject: [PATCH] script/delete_records: Add option to match fields with regex pattern MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Uses rethinkdb's `match` command to filter for records with field value that matches the provided regex pattern. See rethinkdb docs for more details: https://rethinkdb.com/api/python/match/ This was prompted by our need to delete flu sequence records that have accessions with pattern "EPIEPI". We've fixed the accession with #148, but we need to manually remove the old duplicate sequence records because the flu sequence table uses the accession as the index.¹ ¹ https://github.com/nextstrain/fauna/blob/ec1feb679715890ae6d14efe11c979f27d6f1d6f/vdb/upload.py#L82 --- scripts/delete_records.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scripts/delete_records.py b/scripts/delete_records.py index 021c723..fad9235 100644 --- a/scripts/delete_records.py +++ b/scripts/delete_records.py @@ -11,6 +11,7 @@ parser.add_argument("-db", "--database", help="database to delete from") parser.add_argument("-v", "--virus", default="flu", help="virus table to interact with") parser.add_argument("--filter", nargs="*", default=[], help="Filters for records to delete, i.e. inclusion_date: 2021-02-12") + parser.add_argument("--match", nargs="*", default=[], help="Match for records to delete with regex pattern, e.g. accession:^EPIEPI") parser.add_argument('--interval', nargs="*", default=[], help="Select date interval of values for fields, e.g. assay_date:2019-09-03,2023-10-25") parser.add_argument("--preview", action="store_true", help="Preview records to be deleted without deleting from db.") @@ -29,6 +30,14 @@ rethinkdb_command = r.table(args.virus).filter(delete_filters) + delete_matches = {} + for delete_match in args.match: + field, pattern = delete_match.split(":") + rethinkdb_command = rethinkdb_command.filter(lambda doc: doc[field].match(pattern)) + delete_matches[field] = pattern + + print(f"Delete matches: {delete_matches}") + delete_intervals = {} for interval in args.interval: field, values = interval.split(':')