From 102f11d57891abd40a00ac43c05f978f5edd6954 Mon Sep 17 00:00:00 2001 From: Eugene Clark Date: Thu, 22 Feb 2024 13:31:34 -0500 Subject: [PATCH 1/4] Restore line that recomputes repeat subunit length --- src/ga4gh/vrs/normalize.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ga4gh/vrs/normalize.py b/src/ga4gh/vrs/normalize.py index 6fda9bd1..14d95f52 100644 --- a/src/ga4gh/vrs/normalize.py +++ b/src/ga4gh/vrs/normalize.py @@ -192,6 +192,7 @@ def _normalize_allele(input_allele, data_proxy, rle_seq_limit=50): # an RLE allele. len_extended_alt = len(new_alleles[1]) len_extended_ref = len(extended_ref_seq) + repeat_subunit_length = math.gcd(len_extended_ref, len_extended_alt) if len_extended_alt > len_extended_ref: repeat_sequence = itertools.cycle(extended_ref_seq[:repeat_subunit_length]) From 0079aef6914df1140605c83911ebac8ab5d7a616 Mon Sep 17 00:00:00 2001 From: Eugene Clark Date: Thu, 22 Feb 2024 13:32:51 -0500 Subject: [PATCH 2/4] Added additional HGVS test case --- ....10:g.289464_289465insCACA-expected8].yaml | 527 ++++++++++++++++++ ...0:g.289464_289465insCACA-vo_as_dict8].yaml | 62 +++ tests/extras/test_allele_translator.py | 21 +- .../test_variation_normalizer_rest_dp.py | 4 +- 4 files changed, 611 insertions(+), 3 deletions(-) create mode 100644 tests/extras/cassettes/test_hgvs[NC_000019.10:g.289464_289465insCACA-expected8].yaml create mode 100644 tests/extras/cassettes/test_rest_dp_to_hgvs[NC_000019.10:g.289464_289465insCACA-vo_as_dict8].yaml diff --git a/tests/extras/cassettes/test_hgvs[NC_000019.10:g.289464_289465insCACA-expected8].yaml b/tests/extras/cassettes/test_hgvs[NC_000019.10:g.289464_289465insCACA-expected8].yaml new file mode 100644 index 00000000..f9ad64ff --- /dev/null +++ b/tests/extras/cassettes/test_hgvs[NC_000019.10:g.289464_289465insCACA-expected8].yaml @@ -0,0 +1,527 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/metadata/refseq:NC_000019.10 + response: + body: + string: "{\n \"added\": \"2016-08-24T08:19:02Z\",\n \"aliases\": [\n \"Ensembl:19\",\n + \ \"ensembl:19\",\n \"GRCh38:19\",\n \"GRCh38:chr19\",\n \"GRCh38.p1:19\",\n + \ \"GRCh38.p1:chr19\",\n \"GRCh38.p10:19\",\n \"GRCh38.p10:chr19\",\n + \ \"GRCh38.p11:19\",\n \"GRCh38.p11:chr19\",\n \"GRCh38.p12:19\",\n + \ \"GRCh38.p12:chr19\",\n \"GRCh38.p2:19\",\n \"GRCh38.p2:chr19\",\n + \ \"GRCh38.p3:19\",\n \"GRCh38.p3:chr19\",\n \"GRCh38.p4:19\",\n \"GRCh38.p4:chr19\",\n + \ \"GRCh38.p5:19\",\n \"GRCh38.p5:chr19\",\n \"GRCh38.p6:19\",\n \"GRCh38.p6:chr19\",\n + \ \"GRCh38.p7:19\",\n \"GRCh38.p7:chr19\",\n \"GRCh38.p8:19\",\n \"GRCh38.p8:chr19\",\n + \ \"GRCh38.p9:19\",\n \"GRCh38.p9:chr19\",\n \"MD5:b0eba2c7bb5c953d1e06a508b5e487de\",\n + \ \"NCBI:NC_000019.10\",\n \"refseq:NC_000019.10\",\n \"SEGUID:AHxM5/L8jIX08UhBBkKXkiO5rhY\",\n + \ \"SHA1:007c4ce7f2fc8c85f4f148410642979223b9ae16\",\n \"VMC:GS_IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl\",\n + \ \"sha512t24u:IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl\",\n \"ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl\"\n + \ ],\n \"alphabet\": \"ACGNT\",\n \"length\": 58617616\n}\n" + headers: + Connection: + - close + Content-Length: + - '1035' + Content-Type: + - application/json + Date: + - Thu, 22 Feb 2024 17:17:55 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/metadata/ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl + response: + body: + string: "{\n \"added\": \"2016-08-24T08:19:02Z\",\n \"aliases\": [\n \"Ensembl:19\",\n + \ \"ensembl:19\",\n \"GRCh38:19\",\n \"GRCh38:chr19\",\n \"GRCh38.p1:19\",\n + \ \"GRCh38.p1:chr19\",\n \"GRCh38.p10:19\",\n \"GRCh38.p10:chr19\",\n + \ \"GRCh38.p11:19\",\n \"GRCh38.p11:chr19\",\n \"GRCh38.p12:19\",\n + \ \"GRCh38.p12:chr19\",\n \"GRCh38.p2:19\",\n \"GRCh38.p2:chr19\",\n + \ \"GRCh38.p3:19\",\n \"GRCh38.p3:chr19\",\n \"GRCh38.p4:19\",\n \"GRCh38.p4:chr19\",\n + \ \"GRCh38.p5:19\",\n \"GRCh38.p5:chr19\",\n \"GRCh38.p6:19\",\n \"GRCh38.p6:chr19\",\n + \ \"GRCh38.p7:19\",\n \"GRCh38.p7:chr19\",\n \"GRCh38.p8:19\",\n \"GRCh38.p8:chr19\",\n + \ \"GRCh38.p9:19\",\n \"GRCh38.p9:chr19\",\n \"MD5:b0eba2c7bb5c953d1e06a508b5e487de\",\n + \ \"NCBI:NC_000019.10\",\n \"refseq:NC_000019.10\",\n \"SEGUID:AHxM5/L8jIX08UhBBkKXkiO5rhY\",\n + \ \"SHA1:007c4ce7f2fc8c85f4f148410642979223b9ae16\",\n \"VMC:GS_IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl\",\n + \ \"sha512t24u:IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl\",\n \"ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl\"\n + \ ],\n \"alphabet\": \"ACGNT\",\n \"length\": 58617616\n}\n" + headers: + Connection: + - close + Content-Length: + - '1035' + Content-Type: + - application/json + Date: + - Thu, 22 Feb 2024 17:17:56 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl?start=289464&end=289464 + response: + body: + string: '' + headers: + Connection: + - close + Content-Length: + - '0' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 22 Feb 2024 17:17:56 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl?start=289463&end=289464 + response: + body: + string: T + headers: + Connection: + - close + Content-Length: + - '1' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 22 Feb 2024 17:17:56 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl?start=289464&end=289465 + response: + body: + string: C + headers: + Connection: + - close + Content-Length: + - '1' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 22 Feb 2024 17:17:56 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl?start=289465&end=289466 + response: + body: + string: A + headers: + Connection: + - close + Content-Length: + - '1' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 22 Feb 2024 17:17:56 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl?start=289466&end=289467 + response: + body: + string: G + headers: + Connection: + - close + Content-Length: + - '1' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 22 Feb 2024 17:17:56 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl?start=289464&end=289466 + response: + body: + string: CA + headers: + Connection: + - close + Content-Length: + - '2' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 22 Feb 2024 17:17:56 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NC_000019.10&rettype=fasta&seq_start=289465&seq_stop=289466&tool=bioutils&email=biocommons-dev@googlegroups.com + response: + body: + string: !!binary | + H4sIAAAAAAAAALLzc443AAJDSz1DAysjC0sTM1NdMGWm4JGfm69QnFiQmZpXrJCcUQTkFufnpioY + WuoouAc5Zxhb6BUYmigEFGXmJhZVKjgWF6fmJuVUcjk7cnEBAAAA//8DAN6im+lYAAAA + headers: + Access-Control-Allow-Origin: + - '*' + Access-Control-Expose-Headers: + - X-RateLimit-Limit,X-RateLimit-Remaining + Cache-Control: + - private + Connection: + - Keep-Alive + Content-Disposition: + - attachment; filename="sequence.fasta" + Content-Security-Policy: + - upgrade-insecure-requests + Content-Type: + - text/plain + Date: + - Thu, 22 Feb 2024 17:17:58 GMT + Keep-Alive: + - timeout=4, max=40 + NCBI-PHID: + - 939B247C4D7BE0F5000061C9999F8492.1.1.m_5 + NCBI-SID: + - FB6A5F3ECEC87F93_5A24SID + Referrer-Policy: + - origin-when-cross-origin + Server: + - Finatra + Set-Cookie: + - ncbi_sid=FB6A5F3ECEC87F93_5A24SID; domain=.nih.gov; path=/; expires=Sat, 22 + Feb 2025 17:17:59 GMT + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-RateLimit-Limit: + - '3' + X-RateLimit-Remaining: + - '2' + X-UA-Compatible: + - IE=Edge + X-XSS-Protection: + - 1; mode=block + content-encoding: + - gzip + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NC_000019.10&rettype=fasta&seq_start=289466&seq_stop=289466&tool=bioutils&email=biocommons-dev@googlegroups.com + response: + body: + string: !!binary | + H4sIAAAAAAAAALLzc443AAJDSz1DAysjC0sTMzNdCKXgkZ+br1CcWJCZmleskJxRBOQW5+emKhha + 6ii4BzlnGFvoFRiaKAQUZeYmFlUqOBYXp+Ym5VRyOXJxAQAAAP//AwDscSJIVwAAAA== + headers: + Access-Control-Allow-Origin: + - '*' + Access-Control-Expose-Headers: + - X-RateLimit-Limit,X-RateLimit-Remaining + Cache-Control: + - private + Connection: + - Keep-Alive + Content-Disposition: + - attachment; filename="sequence.fasta" + Content-Security-Policy: + - upgrade-insecure-requests + Content-Type: + - text/plain + Date: + - Thu, 22 Feb 2024 17:17:59 GMT + Keep-Alive: + - timeout=4, max=40 + NCBI-PHID: + - 939B247C4D7BE0F500005AC9A142192D.1.1.m_5 + NCBI-SID: + - DD7E32BFC75A3A29_2025SID + Referrer-Policy: + - origin-when-cross-origin + Server: + - Finatra + Set-Cookie: + - ncbi_sid=DD7E32BFC75A3A29_2025SID; domain=.nih.gov; path=/; expires=Sat, 22 + Feb 2025 17:18:00 GMT + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-RateLimit-Limit: + - '3' + X-RateLimit-Remaining: + - '1' + X-UA-Compatible: + - IE=Edge + X-XSS-Protection: + - 1; mode=block + content-encoding: + - gzip + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NC_000019.10&rettype=fasta&seq_start=289465&seq_stop=289486&tool=bioutils&email=biocommons-dev@googlegroups.com + response: + body: + string: !!binary | + H4sIAAAAAAAAALLzc443AAJDSz1DAysjC0sTM1NdEGVhpuCRn5uvUJxYkJmaV6yQnFEE5Bbn56Yq + GFrqKLgHOWcYW+gVGJooBBRl5iYWVSo4Fhen5iblVHI5O7o7OzqHhIS4u7s7urs7O4NJLi4AAAAA + //8DAME0cP1sAAAA + headers: + Access-Control-Allow-Origin: + - '*' + Access-Control-Expose-Headers: + - X-RateLimit-Limit,X-RateLimit-Remaining + Cache-Control: + - private + Connection: + - Keep-Alive + Content-Disposition: + - attachment; filename="sequence.fasta" + Content-Security-Policy: + - upgrade-insecure-requests + Content-Type: + - text/plain + Date: + - Thu, 22 Feb 2024 17:18:00 GMT + Keep-Alive: + - timeout=4, max=40 + NCBI-PHID: + - 939B247C4D7BE0F5000044C9A8CCAD75.1.1.m_5 + NCBI-SID: + - C12E8A945D8F2E08_3217SID + Referrer-Policy: + - origin-when-cross-origin + Server: + - Finatra + Set-Cookie: + - ncbi_sid=C12E8A945D8F2E08_3217SID; domain=.nih.gov; path=/; expires=Sat, 22 + Feb 2025 17:18:00 GMT + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-RateLimit-Limit: + - '3' + X-RateLimit-Remaining: + - '1' + X-UA-Compatible: + - IE=Edge + X-XSS-Protection: + - 1; mode=block + content-encoding: + - gzip + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NC_000019.10&rettype=fasta&seq_start=289463&seq_stop=289466&tool=bioutils&email=biocommons-dev@googlegroups.com + response: + body: + string: !!binary | + H4sIAAAAAAAAALLzc443AAJDSz1DAysjC0sTM2NdMGWm4JGfm69QnFiQmZpXrJCcUQTkFufnpioY + WuoouAc5Zxhb6BUYmigEFGXmJhZVKjgWF6fmJuVUcjmHODtycQEAAAD//wMAcnFU/1oAAAA= + headers: + Access-Control-Allow-Origin: + - '*' + Access-Control-Expose-Headers: + - X-RateLimit-Limit,X-RateLimit-Remaining + Cache-Control: + - private + Connection: + - Keep-Alive + Content-Disposition: + - attachment; filename="sequence.fasta" + Content-Security-Policy: + - upgrade-insecure-requests + Content-Type: + - text/plain + Date: + - Thu, 22 Feb 2024 17:18:01 GMT + Keep-Alive: + - timeout=4, max=40 + NCBI-PHID: + - 939B247C4D7BE0F5000041C9B0605DB1.1.1.m_5 + NCBI-SID: + - 55ABAA95384A40C3_7046SID + Referrer-Policy: + - origin-when-cross-origin + Server: + - Finatra + Set-Cookie: + - ncbi_sid=55ABAA95384A40C3_7046SID; domain=.nih.gov; path=/; expires=Sat, 22 + Feb 2025 17:18:01 GMT + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-RateLimit-Limit: + - '3' + X-RateLimit-Remaining: + - '1' + X-UA-Compatible: + - IE=Edge + X-XSS-Protection: + - 1; mode=block + content-encoding: + - gzip + status: + code: 200 + message: OK +version: 1 diff --git a/tests/extras/cassettes/test_rest_dp_to_hgvs[NC_000019.10:g.289464_289465insCACA-vo_as_dict8].yaml b/tests/extras/cassettes/test_rest_dp_to_hgvs[NC_000019.10:g.289464_289465insCACA-vo_as_dict8].yaml new file mode 100644 index 00000000..518f7079 --- /dev/null +++ b/tests/extras/cassettes/test_rest_dp_to_hgvs[NC_000019.10:g.289464_289465insCACA-vo_as_dict8].yaml @@ -0,0 +1,62 @@ +interactions: +- request: + body: '{"variation": {"id": "ga4gh:VA.6WEakVj5V1rTYtbNX8Xqx4bMN4pMA6fh", "type": + "Allele", "digest": "6WEakVj5V1rTYtbNX8Xqx4bMN4pMA6fh", "location": {"id": "ga4gh:SL.L145KFLJeJ334YnOVm59pPlbdqfHhgXZ", + "type": "SequenceLocation", "digest": "L145KFLJeJ334YnOVm59pPlbdqfHhgXZ", "sequenceReference": + {"type": "SequenceReference", "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl"}, + "start": 289464, "end": 289466}, "state": {"type": "ReferenceLengthExpression", + "length": 6, "sequence": "CACACA", "repeatSubunitLength": 2}}, "namespace": + "refseq"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '543' + Content-Type: + - application/json; charset=utf-8 + User-Agent: + - python-requests/2.31.0 + method: POST + uri: https://normalize.cancervariants.org/variation/vrs_allele_to_hgvs + response: + body: + string: '{"query":{"variation":{"id":"ga4gh:VA.6WEakVj5V1rTYtbNX8Xqx4bMN4pMA6fh","digest":"6WEakVj5V1rTYtbNX8Xqx4bMN4pMA6fh","type":"Allele","location":{"id":"ga4gh:SL.L145KFLJeJ334YnOVm59pPlbdqfHhgXZ","digest":"L145KFLJeJ334YnOVm59pPlbdqfHhgXZ","type":"SequenceLocation","sequenceReference":{"type":"SequenceReference","refgetAccession":"SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl"},"start":289464,"end":289466},"state":{"type":"ReferenceLengthExpression","length":6,"sequence":"CACACA","repeatSubunitLength":2}},"namespace":"refseq"},"warnings":[],"service_meta_":{"name":"variation-normalizer","version":"0.8.1-dev0","response_datetime":"2024-02-22T16:07:02.761099","url":"https://github.com/cancervariants/variation-normalization"},"vrs_python_meta_":{"name":"vrs-python","version":"2.0.0a2","url":"https://github.com/ga4gh/vrs-python"},"variations":["NC_000019.10:g.289466_289467insCACA"]}' + headers: + Connection: + - keep-alive + Content-Length: + - '878' + Content-Type: + - application/json + Date: + - Thu, 22 Feb 2024 16:07:02 GMT + Via: + - 1.1 04d5f6961d9b76b97c908d8ed9816378.cloudfront.net (CloudFront) + X-Amz-Cf-Id: + - xxXr_CZF5qrl4ENyT0nqEtGa0IIvdBgq_3fmAXk7H0QEvvqIG1Id1g== + X-Amz-Cf-Pop: + - EWR50-C1 + X-Amzn-Trace-Id: + - Root=1-65d77126-5f1bce5d7e02846f5333558c + X-Cache: + - Miss from cloudfront + x-amz-apigw-id: + - Ti6eGHBvCYcEGUg= + x-amzn-Remapped-Connection: + - keep-alive + x-amzn-Remapped-Content-Length: + - '878' + x-amzn-Remapped-Date: + - Thu, 22 Feb 2024 16:07:02 GMT + x-amzn-Remapped-Server: + - nginx + x-amzn-RequestId: + - c3910297-d5d7-4692-a2b6-f32ce444f100 + status: + code: 200 + message: OK +version: 1 diff --git a/tests/extras/test_allele_translator.py b/tests/extras/test_allele_translator.py index 55d805d6..990eecef 100644 --- a/tests/extras/test_allele_translator.py +++ b/tests/extras/test_allele_translator.py @@ -367,8 +367,27 @@ def test_to_spdi(tlr): 'type': 'SequenceLocation'}, 'state': {'sequence': 'T', 'type': 'LiteralSequenceExpression'}, 'type': 'Allele'}), + ("NC_000019.10:g.289464_289465insCACA", + {'digest': 'YFUR4oR_84b-rRFf0UzOjfI4eE5FTKAP', + 'id': 'ga4gh:VA.YFUR4oR_84b-rRFf0UzOjfI4eE5FTKAP', + 'type': 'Allele', + 'location': {'digest': 'L145KFLJeJ334YnOVm59pPlbdqfHhgXZ', + 'end': 289466, + 'id': 'ga4gh:SL.L145KFLJeJ334YnOVm59pPlbdqfHhgXZ', + 'sequenceReference': {'refgetAccession': 'SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl', + 'type': 'SequenceReference'}, + 'start': 289464, + 'type': 'SequenceLocation'}, + 'state': {'length': 6, + 'repeatSubunitLength': 2, + 'sequence': 'CACACA', + 'type': 'ReferenceLengthExpression'}}), ) +hgvs_tests_to_hgvs_map = { + "NC_000019.10:g.289464_289465insCACA": "NC_000019.10:g.289466_289467insCACA" +} + @pytest.mark.parametrize("hgvsexpr,expected", hgvs_tests) @pytest.mark.vcr @@ -380,7 +399,7 @@ def test_hgvs(tlr, hgvsexpr, expected): to_hgvs = tlr.translate_to(allele, "hgvs") assert 1 == len(to_hgvs) - assert hgvsexpr == to_hgvs[0] + assert hgvs_tests_to_hgvs_map.get(hgvsexpr, hgvsexpr) == to_hgvs[0] @pytest.mark.vcr diff --git a/tests/extras/test_variation_normalizer_rest_dp.py b/tests/extras/test_variation_normalizer_rest_dp.py index 1fc77c32..03aad933 100644 --- a/tests/extras/test_variation_normalizer_rest_dp.py +++ b/tests/extras/test_variation_normalizer_rest_dp.py @@ -2,7 +2,7 @@ from ga4gh.vrs import models from ga4gh.vrs.extras.variation_normalizer_rest_dp import VariationNormalizerRESTDataProxy -from tests.extras.test_allele_translator import hgvs_tests +from tests.extras.test_allele_translator import hgvs_tests, hgvs_tests_to_hgvs_map @pytest.fixture(scope="module") @@ -15,4 +15,4 @@ def variation_norm_rest_dp(): def test_rest_dp_to_hgvs(variation_norm_rest_dp, expected, vo_as_dict): vo = models.Allele(**vo_as_dict) resp = variation_norm_rest_dp.to_hgvs(vo) - assert resp == [expected] + assert resp == [hgvs_tests_to_hgvs_map.get(expected, expected)] From 27f9b1858fb0281d3e296b18eaf7d5e9621ec9fb Mon Sep 17 00:00:00 2001 From: Eugene Clark Date: Thu, 22 Feb 2024 14:23:56 -0500 Subject: [PATCH 3/4] Only recompute subunit length for ambiguous insertions or deletion/insertions --- src/ga4gh/vrs/normalize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ga4gh/vrs/normalize.py b/src/ga4gh/vrs/normalize.py index 14d95f52..a9da5760 100644 --- a/src/ga4gh/vrs/normalize.py +++ b/src/ga4gh/vrs/normalize.py @@ -192,9 +192,9 @@ def _normalize_allele(input_allele, data_proxy, rle_seq_limit=50): # an RLE allele. len_extended_alt = len(new_alleles[1]) len_extended_ref = len(extended_ref_seq) - repeat_subunit_length = math.gcd(len_extended_ref, len_extended_alt) if len_extended_alt > len_extended_ref: + repeat_subunit_length = math.gcd(len_extended_ref, len_extended_alt) repeat_sequence = itertools.cycle(extended_ref_seq[:repeat_subunit_length]) ref_derived_alt = ''.join([next(repeat_sequence) for _ in range(len_extended_alt)]) # TODO: The space and time efficiency can be improved by iterating over the new_allele[1] From 2bffc3f810964005419a02f93e9d1f3294306a9c Mon Sep 17 00:00:00 2001 From: Eugene Clark Date: Thu, 22 Feb 2024 14:24:17 -0500 Subject: [PATCH 4/4] Added test case with ambiguous deletion --- ...0019.10:g.289485_289500del-expected9].yaml | 499 ++++++++++++++++++ ...19.10:g.289485_289500del-vo_as_dict9].yaml | 62 +++ tests/extras/test_allele_translator.py | 18 +- 3 files changed, 578 insertions(+), 1 deletion(-) create mode 100644 tests/extras/cassettes/test_hgvs[NC_000019.10:g.289485_289500del-expected9].yaml create mode 100644 tests/extras/cassettes/test_rest_dp_to_hgvs[NC_000019.10:g.289485_289500del-vo_as_dict9].yaml diff --git a/tests/extras/cassettes/test_hgvs[NC_000019.10:g.289485_289500del-expected9].yaml b/tests/extras/cassettes/test_hgvs[NC_000019.10:g.289485_289500del-expected9].yaml new file mode 100644 index 00000000..e4d48088 --- /dev/null +++ b/tests/extras/cassettes/test_hgvs[NC_000019.10:g.289485_289500del-expected9].yaml @@ -0,0 +1,499 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl?start=289484&end=289500 + response: + body: + string: GCGGGCAGATCACGAG + headers: + Connection: + - close + Content-Length: + - '16' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 22 Feb 2024 19:17:58 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl?start=289483&end=289484 + response: + body: + string: G + headers: + Connection: + - close + Content-Length: + - '1' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 22 Feb 2024 19:17:58 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl?start=289482&end=289483 + response: + body: + string: A + headers: + Connection: + - close + Content-Length: + - '1' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 22 Feb 2024 19:17:58 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl?start=289481&end=289482 + response: + body: + string: G + headers: + Connection: + - close + Content-Length: + - '1' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 22 Feb 2024 19:17:58 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl?start=289480&end=289481 + response: + body: + string: C + headers: + Connection: + - close + Content-Length: + - '1' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 22 Feb 2024 19:17:58 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl?start=289479&end=289480 + response: + body: + string: C + headers: + Connection: + - close + Content-Length: + - '1' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 22 Feb 2024 19:17:58 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl?start=289500&end=289501 + response: + body: + string: G + headers: + Connection: + - close + Content-Length: + - '1' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 22 Feb 2024 19:17:58 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl?start=289501&end=289502 + response: + body: + string: T + headers: + Connection: + - close + Content-Length: + - '1' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 22 Feb 2024 19:17:58 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl?start=289480&end=289484 + response: + body: + string: CGAG + headers: + Connection: + - close + Content-Length: + - '4' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 22 Feb 2024 19:17:58 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl?start=289480&end=289501 + response: + body: + string: CGAGGCGGGCAGATCACGAGG + headers: + Connection: + - close + Content-Length: + - '21' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 22 Feb 2024 19:17:58 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NC_000019.10&rettype=fasta&seq_start=289481&seq_stop=289501&tool=bioutils&email=biocommons-dev@googlegroups.com + response: + body: + string: !!binary | + H4sIAAAAAAAAALLzc443AAJDSz1DAysjC0sTC0NdIGVqYKjgkZ+br1CcWJCZmleskJxRBOQW5+em + Khha6ii4BzlnGFvoFRiaKAQUZeYmFlUqOBYXp+Ym5VRyObs7urs7uwOxo7tjiLMjmM/FBQAAAP// + AwCUo69uawAAAA== + headers: + Access-Control-Allow-Origin: + - '*' + Access-Control-Expose-Headers: + - X-RateLimit-Limit,X-RateLimit-Remaining + Cache-Control: + - private + Connection: + - Keep-Alive + Content-Disposition: + - attachment; filename="sequence.fasta" + Content-Security-Policy: + - upgrade-insecure-requests + Content-Type: + - text/plain + Date: + - Thu, 22 Feb 2024 19:18:01 GMT + Keep-Alive: + - timeout=4, max=40 + NCBI-PHID: + - 322C2CA7550E811500003085136AA300.1.1.m_5 + NCBI-SID: + - 09803CFF56A4005F_8568SID + Referrer-Policy: + - origin-when-cross-origin + Server: + - Finatra + Set-Cookie: + - ncbi_sid=09803CFF56A4005F_8568SID; domain=.nih.gov; path=/; expires=Sat, 22 + Feb 2025 19:18:01 GMT + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-RateLimit-Limit: + - '3' + X-RateLimit-Remaining: + - '2' + X-UA-Compatible: + - IE=Edge + X-XSS-Protection: + - 1; mode=block + content-encoding: + - gzip + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NC_000019.10&rettype=fasta&seq_start=289501&seq_stop=289501&tool=bioutils&email=biocommons-dev@googlegroups.com + response: + body: + string: !!binary | + H4sIAAAAAAAAALLzc443AAJDSz1DAysjC0tTA0NdCKXgkZ+br1CcWJCZmleskJxRBOQW5+emKhha + 6ii4BzlnGFvoFRiaKAQUZeYmFlUqOBYXp+Ym5VRyuXNxAQAAAP//AwDTOp6eVwAAAA== + headers: + Access-Control-Allow-Origin: + - '*' + Access-Control-Expose-Headers: + - X-RateLimit-Limit,X-RateLimit-Remaining + Cache-Control: + - private + Connection: + - Keep-Alive + Content-Disposition: + - attachment; filename="sequence.fasta" + Content-Security-Policy: + - upgrade-insecure-requests + Content-Type: + - text/plain + Date: + - Thu, 22 Feb 2024 19:18:01 GMT + Keep-Alive: + - timeout=4, max=40 + NCBI-PHID: + - 939B247C4D7BE0F500003E4E6CC91B56.1.1.m_5 + NCBI-SID: + - 563574F7E99B22F2_181DSID + Referrer-Policy: + - origin-when-cross-origin + Server: + - Finatra + Set-Cookie: + - ncbi_sid=563574F7E99B22F2_181DSID; domain=.nih.gov; path=/; expires=Sat, 22 + Feb 2025 19:18:02 GMT + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-RateLimit-Limit: + - '3' + X-RateLimit-Remaining: + - '1' + X-UA-Compatible: + - IE=Edge + X-XSS-Protection: + - 1; mode=block + content-encoding: + - gzip + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NC_000019.10&rettype=fasta&seq_start=289481&seq_stop=289521&tool=bioutils&email=biocommons-dev@googlegroups.com + response: + body: + string: !!binary | + H4sIAAAAAAAAAByHwQqDQAxE7/sV+YAqRi3s9lAIOaSnUsR7sWVBwe3K5uTfNzow8+bdn/xuLBhq + bG6tD73HynBtER45ZdBpW+JP4TsXU80pAoYLyMBz5+sNe3iVJU1lB1KN6bPujoVEWKwkNDKdbhQ5 + /VhmO+zcHwAA//8DAPeNIyZ/AAAA + headers: + Access-Control-Allow-Origin: + - '*' + Access-Control-Expose-Headers: + - X-RateLimit-Limit,X-RateLimit-Remaining + Cache-Control: + - private + Connection: + - Keep-Alive + Content-Disposition: + - attachment; filename="sequence.fasta" + Content-Security-Policy: + - upgrade-insecure-requests + Content-Type: + - text/plain + Date: + - Thu, 22 Feb 2024 19:18:02 GMT + Keep-Alive: + - timeout=4, max=40 + NCBI-PHID: + - D0BD2A910DB406F500004793D25D7785.1.1.m_5 + NCBI-SID: + - FC0E607D3ECDAFBC_6C1BSID + Referrer-Policy: + - origin-when-cross-origin + Server: + - Finatra + Set-Cookie: + - ncbi_sid=FC0E607D3ECDAFBC_6C1BSID; domain=.nih.gov; path=/; expires=Sat, 22 + Feb 2025 19:18:03 GMT + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-RateLimit-Limit: + - '3' + X-RateLimit-Remaining: + - '1' + X-UA-Compatible: + - IE=Edge + X-XSS-Protection: + - 1; mode=block + content-encoding: + - gzip + status: + code: 200 + message: OK +version: 1 diff --git a/tests/extras/cassettes/test_rest_dp_to_hgvs[NC_000019.10:g.289485_289500del-vo_as_dict9].yaml b/tests/extras/cassettes/test_rest_dp_to_hgvs[NC_000019.10:g.289485_289500del-vo_as_dict9].yaml new file mode 100644 index 00000000..4fb9ec04 --- /dev/null +++ b/tests/extras/cassettes/test_rest_dp_to_hgvs[NC_000019.10:g.289485_289500del-vo_as_dict9].yaml @@ -0,0 +1,62 @@ +interactions: +- request: + body: '{"variation": {"id": "ga4gh:VA.Djc_SwVDFunsArqwUM00PciVaF70VTcU", "type": + "Allele", "digest": "Djc_SwVDFunsArqwUM00PciVaF70VTcU", "location": {"id": "ga4gh:SL.WTE7jyihK4qvRRzEqM7u5nSD4iS2k3xp", + "type": "SequenceLocation", "digest": "WTE7jyihK4qvRRzEqM7u5nSD4iS2k3xp", "sequenceReference": + {"type": "SequenceReference", "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl"}, + "start": 289480, "end": 289501}, "state": {"type": "ReferenceLengthExpression", + "length": 5, "sequence": "CGAGG", "repeatSubunitLength": 16}}, "namespace": + "refseq"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '543' + Content-Type: + - application/json; charset=utf-8 + User-Agent: + - python-requests/2.31.0 + method: POST + uri: https://normalize.cancervariants.org/variation/vrs_allele_to_hgvs + response: + body: + string: '{"query":{"variation":{"id":"ga4gh:VA.Djc_SwVDFunsArqwUM00PciVaF70VTcU","digest":"Djc_SwVDFunsArqwUM00PciVaF70VTcU","type":"Allele","location":{"id":"ga4gh:SL.WTE7jyihK4qvRRzEqM7u5nSD4iS2k3xp","digest":"WTE7jyihK4qvRRzEqM7u5nSD4iS2k3xp","type":"SequenceLocation","sequenceReference":{"type":"SequenceReference","refgetAccession":"SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl"},"start":289480,"end":289501},"state":{"type":"ReferenceLengthExpression","length":5,"sequence":"CGAGG","repeatSubunitLength":16}},"namespace":"refseq"},"warnings":[],"service_meta_":{"name":"variation-normalizer","version":"0.8.1-dev0","response_datetime":"2024-02-22T19:18:10.026968","url":"https://github.com/cancervariants/variation-normalization"},"vrs_python_meta_":{"name":"vrs-python","version":"2.0.0a2","url":"https://github.com/ga4gh/vrs-python"},"variations":["NC_000019.10:g.289486_289501del"]}' + headers: + Connection: + - keep-alive + Content-Length: + - '874' + Content-Type: + - application/json + Date: + - Thu, 22 Feb 2024 19:18:10 GMT + Via: + - 1.1 00fd85d5c5d5bd788f272591be9ecbca.cloudfront.net (CloudFront) + X-Amz-Cf-Id: + - Dt6V2mzb1NcFUMK7T5rPjs3C2mQcCbnSjBhSUI_DANbFmmLGhAtcFA== + X-Amz-Cf-Pop: + - EWR50-C1 + X-Amzn-Trace-Id: + - Root=1-65d79df0-0604cab30d4f7c3d564e442b + X-Cache: + - Miss from cloudfront + x-amz-apigw-id: + - TjWdoEw_iYcEORQ= + x-amzn-Remapped-Connection: + - keep-alive + x-amzn-Remapped-Content-Length: + - '874' + x-amzn-Remapped-Date: + - Thu, 22 Feb 2024 19:18:10 GMT + x-amzn-Remapped-Server: + - nginx + x-amzn-RequestId: + - 049b99c8-0266-43a9-a5cb-0d312f09b260 + status: + code: 200 + message: OK +version: 1 diff --git a/tests/extras/test_allele_translator.py b/tests/extras/test_allele_translator.py index 990eecef..a7b93b8e 100644 --- a/tests/extras/test_allele_translator.py +++ b/tests/extras/test_allele_translator.py @@ -382,10 +382,26 @@ def test_to_spdi(tlr): 'repeatSubunitLength': 2, 'sequence': 'CACACA', 'type': 'ReferenceLengthExpression'}}), + ("NC_000019.10:g.289485_289500del", + {'digest': 'Djc_SwVDFunsArqwUM00PciVaF70VTcU', + 'id': 'ga4gh:VA.Djc_SwVDFunsArqwUM00PciVaF70VTcU', + 'type': 'Allele', + 'location': {'digest': 'WTE7jyihK4qvRRzEqM7u5nSD4iS2k3xp', + 'end': 289501, + 'id': 'ga4gh:SL.WTE7jyihK4qvRRzEqM7u5nSD4iS2k3xp', + 'sequenceReference': {'refgetAccession': 'SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl', + 'type': 'SequenceReference'}, + 'start': 289480, + 'type': 'SequenceLocation'}, + 'state': {'length': 5, + 'repeatSubunitLength': 16, + 'sequence': 'CGAGG', + 'type': 'ReferenceLengthExpression'}}), ) hgvs_tests_to_hgvs_map = { - "NC_000019.10:g.289464_289465insCACA": "NC_000019.10:g.289466_289467insCACA" + "NC_000019.10:g.289464_289465insCACA": "NC_000019.10:g.289466_289467insCACA", + "NC_000019.10:g.289485_289500del": "NC_000019.10:g.289486_289501del" }