From 46338736a7b20b32e661c7b5abb010b359a0ff73 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 7 Dec 2023 23:17:23 -0800 Subject: [PATCH 1/5] Cleaning up the size comparison table. % Reduced is now just PyUnicode vs. ShortSeq. --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 93a858d..add5465 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,13 @@ ShortSeqs are compact and efficient Python objects that hold short sequences while using up to 73% less memory compared to built-in types. They are prehashed and comparable, they support slicing and indexing, and they easily convert back to their original string form. -| Sequence Length | PyUnicode Size* | PyBytes Size* | ShortSeq Size* | % Mem. Reduction | -|-----------------|----------------------------|--------------------------|---------------------------:|------------------| -| 0-32 nt | 56-88 bytes | 40-72 bytes | 32 bytes (fixed) | **20-64%** | -| 33-64 nt | 88-120 bytes | 72-104 bytes | 48 bytes (fixed) | **33-60%** | -| 65-1024 nt | 120-1080 bytes | 104-1064 bytes | 48-288 bytes | **53-73%** | +| Sequence Length | PyUnicode Size | PyBytes Size | ShortSeq Size | % Reduced | +|-----------------|----------------------------|--------------------------|--------------------------:|--------------------| +| 0-32 nt | 56-88 bytes | 40-72 bytes | 32 bytes (fixed) | **43-64%** | +| 33-64 nt | 88-120 bytes | 72-104 bytes | 48 bytes (fixed) | **45-60%** | +| 65-1024 nt | 120-1080 bytes | 104-1064 bytes | 56-288 bytes | **53-73%** | -* Object sizes were measured on Python 3.10 using `asizeof()` from the `pympler` package. +* Object sizes were measured on Python 3.10 using `asizeof()` from the `pympler` package. % Reduced is PyUnicode vs. ShortSeq In the table above, you can see that Python's memory representation of DNA sequences is larger than a C-style `char *` array, which would only need one byte per base. Using Cython we can move some of this memory representation out of Python space and into C space for faster facilities and a more compact bitwise representation. From 6fe5259fc69ca257cb27abe3d84c9ebd3ac3e3b8 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 7 Dec 2023 23:17:58 -0800 Subject: [PATCH 2/5] Expanding usage section to include recent feature additions --- README.md | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index add5465..e2a6040 100644 --- a/README.md +++ b/README.md @@ -25,20 +25,31 @@ mamba install -c bioconda -c conda-forge shortseq ```python import shortseq as sq -# Construct from PyUnicode +# Construct from PyUnicode or PyBytes seq_str = "ATGC" -seq_1 = sq.pack(seq_str) - -# Or, construct from PyBytes seq_bytes = b"ATGC" +seq_1 = sq.pack(seq_str) seq_2 = sq.pack(seq_bytes) # Verify outputs (optional) -assert seq_1 == seq_2 -assert seq_str == str(seq_1) == str(seq_2) -assert len(seq_str) == len(seq_1) == len(seq_2) +assert seq_1 == seq_2 == seq_str +assert len(seq_1) == len(seq_2) == len(seq_str) + +# TATTA G CGATTGACAGTTGTCCTGTAATAACG C CGGGTAAATTTGC C G +# TATTA C CGATTGACAGTTGTCCTGTAATAACG G CGGGTAAATTTGC T G +seq_3 = sq.pack("TATTAGCGATTGACAGTTGTCCTGTAATAACGCCGGGTAAATTTGCCG") +seq_4 = sq.pack("TATTACCGATTGACAGTTGTCCTGTAATAACGGCGGGTAAATTTGCTG") +seq_str = str(seq_4) + +# Slice and subscript +assert seq_4[5:15] == seq_str[5:15] +assert seq_4[-2] == seq_str[-2] + +# Vectorized hamming distance (differing bases) +hammd = sum(a!=b for a, b in zip(seq_3, seq_4)) +assert seq_3 ^ seq_4 == hammd == 3 -# Count unique sequences +# Count unique sequences similar to collections.Counter from shortseq import ShortSeqCounter counts = ShortSeqCounter([seq_bytes] * 10) assert counts == {sq.pack("ATGC"): 10} From b50477776a13e4a1423628eecc6017f5c654ac29 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 7 Dec 2023 23:19:43 -0800 Subject: [PATCH 3/5] Adding __sizeof__ to ShortSeqVar so that it can correctly report its heap allocated storage to sys.getsizeof(). Adding unit test for expected object sizes for min/max lengths. --- shortseq/short_seq_var.pyx | 3 ++ shortseq/tests/unit_tests_main.py | 57 +++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/shortseq/short_seq_var.pyx b/shortseq/short_seq_var.pyx index 372490e..7faa393 100644 --- a/shortseq/short_seq_var.pyx +++ b/shortseq/short_seq_var.pyx @@ -77,6 +77,9 @@ cdef class ShortSeqVar: return pop_cnt + def __sizeof__(self): + return sizeof(ShortSeqVar) + _length_to_block_num(self._length) * sizeof(uint64_t) + def __repr__(self): # Clips the sequence to MAX_REPR_LEN characters to avoid overwhelming the debugger cdef unicode clipped_seq = _unmarshall_bytes_var(self._packed, MAX_REPR_LEN) diff --git a/shortseq/tests/unit_tests_main.py b/shortseq/tests/unit_tests_main.py index f0ab28b..74c4a75 100644 --- a/shortseq/tests/unit_tests_main.py +++ b/shortseq/tests/unit_tests_main.py @@ -1,4 +1,5 @@ import unittest +import sys from random import randint @@ -65,6 +66,23 @@ def test_incompatible_seq_chars(self): with self.assertRaisesRegex(Exception, "Unsupported base character"): sq.pack(prob) + """Are min and max length ShortSeqs the correct size?""" + + def test_size(self): + # ShortSeq64 + seq_min = sq.pack(rand_sequence(MIN_64_NT)) + seq_max = sq.pack(rand_sequence(MAX_64_NT)) + + self.assertEqual(sys.getsizeof(seq_min), 32) + self.assertEqual(sys.getsizeof(seq_max), 32) + + # ShortSeq128 + seq_min = sq.pack(rand_sequence(MIN_128_NT)) + seq_max = sq.pack(rand_sequence(MAX_128_NT)) + + self.assertEqual(sys.getsizeof(seq_min), 48) + self.assertEqual(sys.getsizeof(seq_max), 48) + """Checks that randomly generated sequences encode and decode correctly for the entire valid range of lengths.""" @@ -279,6 +297,45 @@ def str_ham(a, b): return sum(a_nt != b_nt for a_nt, b_nt in zip(a, b)) self.assertEqual(sq.pack(a) ^ sq.pack(b), str_ham(a, b)) + def test_readme(self): + # Construct from PyUnicode or PyBytes + seq_str = "ATGC" + seq_bytes = b"ATGC" + seq_1 = sq.pack(seq_str) + seq_2 = sq.pack(seq_bytes) + + # Verify outputs (optional) + assert seq_1 == seq_2 == seq_str + assert len(seq_1) == len(seq_2) == len(seq_str) + + # TATTA G CGATTGACAGTTGTCCTGTAATAACG C CGGGTAAATTTGC C G + # TATTA C CGATTGACAGTTGTCCTGTAATAACG G CGGGTAAATTTGC T G + seq_3 = sq.pack("TATTAGCGATTGACAGTTGTCCTGTAATAACGCCGGGTAAATTTGCCG") + seq_4 = sq.pack("TATTACCGATTGACAGTTGTCCTGTAATAACGGCGGGTAAATTTGCTG") + seq_str = str(seq_4) + + # Slice and subscript + assert seq_4[5:15] == seq_str[5:15] + assert seq_4[-2] == seq_str[-2] + + # Vectorized hamming distance (differing bases) + hammd = sum(a != b for a, b in zip(seq_3, seq_4)) + assert seq_3 ^ seq_4 == hammd == 3 + + # Count unique sequences similar to collections.Counter + from shortseq import ShortSeqCounter + counts = ShortSeqCounter([seq_bytes] * 10) + assert counts == {sq.pack("ATGC"): 10} + + """Are min and max length ShortSeqVars the correct size?""" + + def test_size(self): + seq_min = sq.pack(rand_sequence(MIN_VAR_NT)) + seq_max = sq.pack(rand_sequence(MAX_VAR_NT)) + + self.assertEqual(sys.getsizeof(seq_min), 56) + self.assertEqual(sys.getsizeof(seq_max), 288) + if __name__ == '__main__': From d5ec42892102fd921cf51bbc0768f88553ef49f9 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 8 Dec 2023 09:55:26 -0800 Subject: [PATCH 4/5] A little extra polish on the usage section --- README.md | 4 +--- shortseq/tests/unit_tests_main.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index e2a6040..8694245 100644 --- a/README.md +++ b/README.md @@ -35,10 +35,8 @@ seq_2 = sq.pack(seq_bytes) assert seq_1 == seq_2 == seq_str assert len(seq_1) == len(seq_2) == len(seq_str) -# TATTA G CGATTGACAGTTGTCCTGTAATAACG C CGGGTAAATTTGC C G -# TATTA C CGATTGACAGTTGTCCTGTAATAACG G CGGGTAAATTTGC T G seq_3 = sq.pack("TATTAGCGATTGACAGTTGTCCTGTAATAACGCCGGGTAAATTTGCCG") -seq_4 = sq.pack("TATTACCGATTGACAGTTGTCCTGTAATAACGGCGGGTAAATTTGCTG") +seq_4 = sq.pack("TATTACCGATTGACAGTTGTCCTGTAATAACGGCGGGTAAATTTGCTG") # 5M1X26M1X13M1X1M seq_str = str(seq_4) # Slice and subscript diff --git a/shortseq/tests/unit_tests_main.py b/shortseq/tests/unit_tests_main.py index 74c4a75..967e47f 100644 --- a/shortseq/tests/unit_tests_main.py +++ b/shortseq/tests/unit_tests_main.py @@ -308,10 +308,8 @@ def test_readme(self): assert seq_1 == seq_2 == seq_str assert len(seq_1) == len(seq_2) == len(seq_str) - # TATTA G CGATTGACAGTTGTCCTGTAATAACG C CGGGTAAATTTGC C G - # TATTA C CGATTGACAGTTGTCCTGTAATAACG G CGGGTAAATTTGC T G seq_3 = sq.pack("TATTAGCGATTGACAGTTGTCCTGTAATAACGCCGGGTAAATTTGCCG") - seq_4 = sq.pack("TATTACCGATTGACAGTTGTCCTGTAATAACGGCGGGTAAATTTGCTG") + seq_4 = sq.pack("TATTACCGATTGACAGTTGTCCTGTAATAACGGCGGGTAAATTTGCTG") # 5M1X26M1X13M1X1M seq_str = str(seq_4) # Slice and subscript From cb64a0c1e68a731e10bdd25e31fb64eb6bbd3597 Mon Sep 17 00:00:00 2001 From: AlexTate <0xalextate@gmail.com> Date: Sun, 10 Dec 2023 10:29:47 -0800 Subject: [PATCH 5/5] Update check.yml Adding a command to list CPU details on macOS runners. Hopefully this will illuminate why their runners intermittently fail due to incompatible CPU instruction sets --- .github/workflows/check.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index 5122e90..6f9eef8 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -22,6 +22,9 @@ jobs: - 3.9 - 3.8 steps: + - name: Print agent CPU info + if: runner.os == 'macOS' + run: sysctl -a machdep.cpu - name: Setup Python uses: actions/setup-python@v4 with: @@ -46,4 +49,4 @@ jobs: run: tox --skip-pkg-install env: CI_RUN: "yes" - DIFF_AGAINST: HEAD \ No newline at end of file + DIFF_AGAINST: HEAD