diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
index 5122e90..6f9eef8 100644
--- a/.github/workflows/check.yml
+++ b/.github/workflows/check.yml
@@ -22,6 +22,9 @@ jobs:
- 3.9
- 3.8
steps:
+ - name: Print agent CPU info
+ if: runner.os == 'macOS'
+ run: sysctl -a machdep.cpu
- name: Setup Python
uses: actions/setup-python@v4
with:
@@ -46,4 +49,4 @@ jobs:
run: tox --skip-pkg-install
env:
CI_RUN: "yes"
- DIFF_AGAINST: HEAD
\ No newline at end of file
+ DIFF_AGAINST: HEAD
diff --git a/README.md b/README.md
index 93a858d..8694245 100644
--- a/README.md
+++ b/README.md
@@ -2,13 +2,13 @@
ShortSeqs are compact and efficient Python objects that hold short sequences while using up to 73% less memory compared to built-in types. They are prehashed and comparable, they support slicing and indexing, and they easily convert back to their original string form.
-| Sequence Length | PyUnicode Size* | PyBytes Size* | ShortSeq Size* | % Mem. Reduction |
-|-----------------|----------------------------|--------------------------|---------------------------:|------------------|
-| 0-32 nt | 56-88 bytes | 40-72 bytes | 32 bytes (fixed) | **20-64%** |
-| 33-64 nt | 88-120 bytes | 72-104 bytes | 48 bytes (fixed) | **33-60%** |
-| 65-1024 nt | 120-1080 bytes | 104-1064 bytes | 48-288 bytes | **53-73%** |
+| Sequence Length | PyUnicode Size | PyBytes Size | ShortSeq Size | % Reduced |
+|-----------------|----------------------------|--------------------------|--------------------------:|--------------------|
+| 0-32 nt | 56-88 bytes | 40-72 bytes | 32 bytes (fixed) | **43-64%** |
+| 33-64 nt | 88-120 bytes | 72-104 bytes | 48 bytes (fixed) | **45-60%** |
+| 65-1024 nt | 120-1080 bytes | 104-1064 bytes | 56-288 bytes | **53-73%** |
-* Object sizes were measured on Python 3.10 using `asizeof()` from the `pympler` package.
+* Object sizes were measured on Python 3.10 using `asizeof()` from the `pympler` package. % Reduced is PyUnicode vs. ShortSeq
In the table above, you can see that Python's memory representation of DNA sequences is larger than a C-style `char *` array, which would only need one byte per base. Using Cython we can move some of this memory representation out of Python space and into C space for faster facilities and a more compact bitwise representation.
@@ -25,20 +25,29 @@ mamba install -c bioconda -c conda-forge shortseq
```python
import shortseq as sq
-# Construct from PyUnicode
+# Construct from PyUnicode or PyBytes
seq_str = "ATGC"
-seq_1 = sq.pack(seq_str)
-
-# Or, construct from PyBytes
seq_bytes = b"ATGC"
+seq_1 = sq.pack(seq_str)
seq_2 = sq.pack(seq_bytes)
# Verify outputs (optional)
-assert seq_1 == seq_2
-assert seq_str == str(seq_1) == str(seq_2)
-assert len(seq_str) == len(seq_1) == len(seq_2)
+assert seq_1 == seq_2 == seq_str
+assert len(seq_1) == len(seq_2) == len(seq_str)
+
+seq_3 = sq.pack("TATTAGCGATTGACAGTTGTCCTGTAATAACGCCGGGTAAATTTGCCG")
+seq_4 = sq.pack("TATTACCGATTGACAGTTGTCCTGTAATAACGGCGGGTAAATTTGCTG") # 5M1X26M1X13M1X1M
+seq_str = str(seq_4)
+
+# Slice and subscript
+assert seq_4[5:15] == seq_str[5:15]
+assert seq_4[-2] == seq_str[-2]
+
+# Vectorized hamming distance (differing bases)
+hammd = sum(a!=b for a, b in zip(seq_3, seq_4))
+assert seq_3 ^ seq_4 == hammd == 3
-# Count unique sequences
+# Count unique sequences similar to collections.Counter
from shortseq import ShortSeqCounter
counts = ShortSeqCounter([seq_bytes] * 10)
assert counts == {sq.pack("ATGC"): 10}
diff --git a/shortseq/short_seq_var.pyx b/shortseq/short_seq_var.pyx
index 372490e..7faa393 100644
--- a/shortseq/short_seq_var.pyx
+++ b/shortseq/short_seq_var.pyx
@@ -77,6 +77,9 @@ cdef class ShortSeqVar:
return pop_cnt
+ def __sizeof__(self):
+ return sizeof(ShortSeqVar) + _length_to_block_num(self._length) * sizeof(uint64_t)
+
def __repr__(self):
# Clips the sequence to MAX_REPR_LEN characters to avoid overwhelming the debugger
cdef unicode clipped_seq = _unmarshall_bytes_var(self._packed, MAX_REPR_LEN)
diff --git a/shortseq/tests/unit_tests_main.py b/shortseq/tests/unit_tests_main.py
index f0ab28b..967e47f 100644
--- a/shortseq/tests/unit_tests_main.py
+++ b/shortseq/tests/unit_tests_main.py
@@ -1,4 +1,5 @@
import unittest
+import sys
from random import randint
@@ -65,6 +66,23 @@ def test_incompatible_seq_chars(self):
with self.assertRaisesRegex(Exception, "Unsupported base character"):
sq.pack(prob)
+ """Are min and max length ShortSeqs the correct size?"""
+
+ def test_size(self):
+ # ShortSeq64
+ seq_min = sq.pack(rand_sequence(MIN_64_NT))
+ seq_max = sq.pack(rand_sequence(MAX_64_NT))
+
+ self.assertEqual(sys.getsizeof(seq_min), 32)
+ self.assertEqual(sys.getsizeof(seq_max), 32)
+
+ # ShortSeq128
+ seq_min = sq.pack(rand_sequence(MIN_128_NT))
+ seq_max = sq.pack(rand_sequence(MAX_128_NT))
+
+ self.assertEqual(sys.getsizeof(seq_min), 48)
+ self.assertEqual(sys.getsizeof(seq_max), 48)
+
"""Checks that randomly generated sequences encode and decode correctly
for the entire valid range of lengths."""
@@ -279,6 +297,43 @@ def str_ham(a, b): return sum(a_nt != b_nt for a_nt, b_nt in zip(a, b))
self.assertEqual(sq.pack(a) ^ sq.pack(b), str_ham(a, b))
+ def test_readme(self):
+ # Construct from PyUnicode or PyBytes
+ seq_str = "ATGC"
+ seq_bytes = b"ATGC"
+ seq_1 = sq.pack(seq_str)
+ seq_2 = sq.pack(seq_bytes)
+
+ # Verify outputs (optional)
+ assert seq_1 == seq_2 == seq_str
+ assert len(seq_1) == len(seq_2) == len(seq_str)
+
+ seq_3 = sq.pack("TATTAGCGATTGACAGTTGTCCTGTAATAACGCCGGGTAAATTTGCCG")
+ seq_4 = sq.pack("TATTACCGATTGACAGTTGTCCTGTAATAACGGCGGGTAAATTTGCTG") # 5M1X26M1X13M1X1M
+ seq_str = str(seq_4)
+
+ # Slice and subscript
+ assert seq_4[5:15] == seq_str[5:15]
+ assert seq_4[-2] == seq_str[-2]
+
+ # Vectorized hamming distance (differing bases)
+ hammd = sum(a != b for a, b in zip(seq_3, seq_4))
+ assert seq_3 ^ seq_4 == hammd == 3
+
+ # Count unique sequences similar to collections.Counter
+ from shortseq import ShortSeqCounter
+ counts = ShortSeqCounter([seq_bytes] * 10)
+ assert counts == {sq.pack("ATGC"): 10}
+
+ """Are min and max length ShortSeqVars the correct size?"""
+
+ def test_size(self):
+ seq_min = sq.pack(rand_sequence(MIN_VAR_NT))
+ seq_max = sq.pack(rand_sequence(MAX_VAR_NT))
+
+ self.assertEqual(sys.getsizeof(seq_min), 56)
+ self.assertEqual(sys.getsizeof(seq_max), 288)
+
if __name__ == '__main__':