Skip to content

Commit

Permalink
Merge pull request #5 from AlexTate/api-cleanup
Browse files Browse the repository at this point in the history
Improved size reporting for ShortSeqVars and updated usage docs
  • Loading branch information
AlexTate authored Dec 10, 2023
2 parents 4434683 + cb64a0c commit cb79c4c
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 15 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ jobs:
- 3.9
- 3.8
steps:
- name: Print agent CPU info
if: runner.os == 'macOS'
run: sysctl -a machdep.cpu
- name: Setup Python
uses: actions/setup-python@v4
with:
Expand All @@ -46,4 +49,4 @@ jobs:
run: tox --skip-pkg-install
env:
CI_RUN: "yes"
DIFF_AGAINST: HEAD
DIFF_AGAINST: HEAD
37 changes: 23 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@

ShortSeqs are compact and efficient Python objects that hold short sequences while using up to 73% less memory compared to built-in types. They are prehashed and comparable, they support slicing and indexing, and they easily convert back to their original string form.

| Sequence Length | PyUnicode Size<sup>*</sup> | PyBytes Size<sup>*</sup> | ShortSeq Size<sup>*</sup> | % Mem. Reduction |
|-----------------|----------------------------|--------------------------|---------------------------:|------------------|
| 0-32 nt | 56-88 bytes | 40-72 bytes | 32 bytes (fixed) | **20-64%** |
| 33-64 nt | 88-120 bytes | 72-104 bytes | 48 bytes (fixed) | **33-60%** |
| 65-1024 nt | 120-1080 bytes | 104-1064 bytes | 48-288 bytes | **53-73%** |
| Sequence Length | PyUnicode Size | PyBytes Size | ShortSeq Size | % Reduced |
|-----------------|----------------------------|--------------------------|--------------------------:|--------------------|
| 0-32 nt | 56-88 bytes | 40-72 bytes | 32 bytes (fixed) | **43-64%** |
| 33-64 nt | 88-120 bytes | 72-104 bytes | 48 bytes (fixed) | **45-60%** |
| 65-1024 nt | 120-1080 bytes | 104-1064 bytes | 56-288 bytes | **53-73%** |

<sup>* Object sizes were measured on Python 3.10 using `asizeof()` from the `pympler` package.</sup>
<sup>* Object sizes were measured on Python 3.10 using `asizeof()` from the `pympler` package. % Reduced is PyUnicode vs. ShortSeq</sup>

In the table above, you can see that Python's memory representation of DNA sequences is larger than a C-style `char *` array, which would only need one byte per base. Using Cython we can move some of this memory representation out of Python space and into C space for faster facilities and a more compact bitwise representation.

Expand All @@ -25,20 +25,29 @@ mamba install -c bioconda -c conda-forge shortseq
```python
import shortseq as sq

# Construct from PyUnicode
# Construct from PyUnicode or PyBytes
seq_str = "ATGC"
seq_1 = sq.pack(seq_str)

# Or, construct from PyBytes
seq_bytes = b"ATGC"
seq_1 = sq.pack(seq_str)
seq_2 = sq.pack(seq_bytes)

# Verify outputs (optional)
assert seq_1 == seq_2
assert seq_str == str(seq_1) == str(seq_2)
assert len(seq_str) == len(seq_1) == len(seq_2)
assert seq_1 == seq_2 == seq_str
assert len(seq_1) == len(seq_2) == len(seq_str)

seq_3 = sq.pack("TATTAGCGATTGACAGTTGTCCTGTAATAACGCCGGGTAAATTTGCCG")
seq_4 = sq.pack("TATTACCGATTGACAGTTGTCCTGTAATAACGGCGGGTAAATTTGCTG") # 5M1X26M1X13M1X1M
seq_str = str(seq_4)

# Slice and subscript
assert seq_4[5:15] == seq_str[5:15]
assert seq_4[-2] == seq_str[-2]

# Vectorized hamming distance (differing bases)
hammd = sum(a!=b for a, b in zip(seq_3, seq_4))
assert seq_3 ^ seq_4 == hammd == 3

# Count unique sequences
# Count unique sequences similar to collections.Counter
from shortseq import ShortSeqCounter
counts = ShortSeqCounter([seq_bytes] * 10)
assert counts == {sq.pack("ATGC"): 10}
Expand Down
3 changes: 3 additions & 0 deletions shortseq/short_seq_var.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ cdef class ShortSeqVar:

return pop_cnt

def __sizeof__(self):
return sizeof(ShortSeqVar) + _length_to_block_num(self._length) * sizeof(uint64_t)

def __repr__(self):
# Clips the sequence to MAX_REPR_LEN characters to avoid overwhelming the debugger
cdef unicode clipped_seq = _unmarshall_bytes_var(self._packed, MAX_REPR_LEN)
Expand Down
55 changes: 55 additions & 0 deletions shortseq/tests/unit_tests_main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import unittest
import sys

from random import randint

Expand Down Expand Up @@ -65,6 +66,23 @@ def test_incompatible_seq_chars(self):
with self.assertRaisesRegex(Exception, "Unsupported base character"):
sq.pack(prob)

"""Are min and max length ShortSeqs the correct size?"""

def test_size(self):
# ShortSeq64
seq_min = sq.pack(rand_sequence(MIN_64_NT))
seq_max = sq.pack(rand_sequence(MAX_64_NT))

self.assertEqual(sys.getsizeof(seq_min), 32)
self.assertEqual(sys.getsizeof(seq_max), 32)

# ShortSeq128
seq_min = sq.pack(rand_sequence(MIN_128_NT))
seq_max = sq.pack(rand_sequence(MAX_128_NT))

self.assertEqual(sys.getsizeof(seq_min), 48)
self.assertEqual(sys.getsizeof(seq_max), 48)

"""Checks that randomly generated sequences encode and decode correctly
for the entire valid range of lengths."""

Expand Down Expand Up @@ -279,6 +297,43 @@ def str_ham(a, b): return sum(a_nt != b_nt for a_nt, b_nt in zip(a, b))

self.assertEqual(sq.pack(a) ^ sq.pack(b), str_ham(a, b))

def test_readme(self):
# Construct from PyUnicode or PyBytes
seq_str = "ATGC"
seq_bytes = b"ATGC"
seq_1 = sq.pack(seq_str)
seq_2 = sq.pack(seq_bytes)

# Verify outputs (optional)
assert seq_1 == seq_2 == seq_str
assert len(seq_1) == len(seq_2) == len(seq_str)

seq_3 = sq.pack("TATTAGCGATTGACAGTTGTCCTGTAATAACGCCGGGTAAATTTGCCG")
seq_4 = sq.pack("TATTACCGATTGACAGTTGTCCTGTAATAACGGCGGGTAAATTTGCTG") # 5M1X26M1X13M1X1M
seq_str = str(seq_4)

# Slice and subscript
assert seq_4[5:15] == seq_str[5:15]
assert seq_4[-2] == seq_str[-2]

# Vectorized hamming distance (differing bases)
hammd = sum(a != b for a, b in zip(seq_3, seq_4))
assert seq_3 ^ seq_4 == hammd == 3

# Count unique sequences similar to collections.Counter
from shortseq import ShortSeqCounter
counts = ShortSeqCounter([seq_bytes] * 10)
assert counts == {sq.pack("ATGC"): 10}

"""Are min and max length ShortSeqVars the correct size?"""

def test_size(self):
seq_min = sq.pack(rand_sequence(MIN_VAR_NT))
seq_max = sq.pack(rand_sequence(MAX_VAR_NT))

self.assertEqual(sys.getsizeof(seq_min), 56)
self.assertEqual(sys.getsizeof(seq_max), 288)



if __name__ == '__main__':
Expand Down

0 comments on commit cb79c4c

Please sign in to comment.