From 8cdd4f7ef2355e2db293d3833bcd103f16f23a00 Mon Sep 17 00:00:00 2001 From: joocer Date: Sun, 7 Apr 2024 10:48:15 +0100 Subject: [PATCH 01/12] #1567 --- opteryx/compiled/bloom_filter/__init__.py | 1 - .../compiled/bloom_filter/bloom_filter.pyx | 52 -------- opteryx/compiled/functions/varchar_array.pyx | 120 ------------------ setup.py | 12 -- tests/storage/test_sql_duckdb.py | 6 +- 5 files changed, 5 insertions(+), 186 deletions(-) delete mode 100644 opteryx/compiled/bloom_filter/__init__.py delete mode 100644 opteryx/compiled/bloom_filter/bloom_filter.pyx delete mode 100644 opteryx/compiled/functions/varchar_array.pyx diff --git a/opteryx/compiled/bloom_filter/__init__.py b/opteryx/compiled/bloom_filter/__init__.py deleted file mode 100644 index b4b03f3b1..000000000 --- a/opteryx/compiled/bloom_filter/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from bloom_filter import create_bloom_filter diff --git a/opteryx/compiled/bloom_filter/bloom_filter.pyx b/opteryx/compiled/bloom_filter/bloom_filter.pyx deleted file mode 100644 index 1b548bc99..000000000 --- a/opteryx/compiled/bloom_filter/bloom_filter.pyx +++ /dev/null @@ -1,52 +0,0 @@ -# cython: language_level=3 - -""" -approximately -- 500 items, with two hashes, in 4092 bits, would have a 5% FP rate. -This implementation runs in about 1/3 the time of the one in Orso -""" - - -from libc.stdlib cimport malloc, free -from libc.string cimport memset - -cdef class BloomFilter: - cdef: - unsigned char* bit_array - long size - - def __cinit__(self, long size): - self.size = size - # Allocate memory for the bit array and initialize to 0 - self.bit_array = malloc(size // 8 + 1) - if not self.bit_array: - raise MemoryError("Failed to allocate memory for the bit array.") - memset(self.bit_array, 0, size // 8 + 1) - - def __dealloc__(self): - if self.bit_array: - free(self.bit_array) - - cpdef void add(self, long item): - """Add an item to the Bloom filter""" - h1 = item % self.size - # Apply the golden ratio to the item and use modulo to wrap within the size of the bit array - h2 = (item * 1.618033988749895) % self.size - # Set bits using bitwise OR - self.bit_array[h1 // 8] |= 1 << (h1 % 8) - self.bit_array[h2 // 8] |= 1 << (h2 % 8) - - cpdef int possibly_contains(self, long item): - """Check if the item might be in the set""" - h1 = item % self.size - h2 = (item * item + 1) % self.size - # Check bits using bitwise AND - return (self.bit_array[h1 // 8] & (1 << (h1 % 8))) and \ - (self.bit_array[h2 // 8] & (1 << (h2 % 8))) - -def create_bloom_filter(int size, items): - """Create and populate a Bloom filter""" - cdef BloomFilter bf = BloomFilter(size) - for item in items: - bf.add(item) - return bf diff --git a/opteryx/compiled/functions/varchar_array.pyx b/opteryx/compiled/functions/varchar_array.pyx deleted file mode 100644 index 798afce57..000000000 --- a/opteryx/compiled/functions/varchar_array.pyx +++ /dev/null @@ -1,120 +0,0 @@ -""" -Native accelerators for Parquet encoding and decoding. -""" -# cython: profile=False -# cython: linetrace=False -# cython: binding=False -# cython: language_level=3 -# cython: initializedcheck=False -# cython: boundscheck=False -# cython: wraparound=False -# cython: overflowcheck=False -# cython: initializedcheck=False -# cython: cdivision=True -# cython: always_allow_keywords=False - -# from: https://github.com/dask/fastparquet/blob/main/fastparquet/speedups.pyx - -cdef extern from "string.h": - void *memcpy(void *dest, const void *src, size_t n) - -from cpython cimport (PyUnicode_AsUTF8String, PyUnicode_DecodeUTF8, - PyBytes_CheckExact, PyBytes_FromStringAndSize, - PyBytes_GET_SIZE, PyBytes_AS_STRING) -from cpython.unicode cimport PyUnicode_DecodeUTF8 - -import numpy as np -cimport numpy as np -import cython - - -_obj_dtype = np.dtype('object') - - -def array_encode_utf8(inp): - """ - utf-8 encode all elements of a 1d ndarray of "object" dtype. - A new ndarray of bytes objects is returned. - """ - # TODO: combine with pack_byte_array as is done for unpack - cdef: - Py_ssize_t i, n - np.ndarray[object, ndim=1] arr - np.ndarray[object] result - - arr = np.array(inp, copy=False) - - n = arr.shape[0] - # TODO: why not inplace? - result = np.empty(n, dtype=object) - for i in range(n): - # Fast utf-8 encoding, avoiding method call and codec lookup indirection - result[i] = PyUnicode_AsUTF8String(arr[i]) - - return result - - -def pack_byte_array(list items): - """ - Pack a variable length byte array column. - A bytes object is returned. - """ - cdef: - Py_ssize_t i, n, itemlen, total_size - unsigned char *start - unsigned char *data - object val, out - - # Strategy: compute the total output size and allocate it in one go. - n = len(items) - total_size = 0 - for i in range(n): - val = items[i] - if not PyBytes_CheckExact(val): - raise TypeError("expected list of bytes") - total_size += 4 + PyBytes_GET_SIZE(val) - - out = PyBytes_FromStringAndSize(NULL, total_size) - start = data = PyBytes_AS_STRING(out) - - # Copy data to output. - for i in range(n): - val = items[i] - # `itemlen` should be >= 0, so no signed extension issues - itemlen = PyBytes_GET_SIZE(val) - ( data)[0] = itemlen - data += 4 - memcpy(data, PyBytes_AS_STRING(val), itemlen) - data += itemlen - - assert (data - start) == total_size - return out - - -@cython.boundscheck(False) -def unpack_byte_array(const unsigned char[::1] raw_bytes, Py_ssize_t n, const char utf=False): - """ - Unpack a variable length byte array column. - An array of bytes objects is returned. - """ - cdef: - Py_ssize_t i = 0 - char* ptr = &raw_bytes[0] - int itemlen, bytecount - np.ndarray[object, ndim=1, mode="c"] out = np.empty(n, dtype="object") - - assert out is not None - bytecount = raw_bytes.shape[0] - while i < n and bytecount > 0: - - itemlen = ( ptr)[0] - ptr += 4 - if utf: - out[i] = PyUnicode_DecodeUTF8(ptr, itemlen, "ignore") - else: - out[i] = PyBytes_FromStringAndSize(ptr, itemlen) - ptr += itemlen - bytecount -= 4 + itemlen - i += 1 - - return out \ No newline at end of file diff --git a/setup.py b/setup.py index 35534d1a3..3be123fc1 100644 --- a/setup.py +++ b/setup.py @@ -91,18 +91,6 @@ def rust_build(setup_kwargs: Dict[str, Any]) -> None: language="c++", extra_compile_args=COMPILE_FLAGS + ["-std=c++11"], ), - Extension( - name="bloom_filter", - sources=["opteryx/compiled/bloom_filter/bloom_filter.pyx"], - extra_compile_args=COMPILE_FLAGS, - ), - Extension( - name="varchar_array", - sources=["opteryx/compiled/functions/varchar_array.pyx"], - include_dirs=[numpy.get_include()], - language="c++", - extra_compile_args=COMPILE_FLAGS + ["-std=c++11"], - ), ] setup_config = { diff --git a/tests/storage/test_sql_duckdb.py b/tests/storage/test_sql_duckdb.py index 32e888906..f80ceef6f 100644 --- a/tests/storage/test_sql_duckdb.py +++ b/tests/storage/test_sql_duckdb.py @@ -32,7 +32,11 @@ def test_duckdb_storage(): - create_duck_db() + # We have some problems with creating duckdb, particularly in GitHub Actions + # we're going to brute force. + for i in range(5): + if create_duck_db() is None: + break opteryx.register_store( "duckdb", From ca0148971ae3bc07984f046c00c520f84744519f Mon Sep 17 00:00:00 2001 From: XB500 Date: Sun, 7 Apr 2024 09:48:38 +0000 Subject: [PATCH 02/12] Opteryx Version 0.13.4-alpha.405 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index e6b54c1ed..f07511486 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 404 +__build__ = 405 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 888bb82a6abf5c4d1e0e11463b81862693293eae Mon Sep 17 00:00:00 2001 From: joocer Date: Sun, 7 Apr 2024 10:55:33 +0100 Subject: [PATCH 03/12] #1567 --- opteryx/functions/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/opteryx/functions/__init__.py b/opteryx/functions/__init__.py index d17227964..1c1dcfbc6 100644 --- a/opteryx/functions/__init__.py +++ b/opteryx/functions/__init__.py @@ -24,7 +24,6 @@ from pyarrow import compute import opteryx -from opteryx.compiled.functions import array_encode_utf8 from opteryx.exceptions import FunctionNotFoundError from opteryx.exceptions import IncorrectTypeError from opteryx.exceptions import UnsupportedSyntaxError @@ -35,6 +34,11 @@ from opteryx.utils import dates +def array_encode_utf8(arr): + # this is not the fastest way to do this, orso has a Cython method + return [None if s is None else s.encode() for s in arr] + + def _get(array, key): # Determine the type of the first element (assuming homogeneous array) first_element = next((item for item in array if item is not None), None) From 131350be8fe82d16effc92a73c3ec26dd5eb4e59 Mon Sep 17 00:00:00 2001 From: XB500 Date: Sun, 7 Apr 2024 09:55:55 +0000 Subject: [PATCH 04/12] Opteryx Version 0.13.4-alpha.406 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index f07511486..60e4de459 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 405 +__build__ = 406 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From d8998c495b82569e7d65290f7a5156a02421644c Mon Sep 17 00:00:00 2001 From: joocer Date: Sun, 7 Apr 2024 11:05:32 +0100 Subject: [PATCH 05/12] #1567 --- opteryx/compiled/functions/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/opteryx/compiled/functions/__init__.py b/opteryx/compiled/functions/__init__.py index b86adec01..1b03f36aa 100644 --- a/opteryx/compiled/functions/__init__.py +++ b/opteryx/compiled/functions/__init__.py @@ -3,6 +3,3 @@ from hash_table import HashTable from hash_table import distinct from ip_address import ip_in_cidr -from varchar_array import array_encode_utf8 -from varchar_array import pack_byte_array -from varchar_array import unpack_byte_array From df522cce69ff0b6b17ed5c429708241403099d4e Mon Sep 17 00:00:00 2001 From: XB500 Date: Sun, 7 Apr 2024 10:05:55 +0000 Subject: [PATCH 06/12] Opteryx Version 0.13.4-alpha.407 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 60e4de459..9f234cb9e 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 406 +__build__ = 407 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From d600122ed1b3c21176e171fbe905ba32127c96f0 Mon Sep 17 00:00:00 2001 From: joocer Date: Sun, 7 Apr 2024 11:49:18 +0100 Subject: [PATCH 07/12] #1567 --- tests/storage/test_sql_duckdb.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/storage/test_sql_duckdb.py b/tests/storage/test_sql_duckdb.py index f80ceef6f..082e504b1 100644 --- a/tests/storage/test_sql_duckdb.py +++ b/tests/storage/test_sql_duckdb.py @@ -59,7 +59,11 @@ def test_duckdb_storage(): def test_duckdb_battery(): from opteryx.utils.formatter import format_sql - create_duck_db() + # We have some problems with creating duckdb, particularly in GitHub Actions + # we're going to brute force. + for i in range(5): + if create_duck_db() is None: + break opteryx.register_store( "duckdb", From 8535b0254fb9cceb7f74d078c6c539fc5261cfe4 Mon Sep 17 00:00:00 2001 From: XB500 Date: Sun, 7 Apr 2024 10:49:45 +0000 Subject: [PATCH 08/12] Opteryx Version 0.13.4-alpha.408 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 9f234cb9e..6afc6f191 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 407 +__build__ = 408 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 381f17e40e68c51fbe0e8954f6fc9648115f65b5 Mon Sep 17 00:00:00 2001 From: joocer Date: Sun, 7 Apr 2024 12:00:59 +0100 Subject: [PATCH 09/12] #1567 --- opteryx/__version__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 6afc6f191..0bf2edfc5 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -27,9 +27,9 @@ class VersionStatus(Enum): _major = 0 -_minor = 13 -_revision = 4 -_status = VersionStatus.ALPHA +_minor = 14 +_revision = 0 +_status = VersionStatus.RELEASE __author__ = "@joocer" __version__ = f"{_major}.{_minor}.{_revision}" + ( From e90d27c055702f9f261ede5a96c2f53b30f0a1b2 Mon Sep 17 00:00:00 2001 From: XB500 Date: Sun, 7 Apr 2024 11:01:23 +0000 Subject: [PATCH 10/12] Opteryx Version 0.14.0 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 0bf2edfc5..71d5e8fba 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 408 +__build__ = 409 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From c52c3c4f489a8d3858ab39a48c865885edf8bfd9 Mon Sep 17 00:00:00 2001 From: joocer Date: Sun, 7 Apr 2024 13:34:36 +0100 Subject: [PATCH 11/12] #1576 --- Makefile | 3 ++- opteryx/components/binder/binder_visitor.py | 2 +- opteryx/connectors/cql_connector.py | 6 +++--- opteryx/managers/expression/__init__.py | 8 ++++---- tests/tools.py | 2 +- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index 77ddef45e..7ad1fee37 100644 --- a/Makefile +++ b/Makefile @@ -29,7 +29,8 @@ test: mypy: clear - mypy --ignore-missing-imports --python-version 3.10 --no-strict-optional --check-untyped-defs opteryx + python -m pip install --upgrade mypy + python -m mypy --ignore-missing-imports --python-version 3.10 --no-strict-optional --check-untyped-defs opteryx coverage: clear diff --git a/opteryx/components/binder/binder_visitor.py b/opteryx/components/binder/binder_visitor.py index e3b5f4902..f856de206 100644 --- a/opteryx/components/binder/binder_visitor.py +++ b/opteryx/components/binder/binder_visitor.py @@ -226,7 +226,7 @@ def visit_node(self, node: Node, context: BindingContext) -> Tuple[Node, Binding Tuple[Node, Dict] The node and context after binding. """ - node_type = node.node_type.name + node_type = node.node_type.name # type:ignore visit_method_name = f"visit_{CAMEL_TO_SNAKE.sub('_', node_type).lower()}" visit_method = getattr(self, visit_method_name, None) if visit_method is None: diff --git a/opteryx/connectors/cql_connector.py b/opteryx/connectors/cql_connector.py index 6d751b0b9..833c8cf74 100644 --- a/opteryx/connectors/cql_connector.py +++ b/opteryx/connectors/cql_connector.py @@ -39,7 +39,7 @@ from opteryx.third_party.query_builder import Query -def _handle_operand(operand: Node, parameters: list) -> Tuple[Any, dict]: +def _handle_operand(operand: Node, parameters: list) -> Tuple[Any, list]: if operand.node_type == NodeType.IDENTIFIER: return f'"{operand.source_column}"', parameters @@ -140,7 +140,7 @@ def read_dataset( # type:ignore col for col in self.schema.columns if f'"{col.name}"' in column_names # type:ignore ] else: - query_builder.add("SELECT", f'"{self.single_column.name}"') + query_builder.add("SELECT", f'"{self.single_column.name}"') # type:ignore self.schema.columns = [self.single_column] # type:ignore # Update SQL if we've pushed predicates @@ -211,6 +211,6 @@ def get_dataset_schema(self) -> RelationSchema: ], ) - self.single_column = self.schema.columns[0] + self.single_column = self.schema.columns[0] # type:ignore return self.schema diff --git a/opteryx/managers/expression/__init__.py b/opteryx/managers/expression/__init__.py index b6cc8ee9a..8f2db9c02 100644 --- a/opteryx/managers/expression/__init__.py +++ b/opteryx/managers/expression/__init__.py @@ -194,7 +194,7 @@ def prioritize_evaluation(expressions): def _inner_evaluate(root: Node, table: Table, context: ExecutionContext): - node_type = root.node_type + node_type = root.node_type # type:ignore if node_type == NodeType.SUBQUERY: raise UnsupportedSyntaxError("IN () temporarily not supported.") @@ -228,7 +228,7 @@ def _inner_evaluate(root: Node, table: Table, context: ExecutionContext): ) # type:ignore # BOOLEAN OPERATORS - if node_type & LOGICAL_TYPE == LOGICAL_TYPE: + if node_type & LOGICAL_TYPE == LOGICAL_TYPE: # type:ignore if node_type == NodeType.OR: return short_cut_or(root, table, context) if node_type == NodeType.AND: @@ -237,7 +237,7 @@ def _inner_evaluate(root: Node, table: Table, context: ExecutionContext): if node_type in LOGICAL_OPERATIONS: left = _inner_evaluate(root.left, table, context) if root.left else [None] right = _inner_evaluate(root.right, table, context) if root.right else [None] - return LOGICAL_OPERATIONS[node_type](left, right) + return LOGICAL_OPERATIONS[node_type](left, right) # type:ignore if node_type == NodeType.NOT: centre = _inner_evaluate(root.centre, table, context) if root.centre else [None] @@ -245,7 +245,7 @@ def _inner_evaluate(root: Node, table: Table, context: ExecutionContext): return pyarrow.compute.invert(centre) # INTERAL IDENTIFIERS - if node_type & INTERNAL_TYPE == INTERNAL_TYPE: + if node_type & INTERNAL_TYPE == INTERNAL_TYPE: # type:ignore if node_type == NodeType.FUNCTION: parameters = [_inner_evaluate(param, table, context) for param in root.parameters] # zero parameter functions get the number of rows as the parameter diff --git a/tests/tools.py b/tests/tools.py index b07dc7e08..19afe1ce4 100644 --- a/tests/tools.py +++ b/tests/tools.py @@ -439,7 +439,7 @@ def create_duck_db(): try: res = cur.execute(CREATE_DATABASE) except: - pass + return -1 finally: if res is not None: res.commit() From b50e03e4e55fbc4e5499caee7db5c9b50762eb43 Mon Sep 17 00:00:00 2001 From: XB500 Date: Sun, 7 Apr 2024 12:35:04 +0000 Subject: [PATCH 12/12] Opteryx Version 0.14.0 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 71d5e8fba..3075ff669 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 409 +__build__ = 410 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.