#1576

mabel-dev · Apr 14, 2024 · a356767 · a356767
1 parent 5bf3cfd
commit a356767
Show file tree

Hide file tree

Showing 6 changed files with 357 additions and 330 deletions.
diff --git a/opteryx/compiled/functions/__init__.py b/opteryx/compiled/functions/__init__.py
@@ -4,5 +4,6 @@
 from hash_table import distinct
 from ip_address import ip_in_cidr
 from vectors import possible_match
+from vectors import possible_match_indices
 from vectors import tokenize_and_remove_punctuation
 from vectors import vectorize
diff --git a/opteryx/compiled/functions/vectors.pyx b/opteryx/compiled/functions/vectors.pyx
@@ -1,19 +1,21 @@
 # cython: language_level=3
 # cython: boundscheck=False
 # cython: wraparound=False
+# cython: nonecheck=False
+# cython: overflowcheck=False
 
 import numpy as np
 cimport numpy as cnp
 cimport cython
 
-from libc.stdint cimport uint32_t, int32_t, uint16_t, uint64_t
+from libc.stdint cimport uint32_t, uint16_t, uint64_t
 from cpython cimport PyUnicode_AsUTF8String, PyBytes_GET_SIZE
 from cpython.bytes cimport PyBytes_AsString
 
 cdef double GOLDEN_RATIO_APPROX = 1.618033988749895
 cdef uint32_t VECTOR_SIZE = 1024
 
-cdef uint16_t djb2_hash(char* byte_array, uint64_t length) nogil:
+cdef inline uint16_t djb2_hash(char* byte_array, uint64_t length) nogil:
     """
     Hashes a byte array using the djb2 algorithm, designed to be called without
     holding the Global Interpreter Lock (GIL).
@@ -25,7 +27,7 @@ cdef uint16_t djb2_hash(char* byte_array, uint64_t length) nogil:
             The length of the byte array.
 
     Returns:
-        uint64_t: The hash value.
+        uint16_t: The hash value.
     """
     cdef uint32_t hash_value = 5381
     cdef uint32_t i = 0
@@ -48,8 +50,13 @@ def vectorize(list tokens):
         if token_size > 1:
             hash_1 = djb2_hash(token_bytes, token_size)
             hash_2 = <uint16_t>((hash_1 * GOLDEN_RATIO_APPROX)) & (VECTOR_SIZE - 1)
-            vector[hash_1 & (VECTOR_SIZE - 1)] += 1
-            vector[hash_2] += 1
+
+            hash_1 = hash_1 & (VECTOR_SIZE - 1)
+            if vector[hash_1] < 65535:
+                vector[hash_1] += 1
+
+            if vector[hash_2] < 65535:
+                vector[hash_2] += 1
 
     return vector
 
@@ -70,11 +77,28 @@ def possible_match(list query_tokens, cnp.ndarray[cnp.uint16_t, ndim=1] vector):
     return True
 
 
+def possible_match_indices(cnp.ndarray[cnp.uint16_t, ndim=1] indices, cnp.ndarray[cnp.uint16_t, ndim=1] vector):
+    """
+    Check if all specified indices in 'indices' have non-zero values in 'vector'.
+
+    Parameters:
+        indices: cnp.ndarray[cnp.uint16_t, ndim=1]
+            Array of indices to check in the vector.
+        vector: cnp.ndarray[cnp.uint16_t, ndim=1]
+            Array where non-zero values are expected at the indices specified by 'indices'.
+
+    Returns:
+        bool: True if all specified indices have non-zero values, otherwise False.
+    """
+    cdef int i
+    for i in range(indices.shape[0]):
+        if vector[indices[i]] == 0:
+            return False
+    return True
+
 
 from libc.string cimport strlen, strcpy, strtok, strchr
 from libc.stdlib cimport malloc, free
-import numpy as np
-cimport numpy as cnp
 
 cdef char* strdup(const char* s) nogil:
     cdef char* d = <char*>malloc(strlen(s) + 1)
@@ -96,24 +120,26 @@ cpdef list tokenize_and_remove_punctuation(str text, set stop_words):
     c_text = strdup(PyBytes_AsString(py_text))
 
     try:
-        token = strtok(c_text, " ,.!?\n\t")
+        token = strtok(c_text, " ")
         while token != NULL:
             word = <char*>malloc(strlen(token) + 1)
             i = 0
             j = 0
-            while token[i] != b'\0':
-                # Check if the character is a lowercase or uppercase letter
-                if (b'a' <= token[i] <= b'z' or b'A' <= token[i] <= b'Z'):
+            while token[i] != 0:
+                if 97 <= token[i] <= 122:
+                    word[j] = token[i]
+                    j += 1
+                elif 65 <= token[i] <= 90:
                     # Convert to lowercase if it's uppercase
-                    word[j] = token[i] + 32 if b'A' <= token[i] <= b'Z' else token[i]
+                    word[j] = token[i] + 32
                     j += 1
                 i += 1
-            word[j] = b'\0'
-            # Ensure the token is longer than one character and not a stop word
-            if strlen(word) > 1 and word.decode('utf-8') not in stop_words:
-                tokens.append(word)
+            word[j] = 0
+            if j > 1:
+                if word not in stop_words:
+                    tokens.append(word)
             free(word)
-            token = strtok(NULL, " ,.!?\n\t")
+            token = strtok(NULL, " ")
     finally:
         free(c_text)
 

diff --git a/opteryx/components/logical_planner/logical_planner.py b/opteryx/components/logical_planner/logical_planner.py
@@ -729,8 +729,6 @@ def create_node_relation(relation):
 
 def analyze_query(statement) -> LogicalPlan:
 
-    print(statement)
-
     root_node = "Analyze"
     plan = LogicalPlan()
 

diff --git a/opteryx/components/logical_planner/logical_planner_builders.py b/opteryx/components/logical_planner/logical_planner_builders.py
@@ -541,8 +541,6 @@ def map_access(branch, alias: Optional[List[str]] = None, key=None):
 
 def match_against(branch, alias: Optional[List[str]] = None, key=None):
 
-    print(branch)
-
     columns = [identifier(col) for col in branch["columns"]]
     match_to = build(branch["match_value"])
     mode = branch["opt_search_modifier"]

diff --git a/opteryx/functions/string_functions.py b/opteryx/functions/string_functions.py
@@ -326,19 +326,23 @@ def match_against(arr, val):
     2 indexes)
     """
 
-    from opteryx.compiled.functions import possible_match
+    from opteryx.compiled.functions import possible_match_indices
     from opteryx.compiled.functions import tokenize_and_remove_punctuation
     from opteryx.compiled.functions import vectorize
     from opteryx.virtual_datasets.stop_words import STOP_WORDS
 
     if len(val) == 0:
         return []
     tokenized_literal = tokenize_and_remove_punctuation(str(val[0]), STOP_WORDS)
+    literal_offsets = numpy.nonzero(vectorize(tokenized_literal))[0].astype(numpy.uint16)
+
     if len(tokenized_literal) == 0:
         return [False] * len(arr)
 
-    tokenized_strings = [tokenize_and_remove_punctuation(s, STOP_WORDS) for s in arr]
+    tokenized_strings = (tokenize_and_remove_punctuation(s, STOP_WORDS) for s in arr)
+
     return [
-        possible_match(tokenized_literal, vectorize(tok)) and set(tokenized_literal).issubset(tok)
+        possible_match_indices(literal_offsets, vectorize(tok))
+        and set(tokenized_literal).issubset(tok)
         for tok in tokenized_strings
     ]