Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
joocer committed Apr 14, 2024
1 parent 5bf3cfd commit a356767
Show file tree
Hide file tree
Showing 6 changed files with 357 additions and 330 deletions.
1 change: 1 addition & 0 deletions opteryx/compiled/functions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@
from hash_table import distinct
from ip_address import ip_in_cidr
from vectors import possible_match
from vectors import possible_match_indices
from vectors import tokenize_and_remove_punctuation
from vectors import vectorize
60 changes: 43 additions & 17 deletions opteryx/compiled/functions/vectors.pyx
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
# cython: language_level=3
# cython: boundscheck=False
# cython: wraparound=False
# cython: nonecheck=False
# cython: overflowcheck=False

import numpy as np
cimport numpy as cnp
cimport cython

from libc.stdint cimport uint32_t, int32_t, uint16_t, uint64_t
from libc.stdint cimport uint32_t, uint16_t, uint64_t
from cpython cimport PyUnicode_AsUTF8String, PyBytes_GET_SIZE
from cpython.bytes cimport PyBytes_AsString

cdef double GOLDEN_RATIO_APPROX = 1.618033988749895
cdef uint32_t VECTOR_SIZE = 1024

cdef uint16_t djb2_hash(char* byte_array, uint64_t length) nogil:
cdef inline uint16_t djb2_hash(char* byte_array, uint64_t length) nogil:
"""
Hashes a byte array using the djb2 algorithm, designed to be called without
holding the Global Interpreter Lock (GIL).
Expand All @@ -25,7 +27,7 @@ cdef uint16_t djb2_hash(char* byte_array, uint64_t length) nogil:
The length of the byte array.
Returns:
uint64_t: The hash value.
uint16_t: The hash value.
"""
cdef uint32_t hash_value = 5381
cdef uint32_t i = 0
Expand All @@ -48,8 +50,13 @@ def vectorize(list tokens):
if token_size > 1:
hash_1 = djb2_hash(token_bytes, token_size)
hash_2 = <uint16_t>((hash_1 * GOLDEN_RATIO_APPROX)) & (VECTOR_SIZE - 1)
vector[hash_1 & (VECTOR_SIZE - 1)] += 1
vector[hash_2] += 1

hash_1 = hash_1 & (VECTOR_SIZE - 1)
if vector[hash_1] < 65535:
vector[hash_1] += 1

if vector[hash_2] < 65535:
vector[hash_2] += 1

return vector

Expand All @@ -70,11 +77,28 @@ def possible_match(list query_tokens, cnp.ndarray[cnp.uint16_t, ndim=1] vector):
return True


def possible_match_indices(cnp.ndarray[cnp.uint16_t, ndim=1] indices, cnp.ndarray[cnp.uint16_t, ndim=1] vector):
"""
Check if all specified indices in 'indices' have non-zero values in 'vector'.
Parameters:
indices: cnp.ndarray[cnp.uint16_t, ndim=1]
Array of indices to check in the vector.
vector: cnp.ndarray[cnp.uint16_t, ndim=1]
Array where non-zero values are expected at the indices specified by 'indices'.
Returns:
bool: True if all specified indices have non-zero values, otherwise False.
"""
cdef int i
for i in range(indices.shape[0]):
if vector[indices[i]] == 0:
return False
return True


from libc.string cimport strlen, strcpy, strtok, strchr
from libc.stdlib cimport malloc, free
import numpy as np
cimport numpy as cnp

cdef char* strdup(const char* s) nogil:
cdef char* d = <char*>malloc(strlen(s) + 1)
Expand All @@ -96,24 +120,26 @@ cpdef list tokenize_and_remove_punctuation(str text, set stop_words):
c_text = strdup(PyBytes_AsString(py_text))

try:
token = strtok(c_text, " ,.!?\n\t")
token = strtok(c_text, " ")
while token != NULL:
word = <char*>malloc(strlen(token) + 1)
i = 0
j = 0
while token[i] != b'\0':
# Check if the character is a lowercase or uppercase letter
if (b'a' <= token[i] <= b'z' or b'A' <= token[i] <= b'Z'):
while token[i] != 0:
if 97 <= token[i] <= 122:
word[j] = token[i]
j += 1
elif 65 <= token[i] <= 90:
# Convert to lowercase if it's uppercase
word[j] = token[i] + 32 if b'A' <= token[i] <= b'Z' else token[i]
word[j] = token[i] + 32
j += 1
i += 1
word[j] = b'\0'
# Ensure the token is longer than one character and not a stop word
if strlen(word) > 1 and word.decode('utf-8') not in stop_words:
tokens.append(word)
word[j] = 0
if j > 1:
if word not in stop_words:
tokens.append(word)
free(word)
token = strtok(NULL, " ,.!?\n\t")
token = strtok(NULL, " ")
finally:
free(c_text)

Expand Down
2 changes: 0 additions & 2 deletions opteryx/components/logical_planner/logical_planner.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,8 +729,6 @@ def create_node_relation(relation):

def analyze_query(statement) -> LogicalPlan:

print(statement)

root_node = "Analyze"
plan = LogicalPlan()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -541,8 +541,6 @@ def map_access(branch, alias: Optional[List[str]] = None, key=None):

def match_against(branch, alias: Optional[List[str]] = None, key=None):

print(branch)

columns = [identifier(col) for col in branch["columns"]]
match_to = build(branch["match_value"])
mode = branch["opt_search_modifier"]
Expand Down
10 changes: 7 additions & 3 deletions opteryx/functions/string_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,19 +326,23 @@ def match_against(arr, val):
2 indexes)
"""

from opteryx.compiled.functions import possible_match
from opteryx.compiled.functions import possible_match_indices
from opteryx.compiled.functions import tokenize_and_remove_punctuation
from opteryx.compiled.functions import vectorize
from opteryx.virtual_datasets.stop_words import STOP_WORDS

if len(val) == 0:
return []
tokenized_literal = tokenize_and_remove_punctuation(str(val[0]), STOP_WORDS)
literal_offsets = numpy.nonzero(vectorize(tokenized_literal))[0].astype(numpy.uint16)

if len(tokenized_literal) == 0:
return [False] * len(arr)

tokenized_strings = [tokenize_and_remove_punctuation(s, STOP_WORDS) for s in arr]
tokenized_strings = (tokenize_and_remove_punctuation(s, STOP_WORDS) for s in arr)

return [
possible_match(tokenized_literal, vectorize(tok)) and set(tokenized_literal).issubset(tok)
possible_match_indices(literal_offsets, vectorize(tok))
and set(tokenized_literal).issubset(tok)
for tok in tokenized_strings
]
Loading

0 comments on commit a356767

Please sign in to comment.