Skip to content

Commit

Permalink
Python: Standardized integer types over relying on C types (apache#8535)
Browse files Browse the repository at this point in the history
Utilize libc.stdint for the definition of unsigned 64 bit integers.

Change the array.array type to use integers that are at least 64 bits in width.

Changes motiviated by differences in Windows builds
  • Loading branch information
rustyconover authored Sep 9, 2023
1 parent a113e26 commit a45b7ac
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 23 deletions.
16 changes: 10 additions & 6 deletions python/pyiceberg/avro/decoder_basic.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,22 @@
under the License.
*/

#include <stdint.h>

/*
Decode an an array of zig-zag encoded longs from a buffer.
Decode an an array of zig-zag encoded integers from a buffer.
The buffer is advanced to the end of the integers.
`count` is the number of integers to decode.
`result` is where the decoded integers are stored.
The result is guaranteed to be 64 bits wide.
*/
static inline void decode_longs(const unsigned char **buffer, unsigned int count, unsigned long *result) {
unsigned int current_index;
static inline void decode_zigzag_ints(const unsigned char **buffer, const uint64_t count, uint64_t *result) {
uint64_t current_index;
const unsigned char *current_position = *buffer;
unsigned long temp;
uint64_t temp;
// The largest shift will always be < 64
unsigned char shift;

Expand All @@ -37,7 +41,7 @@ static inline void decode_longs(const unsigned char **buffer, unsigned int count
temp = *current_position & 0x7F;
while(*current_position & 0x80) {
current_position += 1;
temp |= (unsigned long)(*current_position & 0x7F) << shift;
temp |= (uint64_t)(*current_position & 0x7F) << shift;
shift += 7;
}
result[current_index] = (temp >> 1) ^ (~(temp & 1) + 1);
Expand All @@ -53,7 +57,7 @@ static inline void decode_longs(const unsigned char **buffer, unsigned int count
The buffer is advanced to the end of the integer.
*/
static inline void skip_int(const unsigned char **buffer) {
static inline void skip_zigzag_int(const unsigned char **buffer) {
while(**buffer & 0x80) {
*buffer += 1;
}
Expand Down
35 changes: 18 additions & 17 deletions python/pyiceberg/avro/decoder_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,15 @@ from cython.cimports.cpython import array
from pyiceberg.avro import STRUCT_DOUBLE, STRUCT_FLOAT
from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
from libc.string cimport memcpy
from libc.stdint cimport uint64_t, int64_t

import array

cdef extern from "decoder_basic.c":
void decode_longs(const unsigned char **buffer, unsigned int count, unsigned long *result);
void skip_int(const unsigned char **buffer);
void decode_zigzag_ints(const unsigned char **buffer, const uint64_t count, uint64_t *result);
void skip_zigzag_int(const unsigned char **buffer);

unsigned_long_array_template = cython.declare(array.array, array.array('L', []))
unsigned_long_long_array_template = cython.declare(array.array, array.array('Q', []))

@cython.final
cdef class CythonBinaryDecoder:
Expand All @@ -44,7 +45,7 @@ cdef class CythonBinaryDecoder:
cdef const unsigned char *_end

# This is the size of the buffer of the data being parsed.
cdef unsigned int _size
cdef uint64_t _size

def __cinit__(self, input_contents: bytes) -> None:
self._size = len(input_contents)
Expand Down Expand Up @@ -82,33 +83,33 @@ cdef class CythonBinaryDecoder:
self._current += 1;
return self._current[-1] != 0

cpdef inline long read_int(self):
cpdef inline int64_t read_int(self):
"""Reads a value from the stream as an integer.
int/long values are written using variable-length, zigzag coding.
"""
cdef unsigned long result;
cdef uint64_t result;
if self._current >= self._end:
raise EOFError(f"EOF: read 1 bytes")
decode_longs(&self._current, 1, <unsigned long *>&result)
decode_zigzag_ints(&self._current, 1, &result)
return result

def read_ints(self, count: int) -> Tuple[int, ...]:
def read_ints(self, count: int) -> array.array[int]:
"""Reads a list of integers."""
newarray = array.clone(unsigned_long_array_template, count, zero=False)
newarray = array.clone(unsigned_long_long_array_template, count, zero=False)
if self._current >= self._end:
raise EOFError(f"EOF: read 1 bytes")
decode_longs(&self._current, count, newarray.data.as_ulongs)
decode_zigzag_ints(&self._current, count, <uint64_t *>newarray.data.as_ulonglongs)
return newarray

cpdef void read_int_bytes_dict(self, count: int, dest: Dict[int, bytes]):
"""Reads a dictionary of integers for keys and bytes for values into a destination dict."""
cdef unsigned long result[2];
cdef uint64_t result[2];
if self._current >= self._end:
raise EOFError(f"EOF: read 1 bytes")

for _ in range(count):
decode_longs(&self._current, 2, <unsigned long *>&result)
decode_zigzag_ints(&self._current, 2, <uint64_t *>&result)
if result[1] <= 0:
dest[result[0]] = b""
else:
Expand All @@ -117,11 +118,11 @@ cdef class CythonBinaryDecoder:

cpdef inline bytes read_bytes(self):
"""Bytes are encoded as a long followed by that many bytes of data."""
cdef unsigned long length;
cdef uint64_t length;
if self._current >= self._end:
raise EOFError(f"EOF: read 1 bytes")

decode_longs(&self._current, 1, <unsigned long *>&length)
decode_zigzag_ints(&self._current, 1, &length)

if length <= 0:
return b""
Expand Down Expand Up @@ -156,7 +157,7 @@ cdef class CythonBinaryDecoder:
return self.read_bytes().decode("utf-8")

def skip_int(self) -> None:
skip_int(&self._current)
skip_zigzag_int(&self._current)
return

def skip(self, n: int) -> None:
Expand All @@ -172,8 +173,8 @@ cdef class CythonBinaryDecoder:
self._current += 8

def skip_bytes(self) -> None:
cdef long result;
decode_longs(&self._current, 1, <unsigned long *>&result)
cdef uint64_t result;
decode_zigzag_ints(&self._current, 1, &result)
self._current += result

def skip_utf8(self) -> None:
Expand Down

0 comments on commit a45b7ac

Please sign in to comment.