From a45b7ac12f33026d62e04442631ab7424ef3a7a9 Mon Sep 17 00:00:00 2001 From: Rusty Conover Date: Sat, 9 Sep 2023 16:59:48 -0400 Subject: [PATCH] Python: Standardized integer types over relying on C types (#8535) Utilize libc.stdint for the definition of unsigned 64 bit integers. Change the array.array type to use integers that are at least 64 bits in width. Changes motiviated by differences in Windows builds --- python/pyiceberg/avro/decoder_basic.c | 16 +++++++----- python/pyiceberg/avro/decoder_fast.pyx | 35 +++++++++++++------------- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/python/pyiceberg/avro/decoder_basic.c b/python/pyiceberg/avro/decoder_basic.c index e75e2d2e1bff..c3954f330d30 100644 --- a/python/pyiceberg/avro/decoder_basic.c +++ b/python/pyiceberg/avro/decoder_basic.c @@ -17,18 +17,22 @@ under the License. */ +#include + /* - Decode an an array of zig-zag encoded longs from a buffer. + Decode an an array of zig-zag encoded integers from a buffer. The buffer is advanced to the end of the integers. `count` is the number of integers to decode. `result` is where the decoded integers are stored. + The result is guaranteed to be 64 bits wide. + */ -static inline void decode_longs(const unsigned char **buffer, unsigned int count, unsigned long *result) { - unsigned int current_index; +static inline void decode_zigzag_ints(const unsigned char **buffer, const uint64_t count, uint64_t *result) { + uint64_t current_index; const unsigned char *current_position = *buffer; - unsigned long temp; + uint64_t temp; // The largest shift will always be < 64 unsigned char shift; @@ -37,7 +41,7 @@ static inline void decode_longs(const unsigned char **buffer, unsigned int count temp = *current_position & 0x7F; while(*current_position & 0x80) { current_position += 1; - temp |= (unsigned long)(*current_position & 0x7F) << shift; + temp |= (uint64_t)(*current_position & 0x7F) << shift; shift += 7; } result[current_index] = (temp >> 1) ^ (~(temp & 1) + 1); @@ -53,7 +57,7 @@ static inline void decode_longs(const unsigned char **buffer, unsigned int count The buffer is advanced to the end of the integer. */ -static inline void skip_int(const unsigned char **buffer) { +static inline void skip_zigzag_int(const unsigned char **buffer) { while(**buffer & 0x80) { *buffer += 1; } diff --git a/python/pyiceberg/avro/decoder_fast.pyx b/python/pyiceberg/avro/decoder_fast.pyx index cff7752c7730..ddc6ff649d45 100644 --- a/python/pyiceberg/avro/decoder_fast.pyx +++ b/python/pyiceberg/avro/decoder_fast.pyx @@ -19,14 +19,15 @@ from cython.cimports.cpython import array from pyiceberg.avro import STRUCT_DOUBLE, STRUCT_FLOAT from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free from libc.string cimport memcpy +from libc.stdint cimport uint64_t, int64_t import array cdef extern from "decoder_basic.c": - void decode_longs(const unsigned char **buffer, unsigned int count, unsigned long *result); - void skip_int(const unsigned char **buffer); + void decode_zigzag_ints(const unsigned char **buffer, const uint64_t count, uint64_t *result); + void skip_zigzag_int(const unsigned char **buffer); -unsigned_long_array_template = cython.declare(array.array, array.array('L', [])) +unsigned_long_long_array_template = cython.declare(array.array, array.array('Q', [])) @cython.final cdef class CythonBinaryDecoder: @@ -44,7 +45,7 @@ cdef class CythonBinaryDecoder: cdef const unsigned char *_end # This is the size of the buffer of the data being parsed. - cdef unsigned int _size + cdef uint64_t _size def __cinit__(self, input_contents: bytes) -> None: self._size = len(input_contents) @@ -82,33 +83,33 @@ cdef class CythonBinaryDecoder: self._current += 1; return self._current[-1] != 0 - cpdef inline long read_int(self): + cpdef inline int64_t read_int(self): """Reads a value from the stream as an integer. int/long values are written using variable-length, zigzag coding. """ - cdef unsigned long result; + cdef uint64_t result; if self._current >= self._end: raise EOFError(f"EOF: read 1 bytes") - decode_longs(&self._current, 1, &result) + decode_zigzag_ints(&self._current, 1, &result) return result - def read_ints(self, count: int) -> Tuple[int, ...]: + def read_ints(self, count: int) -> array.array[int]: """Reads a list of integers.""" - newarray = array.clone(unsigned_long_array_template, count, zero=False) + newarray = array.clone(unsigned_long_long_array_template, count, zero=False) if self._current >= self._end: raise EOFError(f"EOF: read 1 bytes") - decode_longs(&self._current, count, newarray.data.as_ulongs) + decode_zigzag_ints(&self._current, count, newarray.data.as_ulonglongs) return newarray cpdef void read_int_bytes_dict(self, count: int, dest: Dict[int, bytes]): """Reads a dictionary of integers for keys and bytes for values into a destination dict.""" - cdef unsigned long result[2]; + cdef uint64_t result[2]; if self._current >= self._end: raise EOFError(f"EOF: read 1 bytes") for _ in range(count): - decode_longs(&self._current, 2, &result) + decode_zigzag_ints(&self._current, 2, &result) if result[1] <= 0: dest[result[0]] = b"" else: @@ -117,11 +118,11 @@ cdef class CythonBinaryDecoder: cpdef inline bytes read_bytes(self): """Bytes are encoded as a long followed by that many bytes of data.""" - cdef unsigned long length; + cdef uint64_t length; if self._current >= self._end: raise EOFError(f"EOF: read 1 bytes") - decode_longs(&self._current, 1, &length) + decode_zigzag_ints(&self._current, 1, &length) if length <= 0: return b"" @@ -156,7 +157,7 @@ cdef class CythonBinaryDecoder: return self.read_bytes().decode("utf-8") def skip_int(self) -> None: - skip_int(&self._current) + skip_zigzag_int(&self._current) return def skip(self, n: int) -> None: @@ -172,8 +173,8 @@ cdef class CythonBinaryDecoder: self._current += 8 def skip_bytes(self) -> None: - cdef long result; - decode_longs(&self._current, 1, &result) + cdef uint64_t result; + decode_zigzag_ints(&self._current, 1, &result) self._current += result def skip_utf8(self) -> None: