diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index d6f87344bb28c..88c695a3faf27 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -121,30 +121,30 @@ cdef extern from "parser/tokenizer.h": io_callback cb_io io_cleanup cb_cleanup - size_t chunksize # Number of bytes to prepare for each chunk - char *data # pointer to data to be processed - size_t datalen # amount of data available - size_t datapos + int64_t chunksize # Number of bytes to prepare for each chunk + char *data # pointer to data to be processed + int64_t datalen # amount of data available + int64_t datapos # where to write out tokenized data char *stream - size_t stream_len - size_t stream_cap + int64_t stream_len + int64_t stream_cap # Store words in (potentially ragged) matrix for now, hmm char **words - size_t *word_starts # where we are in the stream - size_t words_len - size_t words_cap + int64_t *word_starts # where we are in the stream + int64_t words_len + int64_t words_cap - char *pword_start # pointer to stream start of current field - size_t word_start # position start of current field + char *pword_start # pointer to stream start of current field + int64_t word_start # position start of current field - size_t *line_start # position in words for start of line - size_t *line_fields # Number of fields in each line - size_t lines # Number of lines observed - size_t file_lines # Number of file lines observed (with bad/skipped) - size_t lines_cap # Vector capacity + int64_t *line_start # position in words for start of line + int64_t *line_fields # Number of fields in each line + int64_t lines # Number of lines observed + int64_t file_lines # Number of file lines observed (with bad/skipped) + int64_t lines_cap # Vector capacity # Tokenizing stuff ParserState state @@ -178,13 +178,13 @@ cdef extern from "parser/tokenizer.h": char thousands int header # Boolean: 1: has header, 0: no header - ssize_t header_start # header row start - ssize_t header_end # header row end + int64_t header_start # header row start + int64_t header_end # header row end void *skipset PyObject *skipfunc int64_t skip_first_N_rows - size_t skipfooter + int64_t skipfooter # pick one, depending on whether the converter requires GIL double (*double_converter_nogil)(const char *, char **, char, char, char, int) nogil @@ -195,12 +195,12 @@ cdef extern from "parser/tokenizer.h": char *warn_msg char *error_msg - size_t skip_empty_lines + int64_t skip_empty_lines ctypedef struct coliter_t: char **words - size_t *line_start - size_t col + int64_t *line_start + int64_t col ctypedef struct uint_state: int seen_sint @@ -210,7 +210,7 @@ cdef extern from "parser/tokenizer.h": void uint_state_init(uint_state *self) int uint64_conflict(uint_state *self) - void coliter_setup(coliter_t *it, parser_t *parser, size_t i, size_t start) nogil + void coliter_setup(coliter_t *it, parser_t *parser, int64_t i, int64_t start) nogil void COLITER_NEXT(coliter_t, const char *) nogil parser_t* parser_new() @@ -289,14 +289,14 @@ cdef class TextReader: object true_values, false_values object handle bint na_filter, verbose, has_usecols, has_mi_columns - size_t parser_start + int64_t parser_start list clocks char *c_encoding kh_str_t *false_set kh_str_t *true_set cdef public: - size_t leading_cols, table_width, skipfooter, buffer_lines + int64_t leading_cols, table_width, skipfooter, buffer_lines object allow_leading_cols object delimiter, converters, delim_whitespace object na_values @@ -731,7 +731,7 @@ cdef class TextReader: char *word object name int status - size_t hr, data_line + int64_t hr, data_line char *errors = "strict" cdef StringPath path = _string_path(self.c_encoding) @@ -950,8 +950,8 @@ cdef class TextReader: cdef _read_rows(self, rows, bint trim): cdef: - size_t buffered_lines - size_t irows, footer = 0 + int64_t buffered_lines + int64_t irows, footer = 0 self._start_clock() @@ -1019,13 +1019,13 @@ cdef class TextReader: def _convert_column_data(self, rows=None, upcast_na=False, footer=0): cdef: - size_t i + int64_t i int nused kh_str_t *na_hashset = NULL - size_t start, end + int64_t start, end object name, na_flist, col_dtype = None bint na_filter = 0 - size_t num_cols + int64_t num_cols start = self.parser_start @@ -1038,7 +1038,7 @@ cdef class TextReader: # if footer > 0: # end -= footer - num_cols = 0 + num_cols = -1 for i in range(self.parser.lines): num_cols = (num_cols < self.parser.line_fields[i]) * \ self.parser.line_fields[i] + \ @@ -1197,7 +1197,7 @@ cdef class TextReader: return col_res, na_count cdef _convert_with_dtype(self, object dtype, Py_ssize_t i, - size_t start, size_t end, + int64_t start, int64_t end, bint na_filter, bint user_dtype, kh_str_t *na_hashset, @@ -1277,7 +1277,7 @@ cdef class TextReader: raise TypeError("the dtype %s is not " "supported for parsing" % dtype) - cdef _string_convert(self, Py_ssize_t i, size_t start, size_t end, + cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end, bint na_filter, kh_str_t *na_hashset): cdef StringPath path = _string_path(self.c_encoding) @@ -1338,7 +1338,7 @@ cdef class TextReader: kh_destroy_str(table) cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused): - cdef int j + cdef int64_t j if self.has_usecols and self.names is not None: if (not callable(self.usecols) and len(self.names) == len(self.usecols)): @@ -1430,8 +1430,8 @@ cdef inline StringPath _string_path(char *encoding): # ---------------------------------------------------------------------- # Type conversions / inference support code -cdef _string_box_factorize(parser_t *parser, size_t col, - size_t line_start, size_t line_end, +cdef _string_box_factorize(parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset): cdef: int error, na_count = 0 @@ -1483,8 +1483,8 @@ cdef _string_box_factorize(parser_t *parser, size_t col, return result, na_count -cdef _string_box_utf8(parser_t *parser, size_t col, - size_t line_start, size_t line_end, +cdef _string_box_utf8(parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset): cdef: int error, na_count = 0 @@ -1536,8 +1536,8 @@ cdef _string_box_utf8(parser_t *parser, size_t col, return result, na_count -cdef _string_box_decode(parser_t *parser, size_t col, - size_t line_start, size_t line_end, +cdef _string_box_decode(parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset, char *encoding): cdef: @@ -1595,8 +1595,8 @@ cdef _string_box_decode(parser_t *parser, size_t col, @cython.boundscheck(False) -cdef _categorical_convert(parser_t *parser, size_t col, - size_t line_start, size_t line_end, +cdef _categorical_convert(parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset, char *encoding): "Convert column data into codes, categories" @@ -1666,8 +1666,8 @@ cdef _categorical_convert(parser_t *parser, size_t col, kh_destroy_str(table) return np.asarray(codes), result, na_count -cdef _to_fw_string(parser_t *parser, size_t col, size_t line_start, - size_t line_end, size_t width): +cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start, + int64_t line_end, int64_t width): cdef: Py_ssize_t i coliter_t it @@ -1683,11 +1683,11 @@ cdef _to_fw_string(parser_t *parser, size_t col, size_t line_start, return result -cdef inline void _to_fw_string_nogil(parser_t *parser, size_t col, - size_t line_start, size_t line_end, +cdef inline void _to_fw_string_nogil(parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, size_t width, char *data) nogil: cdef: - size_t i + int64_t i coliter_t it const char *word = NULL @@ -1702,7 +1702,7 @@ cdef char* cinf = b'inf' cdef char* cposinf = b'+inf' cdef char* cneginf = b'-inf' -cdef _try_double(parser_t *parser, size_t col, size_t line_start, size_t line_end, +cdef _try_double(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset, object na_flist): cdef: int error, na_count = 0 @@ -1811,7 +1811,7 @@ cdef inline int _try_double_nogil(parser_t *parser, return 0 -cdef _try_uint64(parser_t *parser, size_t col, size_t line_start, size_t line_end, +cdef _try_uint64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset): cdef: int error @@ -1845,8 +1845,8 @@ cdef _try_uint64(parser_t *parser, size_t col, size_t line_start, size_t line_en return result -cdef inline int _try_uint64_nogil(parser_t *parser, size_t col, size_t line_start, - size_t line_end, bint na_filter, +cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col, int64_t line_start, + int64_t line_end, bint na_filter, const kh_str_t *na_hashset, uint64_t *data, uint_state *state) nogil: cdef: @@ -1882,7 +1882,7 @@ cdef inline int _try_uint64_nogil(parser_t *parser, size_t col, size_t line_star return 0 -cdef _try_int64(parser_t *parser, size_t col, size_t line_start, size_t line_end, +cdef _try_int64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset): cdef: int error, na_count = 0 @@ -1909,8 +1909,8 @@ cdef _try_int64(parser_t *parser, size_t col, size_t line_start, size_t line_end return result, na_count -cdef inline int _try_int64_nogil(parser_t *parser, size_t col, size_t line_start, - size_t line_end, bint na_filter, +cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, int64_t line_start, + int64_t line_end, bint na_filter, const kh_str_t *na_hashset, int64_t NA, int64_t *data, int *na_count) nogil: cdef: @@ -1947,7 +1947,7 @@ cdef inline int _try_int64_nogil(parser_t *parser, size_t col, size_t line_start return 0 -cdef _try_bool(parser_t *parser, size_t col, size_t line_start, size_t line_end, +cdef _try_bool(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset): cdef: int na_count @@ -1969,8 +1969,8 @@ cdef _try_bool(parser_t *parser, size_t col, size_t line_start, size_t line_end, return None, None return result.view(np.bool_), na_count -cdef inline int _try_bool_nogil(parser_t *parser, size_t col, size_t line_start, - size_t line_end, bint na_filter, +cdef inline int _try_bool_nogil(parser_t *parser, int64_t col, int64_t line_start, + int64_t line_end, bint na_filter, const kh_str_t *na_hashset, uint8_t NA, uint8_t *data, int *na_count) nogil: cdef: @@ -2009,7 +2009,7 @@ cdef inline int _try_bool_nogil(parser_t *parser, size_t col, size_t line_start, data += 1 return 0 -cdef _try_bool_flex(parser_t *parser, size_t col, size_t line_start, size_t line_end, +cdef _try_bool_flex(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, const kh_str_t *na_hashset, const kh_str_t *true_hashset, const kh_str_t *false_hashset): @@ -2035,8 +2035,8 @@ cdef _try_bool_flex(parser_t *parser, size_t col, size_t line_start, size_t line return None, None return result.view(np.bool_), na_count -cdef inline int _try_bool_flex_nogil(parser_t *parser, size_t col, size_t line_start, - size_t line_end, bint na_filter, +cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col, int64_t line_start, + int64_t line_end, bint na_filter, const kh_str_t *na_hashset, const kh_str_t *true_hashset, const kh_str_t *false_hashset, @@ -2254,8 +2254,8 @@ for k in list(na_values): na_values[np.dtype(k)] = na_values[k] -cdef _apply_converter(object f, parser_t *parser, size_t col, - size_t line_start, size_t line_end, +cdef _apply_converter(object f, parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, char* c_encoding): cdef: int error @@ -2299,7 +2299,7 @@ def _to_structured_array(dict columns, object names, object usecols): object name, fnames, field_type Py_ssize_t i, offset, nfields, length - size_t stride, elsize + int64_t stride, elsize char *buf if names is None: @@ -2347,10 +2347,10 @@ def _to_structured_array(dict columns, object names, object usecols): return recs -cdef _fill_structured_column(char *dst, char* src, size_t elsize, - size_t stride, size_t length, bint incref): +cdef _fill_structured_column(char *dst, char* src, int64_t elsize, + int64_t stride, int64_t length, bint incref): cdef: - size_t i + int64_t i if incref: util.transfer_object_column(dst, src, stride, length) diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index a1341b37952eb..f293baa3cda12 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -137,30 +137,30 @@ typedef struct parser_t { io_callback cb_io; io_cleanup cb_cleanup; - size_t chunksize; // Number of bytes to prepare for each chunk + int64_t chunksize; // Number of bytes to prepare for each chunk char *data; // pointer to data to be processed - size_t datalen; // amount of data available - size_t datapos; + int64_t datalen; // amount of data available + int64_t datapos; // where to write out tokenized data char *stream; - size_t stream_len; - size_t stream_cap; + int64_t stream_len; + int64_t stream_cap; // Store words in (potentially ragged) matrix for now, hmm char **words; - size_t *word_starts; // where we are in the stream - size_t words_len; - size_t words_cap; + int64_t *word_starts; // where we are in the stream + int64_t words_len; + int64_t words_cap; char *pword_start; // pointer to stream start of current field - size_t word_start; // position start of current field + int64_t word_start; // position start of current field - size_t *line_start; // position in words for start of line - size_t *line_fields; // Number of fields in each line - size_t lines; // Number of (good) lines observed - size_t file_lines; // Number of lines observed (including bad or skipped) - size_t lines_cap; // Vector capacity + int64_t *line_start; // position in words for start of line + int64_t *line_fields; // Number of fields in each line + int64_t lines; // Number of (good) lines observed + int64_t file_lines; // Number of lines observed (including bad or skipped) + int64_t lines_cap; // Vector capacity // Tokenizing stuff ParserState state; @@ -194,8 +194,8 @@ typedef struct parser_t { char thousands; int header; // Boolean: 1: has header, 0: no header - ssize_t header_start; // header row start - ssize_t header_end; // header row end + int64_t header_start; // header row start + int64_t header_end; // header row end void *skipset; PyObject *skipfunc; @@ -216,7 +216,7 @@ typedef struct parser_t { typedef struct coliter_t { char **words; - size_t *line_start; + int64_t *line_start; int col; } coliter_t; @@ -225,7 +225,7 @@ coliter_t *coliter_new(parser_t *self, int i); #define COLITER_NEXT(iter, word) \ do { \ - const size_t i = *iter.line_start++ + iter.col; \ + const int64_t i = *iter.line_start++ + iter.col; \ word = i < *iter.line_start ? iter.words[i] : ""; \ } while (0)