From e04d12a9735707927edebf20501c1176ef5a2859 Mon Sep 17 00:00:00 2001
From: Jeff Knupp <jeff@jeffknupp.com>
Date: Thu, 20 Jul 2017 18:21:17 -0400
Subject: [PATCH] Switch to use int64_t rather than size_t due to portability
 concerns.

---
 pandas/_libs/parsers.pyx            | 134 ++++++++++++++--------------
 pandas/_libs/src/parser/tokenizer.h |  36 ++++----
 2 files changed, 85 insertions(+), 85 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index d6f87344bb28c..88c695a3faf27 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -121,30 +121,30 @@ cdef extern from "parser/tokenizer.h":
         io_callback cb_io
         io_cleanup cb_cleanup
 
-        size_t chunksize  # Number of bytes to prepare for each chunk
-        char *data     # pointer to data to be processed
-        size_t datalen    # amount of data available
-        size_t datapos
+        int64_t chunksize  # Number of bytes to prepare for each chunk
+        char *data        # pointer to data to be processed
+        int64_t datalen    # amount of data available
+        int64_t datapos
 
         # where to write out tokenized data
         char *stream
-        size_t stream_len
-        size_t stream_cap
+        int64_t stream_len
+        int64_t stream_cap
 
         # Store words in (potentially ragged) matrix for now, hmm
         char **words
-        size_t *word_starts # where we are in the stream
-        size_t words_len
-        size_t words_cap
+        int64_t *word_starts # where we are in the stream
+        int64_t words_len
+        int64_t words_cap
 
-        char *pword_start    # pointer to stream start of current field
-        size_t word_start       # position start of current field
+        char *pword_start       # pointer to stream start of current field
+        int64_t word_start       # position start of current field
 
-        size_t *line_start      # position in words for start of line
-        size_t *line_fields     # Number of fields in each line
-        size_t lines            # Number of lines observed
-        size_t file_lines       # Number of file lines observed (with bad/skipped)
-        size_t lines_cap        # Vector capacity
+        int64_t *line_start      # position in words for start of line
+        int64_t *line_fields     # Number of fields in each line
+        int64_t lines            # Number of lines observed
+        int64_t file_lines       # Number of file lines observed (with bad/skipped)
+        int64_t lines_cap        # Vector capacity
 
         # Tokenizing stuff
         ParserState state
@@ -178,13 +178,13 @@ cdef extern from "parser/tokenizer.h":
         char thousands
 
         int header # Boolean: 1: has header, 0: no header
-        ssize_t header_start # header row start
-        ssize_t header_end # header row end
+        int64_t header_start # header row start
+        int64_t header_end # header row end
 
         void *skipset
         PyObject *skipfunc
         int64_t skip_first_N_rows
-        size_t skipfooter
+        int64_t skipfooter
         # pick one, depending on whether the converter requires GIL
         double (*double_converter_nogil)(const char *, char **,
                                          char, char, char, int) nogil
@@ -195,12 +195,12 @@ cdef extern from "parser/tokenizer.h":
         char *warn_msg
         char *error_msg
 
-        size_t skip_empty_lines
+        int64_t skip_empty_lines
 
     ctypedef struct coliter_t:
         char **words
-        size_t *line_start
-        size_t col
+        int64_t *line_start
+        int64_t col
 
     ctypedef struct uint_state:
         int seen_sint
@@ -210,7 +210,7 @@ cdef extern from "parser/tokenizer.h":
     void uint_state_init(uint_state *self)
     int uint64_conflict(uint_state *self)
 
-    void coliter_setup(coliter_t *it, parser_t *parser, size_t i, size_t start) nogil
+    void coliter_setup(coliter_t *it, parser_t *parser, int64_t i, int64_t start) nogil
     void COLITER_NEXT(coliter_t, const char *) nogil
 
     parser_t* parser_new()
@@ -289,14 +289,14 @@ cdef class TextReader:
         object true_values, false_values
         object handle
         bint na_filter, verbose, has_usecols, has_mi_columns
-        size_t parser_start
+        int64_t parser_start
         list clocks
         char *c_encoding
         kh_str_t *false_set
         kh_str_t *true_set
 
     cdef public:
-        size_t leading_cols, table_width, skipfooter, buffer_lines
+        int64_t leading_cols, table_width, skipfooter, buffer_lines
         object allow_leading_cols
         object delimiter, converters, delim_whitespace
         object na_values
@@ -731,7 +731,7 @@ cdef class TextReader:
             char *word
             object name
             int status
-            size_t hr, data_line
+            int64_t hr, data_line
             char *errors = "strict"
             cdef StringPath path = _string_path(self.c_encoding)
 
@@ -950,8 +950,8 @@ cdef class TextReader:
 
     cdef _read_rows(self, rows, bint trim):
         cdef:
-            size_t buffered_lines
-            size_t irows, footer = 0
+            int64_t buffered_lines
+            int64_t irows, footer = 0
 
         self._start_clock()
 
@@ -1019,13 +1019,13 @@ cdef class TextReader:
 
     def _convert_column_data(self, rows=None, upcast_na=False, footer=0):
         cdef:
-            size_t i
+            int64_t i
             int nused
             kh_str_t *na_hashset = NULL
-            size_t start, end
+            int64_t start, end
             object name, na_flist, col_dtype = None
             bint na_filter = 0
-            size_t num_cols
+            int64_t num_cols
 
         start = self.parser_start
 
@@ -1038,7 +1038,7 @@ cdef class TextReader:
         # if footer > 0:
         #     end -= footer
 
-        num_cols = 0
+        num_cols = -1
         for i in range(self.parser.lines):
             num_cols = (num_cols < self.parser.line_fields[i]) * \
                 self.parser.line_fields[i] + \
@@ -1197,7 +1197,7 @@ cdef class TextReader:
         return col_res, na_count
 
     cdef _convert_with_dtype(self, object dtype, Py_ssize_t i,
-                             size_t start, size_t end,
+                             int64_t start, int64_t end,
                              bint na_filter,
                              bint user_dtype,
                              kh_str_t *na_hashset,
@@ -1277,7 +1277,7 @@ cdef class TextReader:
             raise TypeError("the dtype %s is not "
                             "supported for parsing" % dtype)
 
-    cdef _string_convert(self, Py_ssize_t i, size_t start, size_t end,
+    cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end,
                          bint na_filter, kh_str_t *na_hashset):
 
         cdef StringPath path = _string_path(self.c_encoding)
@@ -1338,7 +1338,7 @@ cdef class TextReader:
         kh_destroy_str(table)
 
     cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused):
-        cdef int j
+        cdef int64_t j
         if self.has_usecols and self.names is not None:
             if (not callable(self.usecols) and
                     len(self.names) == len(self.usecols)):
@@ -1430,8 +1430,8 @@ cdef inline StringPath _string_path(char *encoding):
 # ----------------------------------------------------------------------
 # Type conversions / inference support code
 
-cdef _string_box_factorize(parser_t *parser, size_t col,
-                           size_t line_start, size_t line_end,
+cdef _string_box_factorize(parser_t *parser, int64_t col,
+                           int64_t line_start, int64_t line_end,
                            bint na_filter, kh_str_t *na_hashset):
     cdef:
         int error, na_count = 0
@@ -1483,8 +1483,8 @@ cdef _string_box_factorize(parser_t *parser, size_t col,
 
     return result, na_count
 
-cdef _string_box_utf8(parser_t *parser, size_t col,
-                      size_t line_start, size_t line_end,
+cdef _string_box_utf8(parser_t *parser, int64_t col,
+                      int64_t line_start, int64_t line_end,
                       bint na_filter, kh_str_t *na_hashset):
     cdef:
         int error, na_count = 0
@@ -1536,8 +1536,8 @@ cdef _string_box_utf8(parser_t *parser, size_t col,
 
     return result, na_count
 
-cdef _string_box_decode(parser_t *parser, size_t col,
-                        size_t line_start, size_t line_end,
+cdef _string_box_decode(parser_t *parser, int64_t col,
+                        int64_t line_start, int64_t line_end,
                         bint na_filter, kh_str_t *na_hashset,
                         char *encoding):
     cdef:
@@ -1595,8 +1595,8 @@ cdef _string_box_decode(parser_t *parser, size_t col,
 
 
 @cython.boundscheck(False)
-cdef _categorical_convert(parser_t *parser, size_t col,
-                          size_t line_start, size_t line_end,
+cdef _categorical_convert(parser_t *parser, int64_t col,
+                          int64_t line_start, int64_t line_end,
                           bint na_filter, kh_str_t *na_hashset,
                           char *encoding):
     "Convert column data into codes, categories"
@@ -1666,8 +1666,8 @@ cdef _categorical_convert(parser_t *parser, size_t col,
     kh_destroy_str(table)
     return np.asarray(codes), result, na_count
 
-cdef _to_fw_string(parser_t *parser, size_t col, size_t line_start,
-                   size_t line_end, size_t width):
+cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start,
+                   int64_t line_end, int64_t width):
     cdef:
         Py_ssize_t i
         coliter_t it
@@ -1683,11 +1683,11 @@ cdef _to_fw_string(parser_t *parser, size_t col, size_t line_start,
 
     return result
 
-cdef inline void _to_fw_string_nogil(parser_t *parser, size_t col,
-                                     size_t line_start, size_t line_end,
+cdef inline void _to_fw_string_nogil(parser_t *parser, int64_t col,
+                                     int64_t line_start, int64_t line_end,
                                      size_t width, char *data) nogil:
     cdef:
-        size_t i
+        int64_t i
         coliter_t it
         const char *word = NULL
 
@@ -1702,7 +1702,7 @@ cdef char* cinf = b'inf'
 cdef char* cposinf = b'+inf'
 cdef char* cneginf = b'-inf'
 
-cdef _try_double(parser_t *parser, size_t col, size_t line_start, size_t line_end,
+cdef _try_double(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end,
                  bint na_filter, kh_str_t *na_hashset, object na_flist):
     cdef:
         int error, na_count = 0
@@ -1811,7 +1811,7 @@ cdef inline int _try_double_nogil(parser_t *parser,
 
     return 0
 
-cdef _try_uint64(parser_t *parser, size_t col, size_t line_start, size_t line_end,
+cdef _try_uint64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end,
                  bint na_filter, kh_str_t *na_hashset):
     cdef:
         int error
@@ -1845,8 +1845,8 @@ cdef _try_uint64(parser_t *parser, size_t col, size_t line_start, size_t line_en
 
     return result
 
-cdef inline int _try_uint64_nogil(parser_t *parser, size_t col, size_t line_start,
-                                  size_t line_end, bint na_filter,
+cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col, int64_t line_start,
+                                  int64_t line_end, bint na_filter,
                                   const kh_str_t *na_hashset,
                                   uint64_t *data, uint_state *state) nogil:
     cdef:
@@ -1882,7 +1882,7 @@ cdef inline int _try_uint64_nogil(parser_t *parser, size_t col, size_t line_star
 
     return 0
 
-cdef _try_int64(parser_t *parser, size_t col, size_t line_start, size_t line_end,
+cdef _try_int64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end,
                 bint na_filter, kh_str_t *na_hashset):
     cdef:
         int error, na_count = 0
@@ -1909,8 +1909,8 @@ cdef _try_int64(parser_t *parser, size_t col, size_t line_start, size_t line_end
 
     return result, na_count
 
-cdef inline int _try_int64_nogil(parser_t *parser, size_t col, size_t line_start,
-                                 size_t line_end, bint na_filter,
+cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, int64_t line_start,
+                                 int64_t line_end, bint na_filter,
                                  const kh_str_t *na_hashset, int64_t NA,
                                  int64_t *data, int *na_count) nogil:
     cdef:
@@ -1947,7 +1947,7 @@ cdef inline int _try_int64_nogil(parser_t *parser, size_t col, size_t line_start
 
     return 0
 
-cdef _try_bool(parser_t *parser, size_t col, size_t line_start, size_t line_end,
+cdef _try_bool(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end,
                bint na_filter, kh_str_t *na_hashset):
     cdef:
         int na_count
@@ -1969,8 +1969,8 @@ cdef _try_bool(parser_t *parser, size_t col, size_t line_start, size_t line_end,
         return None, None
     return result.view(np.bool_), na_count
 
-cdef inline int _try_bool_nogil(parser_t *parser, size_t col, size_t line_start,
-                                size_t line_end, bint na_filter,
+cdef inline int _try_bool_nogil(parser_t *parser, int64_t col, int64_t line_start,
+                                int64_t line_end, bint na_filter,
                                 const kh_str_t *na_hashset, uint8_t NA,
                                 uint8_t *data, int *na_count) nogil:
     cdef:
@@ -2009,7 +2009,7 @@ cdef inline int _try_bool_nogil(parser_t *parser, size_t col, size_t line_start,
             data += 1
     return 0
 
-cdef _try_bool_flex(parser_t *parser, size_t col, size_t line_start, size_t line_end,
+cdef _try_bool_flex(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end,
                     bint na_filter, const kh_str_t *na_hashset,
                     const kh_str_t *true_hashset,
                     const kh_str_t *false_hashset):
@@ -2035,8 +2035,8 @@ cdef _try_bool_flex(parser_t *parser, size_t col, size_t line_start, size_t line
         return None, None
     return result.view(np.bool_), na_count
 
-cdef inline int _try_bool_flex_nogil(parser_t *parser, size_t col, size_t line_start,
-                                     size_t line_end, bint na_filter,
+cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col, int64_t line_start,
+                                     int64_t line_end, bint na_filter,
                                      const kh_str_t *na_hashset,
                                      const kh_str_t *true_hashset,
                                      const kh_str_t *false_hashset,
@@ -2254,8 +2254,8 @@ for k in list(na_values):
     na_values[np.dtype(k)] = na_values[k]
 
 
-cdef _apply_converter(object f, parser_t *parser, size_t col,
-                      size_t line_start, size_t line_end,
+cdef _apply_converter(object f, parser_t *parser, int64_t col,
+                      int64_t line_start, int64_t line_end,
                       char* c_encoding):
     cdef:
         int error
@@ -2299,7 +2299,7 @@ def _to_structured_array(dict columns, object names, object usecols):
 
         object name, fnames, field_type
         Py_ssize_t i, offset, nfields, length
-        size_t stride, elsize
+        int64_t stride, elsize
         char *buf
 
     if names is None:
@@ -2347,10 +2347,10 @@ def _to_structured_array(dict columns, object names, object usecols):
 
     return recs
 
-cdef _fill_structured_column(char *dst, char* src, size_t elsize,
-                             size_t stride, size_t length, bint incref):
+cdef _fill_structured_column(char *dst, char* src, int64_t elsize,
+                             int64_t stride, int64_t length, bint incref):
     cdef:
-        size_t i
+        int64_t i
 
     if incref:
         util.transfer_object_column(dst, src, stride, length)
diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h
index a1341b37952eb..f293baa3cda12 100644
--- a/pandas/_libs/src/parser/tokenizer.h
+++ b/pandas/_libs/src/parser/tokenizer.h
@@ -137,30 +137,30 @@ typedef struct parser_t {
     io_callback cb_io;
     io_cleanup cb_cleanup;
 
-    size_t chunksize;  // Number of bytes to prepare for each chunk
+    int64_t chunksize;  // Number of bytes to prepare for each chunk
     char *data;     // pointer to data to be processed
-    size_t datalen;    // amount of data available
-    size_t datapos;
+    int64_t datalen;    // amount of data available
+    int64_t datapos;
 
     // where to write out tokenized data
     char *stream;
-    size_t stream_len;
-    size_t stream_cap;
+    int64_t stream_len;
+    int64_t stream_cap;
 
     // Store words in (potentially ragged) matrix for now, hmm
     char **words;
-    size_t *word_starts;  // where we are in the stream
-    size_t words_len;
-    size_t words_cap;
+    int64_t *word_starts;  // where we are in the stream
+    int64_t words_len;
+    int64_t words_cap;
 
     char *pword_start;  // pointer to stream start of current field
-    size_t word_start;     // position start of current field
+    int64_t word_start;     // position start of current field
 
-    size_t *line_start;   // position in words for start of line
-    size_t *line_fields;  // Number of fields in each line
-    size_t lines;         // Number of (good) lines observed
-    size_t file_lines;  // Number of lines observed (including bad or skipped)
-    size_t lines_cap;   // Vector capacity
+    int64_t *line_start;   // position in words for start of line
+    int64_t *line_fields;  // Number of fields in each line
+    int64_t lines;         // Number of (good) lines observed
+    int64_t file_lines;  // Number of lines observed (including bad or skipped)
+    int64_t lines_cap;   // Vector capacity
 
     // Tokenizing stuff
     ParserState state;
@@ -194,8 +194,8 @@ typedef struct parser_t {
     char thousands;
 
     int header;        // Boolean: 1: has header, 0: no header
-    ssize_t header_start;  // header row start
-    ssize_t header_end;    // header row end
+    int64_t header_start;  // header row start
+    int64_t header_end;    // header row end
 
     void *skipset;
     PyObject *skipfunc;
@@ -216,7 +216,7 @@ typedef struct parser_t {
 
 typedef struct coliter_t {
     char **words;
-    size_t *line_start;
+    int64_t *line_start;
     int col;
 } coliter_t;
 
@@ -225,7 +225,7 @@ coliter_t *coliter_new(parser_t *self, int i);
 
 #define COLITER_NEXT(iter, word)                          \
     do {                                                  \
-        const size_t i = *iter.line_start++ + iter.col;      \
+        const int64_t i = *iter.line_start++ + iter.col;      \
         word = i < *iter.line_start ? iter.words[i] : ""; \
     } while (0)