From 386ed58f38e4283d351c61223e47f241a24755fa Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 2 Sep 2016 02:00:42 -0400 Subject: [PATCH] MAINT: flake8 *.pyx files flake8-ed *.pyx files and fixed errors. Removed the E226 check because that inhibits pointers (e.g. char*). In addition, the check is not even universally accepted in Python. --- ci/lint.sh | 10 +- pandas/algos.pyx | 44 +- pandas/hashtable.pyx | 4 +- pandas/index.pyx | 37 +- pandas/io/sas/saslib.pyx | 87 ++-- pandas/lib.pyx | 111 +++-- pandas/msgpack/_packer.pyx | 61 +-- pandas/msgpack/_unpacker.pyx | 77 +-- pandas/parser.pyx | 240 +++++----- pandas/src/inference.pyx | 99 ++-- pandas/src/offsets.pyx | 6 +- pandas/src/period.pyx | 82 ++-- pandas/src/reduce.pyx | 82 ++-- pandas/src/skiplist.pyx | 1 - pandas/src/sparse.pyx | 25 +- pandas/src/testing.pyx | 30 +- pandas/tslib.pyx | 892 +++++++++++++++++++++-------------- 17 files changed, 1114 insertions(+), 774 deletions(-) diff --git a/ci/lint.sh b/ci/lint.sh index 61d74ae28377e..a866b04445f96 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -20,15 +20,7 @@ if [ "$LINT" ]; then echo "Linting *.py DONE" echo "Linting *.pyx" - for path in 'window.pyx' "src/join.pyx" - do - echo "linting -> pandas/$path" - flake8 pandas/$path --filename '*.pyx' --select=E501,E302,E203,E226,E111,E114,E221,E303,E128,E231,E126 - if [ $? -ne "0" ]; then - RET=1 - fi - - done + flake8 pandas --filename '*.pyx' --select=E501,E302,E203,E111,E114,E221,E303,E128,E231,E126 echo "Linting *.pyx DONE" echo "Linting *.pxi.in" diff --git a/pandas/algos.pyx b/pandas/algos.pyx index d3e68ad2a5eee..de5c5fc661d4d 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -59,11 +59,11 @@ cdef: int TIEBREAK_DENSE = 5 tiebreakers = { - 'average' : TIEBREAK_AVERAGE, - 'min' : TIEBREAK_MIN, - 'max' : TIEBREAK_MAX, - 'first' : TIEBREAK_FIRST, - 'dense' : TIEBREAK_DENSE, + 'average': TIEBREAK_AVERAGE, + 'min': TIEBREAK_MIN, + 'max': TIEBREAK_MAX, + 'first': TIEBREAK_FIRST, + 'dense': TIEBREAK_DENSE, } @@ -489,7 +489,6 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', bint keep_na = 0 float count = 0.0 - tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' @@ -578,6 +577,7 @@ class Infinity(object): __gt__ = lambda self, other: self is not other __ge__ = lambda self, other: True + class NegInfinity(object): """ provide a negative Infinity comparision method for ranking """ @@ -705,7 +705,6 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', # return result - cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil except -1: cdef numeric t @@ -747,11 +746,11 @@ cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k): cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n): cdef: - Py_ssize_t i,j,l,m + Py_ssize_t i, j, l, m double_t x, t l = 0 - m = n-1 + m = n -1 while (l malloc(nlevels * sizeof(int64_t*)) for i from 0 <= i < nlevels: - # vecs[i] = ( list_of_arrays[i]).data - arr = list_of_arrays[i] - vecs[i] = arr.data - # assume uniqueness?? + vecs[i] = arr.data + # Assume uniqueness?? for i from 1 <= i < n: for k from 0 <= k < nlevels: cur = vecs[k][i] - pre = vecs[k][i-1] + pre = vecs[k][i -1] if cur == pre: continue elif cur > pre: @@ -988,7 +989,8 @@ def is_lexsorted(list list_of_arrays): @cython.boundscheck(False) -def groupby_indices(dict ids, ndarray[int64_t] labels, ndarray[int64_t] counts): +def groupby_indices(dict ids, ndarray[int64_t] labels, + ndarray[int64_t] counts): """ turn group_labels output into a combined indexer maping the labels to indexers @@ -1020,7 +1022,7 @@ def groupby_indices(dict ids, ndarray[int64_t] labels, ndarray[int64_t] counts): for i from 0 <= i < len(counts): arr = np.empty(counts[i], dtype=np.int64) result[ids[i]] = arr - vecs[i] = arr.data + vecs[i] = arr.data for i from 0 <= i < n: k = labels[i] @@ -1036,6 +1038,7 @@ def groupby_indices(dict ids, ndarray[int64_t] labels, ndarray[int64_t] counts): free(vecs) return result + @cython.wraparound(False) @cython.boundscheck(False) def group_labels(ndarray[object] values): @@ -1116,6 +1119,7 @@ def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups): #---------------------------------------------------------------------- # first, nth, last + @cython.boundscheck(False) @cython.wraparound(False) def group_nth_object(ndarray[object, ndim=2] out, @@ -1160,6 +1164,7 @@ def group_nth_object(ndarray[object, ndim=2] out, else: out[i, j] = resx[i, j] + @cython.boundscheck(False) @cython.wraparound(False) def group_nth_bin_object(ndarray[object, ndim=2] out, @@ -1210,6 +1215,7 @@ def group_nth_bin_object(ndarray[object, ndim=2] out, else: out[i, j] = resx[i, j] + @cython.boundscheck(False) @cython.wraparound(False) def group_last_object(ndarray[object, ndim=2] out, @@ -1252,6 +1258,7 @@ def group_last_object(ndarray[object, ndim=2] out, else: out[i, j] = resx[i, j] + @cython.boundscheck(False) @cython.wraparound(False) def group_last_bin_object(ndarray[object, ndim=2] out, @@ -1326,7 +1333,6 @@ cdef inline float64_t _median_linear(float64_t* a, int n): a = tmp n -= na_count - if n % 2: result = kth_smallest_c( a, n / 2, n) else: diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index af694c276b5b7..3bda3f49cb054 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -192,7 +192,7 @@ def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): kh_destroy_pymap(table) - return modes[:j+1] + return modes[:j + 1] @cython.wraparound(False) @@ -227,7 +227,7 @@ def mode_int64(int64_t[:] values): kh_destroy_int64(table) - return modes[:j+1] + return modes[:j + 1] @cython.wraparound(False) diff --git a/pandas/index.pyx b/pandas/index.pyx index bc985100692fc..2935560a05b6b 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -54,7 +54,8 @@ cdef inline is_definitely_invalid_key(object val): # we have a _data, means we are a NDFrame return (PySlice_Check(val) or cnp.PyArray_Check(val) - or PyList_Check(val) or hasattr(val,'_data')) + or PyList_Check(val) or hasattr(val, '_data')) + def get_value_at(ndarray arr, object loc): if arr.descr.type_num == NPY_DATETIME: @@ -63,6 +64,7 @@ def get_value_at(ndarray arr, object loc): return Timedelta(util.get_value_at(arr, loc)) return util.get_value_at(arr, loc) + def set_value_at(ndarray arr, object loc, object val): return util.set_value_at(arr, loc, val) @@ -302,7 +304,7 @@ cdef class IndexEngine: else: n_alloc = n - result = np.empty(n_alloc, dtype=np.int64) + result = np.empty(n_alloc, dtype=np.int64) missing = np.empty(n_t, dtype=np.int64) # form the set of the results (like ismember) @@ -311,7 +313,7 @@ cdef class IndexEngine: val = util.get_value_1d(values, i) if val in stargets: if val not in d: - d[val] = [] + d[val] = [] d[val].append(i) for i in range(n_t): @@ -322,20 +324,20 @@ cdef class IndexEngine: if val in d: for j in d[val]: - # realloc if needed - if count >= n_alloc: - n_alloc += 10000 - result = np.resize(result, n_alloc) + # realloc if needed + if count >= n_alloc: + n_alloc += 10000 + result = np.resize(result, n_alloc) - result[count] = j - count += 1 + result[count] = j + count += 1 # value not found else: if count >= n_alloc: - n_alloc += 10000 - result = np.resize(result, n_alloc) + n_alloc += 10000 + result = np.resize(result, n_alloc) result[count] = -1 count += 1 missing[count_missing] = i @@ -479,9 +481,9 @@ cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: return mid + 1 _pad_functions = { - 'object' : algos.pad_object, - 'int64' : algos.pad_int64, - 'float64' : algos.pad_float64 + 'object': algos.pad_object, + 'int64': algos.pad_int64, + 'float64': algos.pad_float64 } _backfill_functions = { @@ -606,7 +608,7 @@ cdef class TimedeltaEngine(DatetimeEngine): cpdef convert_scalar(ndarray arr, object value): if arr.descr.type_num == NPY_DATETIME: - if isinstance(value,np.ndarray): + if isinstance(value, np.ndarray): pass elif isinstance(value, Timestamp): return value.value @@ -615,7 +617,7 @@ cpdef convert_scalar(ndarray arr, object value): else: return Timestamp(value).value elif arr.descr.type_num == NPY_TIMEDELTA: - if isinstance(value,np.ndarray): + if isinstance(value, np.ndarray): pass elif isinstance(value, Timedelta): return value.value @@ -639,7 +641,8 @@ cdef inline _to_i8(object val): return get_datetime64_value(val) elif PyDateTime_Check(val): tzinfo = getattr(val, 'tzinfo', None) - ival = _pydatetime_to_dts(val, &dts) # Save the original date value so we can get the utcoffset from it. + # Save the original date value so we can get the utcoffset from it. + ival = _pydatetime_to_dts(val, &dts) if tzinfo is not None and not _is_utc(tzinfo): offset = tslib._get_utcoffset(tzinfo, val) ival -= tslib._delta_to_nanoseconds(offset) diff --git a/pandas/io/sas/saslib.pyx b/pandas/io/sas/saslib.pyx index ac73ae37ca70e..a66d62ea41581 100644 --- a/pandas/io/sas/saslib.pyx +++ b/pandas/io/sas/saslib.pyx @@ -10,12 +10,14 @@ import sas_constants as const # algorithm. It is partially documented here: # # https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf -cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff): +cdef np.ndarray[uint8_t, ndim=1] rle_decompress( + int result_length, np.ndarray[uint8_t, ndim=1] inbuff): cdef: uint8_t control_byte, x uint8_t [:] result = np.zeros(result_length, np.uint8) - int rpos = 0, ipos = 0, i, nbytes, end_of_first_byte, length = len(inbuff) + int rpos = 0, ipos = 0, length = len(inbuff) + int i, nbytes, end_of_first_byte while ipos < length: control_byte = inbuff[ipos] & 0xF0 @@ -41,13 +43,13 @@ cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[ui rpos += 1 ipos += 1 elif control_byte == 0x60: - nbytes = end_of_first_byte*256 + (inbuff[ipos]) + 17 + nbytes = end_of_first_byte * 256 + (inbuff[ipos]) + 17 ipos += 1 for i in range(nbytes): result[rpos] = 0x20 rpos += 1 elif control_byte == 0x70: - nbytes = end_of_first_byte*256 + (inbuff[ipos]) + 17 + nbytes = end_of_first_byte * 256 + (inbuff[ipos]) + 17 ipos += 1 for i in range(nbytes): result[rpos] = 0x00 @@ -109,8 +111,9 @@ cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[ui # rdc_decompress decompresses data using the Ross Data Compression algorithm: # -# http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm -cdef np.ndarray[uint8_t, ndim=1] rdc_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff): +# http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm +cdef np.ndarray[uint8_t, ndim=1] rdc_decompress( + int result_length, np.ndarray[uint8_t, ndim=1] inbuff): cdef: uint8_t cmd @@ -124,7 +127,8 @@ cdef np.ndarray[uint8_t, ndim=1] rdc_decompress(int result_length, np.ndarray[ui ii += 1 ctrl_mask = ctrl_mask >> 1 if ctrl_mask == 0: - ctrl_bits = (inbuff[ipos] << 8) + inbuff[ipos + 1] + ctrl_bits = ((inbuff[ipos] << 8) + + inbuff[ipos + 1]) ipos += 2 ctrl_mask = 0x8000 @@ -219,7 +223,8 @@ cdef class Parser(object): int subheader_pointer_length int current_page_type bint is_little_endian - np.ndarray[uint8_t, ndim=1] (*decompress)(int result_length, np.ndarray[uint8_t, ndim=1] inbuff) + np.ndarray[uint8_t, ndim=1] (*decompress)( + int result_length, np.ndarray[uint8_t, ndim=1] inbuff) object parser def __init__(self, object parser): @@ -252,7 +257,8 @@ cdef class Parser(object): elif column_types[j] == b's': self.column_types[j] = column_type_string else: - raise ValueError("unknown column type: %s" % self.parser.columns[j].ctype) + raise ValueError("unknown column type: " + "%s" % self.parser.columns[j].ctype) # compression if parser.compression == const.rle_compression: @@ -279,7 +285,8 @@ cdef class Parser(object): # update the parser self.parser._current_row_on_page_index = self.current_row_on_page_index - self.parser._current_row_in_chunk_index = self.current_row_in_chunk_index + self.parser._current_row_in_chunk_index =\ + self.current_row_in_chunk_index self.parser._current_row_in_file_index = self.current_row_in_file_index cdef bint read_next_page(self): @@ -299,13 +306,16 @@ cdef class Parser(object): self.current_row_on_page_index = 0 self.current_page_type = self.parser._current_page_type self.current_page_block_count = self.parser._current_page_block_count - self.current_page_data_subheader_pointers_len = len(self.parser._current_page_data_subheader_pointers) - self.current_page_subheaders_count = self.parser._current_page_subheaders_count + self.current_page_data_subheader_pointers_len = len( + self.parser._current_page_data_subheader_pointers) + self.current_page_subheaders_count =\ + self.parser._current_page_subheaders_count cdef bint readline(self): cdef: - int offset, bit_offset, align_correction, subheader_pointer_length, mn + int offset, bit_offset, align_correction + int subheader_pointer_length, mn bint done, flag bit_offset = self.bit_offset @@ -321,7 +331,8 @@ cdef class Parser(object): # Loop until a data row is read while True: if self.current_page_type == page_meta_type: - flag = self.current_row_on_page_index >= self.current_page_data_subheader_pointers_len + flag = self.current_row_on_page_index >=\ + self.current_page_data_subheader_pointers_len if flag: done = self.read_next_page() if done: @@ -330,10 +341,12 @@ cdef class Parser(object): current_subheader_pointer = ( self.parser._current_page_data_subheader_pointers[ self.current_row_on_page_index]) - self.process_byte_array_with_data(current_subheader_pointer.offset, - current_subheader_pointer.length) + self.process_byte_array_with_data( + current_subheader_pointer.offset, + current_subheader_pointer.length) return False - elif self.current_page_type == page_mix_types_0 or self.current_page_type == page_mix_types_1: + elif (self.current_page_type == page_mix_types_0 or + self.current_page_type == page_mix_types_1): align_correction = (bit_offset + subheader_pointers_offset + self.current_page_subheaders_count * subheader_pointer_length) @@ -345,18 +358,18 @@ cdef class Parser(object): offset += self.current_row_on_page_index * self.row_length self.process_byte_array_with_data(offset, self.row_length) - mn = min(self.parser.row_count, self.parser._mix_page_row_count) + mn = min(self.parser.row_count, + self.parser._mix_page_row_count) if self.current_row_on_page_index == mn: done = self.read_next_page() if done: return True return False elif self.current_page_type == page_data_type: - self.process_byte_array_with_data(bit_offset + - subheader_pointers_offset + - self.current_row_on_page_index * - self.row_length, - self.row_length) + self.process_byte_array_with_data( + bit_offset + subheader_pointers_offset + + self.current_row_on_page_index * self.row_length, + self.row_length) flag = (self.current_row_on_page_index == self.current_page_block_count) if flag: @@ -371,17 +384,18 @@ cdef class Parser(object): cdef void process_byte_array_with_data(self, int offset, int length): cdef: - Py_ssize_t j - int s, k, m, jb, js, current_row - int64_t lngt, start, ct - np.ndarray[uint8_t, ndim=1] source - int64_t[:] column_types - int64_t[:] lengths - int64_t[:] offsets - uint8_t[:, :] byte_chunk - object[:, :] string_chunk - - source = np.frombuffer(self.cached_page[offset:offset+length], dtype=np.uint8) + Py_ssize_t j + int s, k, m, jb, js, current_row + int64_t lngt, start, ct + np.ndarray[uint8_t, ndim=1] source + int64_t[:] column_types + int64_t[:] lengths + int64_t[:] offsets + uint8_t[:, :] byte_chunk + object[:, :] string_chunk + + source = np.frombuffer( + self.cached_page[offset:offset + length], dtype=np.uint8) if self.decompress != NULL and (length < self.row_length): source = self.decompress(self.row_length, source) @@ -408,11 +422,12 @@ cdef class Parser(object): else: m = s for k in range(lngt): - byte_chunk[jb, m + k] = source[start + k] + byte_chunk[jb, m + k] = source[start + k] jb += 1 elif column_types[j] == column_type_string: # string - string_chunk[js, current_row] = source[start:(start+lngt)].tostring().rstrip() + string_chunk[js, current_row] = source[start:( + start + lngt)].tostring().rstrip() js += 1 self.current_row_on_page_index += 1 diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 0473ae79adce5..e7672de5c835e 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -84,6 +84,7 @@ PyDateTime_IMPORT import_array() import_ufunc() + def values_from_object(object o): """ return my values or the object if we are say an ndarray """ cdef f @@ -159,6 +160,7 @@ def ismember(ndarray arr, set values): return result.view(np.bool_) + def ismember_int64(ndarray[int64_t] arr, set values): """ Checks whether @@ -184,6 +186,7 @@ def ismember_int64(ndarray[int64_t] arr, set values): return result.view(np.bool_) + @cython.wraparound(False) @cython.boundscheck(False) def memory_usage_of_objects(ndarray[object, ndim=1] arr): @@ -217,12 +220,15 @@ cdef inline int64_t gmtime(object date): days = pydate(y, m, 1).toordinal() - _EPOCH_ORD + d - 1 return (( (((days * 24 + h) * 60 + mn))) * 60 + s) * 1000 + cpdef object to_datetime(int64_t timestamp): return pydatetime.utcfromtimestamp(timestamp / 1000.0) + cpdef object to_timestamp(object dt): return gmtime(dt) + def array_to_timestamp(ndarray[object, ndim=1] arr): cdef int i, n cdef ndarray[int64_t, ndim=1] result @@ -235,6 +241,7 @@ def array_to_timestamp(ndarray[object, ndim=1] arr): return result + def time64_to_datetime(ndarray[int64_t, ndim=1] arr): cdef int i, n cdef ndarray[object, ndim=1] result @@ -254,6 +261,7 @@ def time64_to_datetime(ndarray[int64_t, ndim=1] arr): cdef double INF = np.inf cdef double NEGINF = -INF + cpdef checknull(object val): if util.is_float_object(val) or util.is_complex_object(val): return val != val # and val != INF and val != NEGINF @@ -268,6 +276,7 @@ cpdef checknull(object val): else: return _checknull(val) + cpdef checknull_old(object val): if util.is_float_object(val) or util.is_complex_object(val): return val != val or val == INF or val == NEGINF @@ -282,18 +291,21 @@ cpdef checknull_old(object val): else: return util._checknull(val) + cpdef isposinf_scalar(object val): if util.is_float_object(val) and val == INF: return True else: return False + cpdef isneginf_scalar(object val): if util.is_float_object(val) and val == NEGINF: return True else: return False + def isscalar(object val): """ Return True if given value is scalar. @@ -356,6 +368,7 @@ def isnullobj(ndarray arr): result[i] = _check_all_nulls(val) return result.view(np.bool_) + @cython.wraparound(False) @cython.boundscheck(False) def isnullobj_old(ndarray arr): @@ -372,6 +385,7 @@ def isnullobj_old(ndarray arr): result[i] = val is NaT or util._checknull_old(val) return result.view(np.bool_) + @cython.wraparound(False) @cython.boundscheck(False) def isnullobj2d(ndarray arr): @@ -390,6 +404,7 @@ def isnullobj2d(ndarray arr): result[i, j] = 1 return result.view(np.bool_) + @cython.wraparound(False) @cython.boundscheck(False) def isnullobj2d_old(ndarray arr): @@ -413,8 +428,8 @@ def isnullobj2d_old(ndarray arr): @cython.boundscheck(False) cpdef ndarray[object] list_to_object_array(list obj): """ - Convert list to object ndarray. Seriously can\'t believe I had to write this - function + Convert list to object ndarray. Seriously can\'t believe + I had to write this function. """ cdef: Py_ssize_t i, n = len(obj) @@ -447,6 +462,7 @@ def fast_unique(ndarray[object] values): return uniques + @cython.wraparound(False) @cython.boundscheck(False) def fast_unique_multiple(list arrays): @@ -473,6 +489,7 @@ def fast_unique_multiple(list arrays): return uniques + @cython.wraparound(False) @cython.boundscheck(False) def fast_unique_multiple_list(list lists): @@ -499,6 +516,7 @@ def fast_unique_multiple_list(list lists): return uniques + @cython.wraparound(False) @cython.boundscheck(False) def fast_unique_multiple_list_gen(object gen, bint sort=True): @@ -538,6 +556,7 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True): return uniques + @cython.wraparound(False) @cython.boundscheck(False) def dicts_to_array(list dicts, list columns): @@ -563,6 +582,7 @@ def dicts_to_array(list dicts, list columns): return result + def fast_zip(list ndarrays): """ For zipping multiple ndarrays into an ndarray of tuples @@ -604,6 +624,7 @@ def fast_zip(list ndarrays): return result + def get_reverse_indexer(ndarray[int64_t] indexer, Py_ssize_t length): """ Reverse indexing operation. @@ -645,6 +666,7 @@ def has_infs_f4(ndarray[float32_t] arr): return True return False + def has_infs_f8(ndarray[float64_t] arr): cdef: Py_ssize_t i, n = len(arr) @@ -659,6 +681,7 @@ def has_infs_f8(ndarray[float64_t] arr): return True return False + def convert_timestamps(ndarray values): cdef: object val, f, result @@ -911,6 +934,7 @@ def scalar_binop(ndarray[object] values, object val, object op): return maybe_convert_bool(result) + @cython.wraparound(False) @cython.boundscheck(False) def vec_binop(ndarray[object] left, ndarray[object] right, object op): @@ -948,18 +972,19 @@ def astype_intsafe(ndarray[object] arr, new_dtype): ndarray result # on 32-bit, 1.6.2 numpy M8[ns] is a subdtype of integer, which is weird - is_datelike = new_dtype in ['M8[ns]','m8[ns]'] + is_datelike = new_dtype in ['M8[ns]', 'm8[ns]'] result = np.empty(n, dtype=new_dtype) for i in range(n): v = arr[i] if is_datelike and checknull(v): - result[i] = NPY_NAT + result[i] = NPY_NAT else: - util.set_value_at(result, i, v) + util.set_value_at(result, i, v) return result + cpdef ndarray[object] astype_unicode(ndarray arr): cdef: Py_ssize_t i, n = arr.size @@ -970,6 +995,7 @@ cpdef ndarray[object] astype_unicode(ndarray arr): return result + cpdef ndarray[object] astype_str(ndarray arr): cdef: Py_ssize_t i, n = arr.size @@ -980,6 +1006,7 @@ cpdef ndarray[object] astype_str(ndarray arr): return result + def clean_index_list(list obj): """ Utility used in pandas.core.index._ensure_index @@ -992,7 +1019,7 @@ def clean_index_list(list obj): for i in range(n): v = obj[i] - if not (PyList_Check(v) or np.PyArray_Check(v) or hasattr(v,'_data')): + if not (PyList_Check(v) or np.PyArray_Check(v) or hasattr(v, '_data')): all_arrays = 0 break @@ -1002,7 +1029,7 @@ def clean_index_list(list obj): converted = np.empty(n, dtype=object) for i in range(n): v = obj[i] - if PyList_Check(v) or np.PyArray_Check(v) or hasattr(v,'_data'): + if PyList_Check(v) or np.PyArray_Check(v) or hasattr(v, '_data'): converted[i] = tuple(v) else: converted[i] = v @@ -1038,10 +1065,16 @@ cpdef Py_ssize_t max_len_string_array(pandas_string[:] arr): return m + @cython.boundscheck(False) @cython.wraparound(False) -def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, object replace = None): - """ replace the values in the array with replacement if they are nan_rep; return the same array """ +def string_array_replace_from_nan_rep( + ndarray[object, ndim=1] arr, object nan_rep, + object replace=None): + """ + Replace the values in the array with 'replacement' if + they are 'nan_rep'. Return the same array. + """ cdef int length = arr.shape[0], i = 0 if replace is None: @@ -1053,9 +1086,11 @@ def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_re return arr + @cython.boundscheck(False) @cython.wraparound(False) -def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, object writer): +def write_csv_rows(list data, ndarray data_index, + int nlevels, ndarray cols, object writer): cdef int N, j, i, ncols cdef list rows @@ -1066,7 +1101,7 @@ def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, obj # pre-allocate rows ncols = len(cols) - rows = [[None]*(nlevels+ncols) for x in range(N)] + rows = [[None] * (nlevels + ncols) for x in range(N)] j = -1 if nlevels == 1: @@ -1074,18 +1109,18 @@ def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, obj row = rows[j % N] row[0] = data_index[j] for i in range(ncols): - row[1+i] = data[i][j] + row[1 + i] = data[i][j] - if j >= N-1 and j % N == N-1: + if j >= N - 1 and j % N == N - 1: writer.writerows(rows) elif nlevels > 1: for j in range(len(data_index)): row = rows[j % N] row[:nlevels] = list(data_index[j]) for i in range(ncols): - row[nlevels+i] = data[i][j] + row[nlevels + i] = data[i][j] - if j >= N-1 and j % N == N-1: + if j >= N - 1 and j % N == N - 1: writer.writerows(rows) else: for j in range(len(data_index)): @@ -1093,15 +1128,15 @@ def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, obj for i in range(ncols): row[i] = data[i][j] - if j >= N-1 and j % N == N-1: + if j >= N - 1 and j % N == N - 1: writer.writerows(rows) - if j >= 0 and (j < N-1 or (j % N) != N-1 ): - writer.writerows(rows[:((j+1) % N)]) + if j >= 0 and (j < N - 1 or (j % N) != N - 1): + writer.writerows(rows[:((j + 1) % N)]) -#------------------------------------------------------------------------------- -# Groupby-related functions +#------------------------------------------------------------------------------ +# Groupby-related functions @cython.boundscheck(False) def arrmap(ndarray[object] index, object func): cdef int length = index.shape[0] @@ -1114,6 +1149,7 @@ def arrmap(ndarray[object] index, object func): return result + @cython.wraparound(False) @cython.boundscheck(False) def is_lexsorted(list list_of_arrays): @@ -1128,16 +1164,14 @@ def is_lexsorted(list list_of_arrays): cdef int64_t **vecs = malloc(nlevels * sizeof(int64_t*)) for i from 0 <= i < nlevels: - # vecs[i] = ( list_of_arrays[i]).data - arr = list_of_arrays[i] vecs[i] = arr.data - # assume uniqueness?? + # Assume uniqueness?? for i from 1 <= i < n: for k from 0 <= k < nlevels: cur = vecs[k][i] - pre = vecs[k][i-1] + pre = vecs[k][i - 1] if cur == pre: continue elif cur > pre: @@ -1148,11 +1182,9 @@ def is_lexsorted(list list_of_arrays): return True - # TODO: could do even better if we know something about the data. eg, index has # 1-min data, binner has 5-min data, then bins are just strides in index. This # is a general, O(max(len(values), len(binner))) method. - @cython.boundscheck(False) @cython.wraparound(False) def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, @@ -1182,18 +1214,18 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, if values[0] < binner[0]: raise ValueError("Values falls before first bin") - if values[lenidx-1] > binner[lenbin-1]: + if values[lenidx - 1] > binner[lenbin - 1]: raise ValueError("Values falls after last bin") bins = np.empty(lenbin - 1, dtype=np.int64) - j = 0 # index into values + j = 0 # index into values bc = 0 # bin count # linear scan if right_closed: for i in range(0, lenbin - 1): - r_bin = binner[i+1] + r_bin = binner[i + 1] # count values in current bin, advance to next bin while j < lenidx and values[j] <= r_bin: j += 1 @@ -1201,7 +1233,7 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, bc += 1 else: for i in range(0, lenbin - 1): - r_bin = binner[i+1] + r_bin = binner[i + 1] # count values in current bin, advance to next bin while j < lenidx and values[j] < r_bin: j += 1 @@ -1216,8 +1248,6 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, return bins - - @cython.boundscheck(False) @cython.wraparound(False) def row_bool_subset(ndarray[float64_t, ndim=2] values, @@ -1239,6 +1269,7 @@ def row_bool_subset(ndarray[float64_t, ndim=2] values, return out + @cython.boundscheck(False) @cython.wraparound(False) def row_bool_subset_object(ndarray[object, ndim=2] values, @@ -1260,6 +1291,7 @@ def row_bool_subset_object(ndarray[object, ndim=2] values, return out + @cython.boundscheck(False) @cython.wraparound(False) def get_level_sorter(ndarray[int64_t, ndim=1] label, @@ -1282,6 +1314,7 @@ def get_level_sorter(ndarray[int64_t, ndim=1] label, return out + def group_count(ndarray[int64_t] values, Py_ssize_t size): cdef: Py_ssize_t i, n = len(values) @@ -1292,6 +1325,7 @@ def group_count(ndarray[int64_t] values, Py_ssize_t size): counts[values[i]] += 1 return counts + def lookup_values(ndarray[object] values, dict mapping): cdef: Py_ssize_t i, n = len(values) @@ -1331,6 +1365,7 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, return counts + cdef class _PandasNull: def __richcmp__(_PandasNull self, object other, int op): @@ -1346,6 +1381,7 @@ cdef class _PandasNull: pandas_null = _PandasNull() + def fast_zip_fillna(list ndarrays, fill_value=pandas_null): """ For zipping multiple ndarrays into an ndarray of tuples @@ -1445,7 +1481,7 @@ def indices_fast(object index, ndarray[int64_t] labels, list keys, tup = PyTuple_New(k) for j in range(k): val = util.get_value_at(keys[j], - sorted_labels[j][i-1]) + sorted_labels[j][i - 1]) PyTuple_SET_ITEM(tup, j, val) Py_INCREF(val) @@ -1574,7 +1610,7 @@ cpdef slice indexer_as_slice(int64_t[:] vals): return None for i in range(2, n): - if vals[i] < 0 or vals[i] - vals[i-1] != d: + if vals[i] < 0 or vals[i] - vals[i - 1] != d: return None start = vals[0] @@ -1645,12 +1681,13 @@ cpdef slice_get_indices_ex(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX): if slc is None: raise TypeError("slc should be a slice") - PySlice_GetIndicesEx(slc, objlen, + PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length) return start, stop, step, length -cpdef Py_ssize_t slice_len(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except -1: +cpdef Py_ssize_t slice_len( + slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except -1: """ Get length of a bounded slice. @@ -1668,7 +1705,7 @@ cpdef Py_ssize_t slice_len(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except - if slc is None: raise TypeError("slc must be slice") - PySlice_GetIndicesEx(slc, objlen, + PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length) return length diff --git a/pandas/msgpack/_packer.pyx b/pandas/msgpack/_packer.pyx index 5004b9e8e7262..008dbe5541d50 100644 --- a/pandas/msgpack/_packer.pyx +++ b/pandas/msgpack/_packer.pyx @@ -23,7 +23,8 @@ cdef extern from "../src/msgpack/pack.h": int msgpack_pack_false(msgpack_packer* pk) int msgpack_pack_long(msgpack_packer* pk, long d) int msgpack_pack_long_long(msgpack_packer* pk, long long d) - int msgpack_pack_unsigned_long_long(msgpack_packer* pk, unsigned long long d) + int msgpack_pack_unsigned_long_long(msgpack_packer* pk, + unsigned long long d) int msgpack_pack_float(msgpack_packer* pk, float d) int msgpack_pack_double(msgpack_packer* pk, double d) int msgpack_pack_array(msgpack_packer* pk, size_t l) @@ -58,8 +59,10 @@ cdef class Packer(object): :param bool use_single_float: Use single precision float type for float. (default: False) :param bool autoreset: - Reset buffer after each pack and return it's content as `bytes`. (default: True). - If set this to false, use `bytes()` to get content and `.reset()` to clear buffer. + Reset buffer after each pack and return it's + content as `bytes`. (default: True). + If set this to false, use `bytes()` to get + content and `.reset()` to clear buffer. :param bool use_bin_type: Use bin type introduced in msgpack spec 2.0 for bytes. It also enable str8 type for unicode. @@ -74,15 +77,16 @@ cdef class Packer(object): cdef bint autoreset def __cinit__(self): - cdef int buf_size = 1024*1024 - self.pk.buf = malloc(buf_size); + cdef int buf_size = 1024 * 1024 + self.pk.buf = malloc(buf_size) if self.pk.buf == NULL: raise MemoryError("Unable to allocate internal buffer.") self.pk.buf_size = buf_size self.pk.length = 0 - def __init__(self, default=None, encoding='utf-8', unicode_errors='strict', - use_single_float=False, bint autoreset=1, bint use_bin_type=0): + def __init__(self, default=None, encoding='utf-8', + unicode_errors='strict', use_single_float=False, + bint autoreset=1, bint use_bin_type=0): """ """ self.use_float = use_single_float @@ -110,7 +114,8 @@ cdef class Packer(object): def __dealloc__(self): free(self.pk.buf); - cdef int _pack(self, object o, int nest_limit=DEFAULT_RECURSE_LIMIT) except -1: + cdef int _pack(self, object o, + int nest_limit=DEFAULT_RECURSE_LIMIT) except -1: cdef long long llval cdef unsigned long long ullval cdef long longval @@ -147,14 +152,14 @@ cdef class Packer(object): ret = msgpack_pack_long(&self.pk, longval) elif PyFloat_Check(o): if self.use_float: - fval = o - ret = msgpack_pack_float(&self.pk, fval) + fval = o + ret = msgpack_pack_float(&self.pk, fval) else: - dval = o - ret = msgpack_pack_double(&self.pk, dval) + dval = o + ret = msgpack_pack_double(&self.pk, dval) elif PyBytes_Check(o): L = len(o) - if L > (2**32)-1: + if L > (2**32) - 1: raise ValueError("bytes is too large") rawval = o ret = msgpack_pack_bin(&self.pk, L) @@ -162,10 +167,12 @@ cdef class Packer(object): ret = msgpack_pack_raw_body(&self.pk, rawval, L) elif PyUnicode_Check(o): if not self.encoding: - raise TypeError("Can't encode unicode string: no encoding is specified") - o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors) + raise TypeError("Can't encode unicode string: " + "no encoding is specified") + o = PyUnicode_AsEncodedString(o, self.encoding, + self.unicode_errors) L = len(o) - if L > (2**32)-1: + if L > (2**32) - 1: raise ValueError("dict is too large") rawval = o ret = msgpack_pack_raw(&self.pk, len(o)) @@ -174,43 +181,43 @@ cdef class Packer(object): elif PyDict_CheckExact(o): d = o L = len(d) - if L > (2**32)-1: + if L > (2**32) - 1: raise ValueError("dict is too large") ret = msgpack_pack_map(&self.pk, L) if ret == 0: for k, v in d.iteritems(): - ret = self._pack(k, nest_limit-1) + ret = self._pack(k, nest_limit - 1) if ret != 0: break - ret = self._pack(v, nest_limit-1) + ret = self._pack(v, nest_limit - 1) if ret != 0: break elif PyDict_Check(o): L = len(o) - if L > (2**32)-1: + if L > (2**32) - 1: raise ValueError("dict is too large") ret = msgpack_pack_map(&self.pk, L) if ret == 0: for k, v in o.items(): - ret = self._pack(k, nest_limit-1) + ret = self._pack(k, nest_limit - 1) if ret != 0: break - ret = self._pack(v, nest_limit-1) + ret = self._pack(v, nest_limit - 1) if ret != 0: break elif isinstance(o, ExtType): # This should be before Tuple because ExtType is namedtuple. longval = o.code rawval = o.data L = len(o.data) - if L > (2**32)-1: + if L > (2**32) - 1: raise ValueError("EXT data is too large") ret = msgpack_pack_ext(&self.pk, longval, L) ret = msgpack_pack_raw_body(&self.pk, rawval, L) elif PyTuple_Check(o) or PyList_Check(o): L = len(o) - if L > (2**32)-1: + if L > (2**32) - 1: raise ValueError("list is too large") ret = msgpack_pack_array(&self.pk, L) if ret == 0: for v in o: - ret = self._pack(v, nest_limit-1) + ret = self._pack(v, nest_limit - 1) if ret != 0: break elif not default_used and self._default: o = self._default(o) @@ -237,7 +244,7 @@ cdef class Packer(object): msgpack_pack_raw_body(&self.pk, data, len(data)) def pack_array_header(self, size_t size): - if size > (2**32-1): + if size > (2**32) - 1: raise ValueError cdef int ret = msgpack_pack_array(&self.pk, size) if ret == -1: @@ -250,7 +257,7 @@ cdef class Packer(object): return buf def pack_map_header(self, size_t size): - if size > (2**32-1): + if size > (2**32) - 1: raise ValueError cdef int ret = msgpack_pack_map(&self.pk, size) if ret == -1: diff --git a/pandas/msgpack/_unpacker.pyx b/pandas/msgpack/_unpacker.pyx index f68bf3369427c..6f23a24adde6c 100644 --- a/pandas/msgpack/_unpacker.pyx +++ b/pandas/msgpack/_unpacker.pyx @@ -4,18 +4,15 @@ from cpython cimport * cdef extern from "Python.h": ctypedef struct PyObject - cdef int PyObject_AsReadBuffer(object o, const void** buff, Py_ssize_t* buf_len) except -1 + cdef int PyObject_AsReadBuffer(object o, const void** buff, + Py_ssize_t* buf_len) except -1 from libc.stdlib cimport * from libc.string cimport * from libc.limits cimport * -from pandas.msgpack.exceptions import ( - BufferFull, - OutOfData, - UnpackValueError, - ExtraData, - ) +from pandas.msgpack.exceptions import (BufferFull, OutOfData, + UnpackValueError, ExtraData) from pandas.msgpack import ExtType @@ -65,7 +62,8 @@ cdef inline init_ctx(unpack_context *ctx, ctx.user.max_ext_len = max_ext_len if object_hook is not None and object_pairs_hook is not None: - raise TypeError("object_pairs_hook and object_hook are mutually exclusive.") + raise TypeError("object_pairs_hook and object_hook " + "are mutually exclusive.") if object_hook is not None: if not PyCallable_Check(object_hook): @@ -93,8 +91,11 @@ cdef inline init_ctx(unpack_context *ctx, ctx.user.encoding = encoding ctx.user.unicode_errors = unicode_errors + def default_read_extended_type(typecode, data): - raise NotImplementedError("Cannot decode extended type with typecode=%d" % typecode) + raise NotImplementedError("Cannot decode extended type " + "with typecode=%d" % typecode) + def unpackb(object packed, object object_hook=None, object list_hook=None, bint use_list=1, encoding=None, unicode_errors="strict", @@ -139,7 +140,8 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, if ret == 1: obj = unpack_data(&ctx) if off < buf_len: - raise ExtraData(obj, PyBytes_FromStringAndSize(buf+off, buf_len-off)) + raise ExtraData(obj, PyBytes_FromStringAndSize( + buf + off, buf_len - off)) return obj else: raise UnpackValueError("Unpack failed: error = %d" % (ret,)) @@ -157,9 +159,9 @@ def unpack(object stream, object object_hook=None, object list_hook=None, See :class:`Unpacker` for options. """ return unpackb(stream.read(), use_list=use_list, - object_hook=object_hook, object_pairs_hook=object_pairs_hook, list_hook=list_hook, - encoding=encoding, unicode_errors=unicode_errors, - ) + object_hook=object_hook, + object_pairs_hook=object_pairs_hook, list_hook=list_hook, + encoding=encoding, unicode_errors=unicode_errors) cdef class Unpacker(object): @@ -169,10 +171,12 @@ cdef class Unpacker(object): :param file_like: File-like object having `.read(n)` method. - If specified, unpacker reads serialized data from it and :meth:`feed()` is not usable. + If specified, unpacker reads serialized data from it and + :meth:`feed()` is not usable. :param int read_size: - Used as `file_like.read(read_size)`. (default: `min(1024**2, max_buffer_size)`) + Used as `file_like.read(read_size)`. (default: + `min(1024**2, max_buffer_size)`) :param bool use_list: If true, unpack msgpack array to Python list. @@ -184,9 +188,8 @@ cdef class Unpacker(object): (See also simplejson) :param callable object_pairs_hook: - When specified, it should be callable. - Unpacker calls it with a list of key-value pairs after unpacking msgpack map. - (See also simplejson) + When specified, it should be callable. Unpacker calls it with a list + of key-value pairs after unpacking msgpack map. (See also simplejson) :param str encoding: Encoding used for decoding msgpack raw. @@ -197,9 +200,10 @@ cdef class Unpacker(object): (default: `'strict'`) :param int max_buffer_size: - Limits size of data waiting unpacked. 0 means system's INT_MAX (default). - Raises `BufferFull` exception when it is insufficient. - You shoud set this parameter when unpacking data from untrasted source. + Limits size of data waiting unpacked. 0 means system's + INT_MAX (default). Raises `BufferFull` exception when it + is insufficient. You shoud set this parameter when unpacking + data from untrasted source. :param int max_str_len: Limits max length of str. (default: 2**31-1) @@ -250,9 +254,9 @@ cdef class Unpacker(object): self.buf = NULL def __init__(self, file_like=None, Py_ssize_t read_size=0, bint use_list=1, - object object_hook=None, object object_pairs_hook=None, object list_hook=None, - encoding=None, unicode_errors='strict', int max_buffer_size=0, - object ext_hook=ExtType, + object object_hook=None, object object_pairs_hook=None, + object list_hook=None, encoding=None, unicode_errors='strict', + int max_buffer_size=0, object ext_hook=ExtType, Py_ssize_t max_str_len=2147483647, # 2**32-1 Py_ssize_t max_bin_len=2147483647, Py_ssize_t max_array_len=2147483647, @@ -274,7 +278,8 @@ cdef class Unpacker(object): if not max_buffer_size: max_buffer_size = INT_MAX if read_size > max_buffer_size: - raise ValueError("read_size should be less or equal to max_buffer_size") + raise ValueError("read_size should be less or " + "equal to max_buffer_size") if not read_size: read_size = min(max_buffer_size, 1024**2) self.max_buffer_size = max_buffer_size @@ -313,8 +318,8 @@ cdef class Unpacker(object): """Append `next_bytes` to internal buffer.""" cdef Py_buffer pybuff if self.file_like is not None: - raise AssertionError( - "unpacker.feed() is not be able to use with `file_like`.") + raise AssertionError("unpacker.feed() is not be able " + "to use with `file_like`.") PyObject_GetBuffer(next_bytes, &pybuff, PyBUF_SIMPLE) try: self.append_buffer(pybuff.buf, pybuff.len) @@ -338,10 +343,10 @@ cdef class Unpacker(object): head = 0 else: # expand buffer. - new_size = (tail-head) + _buf_len + new_size = (tail - head) + _buf_len if new_size > self.max_buffer_size: raise BufferFull - new_size = min(new_size*2, self.max_buffer_size) + new_size = min(new_size * 2, self.max_buffer_size) new_buf = malloc(new_size) if new_buf == NULL: # self.buf still holds old buffer and will be freed during @@ -363,15 +368,16 @@ cdef class Unpacker(object): cdef read_from_file(self): next_bytes = self.file_like_read( - min(self.read_size, - self.max_buffer_size - (self.buf_tail - self.buf_head) - )) + min(self.read_size, + self.max_buffer_size - (self.buf_tail - self.buf_head))) if next_bytes: - self.append_buffer(PyBytes_AsString(next_bytes), PyBytes_Size(next_bytes)) + self.append_buffer(PyBytes_AsString(next_bytes), + PyBytes_Size(next_bytes)) else: self.file_like = None - cdef object _unpack(self, execute_fn execute, object write_bytes, bint iter=0): + cdef object _unpack(self, execute_fn execute, + object write_bytes, bint iter=0): cdef int ret cdef object obj cdef size_t prev_head @@ -389,7 +395,8 @@ cdef class Unpacker(object): ret = execute(&self.ctx, self.buf, self.buf_tail, &self.buf_head) if write_bytes is not None: - write_bytes(PyBytes_FromStringAndSize(self.buf + prev_head, self.buf_head - prev_head)) + write_bytes(PyBytes_FromStringAndSize( + self.buf + prev_head, self.buf_head - prev_head)) if ret == 1: obj = unpack_data(&self.ctx) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 5d8ab7213a7b6..12525c7a9c587 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -106,7 +106,7 @@ cdef extern from "parser/tokenizer.h": enum: ERROR_OVERFLOW ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read, - int *status) + int *status) ctypedef int (*io_cleanup)(void *src) ctypedef struct parser_t: @@ -410,7 +410,6 @@ cdef class TextReader: self._set_quoting(quotechar, quoting) - dtype_order = ['int64', 'float64', 'bool', 'object'] if quoting == QUOTE_NONNUMERIC: # consistent with csv module semantics, cast all to float @@ -517,7 +516,7 @@ cdef class TextReader: # need to artifically skip the final line # which is still a header line header = list(header) - header.append(header[-1]+1) + header.append(header[-1] + 1) self.parser.header_start = header[0] self.parser.header_end = header[-1] @@ -663,7 +662,8 @@ cdef class TextReader: if ptr == NULL: if not os.path.exists(source): - raise compat.FileNotFoundError('File %s does not exist' % source) + raise compat.FileNotFoundError( + 'File %s does not exist' % source) raise IOError('Initializing from file failed') self.parser.source = ptr @@ -689,7 +689,7 @@ cdef class TextReader: # header is now a list of lists, so field_count should use header[0] cdef: - size_t i, start, data_line, field_count, passed_count, hr, unnamed_count + size_t i, start, data_line, field_count, passed_count, hr, unnamed_count # noqa char *word object name int status @@ -716,10 +716,12 @@ cdef class TextReader: # e.g., if header=3 and file only has 2 lines elif self.parser.lines < hr + 1: msg = self.orig_header - if isinstance(msg,list): - msg = "[%s], len of %d," % (','.join([ str(m) for m in msg ]),len(msg)) - raise CParserError('Passed header=%s but only %d lines in file' - % (msg, self.parser.lines)) + if isinstance(msg, list): + msg = "[%s], len of %d," % ( + ','.join([ str(m) for m in msg ]), len(msg)) + raise CParserError( + 'Passed header=%s but only %d lines in file' + % (msg, self.parser.lines)) else: field_count = self.parser.line_fields[hr] @@ -740,13 +742,14 @@ cdef class TextReader: if name == '': if self.has_mi_columns: - name = 'Unnamed: %d_level_%d' % (i,level) + name = 'Unnamed: %d_level_%d' % (i, level) else: name = 'Unnamed: %d' % i unnamed_count += 1 count = counts.get(name, 0) - if count > 0 and self.mangle_dupe_cols and not self.has_mi_columns: + if (count > 0 and self.mangle_dupe_cols + and not self.has_mi_columns): this_header.append('%s.%d' % (name, count)) else: this_header.append(name) @@ -754,12 +757,13 @@ cdef class TextReader: if self.has_mi_columns: - # if we have grabbed an extra line, but its not in our format - # so save in the buffer, and create an blank extra line for the rest of the - # parsing code + # If we have grabbed an extra line, but it's not in our + # format, save in the buffer, and create an blank extra + # line for the rest of the parsing code. if hr == self.header[-1]: lc = len(this_header) - ic = len(self.index_col) if self.index_col is not None else 0 + ic = (len(self.index_col) if self.index_col + is not None else 0) if lc != unnamed_count and lc - ic > unnamed_count: hr -= 1 self.parser_start -= 1 @@ -993,20 +997,15 @@ cdef class TextReader: # if footer > 0: # end -= footer - #print >> sys.stderr, self.table_width - #print >> sys.stderr, self.leading_cols - #print >> sys.stderr, self.parser.lines - #print >> sys.stderr, start - #print >> sys.stderr, end - #print >> sys.stderr, self.header - #print >> sys.stderr, "index" num_cols = -1 for i in range(self.parser.lines): - num_cols = (num_cols < self.parser.line_fields[i]) * self.parser.line_fields[i] +\ + num_cols = (num_cols < self.parser.line_fields[i]) * \ + self.parser.line_fields[i] + \ (num_cols >= self.parser.line_fields[i]) * num_cols if self.table_width - self.leading_cols > num_cols: - raise CParserError("Too many columns specified: expected %s and found %s" % + raise CParserError( + "Too many columns specified: expected %s and found %s" % (self.table_width - self.leading_cols, num_cols)) results = {} @@ -1045,8 +1044,8 @@ cdef class TextReader: continue # Should return as the desired dtype (inferred or specified) - col_res, na_count = self._convert_tokens(i, start, end, name, - na_filter, na_hashset, na_flist) + col_res, na_count = self._convert_tokens( + i, start, end, name, na_filter, na_hashset, na_flist) if na_filter: self._free_na_set(na_hashset) @@ -1054,8 +1053,10 @@ cdef class TextReader: if upcast_na and na_count > 0: col_res = _maybe_upcast(col_res) - if issubclass(col_res.dtype.type, np.integer) and self.compact_ints: - col_res = lib.downcast_int64(col_res, na_values, self.use_unsigned) + if issubclass(col_res.dtype.type, + np.integer) and self.compact_ints: + col_res = lib.downcast_int64(col_res, na_values, + self.use_unsigned) if col_res is None: raise CParserError('Unable to parse column %d' % i) @@ -1087,10 +1088,12 @@ cdef class TextReader: col_dtype = self.dtype if col_dtype is not None: - col_res, na_count = self._convert_with_dtype(col_dtype, i, start, end, - na_filter, 1, na_hashset, na_flist) + col_res, na_count = self._convert_with_dtype( + col_dtype, i, start, end, na_filter, + 1, na_hashset, na_flist) - # fallback on the parse (e.g. we requested int dtype, but its actually a float) + # Fallback on the parse (e.g. we requested int dtype, + # but its actually a float). if col_res is not None: return col_res, na_count @@ -1104,7 +1107,8 @@ cdef class TextReader: dt, i, start, end, na_filter, 0, na_hashset, na_flist) except OverflowError: col_res, na_count = self._convert_with_dtype( - np.dtype('object'), i, start, end, na_filter, 0, na_hashset, na_flist) + np.dtype('object'), i, start, end, na_filter, + 0, na_hashset, na_flist) if col_res is not None: break @@ -1113,7 +1117,7 @@ cdef class TextReader: # only allow safe casts, eg. with a nan you cannot safely cast to int if col_res is not None and col_dtype is not None: try: - col_res = col_res.astype(col_dtype,casting='safe') + col_res = col_res.astype(col_dtype, casting='safe') except TypeError: # float -> int conversions can fail the above @@ -1121,12 +1125,13 @@ cdef class TextReader: col_res_orig = col_res col_res = col_res.astype(col_dtype) if (col_res != col_res_orig).any(): - raise ValueError("cannot safely convert passed user dtype of " - "{col_dtype} for {col_res} dtyped data in " - "column {column}".format(col_dtype=col_dtype, - col_res=col_res_orig.dtype.name, - column=i)) - + raise ValueError( + "cannot safely convert passed user dtype of " + "{col_dtype} for {col_res} dtyped data in " + "column {column}".format( + col_dtype=col_dtype, + col_res=col_res_orig.dtype.name, + column=i)) return col_res, na_count @@ -1137,8 +1142,8 @@ cdef class TextReader: kh_str_t *na_hashset, object na_flist): if is_integer_dtype(dtype): - result, na_count = _try_int64(self.parser, i, start, end, na_filter, - na_hashset) + result, na_count = _try_int64(self.parser, i, start, + end, na_filter, na_hashset) if user_dtype and na_count is not None: if na_count > 0: raise ValueError("Integer column has NA values in " @@ -1175,15 +1180,16 @@ cdef class TextReader: elif dtype.kind == 'U': width = dtype.itemsize if width > 0: - raise TypeError("the dtype %s is not supported for parsing" % dtype) + raise TypeError("the dtype %s is not " + "supported for parsing" % dtype) # unicode variable width return self._string_convert(i, start, end, na_filter, na_hashset) elif is_categorical_dtype(dtype): - codes, cats, na_count = _categorical_convert(self.parser, i, start, - end, na_filter, na_hashset, - self.c_encoding) + codes, cats, na_count = _categorical_convert( + self.parser, i, start, end, na_filter, + na_hashset, self.c_encoding) # sort categories and recode if necessary cats = Index(cats) if not cats.is_monotonic_increasing: @@ -1198,10 +1204,12 @@ cdef class TextReader: return self._string_convert(i, start, end, na_filter, na_hashset) elif is_datetime64_dtype(dtype): - raise TypeError("the dtype %s is not supported for parsing, " - "pass this column using parse_dates instead" % dtype) + raise TypeError("the dtype %s is not supported " + "for parsing, pass this column " + "using parse_dates instead" % dtype) else: - raise TypeError("the dtype %s is not supported for parsing" % dtype) + raise TypeError("the dtype %s is not " + "supported for parsing" % dtype) cdef _string_convert(self, Py_ssize_t i, int start, int end, bint na_filter, kh_str_t *na_hashset): @@ -1218,7 +1226,6 @@ cdef class TextReader: return _string_box_factorize(self.parser, i, start, end, na_filter, na_hashset) - def _get_converter(self, i, name): if self.converters is None: return None @@ -1330,9 +1337,9 @@ def _maybe_upcast(arr): return arr cdef enum StringPath: - CSTRING - UTF8 - ENCODED + CSTRING + UTF8 + ENCODED # factored out logic to pick string converter cdef inline StringPath _string_path(char *encoding): @@ -1445,7 +1452,7 @@ cdef _string_box_utf8(parser_t *parser, int col, pyval = PyUnicode_FromString(word) k = kh_put_strbox(table, word, &ret) - table.vals[k] = pyval + table.vals[k] = pyval result[i] = pyval @@ -1503,7 +1510,7 @@ cdef _string_box_decode(parser_t *parser, int col, pyval = PyUnicode_Decode(word, size, encoding, errors) k = kh_put_strbox(table, word, &ret) - table.vals[k] = pyval + table.vals[k] = pyval result[i] = pyval @@ -1511,6 +1518,7 @@ cdef _string_box_decode(parser_t *parser, int col, return result, na_count + @cython.boundscheck(False) cdef _categorical_convert(parser_t *parser, int col, int line_start, int line_end, @@ -1570,7 +1578,8 @@ cdef _categorical_convert(parser_t *parser, int col, for k in range(table.n_buckets): if kh_exist_str(table, k): size = strlen(table.keys[k]) - result[table.vals[k]] = PyUnicode_Decode(table.keys[k], size, encoding, errors) + result[table.vals[k]] = PyUnicode_Decode( + table.keys[k], size, encoding, errors) elif path == UTF8: for k in range(table.n_buckets): if kh_exist_str(table, k): @@ -1600,8 +1609,9 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start, return result -cdef inline void _to_fw_string_nogil(parser_t *parser, int col, int line_start, - int line_end, size_t width, char *data) nogil: +cdef inline void _to_fw_string_nogil(parser_t *parser, int col, + int line_start, int line_end, + size_t width, char *data) nogil: cdef: Py_ssize_t i coliter_t it @@ -1639,17 +1649,20 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, na_fset = kset_float64_from_list(na_flist) with nogil: error = _try_double_nogil(parser, col, line_start, line_end, - na_filter, na_hashset, use_na_flist, na_fset, NA, data, &na_count) + na_filter, na_hashset, use_na_flist, + na_fset, NA, data, &na_count) kh_destroy_float64(na_fset) if error != 0: return None, None return result, na_count -cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int line_end, - bint na_filter, kh_str_t *na_hashset, bint use_na_flist, +cdef inline int _try_double_nogil(parser_t *parser, int col, + int line_start, int line_end, + bint na_filter, kh_str_t *na_hashset, + bint use_na_flist, const kh_float64_t *na_flist, - double NA, - double *data, int *na_count) nogil: + double NA, double *data, + int *na_count) nogil: cdef: int error, size_t i @@ -1674,15 +1687,17 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int na_count[0] += 1 data[0] = NA else: - data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, - parser.thousands, 1) + data[0] = parser.converter(word, &p_end, parser.decimal, + parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: - if strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0: + if (strcasecmp(word, cinf) == 0 or + strcasecmp(word, cposinf) == 0): data[0] = INF elif strcasecmp(word, cneginf) == 0: data[0] = NEGINF else: - # Just return a non-zero value since the errno is never consumed. + # Just return a non-zero value since + # the errno is never consumed. return 1 if use_na_flist: k64 = kh_get_float64(na_flist, data[0]) @@ -1693,15 +1708,17 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int else: for i in range(lines): COLITER_NEXT(it, word) - data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, - parser.thousands, 1) + data[0] = parser.converter(word, &p_end, parser.decimal, + parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: - if strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0: + if (strcasecmp(word, cinf) == 0 or + strcasecmp(word, cposinf) == 0): data[0] = INF elif strcasecmp(word, cneginf) == 0: data[0] = NEGINF else: - # Just return a non-zero value since the errno is never consumed. + # Just return a non-zero value since + # the errno is never consumed. return 1 data += 1 @@ -1724,7 +1741,8 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, data = result.data coliter_setup(&it, parser, col, line_start) with nogil: - error = _try_int64_nogil(parser, col, line_start, line_end, na_filter, na_hashset, NA, data, &na_count) + error = _try_int64_nogil(parser, col, line_start, line_end, + na_filter, na_hashset, NA, data, &na_count) if error != 0: if error == ERROR_OVERFLOW: # Can't get the word variable @@ -1733,9 +1751,10 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, return result, na_count -cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, int line_end, - bint na_filter, const kh_str_t *na_hashset, int64_t NA, int64_t *data, - int *na_count) nogil: +cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, + int line_end, bint na_filter, + const kh_str_t *na_hashset, int64_t NA, + int64_t *data, int *na_count) nogil: cdef: int error size_t i @@ -1785,14 +1804,18 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, data = result.data with nogil: - error = _try_bool_nogil(parser, col, line_start, line_end, na_filter, na_hashset, NA, data, &na_count) + error = _try_bool_nogil(parser, col, line_start, + line_end, na_filter, + na_hashset, NA, data, + &na_count) if error != 0: return None, None return result.view(np.bool_), na_count -cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, int line_end, - bint na_filter, const kh_str_t *na_hashset, uint8_t NA, uint8_t *data, - int *na_count) nogil: +cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, + int line_end, bint na_filter, + const kh_str_t *na_hashset, uint8_t NA, + uint8_t *data, int *na_count) nogil: cdef: int error size_t lines = line_end - line_start @@ -1832,7 +1855,8 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, int l cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, bint na_filter, const kh_str_t *na_hashset, - const kh_str_t *true_hashset, const kh_str_t *false_hashset): + const kh_str_t *true_hashset, + const kh_str_t *false_hashset): cdef: int error, na_count = 0 size_t i, lines @@ -1848,16 +1872,20 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, result = np.empty(lines, dtype=np.uint8) data = result.data with nogil: - error = _try_bool_flex_nogil(parser, col, line_start, line_end, na_filter, na_hashset, - true_hashset, false_hashset, NA, data, &na_count) + error = _try_bool_flex_nogil(parser, col, line_start, line_end, + na_filter, na_hashset, true_hashset, + false_hashset, NA, data, &na_count) if error != 0: return None, None return result.view(np.bool_), na_count -cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, int line_end, - bint na_filter, const kh_str_t *na_hashset, - const kh_str_t *true_hashset, const kh_str_t *false_hashset, - uint8_t NA, uint8_t *data, int *na_count) nogil: +cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, + int line_end, bint na_filter, + const kh_str_t *na_hashset, + const kh_str_t *true_hashset, + const kh_str_t *false_hashset, + uint8_t NA, uint8_t *data, + int *na_count) nogil: cdef: int error = 0 size_t i @@ -2016,14 +2044,15 @@ def _concatenate_chunks(list chunks): if warning_columns: warning_names = ','.join(warning_columns) - warning_message = " ".join(["Columns (%s) have mixed types." % warning_names, + warning_message = " ".join([ + "Columns (%s) have mixed types." % warning_names, "Specify dtype option on import or set low_memory=False." ]) warnings.warn(warning_message, DtypeWarning, stacklevel=8) return result -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # NA values def _compute_na_values(): int64info = np.iinfo(np.int64) @@ -2035,17 +2064,17 @@ def _compute_na_values(): uint16info = np.iinfo(np.uint16) uint8info = np.iinfo(np.uint8) na_values = { - np.float64 : np.nan, - np.int64 : int64info.min, - np.int32 : int32info.min, - np.int16 : int16info.min, - np.int8 : int8info.min, - np.uint64 : uint64info.max, - np.uint32 : uint32info.max, - np.uint16 : uint16info.max, - np.uint8 : uint8info.max, - np.bool_ : uint8info.max, - np.object_ : np.nan # oof + np.float64: np.nan, + np.int64: int64info.min, + np.int32: int32info.min, + np.int16: int16info.min, + np.int8: int8info.min, + np.uint64: uint64info.max, + np.uint32: uint32info.max, + np.uint16: uint16info.max, + np.uint8: uint8info.max, + np.bool_: uint8info.max, + np.object_: np.nan # oof } return na_values @@ -2128,22 +2157,13 @@ def _to_structured_array(dict columns, object names, object usecols): stride = dt.itemsize - # start = time.time() - - # we own the data + # We own the data. buf = malloc(length * stride) recs = util.sarr_from_data(dt, length, buf) assert(recs.flags.owndata) - # buf = recs.data - # end = time.time() - # print 'took %.4f' % (end - start) - for i in range(nfields): - # start = time.clock() - # name = names[i] - # XXX field_type = fields[fnames[i]] @@ -2156,9 +2176,6 @@ def _to_structured_array(dict columns, object names, object usecols): elsize, stride, length, field_type[0] == np.object_) - # print 'Transfer of %s took %.4f' % (str(field_type), - # time.clock() - start) - return recs cdef _fill_structured_column(char *dst, char* src, int elsize, @@ -2175,7 +2192,6 @@ cdef _fill_structured_column(char *dst, char* src, int elsize, src += elsize - def _maybe_encode(values): if values is None: return [] diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 62555dc7f178c..4fa730eac0fd1 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -21,15 +21,20 @@ cdef extern from "headers/stdint.h": enum: INT64_MIN # core.common import for fast inference checks + + def is_float(object obj): return util.is_float_object(obj) + def is_integer(object obj): return util.is_integer_object(obj) + def is_bool(object obj): return util.is_bool_object(obj) + def is_complex(object obj): return util.is_complex_object(obj) @@ -38,33 +43,33 @@ cpdef bint is_period(object val): return util.is_period_object(val) _TYPE_MAP = { - 'categorical' : 'categorical', - 'category' : 'categorical', + 'categorical': 'categorical', + 'category': 'categorical', 'int8': 'integer', 'int16': 'integer', 'int32': 'integer', 'int64': 'integer', - 'i' : 'integer', + 'i': 'integer', 'uint8': 'integer', 'uint16': 'integer', 'uint32': 'integer', 'uint64': 'integer', - 'u' : 'integer', + 'u': 'integer', 'float32': 'floating', 'float64': 'floating', - 'f' : 'floating', + 'f': 'floating', 'complex128': 'complex', - 'c' : 'complex', + 'c': 'complex', 'string': 'string' if PY2 else 'bytes', - 'S' : 'string' if PY2 else 'bytes', + 'S': 'string' if PY2 else 'bytes', 'unicode': 'unicode' if PY2 else 'string', - 'U' : 'unicode' if PY2 else 'string', + 'U': 'unicode' if PY2 else 'string', 'bool': 'boolean', - 'b' : 'boolean', - 'datetime64[ns]' : 'datetime64', - 'M' : 'datetime64', - 'timedelta64[ns]' : 'timedelta64', - 'm' : 'timedelta64', + 'b': 'boolean', + 'datetime64[ns]': 'datetime64', + 'M': 'datetime64', + 'timedelta64[ns]': 'timedelta64', + 'm': 'timedelta64', } # types only exist on certain platform @@ -88,12 +93,13 @@ cdef _try_infer_map(v): """ if its in our map, just return the dtype """ cdef: object attr, val - for attr in ['name','kind','base']: - val = getattr(v.dtype,attr) + for attr in ['name', 'kind', 'base']: + val = getattr(v.dtype, attr) if val in _TYPE_MAP: return _TYPE_MAP[val] return None + def infer_dtype(object _values): """ we are coercing to an ndarray here @@ -107,12 +113,13 @@ def infer_dtype(object _values): if isinstance(_values, np.ndarray): values = _values - elif hasattr(_values,'dtype'): + elif hasattr(_values, 'dtype'): # this will handle ndarray-like # e.g. categoricals try: - values = getattr(_values, '_values', getattr(_values, 'values', _values)) + values = getattr(_values, '_values', getattr( + _values, 'values', _values)) except: val = _try_infer_map(_values) if val is not None: @@ -242,20 +249,21 @@ def is_possible_datetimelike_array(object arr): for i in range(n): v = arr[i] if util.is_string_object(v): - continue + continue elif util._checknull(v): - continue + continue elif is_datetime(v): - seen_datetime=1 + seen_datetime=1 elif is_timedelta(v): - seen_timedelta=1 + seen_timedelta=1 else: - return False + return False return seen_datetime or seen_timedelta cdef inline bint is_null_datetimelike(v): - # determine if we have a null for a timedelta/datetime (or integer versions)x + # determine if we have a null for a timedelta/datetime (or integer + # versions)x if util._checknull(v): return True elif v is NaT: @@ -315,6 +323,7 @@ cdef inline bint is_time(object o): cdef inline bint is_timedelta(object o): return PyDelta_Check(o) or util.is_timedelta64_object(o) + def is_bool_array(ndarray values): cdef: Py_ssize_t i, n = len(values) @@ -335,9 +344,11 @@ def is_bool_array(ndarray values): else: return False + def is_integer(object o): return util.is_integer_object(o) + def is_integer_array(ndarray values): cdef: Py_ssize_t i, n = len(values) @@ -358,6 +369,7 @@ def is_integer_array(ndarray values): else: return False + def is_integer_float_array(ndarray values): cdef: Py_ssize_t i, n = len(values) @@ -380,6 +392,7 @@ def is_integer_float_array(ndarray values): else: return False + def is_float_array(ndarray values): cdef: Py_ssize_t i, n = len(values) @@ -400,6 +413,7 @@ def is_float_array(ndarray values): else: return False + def is_string_array(ndarray values): cdef: Py_ssize_t i, n = len(values) @@ -421,6 +435,7 @@ def is_string_array(ndarray values): else: return False + def is_unicode_array(ndarray values): cdef: Py_ssize_t i, n = len(values) @@ -475,11 +490,12 @@ def is_datetime_array(ndarray[object] values): if is_null_datetime64(v): # we are a regular null if util._checknull(v): - null_count += 1 + null_count += 1 elif not is_datetime(v): return False return null_count != n + def is_datetime64_array(ndarray values): cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v @@ -619,6 +635,7 @@ cdef extern from "parse_helper.h": cdef int64_t iINT64_MAX = INT64_MAX cdef int64_t iINT64_MIN = INT64_MIN + def maybe_convert_numeric(object[:] values, set na_values, bint convert_empty=True, bint coerce_numeric=False): """ @@ -772,7 +789,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, seen_float = 1 elif util.is_datetime64_object(val): if convert_datetime: - idatetimes[i] = convert_to_tsobject(val, None, None, 0, 0).value + idatetimes[i] = convert_to_tsobject( + val, None, None, 0, 0).value seen_datetime = 1 else: seen_object = 1 @@ -807,7 +825,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, break else: seen_datetime = 1 - idatetimes[i] = convert_to_tsobject(val, None, None, 0, 0).value + idatetimes[i] = convert_to_tsobject( + val, None, None, 0, 0).value else: seen_object = 1 break @@ -857,7 +876,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return floats elif seen_int: return ints - elif not seen_datetime and not seen_numeric and not seen_timedelta: + elif (not seen_datetime and not seen_numeric + and not seen_timedelta): return bools.view(np.bool_) else: @@ -887,7 +907,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return floats elif seen_int: return ints - elif not seen_datetime and not seen_numeric and not seen_timedelta: + elif (not seen_datetime and not seen_numeric + and not seen_timedelta): return bools.view(np.bool_) return objects @@ -896,8 +917,9 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, def convert_sql_column(x): return maybe_convert_objects(x, try_float=1) + def try_parse_dates(ndarray[object] values, parser=None, - dayfirst=False,default=None): + dayfirst=False, default=None): cdef: Py_ssize_t i, n ndarray[object] result @@ -907,12 +929,12 @@ def try_parse_dates(ndarray[object] values, parser=None, if parser is None: if default is None: # GH2618 - date=datetime.now() - default=datetime(date.year,date.month,1) + date=datetime.now() + default=datetime(date.year, date.month, 1) try: from dateutil.parser import parse - parse_date = lambda x: parse(x, dayfirst=dayfirst,default=default) + parse_date = lambda x: parse(x, dayfirst=dayfirst, default=default) except ImportError: # pragma: no cover def parse_date(s): try: @@ -944,9 +966,10 @@ def try_parse_dates(ndarray[object] values, parser=None, return result + def try_parse_date_and_time(ndarray[object] dates, ndarray[object] times, date_parser=None, time_parser=None, - dayfirst=False,default=None): + dayfirst=False, default=None): cdef: Py_ssize_t i, n ndarray[object] result @@ -960,8 +983,8 @@ def try_parse_date_and_time(ndarray[object] dates, ndarray[object] times, if date_parser is None: if default is None: # GH2618 - date=datetime.now() - default=datetime(date.year,date.month,1) + date=datetime.now() + default=datetime(date.year, date.month, 1) try: from dateutil.parser import parse @@ -1016,6 +1039,7 @@ def try_parse_year_month_day(ndarray[object] years, ndarray[object] months, return result + def try_parse_datetime_components(ndarray[object] years, ndarray[object] months, ndarray[object] days, @@ -1052,6 +1076,7 @@ def try_parse_datetime_components(ndarray[object] years, return result + def sanitize_objects(ndarray[object] values, set na_values, convert_empty=True): cdef: @@ -1075,6 +1100,7 @@ def sanitize_objects(ndarray[object] values, set na_values, return na_count + def maybe_convert_bool(ndarray[object] arr, true_values=None, false_values=None): cdef: @@ -1166,6 +1192,7 @@ def map_infer_mask(ndarray arr, object f, ndarray[uint8_t] mask, return result + def map_infer(ndarray arr, object f, bint convert=1): """ Substitute for np.vectorize with pandas-friendly dtype inference @@ -1246,6 +1273,7 @@ def to_object_array(list rows, int min_width=0): return result + def tuples_to_object_array(ndarray[object] tuples): cdef: Py_ssize_t i, j, n, k, tmp @@ -1262,6 +1290,7 @@ def tuples_to_object_array(ndarray[object] tuples): return result + def to_object_array_tuples(list rows): cdef: Py_ssize_t i, j, n, k, tmp diff --git a/pandas/src/offsets.pyx b/pandas/src/offsets.pyx index 096198c8a05fa..c963e256d0aa5 100644 --- a/pandas/src/offsets.pyx +++ b/pandas/src/offsets.pyx @@ -162,7 +162,7 @@ cdef class YearOffset(_Offset): cpdef prev(self): cdef int64_t days - days = 365 + is_leapyear(self.y - (1-self.ly)) + days = 365 + is_leapyear(self.y - (1 - self.ly)) self.t -= days * us_in_day self.y -= 1 @@ -204,8 +204,8 @@ cdef class MonthOffset(_Offset): self.t = ts.value + (self.dayoffset * us_in_day) # for day counting - self.m = ts.dts.month - 1 - self.y = ts.dts.year + self.m = ts.dts.month - 1 + self.y = ts.dts.year self.ly = is_leapyear(self.y) if self.biz != 0: diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index bb0108fcb141c..5565f25937394 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -80,17 +80,21 @@ cdef extern from "period_helper.h": ctypedef int64_t (*freq_conv_func)(int64_t, char, asfreq_info*) void initialize_daytime_conversion_factor_matrix() - int64_t asfreq(int64_t dtordinal, int freq1, int freq2, char relation) except INT32_MIN + int64_t asfreq(int64_t dtordinal, int freq1, int freq2, + char relation) except INT32_MIN freq_conv_func get_asfreq_func(int fromFreq, int toFreq) void get_asfreq_info(int fromFreq, int toFreq, asfreq_info *af_info) int64_t get_period_ordinal(int year, int month, int day, - int hour, int minute, int second, int microseconds, int picoseconds, - int freq) nogil except INT32_MIN + int hour, int minute, int second, + int microseconds, int picoseconds, + int freq) nogil except INT32_MIN - int64_t get_python_ordinal(int64_t period_ordinal, int freq) except INT32_MIN + int64_t get_python_ordinal(int64_t period_ordinal, + int freq) except INT32_MIN - int get_date_info(int64_t ordinal, int freq, date_info *dinfo) nogil except INT32_MIN + int get_date_info(int64_t ordinal, int freq, + date_info *dinfo) nogil except INT32_MIN double getAbsTime(int, int64_t, int64_t) int pyear(int64_t ordinal, int freq) except INT32_MIN @@ -134,6 +138,7 @@ cdef inline int64_t remove_mult(int64_t period_ord_w_mult, int64_t mult): return period_ord_w_mult * mult + 1; + @cython.wraparound(False) @cython.boundscheck(False) def dt64arr_to_periodarr(ndarray[int64_t] dtarr, int freq, tz=None): @@ -158,11 +163,13 @@ def dt64arr_to_periodarr(ndarray[int64_t] dtarr, int freq, tz=None): continue pandas_datetime_to_datetimestruct(dtarr[i], PANDAS_FR_ns, &dts) out[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, dts.us, dts.ps, freq) + dts.hour, dts.min, dts.sec, + dts.us, dts.ps, freq) else: out = localize_dt64arr_to_period(dtarr, freq, tz) return out + @cython.wraparound(False) @cython.boundscheck(False) def periodarr_to_dt64arr(ndarray[int64_t] periodarr, int freq): @@ -212,6 +219,7 @@ cpdef int64_t period_asfreq(int64_t period_ordinal, int freq1, int freq2, return retval + def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): """ Convert int64-array of period ordinals from one frequency to another, and @@ -254,7 +262,9 @@ def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): return result -def period_ordinal(int y, int m, int d, int h, int min, int s, int us, int ps, int freq): + +def period_ordinal(int y, int m, int d, int h, int min, + int s, int us, int ps, int freq): cdef: int64_t ordinal @@ -284,6 +294,7 @@ cpdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) nogil: return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) + def period_format(int64_t value, int freq, object fmt=None): cdef: int freq_group @@ -332,7 +343,8 @@ cdef list extra_fmts = [(b"%q", b"^`AB`^"), (b"%u", b"^`IJ`^"), (b"%n", b"^`KL`^")] -cdef list str_extra_fmts = ["^`AB`^", "^`CD`^", "^`EF`^", "^`GH`^", "^`IJ`^", "^`KL`^"] +cdef list str_extra_fmts = ["^`AB`^", "^`CD`^", "^`EF`^", + "^`GH`^", "^`IJ`^", "^`KL`^"] cdef object _period_strftime(int64_t value, int freq, object fmt): import sys @@ -390,6 +402,7 @@ cdef object _period_strftime(int64_t value, int freq, object fmt): ctypedef int (*accessor)(int64_t ordinal, int freq) except INT32_MIN + def get_period_field(int code, int64_t value, int freq): cdef accessor f = _get_accessor_func(code) if f is NULL: @@ -398,6 +411,7 @@ def get_period_field(int code, int64_t value, int freq): return np.nan return f(value, freq) + def get_period_field_arr(int code, ndarray[int64_t] arr, int freq): cdef: Py_ssize_t i, sz @@ -420,7 +434,6 @@ def get_period_field_arr(int code, ndarray[int64_t] arr, int freq): return out - cdef accessor _get_accessor_func(int code): if code == 0: return &pyear @@ -571,7 +584,7 @@ cdef _reso_local(ndarray[int64_t] stamps, object tz): pos = _pos # statictzinfo - if typ not in ['pytz','dateutil']: + if typ not in ['pytz', 'dateutil']: for i in range(n): if stamps[i] == NPY_NAT: continue @@ -613,7 +626,8 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, continue pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) result[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, dts.us, dts.ps, freq) + dts.hour, dts.min, dts.sec, + dts.us, dts.ps, freq) elif _is_tzlocal(tz): for i in range(n): @@ -628,7 +642,8 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, pandas_datetime_to_datetimestruct(stamps[i] + delta, PANDAS_FR_ns, &dts) result[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, dts.us, dts.ps, freq) + dts.hour, dts.min, dts.sec, + dts.us, dts.ps, freq) else: # Adjust datetime64 timestamp, recompute datetimestruct trans, deltas, typ = _get_dst_info(tz) @@ -639,7 +654,7 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, pos = _pos # statictzinfo - if typ not in ['pytz','dateutil']: + if typ not in ['pytz', 'dateutil']: for i in range(n): if stamps[i] == NPY_NAT: result[i] = NPY_NAT @@ -647,7 +662,8 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, pandas_datetime_to_datetimestruct(stamps[i] + deltas[0], PANDAS_FR_ns, &dts) result[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, dts.us, dts.ps, freq) + dts.hour, dts.min, dts.sec, + dts.us, dts.ps, freq) else: for i in range(n): if stamps[i] == NPY_NAT: @@ -656,13 +672,15 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, pandas_datetime_to_datetimestruct(stamps[i] + deltas[pos[i]], PANDAS_FR_ns, &dts) result[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, dts.us, dts.ps, freq) + dts.hour, dts.min, dts.sec, + dts.us, dts.ps, freq) return result _DIFFERENT_FREQ = "Input has different freq={1} from Period(freq={0})" -_DIFFERENT_FREQ_INDEX = "Input has different freq={1} from PeriodIndex(freq={0})" +_DIFFERENT_FREQ_INDEX = ("Input has different freq={1} " + "from PeriodIndex(freq={0})") class IncompatibleFrequency(ValueError): @@ -675,7 +693,7 @@ cdef class _Period(object): int64_t ordinal object freq - _comparables = ['name','freqstr'] + _comparables = ['name', 'freqstr'] _typ = 'period' @classmethod @@ -695,7 +713,9 @@ cdef class _Period(object): @classmethod def _from_ordinal(cls, ordinal, freq): - """ fast creation from an ordinal and freq that are already validated! """ + """ + Fast creation from an ordinal and freq that are already validated! + """ if ordinal == tslib.iNaT: return tslib.NaT else: @@ -727,7 +747,8 @@ cdef class _Period(object): return hash((self.ordinal, self.freqstr)) def _add_delta(self, other): - if isinstance(other, (timedelta, np.timedelta64, offsets.Tick, Timedelta)): + if isinstance(other, (timedelta, np.timedelta64, + offsets.Tick, Timedelta)): offset = frequencies.to_offset(self.freq.rule_code) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) @@ -752,7 +773,8 @@ cdef class _Period(object): def __add__(self, other): if isinstance(self, Period): if isinstance(other, (timedelta, np.timedelta64, - offsets.Tick, offsets.DateOffset, Timedelta)): + offsets.Tick, offsets.DateOffset, + Timedelta)): return self._add_delta(other) elif other is tslib.NaT: return tslib.NaT @@ -769,7 +791,8 @@ cdef class _Period(object): def __sub__(self, other): if isinstance(self, Period): if isinstance(other, (timedelta, np.timedelta64, - offsets.Tick, offsets.DateOffset, Timedelta)): + offsets.Tick, offsets.DateOffset, + Timedelta)): neg_other = -other return self + neg_other elif lib.is_integer(other): @@ -1138,8 +1161,9 @@ class Period(_Period): raise ValueError('Must supply freq for ordinal value') elif value is None: - if (year is None and month is None and quarter is None and - day is None and hour is None and minute is None and second is None): + if (year is None and month is None and + quarter is None and day is None and + hour is None and minute is None and second is None): ordinal = tslib.iNaT else: if freq is None: @@ -1157,7 +1181,8 @@ class Period(_Period): elif isinstance(value, Period): other = value - if freq is None or frequencies.get_freq_code(freq) == frequencies.get_freq_code(other.freq): + if freq is None or frequencies.get_freq_code( + freq) == frequencies.get_freq_code(other.freq): ordinal = other.ordinal freq = other.freq else: @@ -1177,7 +1202,8 @@ class Period(_Period): try: freq = frequencies.Resolution.get_freq(reso) except KeyError: - raise ValueError("Invalid frequency or could not infer: %s" % reso) + raise ValueError( + "Invalid frequency or could not infer: %s" % reso) elif isinstance(value, datetime): dt = value @@ -1210,7 +1236,8 @@ def _ordinal_from_fields(year, month, quarter, day, if quarter is not None: year, month = _quarter_to_myear(year, quarter, freq) - return get_period_ordinal(year, month, day, hour, minute, second, 0, 0, base) + return get_period_ordinal(year, month, day, hour, + minute, second, 0, 0, base) def _quarter_to_myear(year, quarter, freq): @@ -1218,7 +1245,8 @@ def _quarter_to_myear(year, quarter, freq): if quarter <= 0 or quarter > 4: raise ValueError('Quarter must be 1 <= q <= 4') - mnum = frequencies._month_numbers[frequencies._get_rule_month(freq)] + 1 + mnum = frequencies._month_numbers[ + frequencies._get_rule_month(freq)] + 1 month = (mnum + (quarter - 1) * 3) % 12 + 1 if month > mnum: year -= 1 diff --git a/pandas/src/reduce.pyx b/pandas/src/reduce.pyx index c3f8bdfbfd0a6..1cd3e53494a72 100644 --- a/pandas/src/reduce.pyx +++ b/pandas/src/reduce.pyx @@ -46,11 +46,11 @@ cdef class Reducer: self.chunksize = k self.increment = k * arr.dtype.itemsize - self.f = f self.arr = arr self.labels = labels - self.dummy, self.typ, self.index, self.ityp = self._check_dummy(dummy=dummy) + self.dummy, self.typ, self.index, self.ityp = self._check_dummy( + dummy=dummy) def _check_dummy(self, dummy=None): cdef object index=None, typ=None, ityp=None @@ -65,16 +65,17 @@ cdef class Reducer: else: # we passed a series-like - if hasattr(dummy,'values'): + if hasattr(dummy, 'values'): typ = type(dummy) - index = getattr(dummy,'index',None) + index = getattr(dummy, 'index', None) dummy = dummy.values if dummy.dtype != self.arr.dtype: raise ValueError('Dummy array must be same dtype') if len(dummy) != self.chunksize: - raise ValueError('Dummy array must be length %d' % self.chunksize) + raise ValueError('Dummy array must be length %d' % + self.chunksize) return dummy, typ, index, ityp @@ -111,15 +112,16 @@ cdef class Reducer: if self.typ is not None: - # recreate with the index if supplied - if has_index: + # recreate with the index if supplied + if has_index: - cached_typ = self.typ(chunk, index=self.index, name=name) + cached_typ = self.typ( + chunk, index=self.index, name=name) - else: + else: - # use the passsed typ, sans index - cached_typ = self.typ(chunk, name=name) + # use the passsed typ, sans index + cached_typ = self.typ(chunk, name=name) # use the cached_typ if possible if cached_typ is not None: @@ -127,13 +129,15 @@ cdef class Reducer: if has_index: object.__setattr__(cached_typ, 'index', self.index) - object.__setattr__(cached_typ._data._block, 'values', chunk) + object.__setattr__( + cached_typ._data._block, 'values', chunk) object.__setattr__(cached_typ, 'name', name) res = self.f(cached_typ) else: res = self.f(chunk) - if hasattr(res,'values') and isinstance(res.values, np.ndarray): + if hasattr(res, 'values') and isinstance( + res.values, np.ndarray): res = res.values if i == 0: result = _get_result_array(res, @@ -167,7 +171,8 @@ cdef class SeriesBinGrouper: bint passed_dummy cdef public: - object arr, index, dummy_arr, dummy_index, values, f, bins, typ, ityp, name + object arr, index, dummy_arr, dummy_index + object values, f, bins, typ, ityp, name def __init__(self, object series, object f, object bins, object dummy): n = len(series) @@ -182,7 +187,7 @@ cdef class SeriesBinGrouper: self.typ = series._constructor self.ityp = series.index._constructor self.index = series.index.values - self.name = getattr(series,'name',None) + self.name = getattr(series, 'name', None) self.dummy_arr, self.dummy_index = self._check_dummy(dummy) self.passed_dummy = dummy is not None @@ -205,7 +210,7 @@ cdef class SeriesBinGrouper: raise ValueError('Dummy array must be same dtype') if not values.flags.contiguous: values = values.copy() - index = dummy.index.values + index = dummy.index.values if not index.flags.contiguous: index = index.copy() @@ -227,9 +232,9 @@ cdef class SeriesBinGrouper: counts[0] = self.bins[0] for i in range(1, self.ngroups): if i == self.ngroups - 1: - counts[i] = len(self.arr) - self.bins[i-1] + counts[i] = len(self.arr) - self.bins[i - 1] else: - counts[i] = self.bins[i] - self.bins[i-1] + counts[i] = self.bins[i] - self.bins[i - 1] group_size = 0 n = len(self.arr) @@ -252,7 +257,8 @@ cdef class SeriesBinGrouper: else: object.__setattr__(cached_ityp, '_data', islider.buf) cached_ityp._engine.clear_mapping() - object.__setattr__(cached_typ._data._block, 'values', vslider.buf) + object.__setattr__( + cached_typ._data._block, 'values', vslider.buf) object.__setattr__(cached_typ, '_index', cached_ityp) object.__setattr__(cached_typ, 'name', name) @@ -293,7 +299,8 @@ cdef class SeriesGrouper: bint passed_dummy cdef public: - object arr, index, dummy_arr, dummy_index, f, labels, values, typ, ityp, name + object arr, index, dummy_arr, dummy_index + object f, labels, values, typ, ityp, name def __init__(self, object series, object f, object labels, Py_ssize_t ngroups, object dummy): @@ -309,7 +316,7 @@ cdef class SeriesGrouper: self.typ = series._constructor self.ityp = series.index._constructor self.index = series.index.values - self.name = getattr(series,'name',None) + self.name = getattr(series, 'name', None) self.dummy_arr, self.dummy_index = self._check_dummy(dummy) self.passed_dummy = dummy is not None @@ -320,14 +327,14 @@ cdef class SeriesGrouper: if dummy is None: values = np.empty(0, dtype=self.arr.dtype) - index = None + index = None else: values = dummy.values if dummy.dtype != self.arr.dtype: raise ValueError('Dummy array must be same dtype') if not values.flags.contiguous: values = values.copy() - index = dummy.index.values + index = dummy.index.values if not index.flags.contiguous: index = index.copy() @@ -375,7 +382,8 @@ cdef class SeriesGrouper: else: object.__setattr__(cached_ityp, '_data', islider.buf) cached_ityp._engine.clear_mapping() - object.__setattr__(cached_typ._data._block, 'values', vslider.buf) + object.__setattr__( + cached_typ._data._block, 'values', vslider.buf) object.__setattr__(cached_typ, '_index', cached_ityp) object.__setattr__(cached_typ, 'name', name) @@ -411,14 +419,14 @@ cdef class SeriesGrouper: cdef inline _extract_result(object res): """ extract the result object, it might be a 0-dim ndarray or a len-1 0-dim, or a scalar """ - if hasattr(res,'values'): - res = res.values + if hasattr(res, 'values'): + res = res.values if not np.isscalar(res): - if isinstance(res, np.ndarray): - if res.ndim == 0: - res = res.item() - elif res.ndim == 1 and len(res) == 1: - res = res[0] + if isinstance(res, np.ndarray): + if res.ndim == 0: + res = res.item() + elif res.ndim == 1 and len(res) == 1: + res = res[0] return res cdef class Slider: @@ -467,9 +475,11 @@ cdef class Slider: self.buf.data = self.orig_data self.buf.strides[0] = self.orig_stride + class InvalidApply(Exception): pass + def apply_frame_axis0(object frame, object f, object names, ndarray[int64_t] starts, ndarray[int64_t] ends): cdef: @@ -482,7 +492,6 @@ def apply_frame_axis0(object frame, object f, object names, if frame.index._has_complex_internals: raise InvalidApply('Cannot modify frame index internals') - results = [] # Need to infer if our low-level mucking is going to cause a segfault @@ -496,7 +505,6 @@ def apply_frame_axis0(object frame, object f, object names, except: raise InvalidApply('Let this error raise above us') - slider = BlockSlider(frame) mutated = False @@ -550,7 +558,8 @@ cdef class BlockSlider: util.set_array_not_contiguous(x) self.nblocks = len(self.blocks) - self.idx_slider = Slider(self.frame.index.values, self.dummy.index.values) + self.idx_slider = Slider( + self.frame.index.values, self.dummy.index.values) self.base_ptrs = malloc(sizeof(char*) * len(self.blocks)) for i, block in enumerate(self.blocks): @@ -574,7 +583,7 @@ cdef class BlockSlider: # move and set the index self.idx_slider.move(start, end) - object.__setattr__(self.index,'_data',self.idx_slider.buf) + object.__setattr__(self.index, '_data', self.idx_slider.buf) self.index._engine.clear_mapping() cdef reset(self): @@ -589,6 +598,7 @@ cdef class BlockSlider: arr.data = self.base_ptrs[i] arr.shape[1] = 0 + def reduce(arr, f, axis=0, dummy=None, labels=None): """ @@ -606,7 +616,7 @@ def reduce(arr, f, axis=0, dummy=None, labels=None): raise Exception('Cannot use shortcut') # pass as an ndarray - if hasattr(labels,'values'): + if hasattr(labels, 'values'): labels = labels.values reducer = Reducer(arr, f, axis=axis, dummy=dummy, labels=labels) diff --git a/pandas/src/skiplist.pyx b/pandas/src/skiplist.pyx index e7db7bd5a4a02..3017931e25115 100644 --- a/pandas/src/skiplist.pyx +++ b/pandas/src/skiplist.pyx @@ -75,7 +75,6 @@ cdef class IndexableSkiplist: i -= node.width[level] node = node.next[level] - return node.value cpdef insert(self, double value): diff --git a/pandas/src/sparse.pyx b/pandas/src/sparse.pyx index 88eb4cf13815b..7ab29414499fc 100644 --- a/pandas/src/sparse.pyx +++ b/pandas/src/sparse.pyx @@ -20,7 +20,7 @@ _np_version_under1p11 = LooseVersion(_np_version) < '1.11' np.import_array() np.import_ufunc() -#------------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # Preamble stuff cdef float64_t NaN = np.NaN @@ -29,7 +29,7 @@ cdef float64_t INF = np.inf cdef inline int int_max(int a, int b): return a if a >= b else b cdef inline int int_min(int a, int b): return a if a <= b else b -#------------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- cdef class SparseIndex: @@ -112,7 +112,8 @@ cdef class IntIndex(SparseIndex): xindices = self.indices yindices = y.indices - new_indices = np.empty(min(len(xindices), len(yindices)), dtype=np.int32) + new_indices = np.empty(min( + len(xindices), len(yindices)), dtype=np.int32) for xi from 0 <= xi < self.npoints: xind = xindices[xi] @@ -171,7 +172,8 @@ cdef class IntIndex(SparseIndex): return -1 @cython.wraparound(False) - cpdef ndarray[int32_t] lookup_array(self, ndarray[int32_t, ndim=1] indexer): + cpdef ndarray[int32_t] lookup_array(self, ndarray[ + int32_t, ndim=1] indexer): """ Vectorized lookup, returns ndarray[int32_t] """ @@ -279,7 +281,7 @@ cpdef get_blocks(ndarray[int32_t, ndim=1] indices): lens = lens[:result_indexer] return locs, lens -#------------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # BlockIndex cdef class BlockIndex(SparseIndex): @@ -350,7 +352,7 @@ cdef class BlockIndex(SparseIndex): for i from 0 <= i < self.nblocks: if i > 0: - if blocs[i] <= blocs[i-1]: + if blocs[i] <= blocs[i - 1]: raise ValueError('Locations not in ascending order') if i < self.nblocks - 1: @@ -524,7 +526,8 @@ cdef class BlockIndex(SparseIndex): return -1 @cython.wraparound(False) - cpdef ndarray[int32_t] lookup_array(self, ndarray[int32_t, ndim=1] indexer): + cpdef ndarray[int32_t] lookup_array(self, ndarray[ + int32_t, ndim=1] indexer): """ Vectorized lookup, returns ndarray[int32_t] """ @@ -642,7 +645,8 @@ cdef class BlockUnion(BlockMerge): cdef _make_merged_blocks(self): cdef: - ndarray[int32_t, ndim=1] xstart, xend, ystart, yend, out_bloc, out_blen + ndarray[int32_t, ndim=1] xstart, xend, ystart + ndarray[int32_t, ndim=1] yend, out_bloc, out_blen int32_t nstart, nend, diff Py_ssize_t max_len, result_indexer = 0 @@ -752,14 +756,13 @@ cdef class BlockUnion(BlockMerge): return self._find_next_block_end(1 - mode) -#------------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # Sparse arithmetic include "sparse_op_helper.pxi" - -#------------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # Indexing operations def get_reindexer(ndarray[object, ndim=1] values, dict index_map): diff --git a/pandas/src/testing.pyx b/pandas/src/testing.pyx index e9563d9168206..cda21ba9c4ce1 100644 --- a/pandas/src/testing.pyx +++ b/pandas/src/testing.pyx @@ -68,13 +68,14 @@ cpdef assert_almost_equal(a, b, b : object check_less_precise : bool or int, default False Specify comparison precision. - 5 digits (False) or 3 digits (True) after decimal points are compared. - If an integer, then this will be the number of decimal points to compare + 5 digits (False) or 3 digits (True) after decimal points are + compared. If an integer, then this will be the number of decimal + points to compare check_dtype: bool, default True check dtype if both a and b are np.ndarray obj : str, default None - Specify object name being compared, internally used to show appropriate - assertion message + Specify object name being compared, internally used to show + appropriate assertion message lobj : str, default None Specify left object name being compared, internally used to show appropriate assertion message @@ -129,8 +130,9 @@ cpdef assert_almost_equal(a, b, na, nb = a.size, b.size if a.shape != b.shape: from pandas.util.testing import raise_assert_detail - raise_assert_detail(obj, '{0} shapes are different'.format(obj), - a.shape, b.shape) + raise_assert_detail( + obj, '{0} shapes are different'.format(obj), + a.shape, b.shape) if check_dtype and not is_dtype_equal(a, b): from pandas.util.testing import assert_attr_equal @@ -148,7 +150,7 @@ cpdef assert_almost_equal(a, b, from pandas.util.testing import raise_assert_detail # if we have a small diff set, print it - if abs(na-nb) < 10: + if abs(na - nb) < 10: r = list(set(a) ^ set(b)) else: r = None @@ -158,14 +160,16 @@ cpdef assert_almost_equal(a, b, for i in xrange(len(a)): try: - assert_almost_equal(a[i], b[i], check_less_precise=check_less_precise) + assert_almost_equal(a[i], b[i], + check_less_precise=check_less_precise) except AssertionError: is_unequal = True diff += 1 if is_unequal: from pandas.util.testing import raise_assert_detail - msg = '{0} values are different ({1} %)'.format(obj, np.round(diff * 100.0 / na, 5)) + msg = '{0} values are different ({1} %)'.format( + obj, np.round(diff * 100.0 / na, 5)) raise_assert_detail(obj, msg, lobj, robj) return True @@ -198,12 +202,12 @@ cpdef assert_almost_equal(a, b, # case for zero if abs(fa) < 1e-5: if not decimal_almost_equal(fa, fb, decimal): - assert False, ( - '(very low values) expected %.5f but got %.5f, with decimal %d' % (fb, fa, decimal) - ) + assert False, ('(very low values) expected %.5f but ' + 'got %.5f, with decimal %d' % (fb, fa, decimal)) else: if not decimal_almost_equal(1, fb / fa, decimal): - assert False, 'expected %.5f but got %.5f, with decimal %d' % (fb, fa, decimal) + assert False, ('expected %.5f but got %.5f, ' + 'with decimal %d' % (fb, fa, decimal)) return True raise AssertionError("{0} != {1}".format(a, b)) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index c9e85c5741410..9073ad0abd535 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -36,7 +36,8 @@ cdef extern from "datetime_helper.h": from datetime cimport cmp_pandas_datetimestruct from libc.stdlib cimport free -from util cimport is_integer_object, is_float_object, is_datetime64_object, is_timedelta64_object +from util cimport (is_integer_object, is_float_object, is_datetime64_object, + is_timedelta64_object) cimport util from datetime cimport * @@ -49,8 +50,10 @@ from datetime import time as datetime_time import re # dateutil compat -from dateutil.tz import (tzoffset, tzlocal as _dateutil_tzlocal, tzfile as _dateutil_tzfile, - tzutc as _dateutil_tzutc, tzstr as _dateutil_tzstr) +from dateutil.tz import (tzoffset, tzlocal as _dateutil_tzlocal, + tzfile as _dateutil_tzfile, + tzutc as _dateutil_tzutc, + tzstr as _dateutil_tzstr) from pandas.compat import is_platform_windows if is_platform_windows(): @@ -61,7 +64,8 @@ from dateutil.relativedelta import relativedelta from dateutil.parser import DEFAULTPARSER from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo -from pandas.compat import parse_date, string_types, iteritems, StringIO, callable +from pandas.compat import (parse_date, string_types, iteritems, + StringIO, callable) import operator import collections @@ -89,8 +93,10 @@ try: except NameError: # py3 basestring = str -cdef inline object create_timestamp_from_ts(int64_t value, pandas_datetimestruct dts, - object tz, object freq): + +cdef inline object create_timestamp_from_ts( + int64_t value, pandas_datetimestruct dts, + object tz, object freq): cdef _Timestamp ts_base ts_base = _Timestamp.__new__(Timestamp, dts.year, dts.month, dts.day, dts.hour, dts.min, @@ -101,13 +107,17 @@ cdef inline object create_timestamp_from_ts(int64_t value, pandas_datetimestruct return ts_base -cdef inline object create_datetime_from_ts(int64_t value, pandas_datetimestruct dts, - object tz, object freq): + +cdef inline object create_datetime_from_ts( + int64_t value, pandas_datetimestruct dts, + object tz, object freq): return datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) + def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False): - # convert an i8 repr to an ndarray of datetimes or Timestamp (if box == True) + # convert an i8 repr to an ndarray of datetimes or Timestamp (if box == + # True) cdef: Py_ssize_t i, n = len(arr) @@ -133,7 +143,8 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False): if value == NPY_NAT: result[i] = NaT else: - pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + value, PANDAS_FR_ns, &dts) result[i] = func_create(value, dts, tz, freq) elif _is_tzlocal(tz) or _is_fixed_offset(tz): for i in range(n): @@ -141,7 +152,8 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False): if value == NPY_NAT: result[i] = NaT else: - pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + value, PANDAS_FR_ns, &dts) dt = create_datetime_from_ts(value, dts, tz, freq) dt = dt + tz.utcoffset(dt) if box: @@ -163,10 +175,12 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False): # find right representation of dst etc in pytz timezone new_tz = tz._tzinfos[tz._transition_info[pos]] else: - # no zone-name change for dateutil tzs - dst etc represented in single object. + # no zone-name change for dateutil tzs - dst etc + # represented in single object. new_tz = tz - pandas_datetime_to_datetimestruct(value + deltas[pos], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + value + deltas[pos], PANDAS_FR_ns, &dts) result[i] = func_create(value, dts, new_tz, freq) else: for i in range(n): @@ -180,8 +194,10 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False): return result + def ints_to_pytimedelta(ndarray[int64_t] arr, box=False): - # convert an i8 repr to an ndarray of timedelta or Timedelta (if box == True) + # convert an i8 repr to an ndarray of timedelta or Timedelta (if box == + # True) cdef: Py_ssize_t i, n = len(arr) @@ -197,7 +213,7 @@ def ints_to_pytimedelta(ndarray[int64_t] arr, box=False): if box: result[i] = Timedelta(value) else: - result[i] = timedelta(microseconds=int(value)/1000) + result[i] = timedelta(microseconds=int(value) / 1000) return result @@ -205,6 +221,7 @@ def ints_to_pytimedelta(ndarray[int64_t] arr, box=False): cdef inline bint _is_tzlocal(object tz): return isinstance(tz, _dateutil_tzlocal) + cdef inline bint _is_fixed_offset(object tz): if _treat_tz_as_dateutil(tz): if len(tz._trans_idx) == 0 and len(tz._trans_list) == 0: @@ -212,7 +229,8 @@ cdef inline bint _is_fixed_offset(object tz): else: return 0 elif _treat_tz_as_pytz(tz): - if len(tz._transition_info) == 0 and len(tz._utc_transition_times) == 0: + if (len(tz._transition_info) == 0 + and len(tz._utc_transition_times) == 0): return 1 else: return 0 @@ -223,6 +241,8 @@ _no_input = object() # Python front end to C extension type _Timestamp # This serves as the box for datetime64 + + class Timestamp(_Timestamp): """TimeStamp is the pandas equivalent of python's Datetime and is interchangable with it in most cases. It's the type used @@ -281,7 +301,8 @@ class Timestamp(_Timestamp): offset : str, DateOffset Deprecated, use freq """ - return cls(datetime.fromordinal(ordinal), freq=freq, tz=tz, offset=offset) + return cls(datetime.fromordinal(ordinal), + freq=freq, tz=tz, offset=offset) @classmethod def now(cls, tz=None): @@ -370,13 +391,16 @@ class Timestamp(_Timestamp): if ts_input is _no_input: # User passed keyword arguments. return Timestamp(datetime(year, month, day, hour or 0, - minute or 0, second or 0, microsecond or 0, tzinfo), - tz=tzinfo) + minute or 0, second or 0, + microsecond or 0, tzinfo), + tz=tzinfo) elif is_integer_object(freq): # User passed positional arguments: - # Timestamp(year, month, day[, hour[, minute[, second[, microsecond[, tzinfo]]]]]) + # Timestamp(year, month, day[, hour[, minute[, second[, + # microsecond[, tzinfo]]]]]) return Timestamp(datetime(ts_input, freq, tz, unit or 0, - year or 0, month or 0, day or 0, hour), tz=hour) + year or 0, month or 0, day or 0, + hour), tz=hour) ts = convert_to_tsobject(ts_input, tz, unit, 0, 0) @@ -399,7 +423,6 @@ class Timestamp(_Timestamp): return ts_base - def _round(self, freq, rounder): cdef int64_t unit @@ -411,7 +434,7 @@ class Timestamp(_Timestamp): value = self.tz_localize(None).value else: value = self.value - result = Timestamp(unit*rounder(value/float(unit)),unit='ns') + result = Timestamp(unit * rounder(value / float(unit)), unit='ns') if self.tz is not None: result = result.tz_localize(self.tz) return result @@ -493,7 +516,8 @@ class Timestamp(_Timestamp): @property def weekday_name(self): - out = get_date_name_field(np.array([self.value], dtype=np.int64), 'weekday_name') + out = get_date_name_field( + np.array([self.value], dtype=np.int64), 'weekday_name') return out[0] @property @@ -592,8 +616,8 @@ class Timestamp(_Timestamp): # tz naive, localize tz = maybe_get_tz(tz) if not isinstance(ambiguous, basestring): - ambiguous = [ambiguous] - value = tz_localize_to_utc(np.array([self.value],dtype='i8'), tz, + ambiguous = [ambiguous] + value = tz_localize_to_utc(np.array([self.value], dtype='i8'), tz, ambiguous=ambiguous, errors=errors)[0] return Timestamp(value, tz=tz) else: @@ -605,7 +629,6 @@ class Timestamp(_Timestamp): raise TypeError('Cannot localize tz-aware Timestamp, use ' 'tz_convert for conversions') - def tz_convert(self, tz): """ Convert tz-aware Timestamp to another time zone. @@ -677,25 +700,26 @@ class Timestamp(_Timestamp): year -= 1 month += 12 return (day + - np.fix((153*month - 457)/5) + - 365*year + + np.fix((153 * month - 457) / 5) + + 365 * year + np.floor(year / 4) - np.floor(year / 100) + np.floor(year / 400) + 1721118.5 + (self.hour + - self.minute/60.0 + - self.second/3600.0 + - self.microsecond/3600.0/1e+6 + - self.nanosecond/3600.0/1e+9 - )/24.0) + self.minute / 60.0 + + self.second / 3600.0 + + self.microsecond / 3600.0 / 1e+6 + + self.nanosecond / 3600.0 / 1e+9 + ) / 24.0) def normalize(self): """ Normalize Timestamp to midnight, preserving tz information. """ - normalized_value = date_normalize(np.array([self.value], dtype='i8'), tz=self.tz)[0] + normalized_value = date_normalize( + np.array([self.value], dtype='i8'), tz=self.tz)[0] return Timestamp(normalized_value).tz_localize(self.tz) def __radd__(self, other): @@ -704,7 +728,9 @@ class Timestamp(_Timestamp): return self + other -_nat_strings = set(['NaT','nat','NAT','nan','NaN','NAN']) +_nat_strings = set(['NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN']) + + class NaTType(_NaT): """(N)ot-(A)-(T)ime, the time equivalent of NaN""" @@ -762,7 +788,6 @@ class NaTType(_NaT): return NotImplemented - fields = ['year', 'quarter', 'month', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond', 'nanosecond', 'week', 'dayofyear', 'days_in_month', 'daysinmonth', 'dayofweek', @@ -771,20 +796,23 @@ for field in fields: prop = property(fget=lambda self: np.nan) setattr(NaTType, field, prop) -# GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or return NaT -# create functions that raise, for binding to NaTType + +# GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or +# return NaT create functions that raise, for binding to NaTType def _make_error_func(func_name): def f(*args, **kwargs): raise ValueError("NaTType does not support " + func_name) f.__name__ = func_name return f + def _make_nat_func(func_name): def f(*args, **kwargs): return NaT f.__name__ = func_name return f + def _make_nan_func(func_name): def f(*args, **kwargs): return np.nan @@ -813,7 +841,9 @@ for _maybe_method_name in dir(NaTType): if (callable(_maybe_method) and not _maybe_method_name.startswith("_") and _maybe_method_name not in _implemented_methods): - setattr(NaTType, _maybe_method_name, _make_error_func(_maybe_method_name)) + setattr(NaTType, _maybe_method_name, + _make_error_func(_maybe_method_name)) + def __nat_unpickle(*args): # return constant defined in the module @@ -1028,9 +1058,11 @@ cdef class _Timestamp(datetime): pass tz = ", tz='{0}'".format(zone) if zone is not None else "" - freq = ", freq='{0}'".format(self.freq.freqstr) if self.freq is not None else "" + freq = ", freq='{0}'".format( + self.freq.freqstr) if self.freq is not None else "" - return "Timestamp('{stamp}'{tz}{freq})".format(stamp=stamp, tz=tz, freq=freq) + return "Timestamp('{stamp}'{tz}{freq})".format( + stamp=stamp, tz=tz, freq=freq) cdef bint _compare_outside_nanorange(_Timestamp self, datetime other, int op) except -1: @@ -1101,7 +1133,8 @@ cdef class _Timestamp(datetime): if is_timedelta64_object(other): other_int = other.astype('timedelta64[ns]').view('i8') - return Timestamp(self.value + other_int, tz=self.tzinfo, freq=self.freq) + return Timestamp(self.value + other_int, + tz=self.tzinfo, freq=self.freq) elif is_integer_object(other): if self is NaT: @@ -1114,7 +1147,8 @@ cdef class _Timestamp(datetime): elif isinstance(other, timedelta) or hasattr(other, 'delta'): nanos = _delta_to_nanoseconds(other) - result = Timestamp(self.value + nanos, tz=self.tzinfo, freq=self.freq) + result = Timestamp(self.value + nanos, + tz=self.tzinfo, freq=self.freq) if getattr(other, 'normalize', False): result = Timestamp(normalize_date(result)) return result @@ -1148,21 +1182,27 @@ cdef class _Timestamp(datetime): return NaT # coerce if necessary if we are a Timestamp-like - if isinstance(self, datetime) and (isinstance(other, datetime) or is_datetime64_object(other)): + if (isinstance(self, datetime) + and (isinstance(other, datetime) + or is_datetime64_object(other))): self = Timestamp(self) other = Timestamp(other) # validate tz's if get_timezone(self.tzinfo) != get_timezone(other.tzinfo): - raise TypeError("Timestamp subtraction must have the same timezones or no timezones") + raise TypeError( + "Timestamp subtraction must have the " + "same timezones or no timezones") - # scalar Timestamp/datetime - Timestamp/datetime -> yields a Timedelta + # scalar Timestamp/datetime - Timestamp/datetime -> yields a + # Timedelta try: - return Timedelta(self.value-other.value) + return Timedelta(self.value -other.value) except (OverflowError, OutOfBoundsDatetime): pass - # scalar Timestamp/datetime - Timedelta -> yields a Timestamp (with same timezone if specified) + # scalar Timestamp/datetime - Timedelta -> yields a Timestamp (with + # same timezone if specified) return datetime.__sub__(self, other) cpdef _get_field(self, field): @@ -1170,9 +1210,12 @@ cdef class _Timestamp(datetime): return int(out[0]) cpdef _get_start_end_field(self, field): - month_kw = self.freq.kwds.get('startingMonth', self.freq.kwds.get('month', 12)) if self.freq else 12 + month_kw = self.freq.kwds.get( + 'startingMonth', self.freq.kwds.get( + 'month', 12)) if self.freq else 12 freqstr = self.freqstr if self.freq else None - out = get_start_end_field(np.array([self.value], dtype=np.int64), field, freqstr, month_kw) + out = get_start_end_field( + np.array([self.value], dtype=np.int64), field, freqstr, month_kw) return out[0] property _repr_base: @@ -1361,19 +1404,20 @@ cdef convert_to_tsobject(object ts, object tz, object unit, obj.value = NPY_NAT else: obj.value = _get_datetime64_nanos(ts) - pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) + pandas_datetime_to_datetimestruct( + obj.value, PANDAS_FR_ns, &obj.dts) elif is_integer_object(ts): if ts == NPY_NAT: obj.value = NPY_NAT else: - ts = ts * cast_from_unit(None,unit) + ts = ts * cast_from_unit(None, unit) obj.value = ts pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) elif util.is_float_object(ts): if ts != ts or ts == NPY_NAT: obj.value = NPY_NAT else: - ts = cast_from_unit(ts,unit) + ts = cast_from_unit(ts, unit) obj.value = ts pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) elif PyDateTime_Check(ts): @@ -1424,7 +1468,9 @@ cdef convert_to_tsobject(object ts, object tz, object unit, ts = datetime.combine(ts, datetime_time()) return convert_to_tsobject(ts, tz, None, 0, 0) elif getattr(ts, '_typ', None) == 'period': - raise ValueError("Cannot convert Period to Timestamp unambiguously. Use to_timestamp") + raise ValueError( + "Cannot convert Period to Timestamp " + "unambiguously. Use to_timestamp") else: raise TypeError('Cannot convert input to Timestamp') @@ -1465,7 +1511,8 @@ cpdef convert_str_to_tsobject(object ts, object tz, object unit, else: try: _string_to_dts(ts, &obj.dts, &out_local, &out_tzoffset) - obj.value = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &obj.dts) + obj.value = pandas_datetimestruct_to_datetime( + PANDAS_FR_ns, &obj.dts) _check_dts_bounds(&obj.dts) if out_local == 1: obj.tzinfo = pytz.FixedOffset(out_tzoffset) @@ -1483,12 +1530,14 @@ cpdef convert_str_to_tsobject(object ts, object tz, object unit, ts = tz_convert_single(ts, tz, 'UTC') except ValueError: try: - ts = parse_datetime_string(ts, dayfirst=dayfirst, yearfirst=yearfirst) + ts = parse_datetime_string( + ts, dayfirst=dayfirst, yearfirst=yearfirst) except Exception: raise ValueError("could not convert string to Timestamp") return convert_to_tsobject(ts, tz, unit, dayfirst, yearfirst) + def _test_parse_iso8601(object ts): """ TESTING ONLY: Parse string into Timestamp using iso8601 parser. Used @@ -1534,7 +1583,6 @@ cdef inline void _localize_tso(_TSObject obj, object tz): pos = trans.searchsorted(obj.value, side='right') - 1 - # static/pytz/dateutil specific code if _is_fixed_offset(tz): # statictzinfo @@ -1542,7 +1590,8 @@ cdef inline void _localize_tso(_TSObject obj, object tz): pandas_datetime_to_datetimestruct(obj.value + deltas[0], PANDAS_FR_ns, &obj.dts) else: - pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) + pandas_datetime_to_datetimestruct( + obj.value, PANDAS_FR_ns, &obj.dts) obj.tzinfo = tz elif _treat_tz_as_pytz(tz): inf = tz._transition_info[pos] @@ -1591,21 +1640,29 @@ cdef inline bint _is_utc(object tz): cdef inline object _get_zone(object tz): """ We need to do several things here: - 1/ Distinguish between pytz and dateutil timezones - 2/ Not be over-specific (e.g. US/Eastern with/without DST is same *zone* but a different tz object) - 3/ Provide something to serialize when we're storing a datetime object in pytables. - - We return a string prefaced with dateutil if it's a dateutil tz, else just the tz name. It needs to be a - string so that we can serialize it with UJSON/pytables. maybe_get_tz (below) is the inverse of this process. + 1) Distinguish between pytz and dateutil timezones + 2) Not be over-specific (e.g. US/Eastern with/without DST is same *zone* + but a different tz object) + 3) Provide something to serialize when we're storing a datetime object + in pytables. + + We return a string prefaced with dateutil if it's a dateutil tz, else just + the tz name. It needs to be a string so that we can serialize it with + UJSON/pytables. maybe_get_tz (below) is the inverse of this process. """ if _is_utc(tz): return 'UTC' else: if _treat_tz_as_dateutil(tz): if '.tar.gz' in tz._filename: - raise ValueError('Bad tz filename. Dateutil on python 3 on windows has a bug which causes tzfile._filename to be the same for all ' - 'timezone files. Please construct dateutil timezones implicitly by passing a string like "dateutil/Europe/London" ' - 'when you construct your pandas objects instead of passing a timezone object. See https://github.com/pydata/pandas/pull/7362') + raise ValueError( + 'Bad tz filename. Dateutil on python 3 on windows has a ' + 'bug which causes tzfile._filename to be the same for all ' + 'timezone files. Please construct dateutil timezones ' + 'implicitly by passing a string like "dateutil/Europe' + '/London" when you construct your pandas objects instead ' + 'of passing a timezone object. See ' + 'https://github.com/pydata/pandas/pull/7362') return 'dateutil/' + tz._filename else: # tz is a pytz timezone or unknown. @@ -1620,8 +1677,8 @@ cdef inline object _get_zone(object tz): cpdef inline object maybe_get_tz(object tz): """ - (Maybe) Construct a timezone object from a string. If tz is a string, use it to construct a timezone object. - Otherwise, just return tz. + (Maybe) Construct a timezone object from a string. If tz is a string, use + it to construct a timezone object. Otherwise, just return tz. """ if isinstance(tz, string_types): if tz == 'tzlocal()': @@ -1639,7 +1696,6 @@ cpdef inline object maybe_get_tz(object tz): return tz - class OutOfBoundsDatetime(ValueError): pass @@ -1659,7 +1715,8 @@ cdef inline _check_dts_bounds(pandas_datetimestruct *dts): dts.day, dts.hour, dts.min, dts.sec) - raise OutOfBoundsDatetime('Out of bounds nanosecond timestamp: %s' % fmt) + raise OutOfBoundsDatetime( + 'Out of bounds nanosecond timestamp: %s' % fmt) def datetime_to_datetime64(ndarray[object] values): @@ -1689,7 +1746,8 @@ def datetime_to_datetime64(ndarray[object] values): _check_dts_bounds(&_ts.dts) else: if inferred_tz is not None: - raise ValueError('Cannot mix tz-aware with tz-naive values') + raise ValueError( + 'Cannot mix tz-aware with tz-naive values') iresult[i] = _pydatetime_to_dts(val, &dts) _check_dts_bounds(&dts) else: @@ -1698,7 +1756,7 @@ def datetime_to_datetime64(ndarray[object] values): return result, inferred_tz cdef: - set _not_datelike_strings = set(['a','A','m','M','p','P','t','T']) + set _not_datelike_strings = set(['a', 'A', 'm', 'M', 'p', 'P', 't', 'T']) cpdef bint _does_string_look_like_datetime(object date_string): if date_string.startswith('0'): @@ -1742,7 +1800,7 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, pandas_datetimestruct dts if na_rep is None: - na_rep = 'NaT' + na_rep = 'NaT' # if we don't have a format nor tz, then choose # a format based on precision @@ -1780,7 +1838,7 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, elif show_us: res += '.%.6d' % dts.us elif show_ms: - res += '.%.3d' % (dts.us/1000) + res += '.%.3d' % (dts.us /1000) result[i] = res @@ -1810,7 +1868,6 @@ cdef object _TIMEPAT = re.compile(r'^([01]?[0-9]|2[0-3]):([0-5][0-9])') def parse_datetime_string(object date_string, object freq=None, dayfirst=False, yearfirst=False, **kwargs): - """parse datetime string, only returns datetime. Also cares special handling matching time patterns. @@ -1913,23 +1970,27 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, i = date_string.index('Q', 1, 6) if i == 1: quarter = int(date_string[0]) - if date_len == 4 or (date_len == 5 and date_string[i + 1] == '-'): + if date_len == 4 or (date_len == 5 + and date_string[i + 1] == '-'): # r'(\d)Q-?(\d\d)') year = 2000 + int(date_string[-2:]) - elif date_len == 6 or (date_len == 7 and date_string[i + 1] == '-'): + elif date_len == 6 or (date_len == 7 + and date_string[i + 1] == '-'): # r'(\d)Q-?(\d\d\d\d)') year = int(date_string[-4:]) else: raise ValueError elif i == 2 or i == 3: # r'(\d\d)-?Q(\d)' - if date_len == 4 or (date_len == 5 and date_string[i - 1] == '-'): + if date_len == 4 or (date_len == 5 + and date_string[i - 1] == '-'): quarter = int(date_string[-1]) year = 2000 + int(date_string[:2]) else: raise ValueError elif i == 4 or i == 5: - if date_len == 6 or (date_len == 7 and date_string[i - 1] == '-'): + if date_len == 6 or (date_len == 7 + and date_string[i - 1] == '-'): # r'(\d\d\d\d)-?Q(\d)' quarter = int(date_string[-1]) year = int(date_string[:4]) @@ -1937,7 +1998,8 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, raise ValueError if not (1 <= quarter <= 4): - msg = 'Incorrect quarterly string is given, quarter must be between 1 and 4: {0}' + msg = ('Incorrect quarterly string is given, quarter must be ' + 'between 1 and 4: {0}') raise DateParseError(msg.format(date_string)) if freq is not None: @@ -1945,7 +2007,8 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, try: mnum = _MONTH_NUMBERS[_get_rule_month(freq)] + 1 except (KeyError, ValueError): - msg = 'Unable to retrieve month information from given freq: {0}'.format(freq) + msg = ('Unable to retrieve month information from given ' + 'freq: {0}').format(freq) raise DateParseError(msg) month = (mnum + (quarter - 1) * 3) % 12 + 1 @@ -1962,7 +2025,8 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, except ValueError: pass - if date_len == 6 and (freq == 'M' or getattr(freq, 'rule_code', None) == 'M'): + if date_len == 6 and (freq == 'M' or getattr( + freq, 'rule_code', None) == 'M'): year = int(date_string[:4]) month = int(date_string[4:6]) try: @@ -2048,7 +2112,8 @@ def dateutil_parse(object timestr, object default, ignoretz=False, # const for parsers -_DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0, second=0, microsecond=0) +_DEFAULT_DATETIME = datetime(1, 1, 1).replace( + hour=0, minute=0, second=0, microsecond=0) _MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] _MONTH_NUMBERS = dict((k, i) for i, k in enumerate(_MONTHS)) @@ -2092,7 +2157,9 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): int64_t m ndarray[float64_t] fvalues ndarray mask - bint is_ignore=errors=='ignore', is_coerce=errors=='coerce', is_raise=errors=='raise' + bint is_ignore = errors=='ignore' + bint is_coerce = errors=='coerce' + bint is_raise = errors=='raise' bint need_to_iterate=True ndarray[int64_t] iresult ndarray[object] oresult @@ -2123,9 +2190,11 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): # check the bounds if not need_to_iterate: - if (fvalues < _NS_LOWER_BOUND).any() or (fvalues > _NS_UPPER_BOUND).any(): - raise OutOfBoundsDatetime("cannot convert input with unit '{0}'".format(unit)) - result = (iresult*m).astype('M8[ns]') + if ((fvalues < _NS_LOWER_BOUND).any() + or (fvalues > _NS_UPPER_BOUND).any()): + raise OutOfBoundsDatetime( + "cannot convert input with unit '{0}'".format(unit)) + result = (iresult *m).astype('M8[ns]') iresult = result.view('i8') iresult[mask] = iNaT return result @@ -2149,10 +2218,9 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): iresult[i] = cast_from_unit(val, unit) except OverflowError: if is_raise: - raise OutOfBoundsDatetime("cannot convert input {0}" - "with the unit '{1}'".format( - val, - unit)) + raise OutOfBoundsDatetime( + "cannot convert input {0} with the unit " + "'{1}'".format(val, unit)) elif is_ignore: raise AssertionError iresult[i] = NPY_NAT @@ -2166,19 +2234,17 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): iresult[i] = cast_from_unit(float(val), unit) except ValueError: if is_raise: - raise ValueError("non convertible value {0}" - "with the unit '{1}'".format( - val, - unit)) + raise ValueError( + "non convertible value {0} with the unit " + "'{1}'".format(val, unit)) elif is_ignore: raise AssertionError iresult[i] = NPY_NAT except: if is_raise: - raise OutOfBoundsDatetime("cannot convert input {0}" - "with the unit '{1}'".format( - val, - unit)) + raise OutOfBoundsDatetime( + "cannot convert input {0} with the unit " + "'{1}'".format(val, unit)) elif is_ignore: raise AssertionError iresult[i] = NPY_NAT @@ -2240,8 +2306,13 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', ndarray[int64_t] iresult ndarray[object] oresult pandas_datetimestruct dts - bint utc_convert = bool(utc), seen_integer=0, seen_string=0, seen_datetime=0 - bint is_raise=errors=='raise', is_ignore=errors=='ignore', is_coerce=errors=='coerce' + bint utc_convert = bool(utc) + bint seen_integer = 0 + bint seen_string = 0 + bint seen_datetime = 0 + bint is_raise = errors=='raise' + bint is_ignore = errors=='ignore' + bint is_coerce = errors=='coerce' _TSObject _ts int out_local=0, out_tzoffset=0 @@ -2340,7 +2411,8 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', seen_string=1 _string_to_dts(val, &dts, &out_local, &out_tzoffset) - value = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) + value = pandas_datetimestruct_to_datetime( + PANDAS_FR_ns, &dts) if out_local == 1: tz = pytz.FixedOffset(out_tzoffset) value = tz_convert_single(value, tz, 'UTC') @@ -2353,8 +2425,9 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', iresult[i] = NPY_NAT continue elif is_raise: - raise ValueError("time data %r doesn't match format specified" % - (val,)) + raise ValueError( + "time data %r doesn't match format " + "specified" % (val,)) else: return values @@ -2398,7 +2471,8 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', if is_integer_object(val) or is_float_object(val): result[i] = NPY_NAT elif is_raise: - raise ValueError("mixed datetimes and integers in passed array") + raise ValueError( + "mixed datetimes and integers in passed array") else: raise TypeError @@ -2440,7 +2514,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', try: oresult[i] = parse_datetime_string(val, dayfirst=dayfirst, - yearfirst=yearfirst) + yearfirst=yearfirst) _pydatetime_to_dts(oresult[i], &dts) _check_dts_bounds(&dts) except Exception: @@ -2456,11 +2530,10 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', return oresult -# Similar to Timestamp/datetime, this is a construction requirement for timedeltas -# we need to do object instantiation in python -# This will serve as a C extension type that -# shadows the python class, where we do any heavy lifting. - +# Similar to Timestamp/datetime, this is a construction requirement for +# timedeltas that we need to do object instantiation in python. This will +# serve as a C extension type that shadows the Python class, where we do any +# heavy lifting. cdef class _Timedelta(timedelta): cdef readonly: @@ -2526,14 +2599,14 @@ cdef class _Timedelta(timedelta): return # put frac in seconds - frac = ivalue/(1000*1000*1000) + frac = ivalue /(1000 *1000 *1000) if frac < 0: self._sign = -1 # even fraction if (-frac % 86400) != 0: - self._d = -frac/86400 + 1 - frac += 86400*self._d + self._d = -frac /86400 + 1 + frac += 86400 *self._d else: frac = -frac else: @@ -2542,37 +2615,38 @@ cdef class _Timedelta(timedelta): if frac >= 86400: self._d += frac / 86400 - frac -= self._d * 86400 + frac -= self._d * 86400 if frac >= 3600: - self._h = frac / 3600 - frac -= self._h * 3600 + self._h = frac / 3600 + frac -= self._h * 3600 else: self._h = 0 if frac >= 60: self._m = frac / 60 - frac -= self._m * 60 + frac -= self._m * 60 else: self._m = 0 if frac >= 0: self._s = frac - frac -= self._s + frac -= self._s else: self._s = 0 - sfrac = (self._h*3600 + self._m*60 + self._s)*(1000*1000*1000) + sfrac = (self._h * 3600 + self._m * 60 + + self._s) * (1000 * 1000 * 1000) if self._sign < 0: - ifrac = ivalue + self._d*DAY_NS - sfrac + ifrac = ivalue + self._d *DAY_NS - sfrac else: - ifrac = ivalue - (self._d*DAY_NS + sfrac) + ifrac = ivalue - (self._d *DAY_NS + sfrac) if ifrac != 0: - self._ms = ifrac/(1000*1000) - ifrac -= self._ms*1000*1000 - self._us = ifrac/1000 - ifrac -= self._us*1000 + self._ms = ifrac /(1000 *1000) + ifrac -= self._ms *1000 *1000 + self._us = ifrac /1000 + ifrac -= self._us *1000 self._ns = ifrac else: self._ms = 0 @@ -2586,16 +2660,20 @@ cdef class _Timedelta(timedelta): return an actual datetime.timedelta object note: we lose nanosecond resolution if any """ - return timedelta(microseconds=int(self.value)/1000) + return timedelta(microseconds=int(self.value) /1000) cpdef bint _has_ns(self): return self.value % 1000 != 0 # components named tuple -Components = collections.namedtuple('Components',['days','hours','minutes','seconds','milliseconds','microseconds','nanoseconds']) +Components = collections.namedtuple('Components', [ + 'days', 'hours', 'minutes', 'seconds', + 'milliseconds', 'microseconds', 'nanoseconds']) # Python front end to C extension type _Timedelta # This serves as the box for timedelta64 + + class Timedelta(_Timedelta): """ Represents a duration, the difference between two dates or times. @@ -2608,7 +2686,8 @@ class Timedelta(_Timedelta): value : Timedelta, timedelta, np.timedelta64, string, or integer unit : string, [D,h,m,s,ms,us,ns] Denote the unit of the input, if input is an integer. Default 'ns'. - days, seconds, microseconds, milliseconds, minutes, hours, weeks : numeric, optional + days, seconds, microseconds, + milliseconds, minutes, hours, weeks : numeric, optional Values for construction in compat with datetime.timedelta. np ints and floats will be coereced to python ints and floats. @@ -2623,43 +2702,52 @@ class Timedelta(_Timedelta): if value is _no_input: if not len(kwargs): - raise ValueError("cannot construct a Timedelta without a value/unit or descriptive keywords (days,seconds....)") + raise ValueError( + "cannot construct a Timedelta without a value/unit or " + "descriptive keywords (days,seconds....)") def _to_py_int_float(v): if is_integer_object(v): return int(v) elif is_float_object(v): return float(v) - raise TypeError("Invalid type {0}. Must be int or float.".format(type(v))) + raise TypeError( + "Invalid type {0}. Must be int or float.".format(type(v))) - kwargs = dict([ (k, _to_py_int_float(v)) for k, v in iteritems(kwargs) ]) + kwargs = dict([ (k, _to_py_int_float(v)) + for k, v in iteritems(kwargs) ]) try: - nano = kwargs.pop('nanoseconds',0) - value = convert_to_timedelta64(timedelta(**kwargs),'ns') + nano + nano = kwargs.pop('nanoseconds', 0) + value = convert_to_timedelta64( + timedelta(**kwargs), 'ns') + nano except TypeError as e: - raise ValueError("cannot construct a Timedelta from the passed arguments, allowed keywords are " - "[weeks, days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds]") + raise ValueError("cannot construct a Timedelta from the " + "passed arguments, allowed keywords are " + "[weeks, days, hours, minutes, seconds, " + "milliseconds, microseconds, nanoseconds]") if isinstance(value, Timedelta): value = value.value elif util.is_string_object(value): value = np.timedelta64(parse_timedelta_string(value)) elif isinstance(value, timedelta): - value = convert_to_timedelta64(value,'ns') + value = convert_to_timedelta64(value, 'ns') elif isinstance(value, np.timedelta64): if unit is not None: value = value.astype('timedelta64[{0}]'.format(unit)) value = value.astype('timedelta64[ns]') - elif hasattr(value,'delta'): - value = np.timedelta64(_delta_to_nanoseconds(value.delta),'ns') + elif hasattr(value, 'delta'): + value = np.timedelta64(_delta_to_nanoseconds(value.delta), 'ns') elif is_integer_object(value) or util.is_float_object(value): # unit=None is de-facto 'ns' - value = convert_to_timedelta64(value,unit) + value = convert_to_timedelta64(value, unit) elif _checknull_with_nat(value): return NaT else: - raise ValueError("Value must be Timedelta, string, integer, float, timedelta or convertible") + raise ValueError( + "Value must be Timedelta, string, integer, " + "float, timedelta or convertible") if isinstance(value, np.timedelta64): value = value.view('i8') @@ -2669,7 +2757,7 @@ class Timedelta(_Timedelta): return NaT # make timedelta happy - td_base = _Timedelta.__new__(cls, microseconds=int(value)/1000) + td_base = _Timedelta.__new__(cls, microseconds=int(value) /1000) td_base.value = value td_base.is_populated = 0 return td_base @@ -2690,19 +2778,19 @@ class Timedelta(_Timedelta): self._ensure_components() if self._ns: - return "N" + return "N" elif self._us: - return "U" + return "U" elif self._ms: - return "L" + return "L" elif self._s: - return "S" + return "S" elif self._m: - return "T" + return "T" elif self._h: - return "H" + return "H" else: - return "D" + return "D" def _round(self, freq, rounder): @@ -2710,8 +2798,8 @@ class Timedelta(_Timedelta): from pandas.tseries.frequencies import to_offset unit = to_offset(freq).nanos - result = unit*rounder(self.value/float(unit)) - return Timedelta(result,unit='ns') + result = unit *rounder(self.value /float(unit)) + return Timedelta(result, unit='ns') def round(self, freq): """ @@ -2768,43 +2856,49 @@ class Timedelta(_Timedelta): self._ensure_components() if self._sign < 0: - sign_pretty = "-" - sign2_pretty = " +" + sign_pretty = "-" + sign2_pretty = " +" else: - sign_pretty = "" - sign2_pretty = " " + sign_pretty = "" + sign2_pretty = " " # show everything if format == 'all': - seconds_pretty = "%02d.%03d%03d%03d" % (self._s, self._ms, self._us, self._ns) - return "%s%d days%s%02d:%02d:%s" % (sign_pretty, self._d, sign2_pretty, self._h, self._m, seconds_pretty) + seconds_pretty = "%02d.%03d%03d%03d" % ( + self._s, self._ms, self._us, self._ns) + return "%s%d days%s%02d:%02d:%s" % (sign_pretty, self._d, + sign2_pretty, self._h, + self._m, seconds_pretty) # by default not showing nano if self._ms or self._us or self._ns: - seconds_pretty = "%02d.%03d%03d" % (self._s, self._ms, self._us) + seconds_pretty = "%02d.%03d%03d" % (self._s, self._ms, self._us) else: - seconds_pretty = "%02d" % self._s + seconds_pretty = "%02d" % self._s # if we have a partial day - subs = self._h or self._m or self._s or self._ms or self._us or self._ns + subs = (self._h or self._m or self._s or + self._ms or self._us or self._ns) if format == 'even_day': - if not subs: - return "%s%d days" % (sign_pretty, self._d) + if not subs: + return "%s%d days" % (sign_pretty, self._d) elif format == 'sub_day': - if not self._d: + if not self._d: - # degenerate, don't need the extra space - if self._sign > 0: - sign2_pretty = "" - return "%s%s%02d:%02d:%s" % (sign_pretty, sign2_pretty, self._h, self._m, seconds_pretty) + # degenerate, don't need the extra space + if self._sign > 0: + sign2_pretty = "" + return "%s%s%02d:%02d:%s" % (sign_pretty, sign2_pretty, + self._h, self._m, seconds_pretty) if subs or format=='long': - return "%s%d days%s%02d:%02d:%s" % (sign_pretty, self._d, sign2_pretty, self._h, self._m, seconds_pretty) + return "%s%d days%s%02d:%02d:%s" % (sign_pretty, self._d, + sign2_pretty, self._h, + self._m, seconds_pretty) return "%s%d days" % (sign_pretty, self._d) - def __repr__(self): return "Timedelta('{0}')".format(self._repr_base(format='long')) def __str__(self): @@ -2815,10 +2909,12 @@ class Timedelta(_Timedelta): """ Return a Components NamedTuple-like """ self._ensure_components() if self._sign < 0: - return Components(-self._d,self._h,self._m,self._s,self._ms,self._us,self._ns) + return Components(-self._d, self._h, self._m, self._s, + self._ms, self._us, self._ns) # return the named tuple - return Components(self._d,self._h,self._m,self._s,self._ms,self._us,self._ns) + return Components(self._d, self._h, self._m, self._s, + self._ms, self._us, self._ns) @property def days(self): @@ -2829,7 +2925,7 @@ class Timedelta(_Timedelta): """ self._ensure_components() if self._sign < 0: - return -1*self._d + return -1 *self._d return self._d @property @@ -2840,7 +2936,7 @@ class Timedelta(_Timedelta): .components will return the shown components """ self._ensure_components() - return self._h*3600 + self._m*60 + self._s + return self._h *3600 + self._m *60 + self._s @property def microseconds(self): @@ -2850,7 +2946,7 @@ class Timedelta(_Timedelta): .components will return the shown components """ self._ensure_components() - return self._ms*1000 + self._us + return self._ms *1000 + self._us @property def nanoseconds(self): @@ -2866,7 +2962,7 @@ class Timedelta(_Timedelta): """ Total duration of timedelta in seconds (to ns precision) """ - return 1e-9*self.value + return 1e-9 *self.value def __setstate__(self, state): (value) = state @@ -2887,13 +2983,13 @@ class Timedelta(_Timedelta): def _validate_ops_compat(self, other): # return True if we are compat with operating if _checknull_with_nat(other): - return True + return True elif isinstance(other, (Timedelta, timedelta, np.timedelta64)): - return True + return True elif util.is_string_object(other): - return True - elif hasattr(other,'delta'): - return True + return True + elif hasattr(other, 'delta'): + return True return False # higher than np.ndarray and np.matrix @@ -2952,9 +3048,9 @@ class Timedelta(_Timedelta): # only integers and floats allowed if not (is_integer_object(other) or is_float_object(other)): - return NotImplemented + return NotImplemented - return Timedelta(other*self.value, unit='ns') + return Timedelta(other *self.value, unit='ns') __rmul__ = __mul__ @@ -2965,7 +3061,7 @@ class Timedelta(_Timedelta): # integers or floats if is_integer_object(other) or is_float_object(other): - return Timedelta(self.value/other, unit='ns') + return Timedelta(self.value /other, unit='ns') if not self._validate_ops_compat(other): return NotImplemented @@ -2973,7 +3069,7 @@ class Timedelta(_Timedelta): other = Timedelta(other) if other is NaT: return np.nan - return self.value/float(other.value) + return self.value /float(other.value) def __rtruediv__(self, other): if hasattr(other, 'dtype'): @@ -2988,13 +3084,13 @@ class Timedelta(_Timedelta): return float(other.value) / self.value if not PY3: - __div__ = __truediv__ - __rdiv__ = __rtruediv__ + __div__ = __truediv__ + __rdiv__ = __rtruediv__ def _not_implemented(self, *args, **kwargs): return NotImplemented - __floordiv__ = _not_implemented + __floordiv__ = _not_implemented __rfloordiv__ = _not_implemented def _op_unary_method(func, name): @@ -3010,14 +3106,16 @@ class Timedelta(_Timedelta): __abs__ = _op_unary_method(lambda x: abs(x), '__abs__') # resolution in ns -Timedelta.min = Timedelta(np.iinfo(np.int64).min+1) +Timedelta.min = Timedelta(np.iinfo(np.int64).min +1) Timedelta.max = Timedelta(np.iinfo(np.int64).max) cdef PyTypeObject* td_type = Timedelta + cdef inline bint is_timedelta(object o): return Py_TYPE(o) == td_type # isinstance(o, Timedelta) + cpdef array_to_timedelta64(ndarray[object] values, unit='ns', errors='raise'): """ Convert an ndarray to an array of timedeltas. If errors == 'coerce', @@ -3054,37 +3152,37 @@ cpdef array_to_timedelta64(ndarray[object] values, unit='ns', errors='raise'): return iresult -cdef dict timedelta_abbrevs = { 'D' : 'd', - 'd' : 'd', - 'days' : 'd', - 'day' : 'd', - 'hours' : 'h', - 'hour' : 'h', - 'hr' : 'h', - 'h' : 'h', - 'm' : 'm', - 'minute' : 'm', - 'min' : 'm', - 'minutes' : 'm', - 's' : 's', - 'seconds' : 's', - 'sec' : 's', - 'second' : 's', - 'ms' : 'ms', - 'milliseconds' : 'ms', - 'millisecond' : 'ms', - 'milli' : 'ms', - 'millis' : 'ms', - 'us' : 'us', - 'microseconds' : 'us', - 'microsecond' : 'us', - 'micro' : 'us', - 'micros' : 'us', - 'ns' : 'ns', - 'nanoseconds' : 'ns', - 'nano' : 'ns', - 'nanos' : 'ns', - 'nanosecond' : 'ns', +cdef dict timedelta_abbrevs = { 'D': 'd', + 'd': 'd', + 'days': 'd', + 'day': 'd', + 'hours': 'h', + 'hour': 'h', + 'hr': 'h', + 'h': 'h', + 'm': 'm', + 'minute': 'm', + 'min': 'm', + 'minutes': 'm', + 's': 's', + 'seconds': 's', + 'sec': 's', + 'second': 's', + 'ms': 'ms', + 'milliseconds': 'ms', + 'millisecond': 'ms', + 'milli': 'ms', + 'millis': 'ms', + 'us': 'us', + 'microseconds': 'us', + 'microsecond': 'us', + 'micro': 'us', + 'micros': 'us', + 'ns': 'ns', + 'nanoseconds': 'ns', + 'nano': 'ns', + 'nanos': 'ns', + 'nanosecond': 'ns', } timedelta_abbrevs_map = timedelta_abbrevs @@ -3134,7 +3232,8 @@ cdef inline parse_timedelta_string(object ts): list number=[], frac=[], unit=[] # neg : tracks if we have a leading negative for the value - # have_dot : tracks if we are processing a dot (either post hhmmss or inside an expression) + # have_dot : tracks if we are processing a dot (either post hhmmss or + # inside an expression) # have_value : track if we have at least 1 leading unit # have_hhmmss : tracks if we have a regular format hh:mm:ss @@ -3250,11 +3349,11 @@ cdef inline parse_timedelta_string(object ts): raise ValueError("no units specified") if len(frac) > 0 and len(frac) <= 3: - m = 10**(3-len(frac)) * 1000L * 1000L + m = 10**(3 -len(frac)) * 1000L * 1000L elif len(frac) > 3 and len(frac) <= 6: - m = 10**(6-len(frac)) * 1000L + m = 10**(6 -len(frac)) * 1000L else: - m = 10**(9-len(frac)) + m = 10**(9 -len(frac)) r = int(''.join(frac)) * m result += timedelta_as_neg(r, neg) @@ -3320,7 +3419,7 @@ cpdef convert_to_timedelta64(object ts, object unit): else: if util.is_array(ts): ts = ts.astype('int64').item() - if unit in ['Y','M','W']: + if unit in ['Y', 'M', 'W']: ts = np.timedelta64(ts, unit) else: ts = cast_from_unit(ts, unit) @@ -3328,15 +3427,15 @@ cpdef convert_to_timedelta64(object ts, object unit): elif is_float_object(ts): if util.is_array(ts): ts = ts.astype('int64').item() - if unit in ['Y','M','W']: + if unit in ['Y', 'M', 'W']: ts = np.timedelta64(int(ts), unit) else: ts = cast_from_unit(ts, unit) ts = np.timedelta64(ts) elif util.is_string_object(ts): ts = np.timedelta64(parse_timedelta_string(ts)) - elif hasattr(ts,'delta'): - ts = np.timedelta64(_delta_to_nanoseconds(ts),'ns') + elif hasattr(ts, 'delta'): + ts = np.timedelta64(_delta_to_nanoseconds(ts), 'ns') if isinstance(ts, timedelta): ts = np.timedelta64(ts) @@ -3345,7 +3444,9 @@ cpdef convert_to_timedelta64(object ts, object unit): "scalar: %s" % type(ts)) return ts.astype('timedelta64[ns]') -def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors='raise'): + +def array_strptime(ndarray[object] values, object fmt, + bint exact=True, errors='raise'): """ Parameters ---------- @@ -3364,7 +3465,9 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors=' int64_t us, ns object val, group_key, ampm, found dict found_key - bint is_raise=errors=='raise', is_ignore=errors=='ignore', is_coerce=errors=='coerce' + bint is_raise = errors=='raise' + bint is_ignore = errors=='ignore' + bint is_coerce = errors=='coerce' assert is_raise or is_ignore or is_coerce @@ -3442,8 +3545,8 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors=' if is_coerce: iresult[i] = NPY_NAT continue - raise ValueError("time data %r does not match format %r (match)" % - (values[i], fmt)) + raise ValueError("time data %r does not match " + "format %r (match)" % (values[i], fmt)) if len(val) != found.end(): if is_coerce: iresult[i] = NPY_NAT @@ -3458,8 +3561,8 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors=' if is_coerce: iresult[i] = NPY_NAT continue - raise ValueError("time data %r does not match format %r (search)" % - (values[i], fmt)) + raise ValueError("time data %r does not match format " + "%r (search)" % (values[i], fmt)) year = 1900 month = day = 1 @@ -3563,7 +3666,8 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors=' # same and yet time.daylight is true; too ambiguous to # be able to tell what timezone has daylight savings if (time.tzname[0] == time.tzname[1] and - time.daylight and found_zone not in ("utc", "gmt")): + time.daylight and found_zone not in ( + "utc", "gmt")): break else: tz = value @@ -3579,9 +3683,10 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors=' # calculation. try: if julian == -1: - # Need to add 1 to result since first day of the year is 1, not 0. + # Need to add 1 to result since first day of the year is 1, not + # 0. julian = datetime_date(year, month, day).toordinal() - \ - datetime_date(year, 1, 1).toordinal() + 1 + datetime_date(year, 1, 1).toordinal() + 1 else: # Assume that if they bothered to include Julian day it will # be accurate. datetime_result = datetime_date.fromordinal( @@ -3590,10 +3695,10 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors=' month = datetime_result.month day = datetime_result.day except ValueError: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise + if is_coerce: + iresult[i] = NPY_NAT + continue + raise if weekday == -1: weekday = datetime_date(year, month, day).weekday() @@ -3672,10 +3777,11 @@ cpdef inline int64_t cast_from_unit(object ts, object unit) except? -1: # cast the unit, multiply base/frace separately # to avoid precision issues from float -> int base = ts - frac = ts-base + frac = ts -base if p: - frac = round(frac,p) - return (base*m) + (frac*m) + frac = round(frac, p) + return (base *m) + (frac *m) + def cast_to_nanoseconds(ndarray arr): cdef: @@ -3721,6 +3827,7 @@ def pydt_to_i8(object pydt): return ts.value + def i8_to_pydt(int64_t i8, object tzinfo = None): """ Inverse of pydt_to_i8 @@ -3737,6 +3844,7 @@ try: except: have_pytz = False + def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): cdef: ndarray[int64_t] utc_dates, tt, result, trans, deltas @@ -3803,7 +3911,8 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): pandas_datetime_to_datetimestruct(v, PANDAS_FR_ns, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz2) - delta = int(total_seconds(_get_utcoffset(tz2, dt))) * 1000000000 + delta = int(total_seconds( + _get_utcoffset(tz2, dt))) * 1000000000 result[i] = v + delta return result @@ -3836,6 +3945,7 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): result[i] = v + offset return result + def tz_convert_single(int64_t val, object tz1, object tz2): cdef: ndarray[int64_t] trans, deltas @@ -3889,7 +3999,8 @@ def tz_convert_single(int64_t val, object tz1, object tz2): dst_cache = {} cdef inline bint _treat_tz_as_pytz(object tz): - return hasattr(tz, '_utc_transition_times') and hasattr(tz, '_transition_info') + return hasattr(tz, '_utc_transition_times') and hasattr( + tz, '_transition_info') cdef inline bint _treat_tz_as_dateutil(object tz): return hasattr(tz, '_trans_list') and hasattr(tz, '_trans_idx') @@ -3902,24 +4013,32 @@ def _p_tz_cache_key(tz): cdef inline object _tz_cache_key(object tz): """ - Return the key in the cache for the timezone info object or None if unknown. + Return the key in the cache for the timezone info object or None + if unknown. - The key is currently the tz string for pytz timezones, the filename for dateutil timezones. + The key is currently the tz string for pytz timezones, the filename for + dateutil timezones. Notes ===== - This cannot just be the hash of a timezone object. Unfortunately, the hashes of two dateutil tz objects - which represent the same timezone are not equal (even though the tz objects will compare equal and - represent the same tz file). - Also, pytz objects are not always hashable so we use str(tz) instead. + This cannot just be the hash of a timezone object. Unfortunately, the + hashes of two dateutil tz objects which represent the same timezone are + not equal (even though the tz objects will compare equal and represent + the same tz file). Also, pytz objects are not always hashable so we use + str(tz) instead. """ if isinstance(tz, _pytz_BaseTzInfo): return tz.zone elif isinstance(tz, _dateutil_tzfile): if '.tar.gz' in tz._filename: - raise ValueError('Bad tz filename. Dateutil on python 3 on windows has a bug which causes tzfile._filename to be the same for all ' - 'timezone files. Please construct dateutil timezones implicitly by passing a string like "dateutil/Europe/London" ' - 'when you construct your pandas objects instead of passing a timezone object. See https://github.com/pydata/pandas/pull/7362') + raise ValueError('Bad tz filename. Dateutil on python 3 on ' + 'windows has a bug which causes tzfile._filename ' + 'to be the same for all timezone files. Please ' + 'construct dateutil timezones implicitly by ' + 'passing a string like "dateutil/Europe/London" ' + 'when you construct your pandas objects instead ' + 'of passing a timezone object. See ' + 'https://github.com/pydata/pandas/pull/7362') return 'dateutil' + tz._filename else: return None @@ -3956,26 +4075,29 @@ cdef object _get_dst_info(object tz): if len(tz._trans_list): # get utc trans times trans_list = _get_utc_trans_times_from_dateutil_tz(tz) - trans = np.hstack([np.array([0], dtype='M8[s]'), # place holder for first item - np.array(trans_list, dtype='M8[s]')]).astype('M8[ns]') # all trans listed + trans = np.hstack([ + np.array([0], dtype='M8[s]'), # place holder for first item + np.array(trans_list, dtype='M8[s]')]).astype( + 'M8[ns]') # all trans listed trans = trans.view('i8') trans[0] = NPY_NAT + 1 # deltas - deltas = np.array([v.offset for v in (tz._ttinfo_before,) + tz._trans_idx], dtype='i8') # + (tz._ttinfo_std,) + deltas = np.array([v.offset for v in ( + tz._ttinfo_before,) + tz._trans_idx], dtype='i8') deltas *= 1000000000 typ = 'dateutil' elif _is_fixed_offset(tz): trans = np.array([NPY_NAT + 1], dtype=np.int64) - deltas = np.array([tz._ttinfo_std.offset], dtype='i8') * 1000000000 + deltas = np.array([tz._ttinfo_std.offset], + dtype='i8') * 1000000000 typ = 'fixed' else: trans = np.array([], dtype='M8[ns]') deltas = np.array([], dtype='i8') typ = None - else: # static tzinfo trans = np.array([NPY_NAT + 1], dtype=np.int64) @@ -3989,8 +4111,9 @@ cdef object _get_dst_info(object tz): cdef object _get_utc_trans_times_from_dateutil_tz(object tz): """ - Transition times in dateutil timezones are stored in local non-dst time. This code - converts them to UTC. It's the reverse of the code in dateutil.tz.tzfile.__init__. + Transition times in dateutil timezones are stored in local non-dst + time. This code converts them to UTC. It's the reverse of the code + in dateutil.tz.tzfile.__init__. """ new_trans = list(tz._trans_list) last_std_offset = 0 @@ -4000,6 +4123,7 @@ cdef object _get_utc_trans_times_from_dateutil_tz(object tz): new_trans[i] = trans - last_std_offset return new_trans + def tot_seconds(td): return total_seconds(td) @@ -4069,7 +4193,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, elif hasattr(ambiguous, '__iter__'): is_dst = True if len(ambiguous) != len(vals): - raise ValueError("Length of ambiguous bool-array must be the same size as vals") + raise ValueError( + "Length of ambiguous bool-array must be the same size as vals") trans, deltas, typ = _get_dst_info(tz) @@ -4082,7 +4207,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, result_b.fill(NPY_NAT) # left side - idx_shifted = (np.maximum(0, trans.searchsorted(vals - DAY_NS, side='right') - 1)).astype(np.int64) + idx_shifted = (np.maximum(0, trans.searchsorted( + vals - DAY_NS, side='right') - 1)).astype(np.int64) for i in range(n): v = vals[i] - deltas[idx_shifted[i]] @@ -4093,7 +4219,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, result_a[i] = v # right side - idx_shifted = (np.maximum(0, trans.searchsorted(vals + DAY_NS, side='right') - 1)).astype(np.int64) + idx_shifted = (np.maximum(0, trans.searchsorted( + vals + DAY_NS, side='right') - 1)).astype(np.int64) for i in range(n): v = vals[i] - deltas[idx_shifted[i]] @@ -4110,36 +4237,39 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, # Get the ambiguous hours (given the above, these are the hours # where result_a != result_b and neither of them are NAT) both_nat = np.logical_and(result_a != NPY_NAT, result_b != NPY_NAT) - both_eq = result_a == result_b + both_eq = result_a == result_b trans_idx = np.squeeze(np.nonzero(np.logical_and(both_nat, ~both_eq))) if trans_idx.size == 1: stamp = Timestamp(vals[trans_idx]) - raise pytz.AmbiguousTimeError("Cannot infer dst time from %s as" - "there are no repeated times" % stamp) + raise pytz.AmbiguousTimeError( + "Cannot infer dst time from %s as there " + "are no repeated times" % stamp) # Split the array into contiguous chunks (where the difference between - # indices is 1). These are effectively dst transitions in different years - # which is useful for checking that there is not an ambiguous transition - # in an individual year. + # indices is 1). These are effectively dst transitions in different + # years which is useful for checking that there is not an ambiguous + # transition in an individual year. if trans_idx.size > 0: - one_diff = np.where(np.diff(trans_idx)!=1)[0]+1 + one_diff = np.where(np.diff(trans_idx) != 1)[0] +1 trans_grp = np.array_split(trans_idx, one_diff) - # Iterate through each day, if there are no hours where the delta is negative - # (indicates a repeat of hour) the switch cannot be inferred + # Iterate through each day, if there are no hours where the + # delta is negative (indicates a repeat of hour) the switch + # cannot be inferred for grp in trans_grp: delta = np.diff(result_a[grp]) - if grp.size == 1 or np.all(delta>0): + if grp.size == 1 or np.all(delta > 0): stamp = Timestamp(vals[grp[0]]) raise pytz.AmbiguousTimeError(stamp) - # Find the index for the switch and pull from a for dst and b for standard - switch_idx = (delta<=0).nonzero()[0] + # Find the index for the switch and pull from a for dst and b + # for standard + switch_idx = (delta <= 0).nonzero()[0] if switch_idx.size > 1: - raise pytz.AmbiguousTimeError("There are %i dst switches " - "when there should only be 1." - % switch_idx.size) - switch_idx = switch_idx[0]+1 # Pull the only index and adjust + raise pytz.AmbiguousTimeError( + "There are %i dst switches when " + "there should only be 1." % switch_idx.size) + switch_idx = switch_idx[0] + 1 # Pull the only index and adjust a_idx = grp[:switch_idx] b_idx = grp[switch_idx:] dst_hours[grp] = np.hstack((result_a[a_idx], result_b[b_idx])) @@ -4164,9 +4294,9 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, result[i] = NPY_NAT else: stamp = Timestamp(vals[i]) - raise pytz.AmbiguousTimeError("Cannot infer dst time from %r, "\ - "try using the 'ambiguous' argument" - % stamp) + raise pytz.AmbiguousTimeError( + "Cannot infer dst time from %r, try using the " + "'ambiguous' argument" % stamp) elif left != NPY_NAT: result[i] = left elif right != NPY_NAT: @@ -4246,6 +4376,7 @@ def build_field_sarray(ndarray[int64_t] dtindex): return out + def get_time_micros(ndarray[int64_t] dtindex): """ Datetime as int64 representation to a structured array of fields @@ -4284,7 +4415,7 @@ def get_date_field(ndarray[int64_t] dtindex, object field): _month_offset = np.array( [[ 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 ], [ 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 ]], - dtype=np.int32 ) + dtype=np.int32 ) count = len(dtindex) out = np.empty(count, dtype='i4') @@ -4294,7 +4425,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.year return out @@ -4303,7 +4435,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.month return out @@ -4312,7 +4445,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.day return out @@ -4321,7 +4455,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.hour return out @@ -4330,7 +4465,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.min return out @@ -4339,7 +4475,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.sec return out @@ -4348,7 +4485,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.us return out @@ -4357,7 +4495,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.ps / 1000 return out elif field == 'doy': @@ -4365,9 +4504,10 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) isleap = is_leapyear(dts.year) - out[i] = _month_offset[isleap, dts.month-1] + dts.day + out[i] = _month_offset[isleap, dts.month -1] + dts.day return out elif field == 'dow': @@ -4375,7 +4515,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dayofweek(dts.year, dts.month, dts.day) return out @@ -4384,7 +4525,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) isleap = is_leapyear(dts.year) isleap_prev = is_leapyear(dts.year - 1) mo_off = _month_offset[isleap, dts.month - 1] @@ -4414,7 +4556,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.month out[i] = ((out[i] - 1) / 3) + 1 return out @@ -4424,7 +4567,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = days_in_month(dts) return out elif field == 'is_leap_year': @@ -4434,7 +4578,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): @cython.wraparound(False) -def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=None, int month_kw=12): +def get_start_end_field(ndarray[int64_t] dtindex, object field, + object freqstr=None, int month_kw=12): """ Given an int64-based datetime index return array of indicators of whether timestamps are at the start/end of the month/quarter/year @@ -4456,21 +4601,24 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N _month_offset = np.array( [[ 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 ], [ 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 ]], - dtype=np.int32 ) + dtype=np.int32 ) count = len(dtindex) out = np.zeros(count, dtype='int8') if freqstr: if freqstr == 'C': - raise ValueError("Custom business days is not supported by %s" % field) + raise ValueError( + "Custom business days is not supported by %s" % field) is_business = freqstr[0] == 'B' - # YearBegin(), BYearBegin() use month = starting month of year - # QuarterBegin(), BQuarterBegin() use startingMonth = starting month of year - # other offests use month, startingMonth as ending month of year. + # YearBegin(), BYearBegin() use month = starting month of year. + # QuarterBegin(), BQuarterBegin() use startingMonth = starting + # month of year. Other offests use month, startingMonth as ending + # month of year. - if (freqstr[0:2] in ['MS', 'QS', 'AS']) or (freqstr[1:3] in ['MS', 'QS', 'AS']): + if (freqstr[0:2] in ['MS', 'QS', 'AS']) or ( + freqstr[1:3] in ['MS', 'QS', 'AS']): end_month = 12 if month_kw == 1 else month_kw - 1 start_month = month_kw else: @@ -4485,7 +4633,8 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) dom = dts.day dow = ts_dayofweek(ts) @@ -4497,7 +4646,8 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) dom = dts.day if dom == 1: @@ -4509,7 +4659,8 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) isleap = is_leapyear(dts.year) mo_off = _month_offset[isleap, dts.month - 1] @@ -4518,14 +4669,16 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N ldom = _month_offset[isleap, dts.month] dow = ts_dayofweek(ts) - if (ldom == doy and dow < 5) or (dow == 4 and (ldom - doy <= 2)): + if (ldom == doy and dow < 5) or ( + dow == 4 and (ldom - doy <= 2)): out[i] = 1 return out.view(bool) else: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) isleap = is_leapyear(dts.year) mo_off = _month_offset[isleap, dts.month - 1] dom = dts.day @@ -4541,19 +4694,22 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) dom = dts.day dow = ts_dayofweek(ts) - if ((dts.month - start_month) % 3 == 0) and ((dom == 1 and dow < 5) or (dom <= 3 and dow == 0)): + if ((dts.month - start_month) % 3 == 0) and ( + (dom == 1 and dow < 5) or (dom <= 3 and dow == 0)): out[i] = 1 return out.view(bool) else: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) dom = dts.day if ((dts.month - start_month) % 3 == 0) and dom == 1: @@ -4565,7 +4721,8 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) isleap = is_leapyear(dts.year) mo_off = _month_offset[isleap, dts.month - 1] @@ -4574,14 +4731,17 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N ldom = _month_offset[isleap, dts.month] dow = ts_dayofweek(ts) - if ((dts.month - end_month) % 3 == 0) and ((ldom == doy and dow < 5) or (dow == 4 and (ldom - doy <= 2))): + if ((dts.month - end_month) % 3 == 0) and ( + (ldom == doy and dow < 5) or ( + dow == 4 and (ldom - doy <= 2))): out[i] = 1 return out.view(bool) else: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) isleap = is_leapyear(dts.year) mo_off = _month_offset[isleap, dts.month - 1] dom = dts.day @@ -4597,19 +4757,22 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) dom = dts.day dow = ts_dayofweek(ts) - if (dts.month == start_month) and ((dom == 1 and dow < 5) or (dom <= 3 and dow == 0)): + if (dts.month == start_month) and ( + (dom == 1 and dow < 5) or (dom <= 3 and dow == 0)): out[i] = 1 return out.view(bool) else: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) dom = dts.day if (dts.month == start_month) and dom == 1: @@ -4621,7 +4784,8 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) isleap = is_leapyear(dts.year) dom = dts.day @@ -4630,14 +4794,17 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N dow = ts_dayofweek(ts) ldom = _month_offset[isleap, dts.month] - if (dts.month == end_month) and ((ldom == doy and dow < 5) or (dow == 4 and (ldom - doy <= 2))): + if (dts.month == end_month) and ( + (ldom == doy and dow < 5) or ( + dow == 4 and (ldom - doy <= 2))): out[i] = 1 return out.view(bool) else: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) isleap = is_leapyear(dts.year) mo_off = _month_offset[isleap, dts.month - 1] @@ -4651,6 +4818,7 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N raise ValueError("Field %s not supported" % field) + @cython.wraparound(False) @cython.boundscheck(False) def get_date_name_field(ndarray[int64_t] dtindex, object field): @@ -4666,8 +4834,9 @@ def get_date_name_field(ndarray[int64_t] dtindex, object field): int dow _dayname = np.array( - ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], - dtype=np.object_ ) + ['Monday', 'Tuesday', 'Wednesday', 'Thursday', + 'Friday', 'Saturday', 'Sunday'], + dtype=np.object_ ) count = len(dtindex) out = np.empty(count, dtype=object) @@ -4710,11 +4879,13 @@ def date_normalize(ndarray[int64_t] stamps, tz=None): if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue - pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + stamps[i], PANDAS_FR_ns, &dts) result[i] = _normalized_stamp(&dts) return result + @cython.wraparound(False) @cython.boundscheck(False) cdef _normalize_local(ndarray[int64_t] stamps, object tz): @@ -4730,15 +4901,15 @@ cdef _normalize_local(ndarray[int64_t] stamps, object tz): if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue - pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + stamps[i], PANDAS_FR_ns, &dts) result[i] = _normalized_stamp(&dts) elif _is_tzlocal(tz): for i in range(n): if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue - pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, - &dts) + pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) delta = int(total_seconds(_get_utcoffset(tz, dt))) * 1000000000 @@ -4755,7 +4926,7 @@ cdef _normalize_local(ndarray[int64_t] stamps, object tz): pos = _pos # statictzinfo - if typ not in ['pytz','dateutil']: + if typ not in ['pytz', 'dateutil']: for i in range(n): if stamps[i] == NPY_NAT: result[i] = NPY_NAT @@ -4840,7 +5011,7 @@ def monthrange(int64_t year, int64_t month): if month < 1 or month > 12: raise ValueError("bad month number 0; must be 1-12") - days = days_per_month_table[is_leapyear(year)][month-1] + days = days_per_month_table[is_leapyear(year)][month -1] return (dayofweek(year, month, 1), days) @@ -4848,7 +5019,7 @@ cdef inline int64_t ts_dayofweek(_TSObject ts): return dayofweek(ts.dts.year, ts.dts.month, ts.dts.day) cdef inline int days_in_month(pandas_datetimestruct dts) nogil: - return days_per_month_table[is_leapyear(dts.year)][dts.month-1] + return days_per_month_table[is_leapyear(dts.year)][dts.month -1] cpdef normalize_date(object dt): """ @@ -4874,10 +5045,14 @@ cdef inline int _year_add_months(pandas_datetimestruct dts, cdef inline int _month_add_months(pandas_datetimestruct dts, int months) nogil: - """new month number after shifting pandas_datetimestruct number of months""" + """ + New month number after shifting pandas_datetimestruct + number of months. + """ cdef int new_month = (dts.month + months) % 12 return 12 if new_month == 0 else new_month + @cython.wraparound(False) @cython.boundscheck(False) def shift_months(int64_t[:] dtindex, int months, object day=None): @@ -4902,7 +5077,8 @@ def shift_months(int64_t[:] dtindex, int months, object day=None): with nogil: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = NPY_NAT; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) dts.year = _year_add_months(dts, months) dts.month = _month_add_months(dts, months) @@ -4916,7 +5092,8 @@ def shift_months(int64_t[:] dtindex, int months, object day=None): with nogil: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = NPY_NAT; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) months_to_roll = months # offset semantics - if on the anchor point and going backwards @@ -4937,7 +5114,8 @@ def shift_months(int64_t[:] dtindex, int months, object day=None): with nogil: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = NPY_NAT; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) months_to_roll = months # similar semantics - when adding shift forward by one @@ -4992,10 +5170,12 @@ except: __all__ = [] + def _getlang(): # Figure out what the current language is set to. return locale.getlocale(locale.LC_TIME) + class LocaleTime(object): """Stores and handles locale-specific information related to time. @@ -5075,8 +5255,9 @@ class LocaleTime(object): # magical; just happened to have used it everywhere else where a # static date was needed. am_pm = [] - for hour in (01,22): - time_tuple = time.struct_time((1999,3,17,hour,44,55,2,76,0)) + for hour in (01, 22): + time_tuple = time.struct_time( + (1999, 3, 17, hour, 44, 55, 2, 76, 0)) am_pm.append(time.strftime("%p", time_tuple).lower()) self.am_pm = am_pm @@ -5088,22 +5269,23 @@ class LocaleTime(object): # overloaded numbers is minimized. The order in which searches for # values within the format string is very important; it eliminates # possible ambiguity for what something represents. - time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0)) + time_tuple = time.struct_time((1999, 3, 17, 22, 44, 55, 2, 76, 0)) date_time = [None, None, None] date_time[0] = time.strftime("%c", time_tuple).lower() date_time[1] = time.strftime("%x", time_tuple).lower() date_time[2] = time.strftime("%X", time_tuple).lower() replacement_pairs = [('%', '%%'), (self.f_weekday[2], '%A'), - (self.f_month[3], '%B'), (self.a_weekday[2], '%a'), - (self.a_month[3], '%b'), (self.am_pm[1], '%p'), - ('1999', '%Y'), ('99', '%y'), ('22', '%H'), - ('44', '%M'), ('55', '%S'), ('76', '%j'), - ('17', '%d'), ('03', '%m'), ('3', '%m'), - # '3' needed for when no leading zero. - ('2', '%w'), ('10', '%I')] + (self.f_month[3], + '%B'), (self.a_weekday[2], '%a'), + (self.a_month[3], '%b'), (self.am_pm[1], '%p'), + ('1999', '%Y'), ('99', '%y'), ('22', '%H'), + ('44', '%M'), ('55', '%S'), ('76', '%j'), + ('17', '%d'), ('03', '%m'), ('3', '%m'), + # '3' needed for when no leading zero. + ('2', '%w'), ('10', '%I')] replacement_pairs.extend([(tz, "%Z") for tz_values in self.timezone for tz in tz_values]) - for offset,directive in ((0,'%c'), (1,'%x'), (2,'%X')): + for offset, directive in ((0, '%c'), (1, '%x'), (2, '%X')): current_format = date_time[offset] for old, new in replacement_pairs: # Must deal with possible lack of locale info @@ -5115,7 +5297,7 @@ class LocaleTime(object): # If %W is used, then Sunday, 2005-01-03 will fall on week 0 since # 2005-01-03 occurs before the first Monday of the year. Otherwise # %U is used. - time_tuple = time.struct_time((1999,1,3,1,1,1,6,3,0)) + time_tuple = time.struct_time((1999, 1, 3, 1, 1, 1, 6, 3, 0)) if '00' in time.strftime(directive, time_tuple): U_W = '%W' else: @@ -5161,7 +5343,8 @@ class TimeRE(dict): 'f': r"(?P[0-9]{1,9})", 'H': r"(?P2[0-3]|[0-1]\d|\d)", 'I': r"(?P1[0-2]|0[1-9]|[1-9])", - 'j': r"(?P36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])", + 'j': (r"(?P36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|" + r"[1-9]\d|0[1-9]|[1-9])"), 'm': r"(?P1[0-2]|0[1-9]|[1-9])", 'M': r"(?P[0-5]\d|\d)", 'S': r"(?P6[0-1]|[0-5]\d|\d)", @@ -5221,11 +5404,11 @@ class TimeRE(dict): whitespace_replacement = re_compile(r'\s+') format = whitespace_replacement.sub(r'\\s+', format) while '%' in format: - directive_index = format.index('%')+1 + directive_index = format.index('%') +1 processed_format = "%s%s%s" % (processed_format, - format[:directive_index-1], + format[:directive_index -1], self[format[directive_index]]) - format = format[directive_index+1:] + format = format[directive_index +1:] return "%s%s" % (processed_format, format) def compile(self, format): @@ -5239,7 +5422,8 @@ _TimeRE_cache = TimeRE() _CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache _regex_cache = {} -cdef _calc_julian_from_U_or_W(int year, int week_of_year, int day_of_week, int week_starts_Mon): +cdef _calc_julian_from_U_or_W(int year, int week_of_year, + int day_of_week, int week_starts_Mon): """Calculate the Julian day based on the year, week of the year, and day of the week, with week_start_day representing whether the week of the year assumes the week starts on Sunday or Monday (6 or 0)."""