Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLN: use float64_t consistently instead of double, double_t #23583

Merged
merged 19 commits into from
Nov 11, 2018
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions pandas/_libs/algos.pxd
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
from util cimport numeric


cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k) nogil


cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil:
cdef:
numeric t
Expand Down
19 changes: 9 additions & 10 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ from numpy cimport (ndarray,
NPY_FLOAT32, NPY_FLOAT64,
NPY_OBJECT,
int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
uint32_t, uint64_t, float32_t, float64_t,
double_t)
uint32_t, uint64_t, float32_t, float64_t)
cnp.import_array()


Expand All @@ -32,10 +31,9 @@ import missing

cdef float64_t FP_ERR = 1e-13

cdef double NaN = <double>np.NaN
cdef double nan = NaN
cdef float64_t NaN = <float64_t>np.NaN

cdef int64_t iNaT = get_nat()
cdef int64_t NPY_NAT = get_nat()

tiebreakers = {
'average': TIEBREAK_AVERAGE,
Expand Down Expand Up @@ -197,9 +195,10 @@ def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups):
return result, counts


# TODO: redundant with groupby.kth_smallest_c
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its not actually

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. The typing is more specific in the groupby version, but the code itself is nearly identical

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

try to remove and you will see why it’s here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll get rid of the comment, but am kind of surprised: you're usually Holy Crusader against redundant code, and the body of this function is very nearly copy/pasted

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

my point is i don’t think it’s easy to remove
you are more than welcome to try

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No you're right. I removed the comment.

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k) nogil:
def kth_smallest(numeric[:] a, Py_ssize_t k) -> numeric:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think will have a big perf slowdown

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its never used in cython and nogil isnt allowed for def functions. There is a with nogil block just below this

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls test i once changed this (and tried to remove) and was all negative

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indistinguishable:

master:

In [3]: arr = np.arange(10000, dtype=np.int64)

In [4]: %timeit pd._libs.algos.kth_smallest(arr, 4)
The slowest run took 165.65 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 9.42 µs per loop

In [5]: %timeit pd._libs.algos.kth_smallest(arr, 4)
The slowest run took 4.39 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 10.3 µs per loop

In [6]: %timeit pd._libs.algos.kth_smallest(arr, 4)
The slowest run took 4.21 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 11.2 µs per loop

PR:

In [3]: arr = np.arange(10000, dtype=np.int64)

In [4]: %timeit pd._libs.algos.kth_smallest(arr, 4)
The slowest run took 12.06 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 9.71 µs per loop

In [5]: %timeit pd._libs.algos.kth_smallest(arr, 4)
The slowest run took 9.48 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 9.6 µs per loop

In [6]: %timeit pd._libs.algos.kth_smallest(arr, 4)
The slowest run took 6.23 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 9.95 µs per loop

Similar for other dtypes

cdef:
Py_ssize_t i, j, l, m, n = a.shape[0]
numeric x
Expand Down Expand Up @@ -812,23 +811,23 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike):
n = len(arr)

if n == 1:
if arr[0] != arr[0] or (timelike and <int64_t>arr[0] == iNaT):
if arr[0] != arr[0] or (timelike and <int64_t>arr[0] == NPY_NAT):
# single value is NaN
return False, False, True
else:
return True, True, True
elif n < 2:
return True, True, True

if timelike and <int64_t>arr[0] == iNaT:
if timelike and <int64_t>arr[0] == NPY_NAT:
return False, False, True

if algos_t is not object:
with nogil:
prev = arr[0]
for i in range(1, n):
cur = arr[i]
if timelike and <int64_t>cur == iNaT:
if timelike and <int64_t>cur == NPY_NAT:
is_monotonic_inc = 0
is_monotonic_dec = 0
break
Expand All @@ -853,7 +852,7 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike):
prev = arr[0]
for i in range(1, n):
cur = arr[i]
if timelike and <int64_t>cur == iNaT:
if timelike and <int64_t>cur == NPY_NAT:
is_monotonic_inc = 0
is_monotonic_dec = 0
break
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/algos_common_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,9 @@ def put2d_{{name}}_{{dest_name}}(ndarray[{{c_type}}, ndim=2, cast=True] values,

{{endfor}}

#----------------------------------------------------------------------
# ----------------------------------------------------------------------
# ensure_dtype
#----------------------------------------------------------------------
# ----------------------------------------------------------------------

cdef int PLATFORM_INT = (<ndarray>np.arange(0, dtype=np.intp)).descr.type_num

Expand Down
10 changes: 5 additions & 5 deletions pandas/_libs/algos_rank_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,9 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average',
{{elif dtype == 'float64'}}
mask = np.isnan(values)
{{elif dtype == 'int64'}}
mask = values == iNaT
mask = values == NPY_NAT

# create copy in case of iNaT
# create copy in case of NPY_NAT
# values are mutated inplace
if mask.any():
values = values.copy()
Expand Down Expand Up @@ -149,7 +149,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average',
{{if dtype != 'uint64'}}
isnan = sorted_mask[i]
if isnan and keep_na:
ranks[argsorted[i]] = nan
ranks[argsorted[i]] = NaN
continue
{{endif}}

Expand Down Expand Up @@ -257,7 +257,7 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average',
{{elif dtype == 'float64'}}
mask = np.isnan(values)
{{elif dtype == 'int64'}}
mask = values == iNaT
mask = values == NPY_NAT
{{endif}}

np.putmask(values, mask, nan_value)
Expand Down Expand Up @@ -317,7 +317,7 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average',
{{else}}
if (val == nan_value) and keep_na:
{{endif}}
ranks[i, argsorted[i, j]] = nan
ranks[i, argsorted[i, j]] = NaN

{{if dtype == 'object'}}
infs += 1
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/algos_take_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ Template for each `dtype` helper function for take
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""

#----------------------------------------------------------------------
# ----------------------------------------------------------------------
# take_1d, take_2d
#----------------------------------------------------------------------
# ----------------------------------------------------------------------

{{py:

Expand Down
34 changes: 16 additions & 18 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
# -*- coding: utf-8 -*-

cimport cython
from cython cimport Py_ssize_t
import cython
from cython import Py_ssize_t

from libc.stdlib cimport malloc, free

import numpy as np
cimport numpy as cnp
from numpy cimport (ndarray,
double_t,
int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
uint32_t, uint64_t, float32_t, float64_t)
cnp.import_array()
Expand All @@ -20,10 +19,9 @@ from algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE, TIEBREAK_MIN,
TIEBREAK_MAX, TIEBREAK_FIRST, TIEBREAK_DENSE)
from algos import take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers

cdef int64_t iNaT = get_nat()
cdef int64_t NPY_NAT = get_nat()

cdef double NaN = <double>np.NaN
cdef double nan = NaN
cdef float64_t NaN = <float64_t>np.NaN


cdef inline float64_t median_linear(float64_t* a, int n) nogil:
Expand Down Expand Up @@ -67,13 +65,13 @@ cdef inline float64_t median_linear(float64_t* a, int n) nogil:
return result


# TODO: Is this redundant with algos.kth_smallest?
# TODO: Is this redundant with algos.kth_smallest
cdef inline float64_t kth_smallest_c(float64_t* a,
Py_ssize_t k,
Py_ssize_t n) nogil:
cdef:
Py_ssize_t i, j, l, m
double_t x, t
float64_t x, t

l = 0
m = n - 1
Expand Down Expand Up @@ -109,7 +107,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
cdef:
Py_ssize_t i, j, N, K, ngroups, size
ndarray[int64_t] _counts
ndarray data
ndarray[float64_t, ndim=2] data
float64_t* ptr

assert min_count == -1, "'min_count' only used in add and prod"
Expand Down Expand Up @@ -139,8 +137,8 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
@cython.boundscheck(False)
@cython.wraparound(False)
def group_cumprod_float64(float64_t[:, :] out,
float64_t[:, :] values,
int64_t[:] labels,
const float64_t[:, :] values,
const int64_t[:] labels,
bint is_datetimelike,
bint skipna=True):
"""
Expand Down Expand Up @@ -177,7 +175,7 @@ def group_cumprod_float64(float64_t[:, :] out,
@cython.wraparound(False)
def group_cumsum(numeric[:, :] out,
numeric[:, :] values,
int64_t[:] labels,
const int64_t[:] labels,
is_datetimelike,
bint skipna=True):
"""
Expand Down Expand Up @@ -217,7 +215,7 @@ def group_cumsum(numeric[:, :] out,

@cython.boundscheck(False)
@cython.wraparound(False)
def group_shift_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
def group_shift_indexer(int64_t[:] out, const int64_t[:] labels,
int ngroups, int periods):
cdef:
Py_ssize_t N, i, j, ii
Expand Down Expand Up @@ -291,7 +289,7 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
"""
cdef:
Py_ssize_t i, N
ndarray[int64_t] sorted_labels
int64_t[:] sorted_labels
int64_t idx, curr_fill_idx=-1, filled_vals=0

N = len(out)
Expand Down Expand Up @@ -327,10 +325,10 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,

@cython.boundscheck(False)
@cython.wraparound(False)
def group_any_all(ndarray[uint8_t] out,
ndarray[int64_t] labels,
ndarray[uint8_t] values,
ndarray[uint8_t] mask,
def group_any_all(uint8_t[:] out,
const int64_t[:] labels,
const uint8_t[:] values,
const uint8_t[:] mask,
object val_test,
bint skipna):
"""Aggregated boolean values to show truthfulness of group elements
Expand Down
22 changes: 11 additions & 11 deletions pandas/_libs/groupby_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""

cdef extern from "numpy/npy_math.h":
double NAN "NPY_NAN"
float64_t NAN "NPY_NAN"
_int64_max = np.iinfo(np.int64).max

# ----------------------------------------------------------------------
Expand Down Expand Up @@ -268,16 +268,16 @@ def group_ohlc_{{name}}(ndarray[{{c_type}}, ndim=2] out,

{{endfor}}

#----------------------------------------------------------------------
# ----------------------------------------------------------------------
# group_nth, group_last, group_rank
#----------------------------------------------------------------------
# ----------------------------------------------------------------------

{{py:

# name, c_type, nan_val
dtypes = [('float64', 'float64_t', 'NAN'),
('float32', 'float32_t', 'NAN'),
('int64', 'int64_t', 'iNaT'),
('int64', 'int64_t', 'NPY_NAT'),
('object', 'object', 'NAN')]

def get_dispatch(dtypes):
Expand Down Expand Up @@ -527,7 +527,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
# to the result where appropriate
if keep_na and mask[_as[i]]:
for j in range(i - dups + 1, i + 1):
out[_as[j], 0] = nan
out[_as[j], 0] = NaN
grp_na_count = dups
elif tiebreak == TIEBREAK_AVERAGE:
for j in range(i - dups + 1, i + 1):
Expand Down Expand Up @@ -630,7 +630,7 @@ def group_max(ndarray[groupby_t, ndim=2] out,
if groupby_t is int64_t:
# Note: evaluated at compile-time
maxx[:] = -_int64_max
nan_val = iNaT
nan_val = NPY_NAT
else:
maxx[:] = -np.inf
nan_val = NAN
Expand Down Expand Up @@ -692,7 +692,7 @@ def group_min(ndarray[groupby_t, ndim=2] out,
minx = np.empty_like(out)
if groupby_t is int64_t:
minx[:] = _int64_max
nan_val = iNaT
nan_val = NPY_NAT
else:
minx[:] = np.inf
nan_val = NAN
Expand Down Expand Up @@ -762,8 +762,8 @@ def group_cummin(ndarray[groupby_t, ndim=2] out,

# val = nan
if groupby_t is int64_t:
if is_datetimelike and val == iNaT:
out[i, j] = iNaT
if is_datetimelike and val == NPY_NAT:
out[i, j] = NPY_NAT
else:
mval = accum[lab, j]
if val < mval:
Expand Down Expand Up @@ -809,8 +809,8 @@ def group_cummax(ndarray[groupby_t, ndim=2] out,
val = values[i, j]

if groupby_t is int64_t:
if is_datetimelike and val == iNaT:
out[i, j] = iNaT
if is_datetimelike and val == NPY_NAT:
out[i, j] = NPY_NAT
else:
mval = accum[lab, j]
if val > mval:
Expand Down
8 changes: 3 additions & 5 deletions pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ from libc.stdlib cimport malloc, free

import numpy as np
cimport numpy as cnp
from numpy cimport ndarray, uint8_t, uint32_t
from numpy cimport ndarray, uint8_t, uint32_t, float64_t
cnp.import_array()

cdef extern from "numpy/npy_math.h":
double NAN "NPY_NAN"
float64_t NAN "NPY_NAN"


from khash cimport (
Expand Down Expand Up @@ -42,9 +42,7 @@ cimport util
from missing cimport checknull


nan = np.nan

cdef int64_t iNaT = util.get_nat()
cdef int64_t NPY_NAT = util.get_nat()
_SIZE_HINT_LIMIT = (1 << 20) + 7


Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -251,9 +251,9 @@ cdef class HashTable:
{{py:

# name, dtype, float_group, default_na_value
dtypes = [('Float64', 'float64', True, 'nan'),
dtypes = [('Float64', 'float64', True, 'np.nan'),
('UInt64', 'uint64', False, 0),
('Int64', 'int64', False, 'iNaT')]
('Int64', 'int64', False, 'NPY_NAT')]

}}

Expand Down
6 changes: 3 additions & 3 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ from pandas._libs import algos, hashtable as _hash
from pandas._libs.tslibs import Timestamp, Timedelta, period as periodlib
from pandas._libs.missing import checknull

cdef int64_t iNaT = util.get_nat()
cdef int64_t NPY_NAT = util.get_nat()


cdef inline bint is_definitely_invalid_key(object val):
Expand Down Expand Up @@ -520,7 +520,7 @@ cpdef convert_scalar(ndarray arr, object value):
elif isinstance(value, (datetime, np.datetime64, date)):
return Timestamp(value).value
elif value is None or value != value:
return iNaT
return NPY_NAT
elif util.is_string_object(value):
return Timestamp(value).value
raise ValueError("cannot set a Timestamp with a non-timestamp")
Expand All @@ -531,7 +531,7 @@ cpdef convert_scalar(ndarray arr, object value):
elif isinstance(value, timedelta):
return Timedelta(value).value
elif value is None or value != value:
return iNaT
return NPY_NAT
elif util.is_string_object(value):
return Timedelta(value).value
raise ValueError("cannot set a Timedelta with a non-timedelta")
Expand Down
Loading