Merge remote-tracking branch 'upstream/master' into depr_concat_join_…

…axes
pandas-dev · Sep 5, 2018 · 144ef86 · 144ef86
2 parents d2aa496 + a5fe9cf
commit 144ef86
Show file tree

Hide file tree

Showing 46 changed files with 733 additions and 499 deletions.
diff --git a/Makefile b/Makefile
@@ -13,7 +13,7 @@ build: clean_pyc
 	python setup.py build_ext --inplace
 
 lint-diff:
-	git diff master --name-only -- "*.py" | grep "pandas" | xargs flake8
+	git diff master --name-only -- "*.py" | grep -E "pandas|scripts" | xargs flake8
 
 develop: build
 	-python setup.py develop

diff --git a/ci/lint.sh b/ci/lint.sh
@@ -24,6 +24,11 @@ if [ "$LINT" ]; then
     if [ $? -ne "0" ]; then
         RET=1
     fi
+
+    flake8 scripts/tests --filename=*.py
+    if [ $? -ne "0" ]; then
+        RET=1
+    fi
     echo "Linting *.py DONE"
 
     echo "Linting setup.py"
@@ -175,7 +180,7 @@ if [ "$LINT" ]; then
         RET=1
     fi
     echo "Check for old-style classes DONE"
-    
+
     echo "Check for backticks incorrectly rendering because of missing spaces"
     grep -R --include="*.rst" -E "[a-zA-Z0-9]\`\`?[a-zA-Z0-9]" doc/source/
 

diff --git a/ci/script_single.sh b/ci/script_single.sh
@@ -28,6 +28,8 @@ elif [ "$COVERAGE" ]; then
     echo pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas
     pytest      -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas
 
+    echo pytest -s -r xXs --strict scripts
+    pytest      -s -r xXs --strict scripts
 else
     echo pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas
     pytest      -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest

diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -103,8 +103,6 @@ consider the following ``DataFrame``:
 
 .. note::
 
-   .. versionadded:: 0.20
-
    A string passed to ``groupby`` may refer to either a column or an index level.
    If a string matches both a column name and an index level name, a
    ``ValueError`` will be raised.

diff --git a/doc/source/text.rst b/doc/source/text.rst
@@ -306,7 +306,7 @@ The same alignment can be used when ``others`` is a ``DataFrame``:
 Concatenating a Series and many objects into a Series
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-All one-dimensional list-likes can be arbitrarily combined in a list-like container (including iterators, ``dict``-views, etc.):
+All one-dimensional list-likes can be combined in a list-like container (including iterators, ``dict``-views, etc.):
 
 .. ipython:: python
 

diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst
@@ -2228,7 +2228,7 @@ To remove timezone from tz-aware ``DatetimeIndex``, use ``tz_localize(None)`` or
    didx.tz_convert(None)
 
    # tz_convert(None) is identical with tz_convert('UTC').tz_localize(None)
-   didx.tz_convert('UCT').tz_localize(None)
+   didx.tz_convert('UTC').tz_localize(None)
 
 .. _timeseries.timezone_ambiguous:
 

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -507,14 +507,16 @@ Other API Changes
 Deprecations
 ~~~~~~~~~~~~
 
-- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument.  The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`).
-- :meth:`MultiIndex.to_hierarchical` is deprecated and will be removed in a future version  (:issue:`21613`)
+- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`)
+- :meth:`MultiIndex.to_hierarchical` is deprecated and will be removed in a future version (:issue:`21613`)
 - :meth:`Series.ptp` is deprecated. Use ``numpy.ptp`` instead (:issue:`21614`)
 - :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`)
 - The signature of :meth:`Series.to_csv` has been uniformed to that of doc:meth:`DataFrame.to_csv`: the name of the first argument is now 'path_or_buf', the order of subsequent arguments has changed, the 'header' argument now defaults to True. (:issue:`19715`)
 - :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`)
 - :meth:`pandas.concat` has deprecated the ``join_axes``-keyword. Instead, use :meth:`DataFrame.reindex` or :meth:`DataFrame.reindex_like` on the result (:issue:`21951`)
 - :func:`pandas.read_table` is deprecated. Instead, use :func:`pandas.read_csv` passing ``sep='\t'`` if necessary (:issue:`21948`)
+- :meth:`Series.str.cat` has deprecated using arbitrary list-likes *within* list-likes. A list-like container may still contain
+  many ``Series``, ``Index`` or 1-dimensional ``np.ndarray``, or alternatively, only scalar values. (:issue:`21950`)
 
 .. _whatsnew_0240.prior_deprecations:
 
@@ -526,6 +528,7 @@ Removal of prior version deprecations/changes
 - Several private functions were removed from the (non-public) module ``pandas.core.common`` (:issue:`22001`)
 - Removal of the previously deprecated module ``pandas.core.datetools`` (:issue:`14105`, :issue:`14094`)
 - Strings passed into :meth:`DataFrame.groupby` that refer to both column and index levels will raise a ``ValueError`` (:issue:`14432`)
+- :meth:`Index.repeat` and :meth:`MultiIndex.repeat` have renamed the ``n`` argument to ``repeats``(:issue:`14645`)
 -
 
 .. _whatsnew_0240.performance:
@@ -583,6 +586,7 @@ Datetimelike
 - Bug in :class:`DataFrame` comparisons against ``Timestamp``-like objects failing to raise ``TypeError`` for inequality checks with mismatched types (:issue:`8932`,:issue:`22163`)
 - Bug in :class:`DataFrame` with mixed dtypes including ``datetime64[ns]`` incorrectly raising ``TypeError`` on equality comparisons (:issue:`13128`,:issue:`22163`)
 - Bug in :meth:`DataFrame.eq` comparison against ``NaT`` incorrectly returning ``True`` or ``NaN`` (:issue:`15697`,:issue:`22163`)
+- Bug in :class:`DatetimeIndex` subtraction that incorrectly failed to raise `OverflowError` (:issue:`22492`, :issue:`22508`)
 
 Timedelta
 ^^^^^^^^^
@@ -664,6 +668,7 @@ Indexing
 - Fixed ``DataFrame[np.nan]`` when columns are non-unique (:issue:`21428`)
 - Bug when indexing :class:`DatetimeIndex` with nanosecond resolution dates and timezones (:issue:`11679`)
 - Bug where indexing with a Numpy array containing negative values would mutate the indexer (:issue:`21867`)
+- Bug where mixed indexes wouldn't allow integers for ``.at`` (:issue:`19860`)
 - ``Float64Index.get_loc`` now raises ``KeyError`` when boolean key passed. (:issue:`19087`)
 
 Missing
@@ -704,6 +709,7 @@ Groupby/Resample/Rolling
 - Multiple bugs in :func:`pandas.core.Rolling.min` with ``closed='left'` and a
   datetime-like index leading to incorrect results and also segfault. (:issue:`21704`)
 - Bug in :meth:`Resampler.apply` when passing postiional arguments to applied func (:issue:`14615`).
+- Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to `loffset` kwarg (:issue:`7687`).
 
 Sparse
 ^^^^^^
@@ -736,9 +742,10 @@ Build Changes
 Other
 ^^^^^
 
-- :meth: `~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`)
+- :meth:`~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`)
 - Require at least 0.28.2 version of ``cython`` to support read-only memoryviews (:issue:`21688`)
-- :meth: `~pandas.io.formats.style.Styler.background_gradient` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` (:issue:`15204`)
+- :meth:`~pandas.io.formats.style.Styler.background_gradient` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` (:issue:`15204`)
+- :meth:`~pandas.io.formats.style.Styler.bar` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` and setting clipping range with ``vmin`` and ``vmax`` (:issue:`21548` and :issue:`21526`). ``NaN`` values are also handled properly.
 -
 -
 -
diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in
@@ -55,8 +55,9 @@ cpdef map_indices_{{name}}(ndarray[{{c_type}}] index):
 
     Better to do this with Cython because of the enormous speed boost.
     """
-    cdef Py_ssize_t i, length
-    cdef dict result = {}
+    cdef:
+        Py_ssize_t i, length
+        dict result = {}
 
     length = len(index)
 

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -104,10 +104,7 @@ cdef class IndexEngine:
         loc = self.get_loc(key)
         value = convert_scalar(arr, value)
 
-        if PySlice_Check(loc) or util.is_array(loc):
-            arr[loc] = value
-        else:
-            util.set_value_at(arr, loc, value)
+        arr[loc] = value
 
     cpdef get_loc(self, object val):
         if is_definitely_invalid_key(val):

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -492,9 +492,7 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
         if is_datelike and checknull(v):
             result[i] = NPY_NAT
         else:
-            # we can use the unsafe version because we know `result` is mutable
-            # since it was created from `np.empty`
-            util.set_value_at_unsafe(result, i, v)
+            result[i] = v
 
     return result
 
@@ -505,9 +503,7 @@ cpdef ndarray[object] astype_unicode(ndarray arr):
         ndarray[object] result = np.empty(n, dtype=object)
 
     for i in range(n):
-        # we can use the unsafe version because we know `result` is mutable
-        # since it was created from `np.empty`
-        util.set_value_at_unsafe(result, i, unicode(arr[i]))
+        result[i] = unicode(arr[i])
 
     return result
 
@@ -518,9 +514,7 @@ cpdef ndarray[object] astype_str(ndarray arr):
         ndarray[object] result = np.empty(n, dtype=object)
 
     for i in range(n):
-        # we can use the unsafe version because we know `result` is mutable
-        # since it was created from `np.empty`
-        util.set_value_at_unsafe(result, i, str(arr[i]))
+        result[i] = str(arr[i])
 
     return result
 

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -29,7 +29,7 @@ cdef extern from "Python.h":
 
 import numpy as np
 cimport numpy as cnp
-from numpy cimport ndarray, uint8_t, uint64_t, int64_t
+from numpy cimport ndarray, uint8_t, uint64_t, int64_t, float64_t
 cnp.import_array()
 
 from util cimport UINT64_MAX, INT64_MAX, INT64_MIN
@@ -694,7 +694,7 @@ cdef class TextReader:
             if ptr == NULL:
                 if not os.path.exists(source):
                     raise compat.FileNotFoundError(
-                        'File %s does not exist' % source)
+                        'File {source} does not exist'.format(source=source))
                 raise IOError('Initializing from file failed')
 
             self.parser.source = ptr
@@ -772,9 +772,10 @@ cdef class TextReader:
 
                     if name == '':
                         if self.has_mi_columns:
-                            name = 'Unnamed: %d_level_%d' % (i, level)
+                            name = ('Unnamed: {i}_level_{lvl}'
+                                    .format(i=i, lvl=level))
                         else:
-                            name = 'Unnamed: %d' % i
+                            name = 'Unnamed: {i}'.format(i=i)
                         unnamed_count += 1
 
                     count = counts.get(name, 0)
@@ -849,8 +850,8 @@ cdef class TextReader:
             #                        'data has %d fields'
             #                        % (passed_count, field_count))
 
-            if self.has_usecols and self.allow_leading_cols and \
-                    not callable(self.usecols):
+            if (self.has_usecols and self.allow_leading_cols and
+                    not callable(self.usecols)):
                 nuse = len(self.usecols)
                 if nuse == passed_count:
                     self.leading_cols = 0
@@ -1027,17 +1028,19 @@ cdef class TextReader:
 
         if self.table_width - self.leading_cols > num_cols:
             raise ParserError(
-                "Too many columns specified: expected %s and found %s" %
-                (self.table_width - self.leading_cols, num_cols))
+                "Too many columns specified: expected {expected} and "
+                "found {found}"
+                .format(expected=self.table_width - self.leading_cols,
+                        found=num_cols))
 
         results = {}
         nused = 0
         for i in range(self.table_width):
             if i < self.leading_cols:
                 # Pass through leading columns always
                 name = i
-            elif self.usecols and not callable(self.usecols) and \
-                    nused == len(self.usecols):
+            elif (self.usecols and not callable(self.usecols) and
+                    nused == len(self.usecols)):
                 # Once we've gathered all requested columns, stop. GH5766
                 break
             else:
@@ -1103,7 +1106,7 @@ cdef class TextReader:
                 col_res = _maybe_upcast(col_res)
 
             if col_res is None:
-                raise ParserError('Unable to parse column %d' % i)
+                raise ParserError('Unable to parse column {i}'.format(i=i))
 
             results[i] = col_res
 
@@ -1222,8 +1225,8 @@ cdef class TextReader:
         elif dtype.kind == 'U':
             width = dtype.itemsize
             if width > 0:
-                raise TypeError("the dtype %s is not "
-                                "supported for parsing" % dtype)
+                raise TypeError("the dtype {dtype} is not "
+                                "supported for parsing".format(dtype=dtype))
 
             # unicode variable width
             return self._string_convert(i, start, end, na_filter,
@@ -1241,12 +1244,12 @@ cdef class TextReader:
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
         elif is_datetime64_dtype(dtype):
-            raise TypeError("the dtype %s is not supported "
+            raise TypeError("the dtype {dtype} is not supported "
                             "for parsing, pass this column "
-                            "using parse_dates instead" % dtype)
+                            "using parse_dates instead".format(dtype=dtype))
         else:
-            raise TypeError("the dtype %s is not "
-                            "supported for parsing" % dtype)
+            raise TypeError("the dtype {dtype} is not "
+                            "supported for parsing".format(dtype=dtype))
 
     cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end,
                          bint na_filter, kh_str_t *na_hashset):
@@ -2058,7 +2061,7 @@ cdef kh_float64_t* kset_float64_from_list(values) except NULL:
         khiter_t k
         kh_float64_t *table
         int ret = 0
-        cnp.float64_t val
+        float64_t val
         object value
 
     table = kh_init_float64()
@@ -2101,7 +2104,7 @@ cdef raise_parser_error(object base, parser_t *parser):
                 Py_XDECREF(type)
                 raise old_exc
 
-    message = '%s. C error: ' % base
+    message = '{base}. C error: '.format(base=base)
     if parser.error_msg != NULL:
         if PY3:
             message += parser.error_msg.decode('utf-8')

diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx
@@ -18,8 +18,6 @@ cnp.import_array()
 cimport util
 from lib import maybe_convert_objects
 
-is_numpy_prior_1_6_2 = LooseVersion(np.__version__) < '1.6.2'
-
 
 cdef _get_result_array(object obj, Py_ssize_t size, Py_ssize_t cnt):
 
@@ -282,8 +280,7 @@ cdef class SeriesBinGrouper:
                     result = _get_result_array(res,
                                                self.ngroups,
                                                len(self.dummy_arr))
-
-                util.assign_value_1d(result, i, res)
+                result[i] = res
 
                 islider.advance(group_size)
                 vslider.advance(group_size)
@@ -408,7 +405,7 @@ cdef class SeriesGrouper:
                                                    self.ngroups,
                                                    len(self.dummy_arr))
 
-                    util.assign_value_1d(result, lab, res)
+                    result[lab] = res
                     counts[lab] = group_size
                     islider.advance(group_size)
                     vslider.advance(group_size)

diff --git a/pandas/_libs/src/numpy_helper.h b/pandas/_libs/src/numpy_helper.h
diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
@@ -46,8 +46,7 @@ from tslibs.nattype cimport checknull_with_nat, NPY_NAT
 
 from tslibs.offsets cimport to_offset
 
-from tslibs.timestamps cimport (create_timestamp_from_ts,
-                                _NS_UPPER_BOUND, _NS_LOWER_BOUND)
+from tslibs.timestamps cimport create_timestamp_from_ts
 from tslibs.timestamps import Timestamp
 
 
@@ -350,8 +349,8 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'):
         # check the bounds
         if not need_to_iterate:
 
-            if ((fvalues < _NS_LOWER_BOUND).any()
-                    or (fvalues > _NS_UPPER_BOUND).any()):
+            if ((fvalues < Timestamp.min.value).any()
+                    or (fvalues > Timestamp.max.value).any()):
                 raise OutOfBoundsDatetime("cannot convert input with unit "
                                           "'{unit}'".format(unit=unit))
             result = (iresult * m).astype('M8[ns]')

diff --git a/pandas/_libs/tslibs/timestamps.pxd b/pandas/_libs/tslibs/timestamps.pxd
@@ -6,5 +6,3 @@ from np_datetime cimport npy_datetimestruct
 cdef object create_timestamp_from_ts(int64_t value,
                                      npy_datetimestruct dts,
                                      object tz, object freq)
-
-cdef int64_t _NS_UPPER_BOUND, _NS_LOWER_BOUND