Merge branch 'master' into fixturize_frame_tests_1

pandas-dev · Oct 1, 2018 · 16a5297 · 16a5297
2 parents 733b889 + 2f1b842
commit 16a5297
Show file tree

Hide file tree

Showing 63 changed files with 749 additions and 254 deletions.
diff --git a/README.md b/README.md
@@ -56,8 +56,8 @@
 <tr>
   <td></td>
   <td>
-    <a href="https://ci.appveyor.com/project/pandas-dev/pandas">
-    <img src="https://ci.appveyor.com/api/projects/status/86vn83mxgnl4xf1s/branch/master?svg=true" alt="appveyor build status" />
+    <a href="https://dev.azure.com/pandas-dev/pandas/_build/latest?definitionId=1&branch=master">
+      <img src="https://dev.azure.com/pandas-dev/pandas/_apis/build/status/pandas-dev.pandas?branch=master" alt="Azure Pipelines build status" />
     </a>
   </td>
 </tr>

diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -9,7 +9,7 @@
     try:
         hashing = import_module(imp)
         break
-    except:
+    except (ImportError, TypeError, ValueError):
         pass
 
 from .pandas_vb_common import setup # noqa

diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -505,14 +505,21 @@ class NSort(object):
     param_names = ['keep']
 
     def setup(self, keep):
-        self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC'))
+        self.df = DataFrame(np.random.randn(100000, 3),
+                            columns=list('ABC'))
 
-    def time_nlargest(self, keep):
+    def time_nlargest_one_column(self, keep):
         self.df.nlargest(100, 'A', keep=keep)
 
-    def time_nsmallest(self, keep):
+    def time_nlargest_two_columns(self, keep):
+        self.df.nlargest(100, ['A', 'B'], keep=keep)
+
+    def time_nsmallest_one_column(self, keep):
         self.df.nsmallest(100, 'A', keep=keep)
 
+    def time_nsmallest_two_columns(self, keep):
+        self.df.nsmallest(100, ['A', 'B'], keep=keep)
+
 
 class Describe(object):
 

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -1,11 +1,9 @@
 import random
-import timeit
 import string
 
 import numpy as np
 import pandas.util.testing as tm
 from pandas import DataFrame, Categorical, date_range, read_csv
-from pandas.compat import PY2
 from pandas.compat import cStringIO as StringIO
 
 from ..pandas_vb_common import setup, BaseIO  # noqa
@@ -181,8 +179,8 @@ def time_read_csv(self, sep, decimal, float_precision):
                  names=list('abc'), float_precision=float_precision)
 
     def time_read_csv_python_engine(self, sep, decimal, float_precision):
-        read_csv(self.data(self.StringIO_input), sep=sep, header=None, engine='python',
-                 float_precision=None, names=list('abc'))
+        read_csv(self.data(self.StringIO_input), sep=sep, header=None,
+                 engine='python', float_precision=None, names=list('abc'))
 
 
 class ReadCSVCategorical(BaseIO):

diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py
@@ -29,7 +29,7 @@ def setup(self):
         try:
             with warnings.catch_warnings(record=True):
                 self.mdf1.consolidate(inplace=True)
-        except:
+        except (AttributeError, TypeError):
             pass
         self.mdf2 = self.mdf1.copy()
         self.mdf2.index = self.df2.index

diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py
@@ -2,14 +2,13 @@
 from importlib import import_module
 
 import numpy as np
-from pandas import Panel
 
 # Compatibility import for lib
 for imp in ['pandas._libs.lib', 'pandas.lib']:
     try:
         lib = import_module(imp)
         break
-    except:
+    except (ImportError, TypeError, ValueError):
         pass
 
 numeric_dtypes = [np.int64, np.int32, np.uint32, np.uint64, np.float32,
@@ -34,7 +33,7 @@ def remove(self, f):
         """Remove created files"""
         try:
             os.remove(f)
-        except:
+        except OSError:
             # On Windows, attempting to remove a file that is in use
             # causes an exception to be raised
             pass

diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py
@@ -18,7 +18,7 @@ def setup(self, op, dtype, axis, use_bottleneck):
         df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype)
         try:
             pd.options.compute.use_bottleneck = use_bottleneck
-        except:
+        except TypeError:
             from pandas.core import nanops
             nanops._USE_BOTTLENECK = use_bottleneck
         self.df_func = getattr(df, op)
@@ -56,7 +56,7 @@ def setup(self, op, dtype, use_bottleneck):
         s = pd.Series(np.random.randn(100000)).astype(dtype)
         try:
             pd.options.compute.use_bottleneck = use_bottleneck
-        except:
+        except TypeError:
             from pandas.core import nanops
             nanops._USE_BOTTLENECK = use_bottleneck
         self.s_func = getattr(s, op)

diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py
@@ -1,4 +1,3 @@
-import warnings
 from datetime import timedelta
 
 import numpy as np

diff --git a/ci/doctests.sh b/ci/doctests.sh
@@ -21,7 +21,7 @@ if [ "$DOCTEST" ]; then
 
     # DataFrame / Series docstrings
     pytest --doctest-modules -v pandas/core/frame.py \
-        -k"-axes -combine -isin -itertuples -join -nlargest -nsmallest -nunique -pivot_table -quantile -query -reindex -reindex_axis -replace -round -set_index -stack -to_dict -to_stata"
+        -k"-axes -combine -itertuples -join -nlargest -nsmallest -nunique -pivot_table -quantile -query -reindex -reindex_axis -replace -round -set_index -stack -to_dict -to_stata"
 
     if [ $? -ne "0" ]; then
         RET=1

diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt
@@ -14,7 +14,7 @@ lxml
 matplotlib
 nbsphinx
 numexpr
-openpyxl=2.5.5
+openpyxl==2.5.5
 pyarrow
 pymysql
 tables
@@ -28,4 +28,4 @@ statsmodels
 xarray
 xlrd
 xlsxwriter
-xlwt
+xlwt
diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pdf
diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pptx
diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet_JP.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet_JP.pdf
diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet_JP.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet_JP.pptx
diff --git a/doc/make.py b/doc/make.py
@@ -233,10 +233,10 @@ def _sphinx_build(self, kind):
                      '-b{}'.format(kind),
                      '-{}'.format(
                          'v' * self.verbosity) if self.verbosity else '',
-                     '-d{}'.format(os.path.join(BUILD_PATH, 'doctrees')),
+                     '-d"{}"'.format(os.path.join(BUILD_PATH, 'doctrees')),
                      '-Dexclude_patterns={}'.format(self.exclude_patterns),
-                     SOURCE_PATH,
-                     os.path.join(BUILD_PATH, kind))
+                     '"{}"'.format(SOURCE_PATH),
+                     '"{}"'.format(os.path.join(BUILD_PATH, kind)))
 
     def _open_browser(self):
         base_url = os.path.join('file://', DOC_PATH, 'build', 'html')

diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -2603,3 +2603,12 @@ objects.
    generated/pandas.Series.ix
    generated/pandas.Series.imag
    generated/pandas.Series.real
+
+
+.. Can't convince sphinx to generate toctree for this class attribute.
+.. So we do it manually to avoid a warning
+
+.. toctree::
+   :hidden:
+
+   generated/pandas.api.extensions.ExtensionDtype.na_value
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -1935,7 +1935,7 @@ NumPy's type-system for a few cases.
 * :ref:`Categorical <categorical>`
 * :ref:`Datetime with Timezone <timeseries.timezone_series>`
 * :ref:`Period <timeseries.periods>`
-* :ref:`Interval <advanced.indexing.intervallindex>`
+* :ref:`Interval <indexing.intervallindex>`
 
 Pandas uses the ``object`` dtype for storing strings.
 

diff --git a/doc/source/computation.rst b/doc/source/computation.rst
@@ -153,6 +153,21 @@ Like ``cov``, ``corr`` also supports the optional ``min_periods`` keyword:
    frame.corr(min_periods=12)
 
 
+.. versionadded:: 0.24.0
+
+The ``method`` argument can also be a callable for a generic correlation
+calculation. In this case, it should be a single function
+that produces a single value from two ndarray inputs. Suppose we wanted to
+compute the correlation based on histogram intersection:
+
+.. ipython:: python
+
+   # histogram intersection
+   histogram_intersection = lambda a, b: np.minimum(
+       np.true_divide(a, a.sum()), np.true_divide(b, b.sum())
+   ).sum()
+   frame.corr(method=histogram_intersection)
+
 A related method :meth:`~DataFrame.corrwith` is implemented on DataFrame to 
 compute the correlation between like-labeled Series contained in different 
 DataFrame objects.

diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst
@@ -505,13 +505,11 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
 .. ipython:: python
 
    df = pd.DataFrame({'A' : [1, 1, 2, 2], 'B' : [1, -1, 1, 2]})
-
    gb = df.groupby('A')
 
    def replace(g):
-      mask = g < 0
-      g.loc[mask] = g[~mask].mean()
-      return g
+       mask = g < 0
+       return g.where(mask, g[~mask].mean())
 
    gb.transform(replace)
 

diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst
@@ -73,8 +73,8 @@ large data to thin clients.
 `seaborn <https://seaborn.pydata.org>`__
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Seaborn is a Python visualization library based on `matplotlib
-<http://matplotlib.org>`__.  It provides a high-level, dataset-oriented
+Seaborn is a Python visualization library based on
+`matplotlib <http://matplotlib.org>`__. It provides a high-level, dataset-oriented
 interface for creating attractive statistical graphics. The plotting functions
 in seaborn understand pandas objects and leverage pandas grouping operations
 internally to support concise specification of complex visualizations. Seaborn
@@ -140,7 +140,7 @@ which are utilized by Jupyter Notebook for displaying
 (Note: HTML tables may or may not be
 compatible with non-HTML Jupyter output formats.)
 
-See :ref:`Options and Settings <options>` and :ref:`<options.available>`
+See :ref:`Options and Settings <options>` and :ref:`options.available <available>`
 for pandas ``display.`` settings.
 
 `quantopian/qgrid <https://github.com/quantopian/qgrid>`__
@@ -169,7 +169,7 @@ or the clipboard into a new pandas DataFrame via a sophisticated import wizard.
 Most pandas classes, methods and data attributes can be autocompleted in
 Spyder's `Editor <https://docs.spyder-ide.org/editor.html>`__ and
 `IPython Console <https://docs.spyder-ide.org/ipythonconsole.html>`__,
-and Spyder's `Help pane<https://docs.spyder-ide.org/help.html>`__ can retrieve
+and Spyder's `Help pane <https://docs.spyder-ide.org/help.html>`__ can retrieve
 and render Numpydoc documentation on pandas objects in rich text with Sphinx
 both automatically and on-demand.
 

diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -66,16 +66,13 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
 CSV & Text files
 ----------------
 
-The two workhorse functions for reading text files (a.k.a. flat files) are
-:func:`read_csv` and :func:`read_table`. They both use the same parsing code to
-intelligently convert tabular data into a ``DataFrame`` object. See the
-:ref:`cookbook<cookbook.csv>` for some advanced strategies.
+The workhorse function for reading text files (a.k.a. flat files) is
+:func:`read_csv`. See the :ref:`cookbook<cookbook.csv>` for some advanced strategies.
 
 Parsing options
 '''''''''''''''
 
-The functions :func:`read_csv` and :func:`read_table` accept the following
-common arguments:
+:func:`read_csv` accepts the following common arguments:
 
 Basic
 +++++
@@ -780,8 +777,8 @@ Date Handling
 Specifying Date Columns
 +++++++++++++++++++++++
 
-To better facilitate working with datetime data, :func:`read_csv` and
-:func:`read_table` use the keyword arguments ``parse_dates`` and ``date_parser``
+To better facilitate working with datetime data, :func:`read_csv`
+uses the keyword arguments ``parse_dates`` and ``date_parser``
 to allow users to specify a variety of columns and date/time formats to turn the
 input text data into ``datetime`` objects.
 
@@ -1434,7 +1431,7 @@ Suppose you have data indexed by two columns:
 
    print(open('data/mindex_ex.csv').read())
 
-The ``index_col`` argument to ``read_csv`` and ``read_table`` can take a list of
+The ``index_col`` argument to ``read_csv`` can take a list of
 column numbers to turn multiple columns into a ``MultiIndex`` for the index of the
 returned object:
 
@@ -1505,8 +1502,8 @@ class of the csv module. For this, you have to specify ``sep=None``.
 
 .. ipython:: python
 
-    print(open('tmp2.sv').read())
-    pd.read_csv('tmp2.sv', sep=None, engine='python')
+   print(open('tmp2.sv').read())
+   pd.read_csv('tmp2.sv', sep=None, engine='python')
 
 .. _io.multiple_files:
 
@@ -1528,16 +1525,16 @@ rather than reading the entire file into memory, such as the following:
 .. ipython:: python
 
    print(open('tmp.sv').read())
-   table = pd.read_table('tmp.sv', sep='|')
+   table = pd.read_csv('tmp.sv', sep='|')
    table
 
 
-By specifying a ``chunksize`` to ``read_csv`` or ``read_table``, the return
+By specifying a ``chunksize`` to ``read_csv``, the return
 value will be an iterable object of type ``TextFileReader``:
 
 .. ipython:: python
 
-   reader = pd.read_table('tmp.sv', sep='|', chunksize=4)
+   reader = pd.read_csv('tmp.sv', sep='|', chunksize=4)
    reader
 
    for chunk in reader:
@@ -1548,7 +1545,7 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object:
 
 .. ipython:: python
 
-   reader = pd.read_table('tmp.sv', sep='|', iterator=True)
+   reader = pd.read_csv('tmp.sv', sep='|', iterator=True)
    reader.get_chunk(5)
 
 .. ipython:: python
@@ -3067,7 +3064,7 @@ Clipboard
 
 A handy way to grab data is to use the :meth:`~DataFrame.read_clipboard` method,
 which takes the contents of the clipboard buffer and passes them to the
-``read_table`` method. For instance, you can copy the following text to the
+``read_csv`` method. For instance, you can copy the following text to the
 clipboard (CTRL-C on many operating systems):
 
 .. code-block:: python

diff --git a/doc/source/text.rst b/doc/source/text.rst
@@ -312,14 +312,15 @@ All one-dimensional list-likes can be combined in a list-like container (includi
 
     s
     u
-    s.str.cat([u.values, ['A', 'B', 'C', 'D'], map(str, u.index)], na_rep='-')
+    s.str.cat([u.values,
+               u.index.astype(str).values], na_rep='-')
 
 All elements must match in length to the calling ``Series`` (or ``Index``), except those having an index if ``join`` is not None:
 
 .. ipython:: python
 
     v
-    s.str.cat([u, v, ['A', 'B', 'C', 'D']], join='outer', na_rep='-')
+    s.str.cat([u, v], join='outer', na_rep='-')
 
 If using ``join='right'`` on a list of ``others`` that contains different indexes,
 the union of these indexes will be used as the basis for the final concatenation: