Merge branch 'master' into excel-tables-pandas-dev#24862

tdamsma · Jan 27, 2019 · 5f9d664 · 5f9d664
2 parents 32f10e5 + 2b16e2e
commit 5f9d664
Show file tree

Hide file tree

Showing 90 changed files with 3,303 additions and 1,635 deletions.
diff --git a/.gitignore b/.gitignore
@@ -101,14 +101,14 @@ asv_bench/pandas/
 # Documentation generated files #
 #################################
 doc/source/generated
-doc/source/api/generated
+doc/source/user_guide/styled.xlsx
+doc/source/reference/api
 doc/source/_static
 doc/source/vbench
 doc/source/vbench.rst
 doc/source/index.rst
 doc/build/html/index.html
 # Windows specific leftover:
 doc/tmp.sv
-doc/source/styled.xlsx
 env/
 doc/source/savefig/
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -223,12 +223,19 @@ class CategoricalSlicing(object):
 
     def setup(self, index):
         N = 10**6
-        values = list('a' * N + 'b' * N + 'c' * N)
-        indices = {
-            'monotonic_incr': pd.Categorical(values),
-            'monotonic_decr': pd.Categorical(reversed(values)),
-            'non_monotonic': pd.Categorical(list('abc' * N))}
-        self.data = indices[index]
+        categories = ['a', 'b', 'c']
+        values = [0] * N + [1] * N + [2] * N
+        if index == 'monotonic_incr':
+            self.data = pd.Categorical.from_codes(values,
+                                                  categories=categories)
+        elif index == 'monotonic_decr':
+            self.data = pd.Categorical.from_codes(list(reversed(values)),
+                                                  categories=categories)
+        elif index == 'non_monotonic':
+            self.data = pd.Categorical.from_codes([0, 1, 2] * N,
+                                                  categories=categories)
+        else:
+            raise ValueError('Invalid index param: {}'.format(index))
 
         self.scalar = 10000
         self.list = list(range(10000))

diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py
@@ -72,7 +72,7 @@ class SeriesDtypesConstructors(object):
 
     def setup(self):
         N = 10**4
-        self.arr = np.random.randn(N, N)
+        self.arr = np.random.randn(N)
         self.arr_str = np.array(['foo', 'bar', 'baz'], dtype=object)
         self.s = Series([Timestamp('20110101'), Timestamp('20120101'),
                          Timestamp('20130101')] * N * 10)

diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py
@@ -138,7 +138,8 @@ def setup(self, dtype):
         self.sorted = self.idx.sort_values()
         half = N // 2
         self.non_unique = self.idx[:half].append(self.idx[:half])
-        self.non_unique_sorted = self.sorted[:half].append(self.sorted[:half])
+        self.non_unique_sorted = (self.sorted[:half].append(self.sorted[:half])
+                                  .sort_values())
         self.key = self.sorted[N // 4]
 
     def time_boolean_array(self, dtype):

diff --git a/doc/make.py b/doc/make.py
@@ -53,7 +53,7 @@ def __init__(self, num_jobs=0, include_api=True, single_doc=None,
         if single_doc and single_doc.endswith('.rst'):
             self.single_doc_html = os.path.splitext(single_doc)[0] + '.html'
         elif single_doc:
-            self.single_doc_html = 'api/generated/pandas.{}.html'.format(
+            self.single_doc_html = 'reference/api/pandas.{}.html'.format(
                 single_doc)
 
     def _process_single_doc(self, single_doc):
@@ -63,7 +63,7 @@ def _process_single_doc(self, single_doc):
 
         For example, categorial.rst or pandas.DataFrame.head. For the latter,
         return the corresponding file path
-        (e.g. generated/pandas.DataFrame.head.rst).
+        (e.g. reference/api/pandas.DataFrame.head.rst).
         """
         base_name, extension = os.path.splitext(single_doc)
         if extension in ('.rst', '.ipynb'):
@@ -121,8 +121,6 @@ def _sphinx_build(self, kind):
             raise ValueError('kind must be html or latex, '
                              'not {}'.format(kind))
 
-        self.clean()
-
         cmd = ['sphinx-build', '-b', kind]
         if self.num_jobs:
             cmd += ['-j', str(self.num_jobs)]
@@ -260,7 +258,7 @@ def clean():
         Clean documentation generated files.
         """
         shutil.rmtree(BUILD_PATH, ignore_errors=True)
-        shutil.rmtree(os.path.join(SOURCE_PATH, 'api', 'generated'),
+        shutil.rmtree(os.path.join(SOURCE_PATH, 'reference', 'api'),
                       ignore_errors=True)
 
     def zip_html(self):

diff --git a/doc/redirects.csv b/doc/redirects.csv
diff --git a/doc/source/comparison_with_r.rst → ..._started/comparison/comparison_with_r.rst b/doc/source/comparison_with_r.rst → ..._started/comparison/comparison_with_r.rst
diff --git a/doc/source/comparison_with_sas.rst → ...tarted/comparison/comparison_with_sas.rst b/doc/source/comparison_with_sas.rst → ...tarted/comparison/comparison_with_sas.rst
diff --git a/doc/source/comparison_with_sql.rst → ...tarted/comparison/comparison_with_sql.rst b/doc/source/comparison_with_sql.rst → ...tarted/comparison/comparison_with_sql.rst
diff --git a/doc/source/comparison_with_stata.rst → ...rted/comparison/comparison_with_stata.rst b/doc/source/comparison_with_stata.rst → ...rted/comparison/comparison_with_stata.rst
diff --git a/doc/source/getting_started/comparison/index.rst b/doc/source/getting_started/comparison/index.rst
@@ -0,0 +1,15 @@
+{{ header }}
+
+.. _comparison:
+
+===========================
+Comparison with other tools
+===========================
+
+.. toctree::
+    :maxdepth: 2
+
+    comparison_with_r
+    comparison_with_sql
+    comparison_with_sas
+    comparison_with_stata
diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst
@@ -13,4 +13,5 @@ Getting started
     10min
     basics
     dsintro
+    comparison/index
     tutorials
diff --git a/doc/source/getting_started/overview.rst b/doc/source/getting_started/overview.rst
@@ -6,25 +6,80 @@
 Package overview
 ****************
 
-:mod:`pandas` is an open source, BSD-licensed library providing high-performance,
-easy-to-use data structures and data analysis tools for the `Python <https://www.python.org/>`__
-programming language.
-
-:mod:`pandas` consists of the following elements:
-
-* A set of labeled array data structures, the primary of which are
-  Series and DataFrame.
-* Index objects enabling both simple axis indexing and multi-level /
-  hierarchical axis indexing.
-* An integrated group by engine for aggregating and transforming data sets.
-* Date range generation (date_range) and custom date offsets enabling the
-  implementation of customized frequencies.
-* Input/Output tools: loading tabular data from flat files (CSV, delimited,
-  Excel 2003), and saving and loading pandas objects from the fast and
-  efficient PyTables/HDF5 format.
-* Memory-efficient "sparse" versions of the standard data structures for storing
-  data that is mostly missing or mostly constant (some fixed value).
-* Moving window statistics (rolling mean, rolling standard deviation, etc.).
+**pandas** is a `Python <https://www.python.org>`__ package providing fast,
+flexible, and expressive data structures designed to make working with
+"relational" or "labeled" data both easy and intuitive. It aims to be the
+fundamental high-level building block for doing practical, **real world** data
+analysis in Python. Additionally, it has the broader goal of becoming **the
+most powerful and flexible open source data analysis / manipulation tool
+available in any language**. It is already well on its way toward this goal.
+
+pandas is well suited for many different kinds of data:
+
+  - Tabular data with heterogeneously-typed columns, as in an SQL table or
+    Excel spreadsheet
+  - Ordered and unordered (not necessarily fixed-frequency) time series data.
+  - Arbitrary matrix data (homogeneously typed or heterogeneous) with row and
+    column labels
+  - Any other form of observational / statistical data sets. The data actually
+    need not be labeled at all to be placed into a pandas data structure
+
+The two primary data structures of pandas, :class:`Series` (1-dimensional)
+and :class:`DataFrame` (2-dimensional), handle the vast majority of typical use
+cases in finance, statistics, social science, and many areas of
+engineering. For R users, :class:`DataFrame` provides everything that R's
+``data.frame`` provides and much more. pandas is built on top of `NumPy
+<https://www.numpy.org>`__ and is intended to integrate well within a scientific
+computing environment with many other 3rd party libraries.
+
+Here are just a few of the things that pandas does well:
+
+  - Easy handling of **missing data** (represented as NaN) in floating point as
+    well as non-floating point data
+  - Size mutability: columns can be **inserted and deleted** from DataFrame and
+    higher dimensional objects
+  - Automatic and explicit **data alignment**: objects can be explicitly
+    aligned to a set of labels, or the user can simply ignore the labels and
+    let `Series`, `DataFrame`, etc. automatically align the data for you in
+    computations
+  - Powerful, flexible **group by** functionality to perform
+    split-apply-combine operations on data sets, for both aggregating and
+    transforming data
+  - Make it **easy to convert** ragged, differently-indexed data in other
+    Python and NumPy data structures into DataFrame objects
+  - Intelligent label-based **slicing**, **fancy indexing**, and **subsetting**
+    of large data sets
+  - Intuitive **merging** and **joining** data sets
+  - Flexible **reshaping** and pivoting of data sets
+  - **Hierarchical** labeling of axes (possible to have multiple labels per
+    tick)
+  - Robust IO tools for loading data from **flat files** (CSV and delimited),
+    Excel files, databases, and saving / loading data from the ultrafast **HDF5
+    format**
+  - **Time series**-specific functionality: date range generation and frequency
+    conversion, moving window statistics, moving window linear regressions,
+    date shifting and lagging, etc.
+
+Many of these principles are here to address the shortcomings frequently
+experienced using other languages / scientific research environments. For data
+scientists, working with data is typically divided into multiple stages:
+munging and cleaning data, analyzing / modeling it, then organizing the results
+of the analysis into a form suitable for plotting or tabular display. pandas
+is the ideal tool for all of these tasks.
+
+Some other notes
+
+ - pandas is **fast**. Many of the low-level algorithmic bits have been
+   extensively tweaked in `Cython <https://cython.org>`__ code. However, as with
+   anything else generalization usually sacrifices performance. So if you focus
+   on one feature for your application you may be able to create a faster
+   specialized tool.
+
+ - pandas is a dependency of `statsmodels
+   <https://www.statsmodels.org/stable/index.html>`__, making it an important part of the
+   statistical computing ecosystem in Python.
+
+ - pandas has been used extensively in production in financial applications.
 
 Data Structures
 ---------------

diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template
@@ -1,141 +1,52 @@
 .. pandas documentation master file, created by
 
+.. module:: pandas
+
 *********************************************
 pandas: powerful Python data analysis toolkit
 *********************************************
 
-`PDF Version <pandas.pdf>`__
-
-`Zipped HTML <pandas.zip>`__
-
-.. module:: pandas
-
 **Date**: |today| **Version**: |version|
 
-**Binary Installers:** https://pypi.org/project/pandas
-
-**Source Repository:** https://github.com/pandas-dev/pandas
-
-**Issues & Ideas:** https://github.com/pandas-dev/pandas/issues
-
-**Q&A Support:** https://stackoverflow.com/questions/tagged/pandas
-
-**Developer Mailing List:** https://groups.google.com/forum/#!forum/pydata
-
-**pandas** is a `Python <https://www.python.org>`__ package providing fast,
-flexible, and expressive data structures designed to make working with
-"relational" or "labeled" data both easy and intuitive. It aims to be the
-fundamental high-level building block for doing practical, **real world** data
-analysis in Python. Additionally, it has the broader goal of becoming **the
-most powerful and flexible open source data analysis / manipulation tool
-available in any language**. It is already well on its way toward this goal.
-
-pandas is well suited for many different kinds of data:
-
-  - Tabular data with heterogeneously-typed columns, as in an SQL table or
-    Excel spreadsheet
-  - Ordered and unordered (not necessarily fixed-frequency) time series data.
-  - Arbitrary matrix data (homogeneously typed or heterogeneous) with row and
-    column labels
-  - Any other form of observational / statistical data sets. The data actually
-    need not be labeled at all to be placed into a pandas data structure
-
-The two primary data structures of pandas, :class:`Series` (1-dimensional)
-and :class:`DataFrame` (2-dimensional), handle the vast majority of typical use
-cases in finance, statistics, social science, and many areas of
-engineering. For R users, :class:`DataFrame` provides everything that R's
-``data.frame`` provides and much more. pandas is built on top of `NumPy
-<https://www.numpy.org>`__ and is intended to integrate well within a scientific
-computing environment with many other 3rd party libraries.
-
-Here are just a few of the things that pandas does well:
-
-  - Easy handling of **missing data** (represented as NaN) in floating point as
-    well as non-floating point data
-  - Size mutability: columns can be **inserted and deleted** from DataFrame and
-    higher dimensional objects
-  - Automatic and explicit **data alignment**: objects can be explicitly
-    aligned to a set of labels, or the user can simply ignore the labels and
-    let `Series`, `DataFrame`, etc. automatically align the data for you in
-    computations
-  - Powerful, flexible **group by** functionality to perform
-    split-apply-combine operations on data sets, for both aggregating and
-    transforming data
-  - Make it **easy to convert** ragged, differently-indexed data in other
-    Python and NumPy data structures into DataFrame objects
-  - Intelligent label-based **slicing**, **fancy indexing**, and **subsetting**
-    of large data sets
-  - Intuitive **merging** and **joining** data sets
-  - Flexible **reshaping** and pivoting of data sets
-  - **Hierarchical** labeling of axes (possible to have multiple labels per
-    tick)
-  - Robust IO tools for loading data from **flat files** (CSV and delimited),
-    Excel files, databases, and saving / loading data from the ultrafast **HDF5
-    format**
-  - **Time series**-specific functionality: date range generation and frequency
-    conversion, moving window statistics, moving window linear regressions,
-    date shifting and lagging, etc.
-
-Many of these principles are here to address the shortcomings frequently
-experienced using other languages / scientific research environments. For data
-scientists, working with data is typically divided into multiple stages:
-munging and cleaning data, analyzing / modeling it, then organizing the results
-of the analysis into a form suitable for plotting or tabular display. pandas
-is the ideal tool for all of these tasks.
-
-Some other notes
-
- - pandas is **fast**. Many of the low-level algorithmic bits have been
-   extensively tweaked in `Cython <https://cython.org>`__ code. However, as with
-   anything else generalization usually sacrifices performance. So if you focus
-   on one feature for your application you may be able to create a faster
-   specialized tool.
-
- - pandas is a dependency of `statsmodels
-   <https://www.statsmodels.org/stable/index.html>`__, making it an important part of the
-   statistical computing ecosystem in Python.
-
- - pandas has been used extensively in production in financial applications.
-
-.. note::
+**Download documentation**: `PDF Version <pandas.pdf>`__ | `Zipped HTML <pandas.zip>`__
 
-   This documentation assumes general familiarity with NumPy. If you haven't
-   used NumPy much or at all, do invest some time in `learning about NumPy
-   <https://docs.scipy.org>`__ first.
+**Useful links**:
+`Binary Installers <https://pypi.org/project/pandas>`__ |
+`Source Repository <https://github.com/pandas-dev/pandas>`__ |
+`Issues & Ideas <https://github.com/pandas-dev/pandas/issues>`__ |
+`Q&A Support <https://stackoverflow.com/questions/tagged/pandas>`__ |
+`Mailing List <https://groups.google.com/forum/#!forum/pydata>`__
 
-See the package overview for more detail about what's in the library.
+:mod:`pandas` is an open source, BSD-licensed library providing high-performance,
+easy-to-use data structures and data analysis tools for the `Python <https://www.python.org/>`__
+programming language.
 
+See the :ref:`overview` for more detail about what's in the library.
 
 {% if single_doc and single_doc.endswith('.rst') -%}
 .. toctree::
-    :maxdepth: 4
+    :maxdepth: 2
 
     {{ single_doc[:-4] }}
 {% elif single_doc %}
 .. autosummary::
-    :toctree: api/generated/
+    :toctree: reference/api/
 
     {{ single_doc }}
 {% else -%}
 .. toctree::
-    :maxdepth: 4
+    :maxdepth: 2
 {% endif %}
 
     {% if not single_doc -%}
-    What's New <whatsnew/v0.24.0>
+    What's New in 0.25.0 <whatsnew/v0.25.0>
     install
     getting_started/index
-    cookbook
     user_guide/index
-    r_interface
     ecosystem
-    comparison_with_r
-    comparison_with_sql
-    comparison_with_sas
-    comparison_with_stata
     {% endif -%}
     {% if include_api -%}
-    api/index
+    reference/index
     {% endif -%}
     {% if not single_doc -%}
     development/index
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,4 +13,5 @@ Getting started @@
 min
         basics
         dsintro
+        comparison/index
         tutorials