pandas-dev · MarcoGorelli · Dec 29, 2020 · Dec 29, 2020 · Dec 29, 2020 · jreback
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,4 +1,4 @@
-minimum_pre_commit_version: '2.9.2'
+minimum_pre_commit_version: 2.9.2
 repos:
 -   repo: https://github.com/python/black
     rev: 20.8b1
@@ -168,3 +168,9 @@ repos:
         exclude: ^LICENSES/|\.(html|csv|txt|svg|py)$
     -   id: trailing-whitespace
         exclude: \.(html|svg)$
+-   repo: https://github.com/codespell-project/codespell
+    rev: v2.0.0
+    hooks:
+    -   id: codespell
+        types_or: [python, rst, markdown]
+        files: ^pandas/core/
diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
@@ -275,7 +275,7 @@ def fillna(
             if method is not None:
                 func = missing.get_fill_func(method)
                 new_values = func(self._ndarray.copy(), limit=limit, mask=mask)
-                # TODO: PandasArray didnt used to copy, need tests for this
+                # TODO: PandasArray didn't used to copy, need tests for this
                 new_values = self._from_backing_data(new_values)
             else:
                 # fill with value

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -741,7 +741,7 @@ def isin(self, values) -> np.ndarray:
             return np.zeros(self.shape, dtype=bool)
 
         if not isinstance(values, type(self)):
-            inferrable = [
+            inferable = [
                 "timedelta",
                 "timedelta64",
                 "datetime",
@@ -751,7 +751,7 @@ def isin(self, values) -> np.ndarray:
             ]
             if values.dtype == object:
                 inferred = lib.infer_dtype(values, skipna=False)
-                if inferred not in inferrable:
+                if inferred not in inferable:
                     if inferred == "string":
                         pass
 

diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py
@@ -175,7 +175,7 @@ class FloatingArray(NumericArray):
     .. warning::
 
        FloatingArray is currently experimental, and its API or internal
-       implementation may change without warning. Expecially the behaviour
+       implementation may change without warning. Especially the behaviour
        regarding NaN (distinct from NA missing values) is subject to change.
 
     We represent a FloatingArray with 2 numpy arrays:

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -975,7 +975,7 @@ def _concat_same_type(
 
         else:
             # when concatenating block indices, we don't claim that you'll
-            # get an identical index as concating the values and then
+            # get an identical index as concatenating the values and then
             # creating a new index. We don't want to spend the time trying
             # to merge blocks across arrays in `to_concat`, so the resulting
             # BlockIndex may have more blocks.

diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py
@@ -371,7 +371,7 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
         fill_value = fill_values[0]
 
         # np.nan isn't a singleton, so we may end up with multiple
-        # NaNs here, so we ignore tha all NA case too.
+        # NaNs here, so we ignore the all NA case too.
         if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
             warnings.warn(
                 "Concatenating sparse arrays with multiple fill "

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -467,7 +467,7 @@ def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None:
             elif not isinstance(value, str):
                 raise ValueError("Scalar must be NA or str")
 
-            # Slice data and insert inbetween
+            # Slice data and insert in-between
             new_data = [
                 *self._data[0:key].chunks,
                 pa.array([value], type=pa.string()),
@@ -616,7 +616,7 @@ def value_counts(self, dropna: bool = True) -> Series:
 
         # Index cannot hold ExtensionArrays yet
         index = Index(type(self)(vc.field(0)).astype(object))
-        # No missings, so we can adhere to the interface and return a numpy array.
+        # No missing values so we can adhere to the interface and return a numpy array.
         counts = np.array(vc.field(1))
 
         if dropna and self._data.null_count > 0:

diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py
@@ -35,7 +35,7 @@ def create_valid_python_identifier(name: str) -> str:
 
     # Create a dict with the special characters and their replacement string.
     # EXACT_TOKEN_TYPES contains these special characters
-    # toke.tok_name contains a readable description of the replacement string.
+    # token.tok_name contains a readable description of the replacement string.
     special_characters_replacements = {
         char: f"_{token.tok_name[tokval]}_"
         # The ignore here is because of a bug in mypy that is resolved in 0.740

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -5559,7 +5559,7 @@ def _is_mixed_type(self) -> bool_t:
             return False
 
         if self._mgr.any_extension_types:
-            # Even if they have the same dtype, we cant consolidate them,
+            # Even if they have the same dtype, we can't consolidate them,
             #  so we pretend this is "mixed'"
             return True
 
@@ -10626,7 +10626,7 @@ def _add_numeric_operations(cls):
         """
         Add the operations to the cls; evaluate the doc strings again
         """
-        axis_descr, name1, name2 = _doc_parms(cls)
+        axis_descr, name1, name2 = _doc_params(cls)
 
         @doc(
             _bool_doc,
@@ -11186,8 +11186,8 @@ def last_valid_index(self):
         return self._find_valid_index("last")
 
 
-def _doc_parms(cls):
-    """Return a tuple of the doc parms."""
+def _doc_params(cls):
+    """Return a tuple of the doc params."""
     axis_descr = (
         f"{{{', '.join(f'{a} ({i})' for i, a in enumerate(cls._AXIS_ORDERS))}}}"
     )

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -495,7 +495,7 @@ def _ea_wrap_cython_operation(
         If we have an ExtensionArray, unwrap, call _cython_operation, and
         re-wrap if appropriate.
         """
-        # TODO: general case implementation overrideable by EAs.
+        # TODO: general case implementation overridable by EAs.
         orig_values = values
 
         if is_datetime64tz_dtype(values.dtype) or is_period_dtype(values.dtype):

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -4842,7 +4842,7 @@ def argsort(self, *args, **kwargs) -> np.ndarray:
         >>> idx[order]
         Index(['a', 'b', 'c', 'd'], dtype='object')
         """
-        # This works for either ndarray or EA, is overriden
+        # This works for either ndarray or EA, is overridden
         #  by RangeIndex, MultIIndex
         return self._data.argsort(*args, **kwargs)
 
@@ -4974,7 +4974,7 @@ def get_indexer_non_unique(self, target):
             return self._get_indexer_non_comparable(target, method=None, unique=False)
 
         if not is_dtype_equal(self.dtype, target.dtype):
-            # TODO: if object, could use infer_dtype to pre-empt costly
+            # TODO: if object, could use infer_dtype to preempt costly
             #  conversion if still non-comparable?
             dtype = find_common_type([self.dtype, target.dtype])
             if (

diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
@@ -164,12 +164,12 @@ def equals(self, other: object) -> bool:
             return False
         elif not isinstance(other, type(self)):
             should_try = False
-            inferrable = self._data._infer_matches
+            inferable = self._data._infer_matches
             if other.dtype == object:
-                should_try = other.inferred_type in inferrable
+                should_try = other.inferred_type in inferable
             elif is_categorical_dtype(other.dtype):
                 other = cast("CategoricalIndex", other)
-                should_try = other.categories.inferred_type in inferrable
+                should_try = other.categories.inferred_type in inferable
 
             if should_try:
                 try:

diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py
@@ -643,7 +643,7 @@ def difference(self, other, sort=None):
         if len(overlap) == len(self):
             return self[:0].rename(res_name)
         if not isinstance(overlap, RangeIndex):
-            # We wont end up with RangeIndex, so fall back
+            # We won't end up with RangeIndex, so fall back
             return super().difference(other, sort=sort)
         if overlap.step != first.step:
             # In some cases we might be able to get a RangeIndex back,

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -1816,7 +1816,7 @@ def _slice(self, slicer):
         # return same dims as we currently have
         if not isinstance(slicer, tuple) and self.ndim == 2:
             # reached via getitem_block via _slice_take_blocks_ax0
-            # TODO(EA2D): wont be necessary with 2D EAs
+            # TODO(EA2D): won't be necessary with 2D EAs
             slicer = (slicer, slice(None))
 
         if isinstance(slicer, tuple) and len(slicer) == 2:
@@ -1826,7 +1826,7 @@ def _slice(self, slicer):
                     "invalid slicing for a 1-ndim ExtensionArray", first
                 )
             # GH#32959 only full-slicers along fake-dim0 are valid
-            # TODO(EA2D): wont be necessary with 2D EAs
+            # TODO(EA2D): won't be necessary with 2D EAs
             new_locs = self.mgr_locs[first]
             if len(new_locs):
                 # effectively slice(None)
@@ -2289,7 +2289,7 @@ def _check_ndim(self, values, ndim):
         """
         ndim inference and validation.
 
-        This is overriden by the DatetimeTZBlock to check the case of 2D
+        This is overridden by the DatetimeTZBlock to check the case of 2D
         data (values.ndim == 2), which should only be allowed if ndim is
         also 2.
         The case of 1D array is still allowed with both ndim of 1 or 2, as

diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -746,7 +746,7 @@ def _convert_object_array(
     content: List[Scalar], dtype: Optional[DtypeObj] = None
 ) -> List[Scalar]:
     """
-    Internal function ot convert object array.
+    Internal function to convert object array.
 
     Parameters
     ----------

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1355,7 +1355,7 @@ def _slice_take_blocks_ax0(
             blk = self.blocks[0]
 
             if sl_type in ("slice", "mask"):
-                # GH#32959 EABlock would fail since we cant make 0-width
+                # GH#32959 EABlock would fail since we can't make 0-width
                 # TODO(EA2D): special casing unnecessary with 2D EAs
                 if sllen == 0:
                     return []

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -1221,33 +1221,33 @@ def nankurt(
 
     with np.errstate(invalid="ignore", divide="ignore"):
         adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
-        numer = count * (count + 1) * (count - 1) * m4
-        denom = (count - 2) * (count - 3) * m2 ** 2
+        numerator = count * (count + 1) * (count - 1) * m4
+        denominator = (count - 2) * (count - 3) * m2 ** 2
 
     # floating point error
     #
     # #18044 in _libs/windows.pyx calc_kurt follow this behavior
     # to fix the fperr to treat denom <1e-14 as zero
-    numer = _zero_out_fperr(numer)
-    denom = _zero_out_fperr(denom)
+    numerator = _zero_out_fperr(numerator)
+    denominator = _zero_out_fperr(denominator)
 
-    if not isinstance(denom, np.ndarray):
+    if not isinstance(denominator, np.ndarray):
         # if ``denom`` is a scalar, check these corner cases first before
         # doing division
         if count < 4:
             return np.nan
-        if denom == 0:
+        if denominator == 0:
             return 0
 
     with np.errstate(invalid="ignore", divide="ignore"):
-        result = numer / denom - adj
+        result = numerator / denominator - adj
 
     dtype = values.dtype
     if is_float_dtype(dtype):
         result = result.astype(dtype)
 
     if isinstance(result, np.ndarray):
-        result = np.where(denom == 0, 0, result)
+        result = np.where(denominator == 0, 0, result)
         result[count < 4] = np.nan
 
     return result

diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
@@ -740,8 +740,8 @@ def _build_names_mapper(
     A row or column name is replaced if it is duplicate among the rows of the inputs,
     among the columns of the inputs or between the rows and the columns.
 
-    Paramters
-    ---------
+    Parameters
+    ----------
     rownames: list[str]
     colnames: list[str]
 

diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
@@ -1864,7 +1864,7 @@ def _get_corr(a, b):
                 window=window, min_periods=self.min_periods, center=self.center
             )
             # GH 31286: Through using var instead of std we can avoid numerical
-            # issues when the result of var is withing floating proint precision
+            # issues when the result of var is within floating proint precision
             # while std is not.
             return a.cov(b, **kwargs) / (a.var(**kwargs) * b.var(**kwargs)) ** 0.5
 

diff --git a/setup.cfg b/setup.cfg
@@ -63,6 +63,9 @@ filterwarnings =
     error:The SparseArray:FutureWarning
 junit_family=xunit2
 
+[codespell]
+ignore-words-list=ba,blocs,coo,datas,fo,hist,nd,ser
+
 [coverage:run]
 branch = False
 omit =