diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1fd95b8103a41..d78c2bacc4e44 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -minimum_pre_commit_version: '2.9.2' +minimum_pre_commit_version: 2.9.2 repos: - repo: https://github.com/python/black rev: 20.8b1 @@ -168,3 +168,9 @@ repos: exclude: ^LICENSES/|\.(html|csv|txt|svg|py)$ - id: trailing-whitespace exclude: \.(html|svg)$ +- repo: https://github.com/codespell-project/codespell + rev: v2.0.0 + hooks: + - id: codespell + types_or: [python, rst, markdown] + files: ^pandas/core/ diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index b6938931e86af..2f292c8db025e 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -275,7 +275,7 @@ def fillna( if method is not None: func = missing.get_fill_func(method) new_values = func(self._ndarray.copy(), limit=limit, mask=mask) - # TODO: PandasArray didnt used to copy, need tests for this + # TODO: PandasArray didn't used to copy, need tests for this new_values = self._from_backing_data(new_values) else: # fill with value diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index b2050bf54cad6..088c2fd89c244 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -741,7 +741,7 @@ def isin(self, values) -> np.ndarray: return np.zeros(self.shape, dtype=bool) if not isinstance(values, type(self)): - inferrable = [ + inferable = [ "timedelta", "timedelta64", "datetime", @@ -751,7 +751,7 @@ def isin(self, values) -> np.ndarray: ] if values.dtype == object: inferred = lib.infer_dtype(values, skipna=False) - if inferred not in inferrable: + if inferred not in inferable: if inferred == "string": pass diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 1e9024e32c5b7..1ac23d7893fbf 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -175,7 +175,7 @@ class FloatingArray(NumericArray): .. warning:: FloatingArray is currently experimental, and its API or internal - implementation may change without warning. Expecially the behaviour + implementation may change without warning. Especially the behaviour regarding NaN (distinct from NA missing values) is subject to change. We represent a FloatingArray with 2 numpy arrays: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 575ae7531de2c..fa648157d7678 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -975,7 +975,7 @@ def _concat_same_type( else: # when concatenating block indices, we don't claim that you'll - # get an identical index as concating the values and then + # get an identical index as concatenating the values and then # creating a new index. We don't want to spend the time trying # to merge blocks across arrays in `to_concat`, so the resulting # BlockIndex may have more blocks. diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index c0662911d40da..14bdd063fa41a 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -371,7 +371,7 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: fill_value = fill_values[0] # np.nan isn't a singleton, so we may end up with multiple - # NaNs here, so we ignore tha all NA case too. + # NaNs here, so we ignore the all NA case too. if not (len(set(fill_values)) == 1 or isna(fill_values).all()): warnings.warn( "Concatenating sparse arrays with multiple fill " diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 184fbc050036b..3a351bf497662 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -467,7 +467,7 @@ def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None: elif not isinstance(value, str): raise ValueError("Scalar must be NA or str") - # Slice data and insert inbetween + # Slice data and insert in-between new_data = [ *self._data[0:key].chunks, pa.array([value], type=pa.string()), @@ -616,7 +616,7 @@ def value_counts(self, dropna: bool = True) -> Series: # Index cannot hold ExtensionArrays yet index = Index(type(self)(vc.field(0)).astype(object)) - # No missings, so we can adhere to the interface and return a numpy array. + # No missing values so we can adhere to the interface and return a numpy array. counts = np.array(vc.field(1)) if dropna and self._data.null_count > 0: diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index a1bebc92046ae..ef79c2b77e4e5 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -35,7 +35,7 @@ def create_valid_python_identifier(name: str) -> str: # Create a dict with the special characters and their replacement string. # EXACT_TOKEN_TYPES contains these special characters - # toke.tok_name contains a readable description of the replacement string. + # token.tok_name contains a readable description of the replacement string. special_characters_replacements = { char: f"_{token.tok_name[tokval]}_" # The ignore here is because of a bug in mypy that is resolved in 0.740 diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b34d23bee8b8a..5d9313148fb3d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5559,7 +5559,7 @@ def _is_mixed_type(self) -> bool_t: return False if self._mgr.any_extension_types: - # Even if they have the same dtype, we cant consolidate them, + # Even if they have the same dtype, we can't consolidate them, # so we pretend this is "mixed'" return True @@ -10626,7 +10626,7 @@ def _add_numeric_operations(cls): """ Add the operations to the cls; evaluate the doc strings again """ - axis_descr, name1, name2 = _doc_parms(cls) + axis_descr, name1, name2 = _doc_params(cls) @doc( _bool_doc, @@ -11186,8 +11186,8 @@ def last_valid_index(self): return self._find_valid_index("last") -def _doc_parms(cls): - """Return a tuple of the doc parms.""" +def _doc_params(cls): + """Return a tuple of the doc params.""" axis_descr = ( f"{{{', '.join(f'{a} ({i})' for i, a in enumerate(cls._AXIS_ORDERS))}}}" ) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d1a4fc6fc74e5..06d01d46b64f7 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -495,7 +495,7 @@ def _ea_wrap_cython_operation( If we have an ExtensionArray, unwrap, call _cython_operation, and re-wrap if appropriate. """ - # TODO: general case implementation overrideable by EAs. + # TODO: general case implementation overridable by EAs. orig_values = values if is_datetime64tz_dtype(values.dtype) or is_period_dtype(values.dtype): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 275c977e9b37b..eaeaf103c17ab 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4842,7 +4842,7 @@ def argsort(self, *args, **kwargs) -> np.ndarray: >>> idx[order] Index(['a', 'b', 'c', 'd'], dtype='object') """ - # This works for either ndarray or EA, is overriden + # This works for either ndarray or EA, is overridden # by RangeIndex, MultIIndex return self._data.argsort(*args, **kwargs) @@ -4974,7 +4974,7 @@ def get_indexer_non_unique(self, target): return self._get_indexer_non_comparable(target, method=None, unique=False) if not is_dtype_equal(self.dtype, target.dtype): - # TODO: if object, could use infer_dtype to pre-empt costly + # TODO: if object, could use infer_dtype to preempt costly # conversion if still non-comparable? dtype = find_common_type([self.dtype, target.dtype]) if ( diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index d673d1b43f729..249e9707be328 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -164,12 +164,12 @@ def equals(self, other: object) -> bool: return False elif not isinstance(other, type(self)): should_try = False - inferrable = self._data._infer_matches + inferable = self._data._infer_matches if other.dtype == object: - should_try = other.inferred_type in inferrable + should_try = other.inferred_type in inferable elif is_categorical_dtype(other.dtype): other = cast("CategoricalIndex", other) - should_try = other.categories.inferred_type in inferrable + should_try = other.categories.inferred_type in inferable if should_try: try: diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 5e5280934dff4..029c4a30a6b22 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -643,7 +643,7 @@ def difference(self, other, sort=None): if len(overlap) == len(self): return self[:0].rename(res_name) if not isinstance(overlap, RangeIndex): - # We wont end up with RangeIndex, so fall back + # We won't end up with RangeIndex, so fall back return super().difference(other, sort=sort) if overlap.step != first.step: # In some cases we might be able to get a RangeIndex back, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ea1b8259eeadd..3ec8355c89aab 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1816,7 +1816,7 @@ def _slice(self, slicer): # return same dims as we currently have if not isinstance(slicer, tuple) and self.ndim == 2: # reached via getitem_block via _slice_take_blocks_ax0 - # TODO(EA2D): wont be necessary with 2D EAs + # TODO(EA2D): won't be necessary with 2D EAs slicer = (slicer, slice(None)) if isinstance(slicer, tuple) and len(slicer) == 2: @@ -1826,7 +1826,7 @@ def _slice(self, slicer): "invalid slicing for a 1-ndim ExtensionArray", first ) # GH#32959 only full-slicers along fake-dim0 are valid - # TODO(EA2D): wont be necessary with 2D EAs + # TODO(EA2D): won't be necessary with 2D EAs new_locs = self.mgr_locs[first] if len(new_locs): # effectively slice(None) @@ -2289,7 +2289,7 @@ def _check_ndim(self, values, ndim): """ ndim inference and validation. - This is overriden by the DatetimeTZBlock to check the case of 2D + This is overridden by the DatetimeTZBlock to check the case of 2D data (values.ndim == 2), which should only be allowed if ndim is also 2. The case of 1D array is still allowed with both ndim of 1 or 2, as diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index d9db728f66754..d59cfc436f13d 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -746,7 +746,7 @@ def _convert_object_array( content: List[Scalar], dtype: Optional[DtypeObj] = None ) -> List[Scalar]: """ - Internal function ot convert object array. + Internal function to convert object array. Parameters ---------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3ccdd287dd502..7dde952636a79 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1355,7 +1355,7 @@ def _slice_take_blocks_ax0( blk = self.blocks[0] if sl_type in ("slice", "mask"): - # GH#32959 EABlock would fail since we cant make 0-width + # GH#32959 EABlock would fail since we can't make 0-width # TODO(EA2D): special casing unnecessary with 2D EAs if sllen == 0: return [] diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 88662a4fabed8..9c37a0f2b521f 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1221,33 +1221,33 @@ def nankurt( with np.errstate(invalid="ignore", divide="ignore"): adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) - numer = count * (count + 1) * (count - 1) * m4 - denom = (count - 2) * (count - 3) * m2 ** 2 + numerator = count * (count + 1) * (count - 1) * m4 + denominator = (count - 2) * (count - 3) * m2 ** 2 # floating point error # # #18044 in _libs/windows.pyx calc_kurt follow this behavior # to fix the fperr to treat denom <1e-14 as zero - numer = _zero_out_fperr(numer) - denom = _zero_out_fperr(denom) + numerator = _zero_out_fperr(numerator) + denominator = _zero_out_fperr(denominator) - if not isinstance(denom, np.ndarray): + if not isinstance(denominator, np.ndarray): # if ``denom`` is a scalar, check these corner cases first before # doing division if count < 4: return np.nan - if denom == 0: + if denominator == 0: return 0 with np.errstate(invalid="ignore", divide="ignore"): - result = numer / denom - adj + result = numerator / denominator - adj dtype = values.dtype if is_float_dtype(dtype): result = result.astype(dtype) if isinstance(result, np.ndarray): - result = np.where(denom == 0, 0, result) + result = np.where(denominator == 0, 0, result) result[count < 4] = np.nan return result diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 40496a5b8671b..8e4b000a56a3d 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -740,8 +740,8 @@ def _build_names_mapper( A row or column name is replaced if it is duplicate among the rows of the inputs, among the columns of the inputs or between the rows and the columns. - Paramters - --------- + Parameters + ---------- rownames: list[str] colnames: list[str] diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 053c960cc5cbd..70a9367dc2150 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1864,7 +1864,7 @@ def _get_corr(a, b): window=window, min_periods=self.min_periods, center=self.center ) # GH 31286: Through using var instead of std we can avoid numerical - # issues when the result of var is withing floating proint precision + # issues when the result of var is within floating proint precision # while std is not. return a.cov(b, **kwargs) / (a.var(**kwargs) * b.var(**kwargs)) ** 0.5 diff --git a/setup.cfg b/setup.cfg index a91cd18694c33..56b2fa190ac99 100644 --- a/setup.cfg +++ b/setup.cfg @@ -63,6 +63,9 @@ filterwarnings = error:The SparseArray:FutureWarning junit_family=xunit2 +[codespell] +ignore-words-list=ba,blocs,coo,datas,fo,hist,nd,ser + [coverage:run] branch = False omit =