From 7d734fbdd86019c902f875f281fc2a5bbb84ffaa Mon Sep 17 00:00:00 2001 From: Mortada Mehyar Date: Mon, 16 Mar 2015 20:03:45 -0700 Subject: [PATCH] ENH: add StringMethods (.str accessor) to Index, fixes #9068 --- doc/source/text.rst | 11 +++++++++-- doc/source/whatsnew/v0.16.1.txt | 19 ++++++++++++++++++- pandas/core/base.py | 19 +++++++++++++++++++ pandas/core/series.py | 16 ---------------- pandas/core/strings.py | 23 +++++++++++++++++------ pandas/tests/test_index.py | 31 +++++++++++++++++++++++++++++++ pandas/tests/test_series.py | 13 +++++++++++++ 7 files changed, 107 insertions(+), 25 deletions(-) diff --git a/doc/source/text.rst b/doc/source/text.rst index a98153e277fae..ee91ea3c166b6 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -17,10 +17,10 @@ Working with Text Data .. _text.string_methods: -Series is equipped with a set of string processing methods +Series and Index are equipped with a set of string processing methods that make it easy to operate on each element of the array. Perhaps most importantly, these methods exclude missing/NA values automatically. These are -accessed via the Series's ``str`` attribute and generally have names matching +accessed via the ``str`` attribute and generally have names matching the equivalent (scalar) built-in string methods: .. ipython:: python @@ -30,6 +30,13 @@ the equivalent (scalar) built-in string methods: s.str.upper() s.str.len() +.. ipython:: python + + idx = Index([' jack', 'jill ', ' jesse ', 'frank']) + idx.str.strip() + idx.str.lstrip() + idx.str.rstrip() + Splitting and Replacing Strings ------------------------------- diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index e2ed86c7fb143..7d6ce1ad071b2 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -18,11 +18,28 @@ Enhancements ~~~~~~~~~~~~ - Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`) +- Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`) -- ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`) + The `.str` accessor is now available for both `Series` and `Index`. + + .. ipython:: python + idx = Index([' jack', 'jill ', ' jesse ', 'frank']) + idx.str.strip() + One special case for the `.str` accessor on `Index` is that if a string method returns `bool`, the `.str` accessor + will return a `np.array` instead of a boolean `Index` (:issue:`8875`). This enables the following expression + to work naturally: + .. ipython:: python + + idx = Index(['a1', 'a2', 'b1', 'b2']) + s = Series(range(4), index=idx) + s + idx.str.startswith('a') + s[s.index.str.startswith('a')] + +- ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`) - ``drop`` function can now accept ``errors`` keyword to suppress ValueError raised when any of label does not exist in the target data. (:issue:`6736`) .. ipython:: python diff --git a/pandas/core/base.py b/pandas/core/base.py index dde2e74132c4b..a3d3c3791e20c 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -10,6 +10,7 @@ import pandas.tslib as tslib import pandas.lib as lib from pandas.util.decorators import Appender, cache_readonly +from pandas.core.strings import StringMethods _shared_docs = dict() @@ -497,6 +498,24 @@ def searchsorted(self, key, side='left'): #### needs tests/doc-string return self.values.searchsorted(key, side=side) + # string methods + def _make_str_accessor(self): + from pandas.core.series import Series + from pandas.core.index import Index + if isinstance(self, Series) and not com.is_object_dtype(self.dtype): + # this really should exclude all series with any non-string values, + # but that isn't practical for performance reasons until we have a + # str dtype (GH 9343) + raise AttributeError("Can only use .str accessor with string " + "values, which use np.object_ dtype in " + "pandas") + elif isinstance(self, Index) and self.inferred_type != 'string': + raise AttributeError("Can only use .str accessor with string " + "values (i.e. inferred_type is 'string')") + return StringMethods(self) + + str = AccessorProperty(StringMethods, _make_str_accessor) + _shared_docs['drop_duplicates'] = ( """Return %(klass)s with duplicate values removed diff --git a/pandas/core/series.py b/pandas/core/series.py index 68f3a6032402f..b71c269468d62 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -28,7 +28,6 @@ from pandas.core import generic, base from pandas.core.internals import SingleBlockManager from pandas.core.categorical import Categorical, CategoricalAccessor -from pandas.core.strings import StringMethods from pandas.tseries.common import (maybe_to_datetimelike, CombinedDatetimelikeProperties) from pandas.tseries.index import DatetimeIndex @@ -2494,21 +2493,6 @@ def to_period(self, freq=None, copy=True): return self._constructor(new_values, index=new_index).__finalize__(self) - #------------------------------------------------------------------------------ - # string methods - - def _make_str_accessor(self): - if not com.is_object_dtype(self.dtype): - # this really should exclude all series with any non-string values, - # but that isn't practical for performance reasons until we have a - # str dtype (GH 9343) - raise AttributeError("Can only use .str accessor with string " - "values, which use np.object_ dtype in " - "pandas") - return StringMethods(self) - - str = base.AccessorProperty(StringMethods, _make_str_accessor) - #------------------------------------------------------------------------------ # Datetimelike delegation methods diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 4ef341c481a60..d04c1bd94d5a9 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1,7 +1,7 @@ import numpy as np from pandas.compat import zip -from pandas.core.common import isnull, _values_from_object +from pandas.core.common import isnull, _values_from_object, is_bool_dtype import pandas.compat as compat from pandas.util.decorators import Appender import re @@ -632,9 +632,9 @@ def str_split(arr, pat=None, n=None, return_type='series'): pat : string, default None String or regular expression to split on. If None, splits on whitespace n : int, default None (all) - return_type : {'series', 'frame'}, default 'series + return_type : {'series', 'frame'}, default 'series' If frame, returns a DataFrame (elements are strings) - If series, returns an Series (elements are lists of strings). + If series, returns a Series (elements are lists of strings). Notes ----- @@ -646,9 +646,13 @@ def str_split(arr, pat=None, n=None, return_type='series'): """ from pandas.core.series import Series from pandas.core.frame import DataFrame + from pandas.core.index import Index if return_type not in ('series', 'frame'): raise ValueError("return_type must be {'series', 'frame'}") + if return_type == 'frame' and isinstance(arr, Index): + raise ValueError("return_type='frame' is not supported for string " + "methods on Index") if pat is None: if n is None or n == 0: n = -1 @@ -928,9 +932,9 @@ def do_copy(target): class StringMethods(object): """ - Vectorized string functions for Series. NAs stay NA unless handled - otherwise by a particular method. Patterned after Python's string methods, - with some inspiration from R's stringr package. + Vectorized string functions for Series and Index. NAs stay NA unless + handled otherwise by a particular method. Patterned after Python's string + methods, with some inspiration from R's stringr package. Examples -------- @@ -959,11 +963,18 @@ def __iter__(self): def _wrap_result(self, result): from pandas.core.series import Series from pandas.core.frame import DataFrame + from pandas.core.index import Index if not hasattr(result, 'ndim'): return result elif result.ndim == 1: name = getattr(result, 'name', None) + if isinstance(self.series, Index): + # if result is a boolean np.array, return the np.array + # instead of wrapping it into a boolean Index (GH 8875) + if is_bool_dtype(result): + return result + return Index(result, name=name or self.series.name) return Series(result, index=self.series.index, name=name or self.series.name) else: diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 61cb337880c00..85377ab6065ef 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -1197,6 +1197,37 @@ def test_join_self(self): for kind in kinds: joined = res.join(res, how=kind) self.assertIs(res, joined) + def test_str_attribute(self): + # GH9068 + methods = ['strip', 'rstrip', 'lstrip'] + idx = Index([' jack', 'jill ', ' jesse ', 'frank']) + for method in methods: + expected = Index([getattr(str, method)(x) for x in idx.values]) + tm.assert_index_equal(getattr(Index.str, method)(idx.str), expected) + + # create a few instances that are not able to use .str accessor + indices = [Index(range(5)), + tm.makeDateIndex(10), + MultiIndex.from_tuples([('foo', '1'), ('bar', '3')]), + PeriodIndex(start='2000', end='2010', freq='A')] + for idx in indices: + with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'): + idx.str.repeat(2) + + idx = Index(['a b c', 'd e', 'f']) + expected = Index([['a', 'b', 'c'], ['d', 'e'], ['f']]) + tm.assert_index_equal(idx.str.split(), expected) + with self.assertRaisesRegexp(ValueError, 'not supported'): + idx.str.split(return_type='frame') + + # test boolean case, should return np.array instead of boolean Index + idx = Index(['a1', 'a2', 'b1', 'b2']) + expected = np.array([True, True, False, False]) + self.assert_array_equal(idx.str.startswith('a'), expected) + self.assertIsInstance(idx.str.startswith('a'), np.ndarray) + s = Series(range(4), index=idx) + expected = Series(range(2), index=['a1', 'a2']) + tm.assert_series_equal(s[s.index.str.startswith('a')], expected) def test_indexing_doesnt_change_class(self): idx = Index([1, 2, 3, 'a', 'b', 'c']) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index f044fe540ea24..8e468f8ee46ef 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -4933,6 +4933,19 @@ def test_to_csv_path_is_none(self): csv_str = s.to_csv(path=None) self.assertIsInstance(csv_str, str) + def test_str_attribute(self): + # GH9068 + methods = ['strip', 'rstrip', 'lstrip'] + s = Series([' jack', 'jill ', ' jesse ', 'frank']) + for method in methods: + expected = Series([getattr(str, method)(x) for x in s.values]) + assert_series_equal(getattr(Series.str, method)(s.str), expected) + + # str accessor only valid with string values + s = Series(range(5)) + with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'): + s.str.repeat(2) + def test_clip(self): val = self.ts.median()