diff --git a/doc/source/text.rst b/doc/source/text.rst index 2d46b37853cee..fe86c416cecbb 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -17,10 +17,10 @@ Working with Text Data .. _text.string_methods: -Series is equipped with a set of string processing methods +Series and Index are equipped with a set of string processing methods that make it easy to operate on each element of the array. Perhaps most importantly, these methods exclude missing/NA values automatically. These are -accessed via the Series's ``str`` attribute and generally have names matching +accessed via the ``str`` attribute and generally have names matching the equivalent (scalar) built-in string methods: .. ipython:: python @@ -30,6 +30,13 @@ the equivalent (scalar) built-in string methods: s.str.upper() s.str.len() +.. ipython:: python + + idx = Index([' jack', 'jill ', ' jesse ', 'frank']) + idx.str.strip() + idx.str.lstrip() + idx.str.rstrip() + Splitting and Replacing Strings ------------------------------- diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index f0210698e2828..a27c1bea2e511 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -18,6 +18,7 @@ Enhancements ~~~~~~~~~~~~ - Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`) +- Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`) diff --git a/pandas/core/base.py b/pandas/core/base.py index dde2e74132c4b..54f2a664c6876 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -10,6 +10,7 @@ import pandas.tslib as tslib import pandas.lib as lib from pandas.util.decorators import Appender, cache_readonly +from pandas.core.strings import StringMethods _shared_docs = dict() @@ -497,6 +498,19 @@ def searchsorted(self, key, side='left'): #### needs tests/doc-string return self.values.searchsorted(key, side=side) + # string methods + def _make_str_accessor(self): + if not com.is_object_dtype(self.dtype): + # this really should exclude all series with any non-string values, + # but that isn't practical for performance reasons until we have a + # str dtype (GH 9343) + raise AttributeError("Can only use .str accessor with string " + "values, which use np.object_ dtype in " + "pandas") + return StringMethods(self) + + str = AccessorProperty(StringMethods, _make_str_accessor) + _shared_docs['drop_duplicates'] = ( """Return %(klass)s with duplicate values removed diff --git a/pandas/core/series.py b/pandas/core/series.py index 68f3a6032402f..b71c269468d62 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -28,7 +28,6 @@ from pandas.core import generic, base from pandas.core.internals import SingleBlockManager from pandas.core.categorical import Categorical, CategoricalAccessor -from pandas.core.strings import StringMethods from pandas.tseries.common import (maybe_to_datetimelike, CombinedDatetimelikeProperties) from pandas.tseries.index import DatetimeIndex @@ -2494,21 +2493,6 @@ def to_period(self, freq=None, copy=True): return self._constructor(new_values, index=new_index).__finalize__(self) - #------------------------------------------------------------------------------ - # string methods - - def _make_str_accessor(self): - if not com.is_object_dtype(self.dtype): - # this really should exclude all series with any non-string values, - # but that isn't practical for performance reasons until we have a - # str dtype (GH 9343) - raise AttributeError("Can only use .str accessor with string " - "values, which use np.object_ dtype in " - "pandas") - return StringMethods(self) - - str = base.AccessorProperty(StringMethods, _make_str_accessor) - #------------------------------------------------------------------------------ # Datetimelike delegation methods diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 97f6752fb5851..4f93166b1265a 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -926,9 +926,9 @@ def do_copy(target): class StringMethods(object): """ - Vectorized string functions for Series. NAs stay NA unless handled - otherwise by a particular method. Patterned after Python's string methods, - with some inspiration from R's stringr package. + Vectorized string functions for Series and Index. NAs stay NA unless + handled otherwise by a particular method. Patterned after Python's string + methods, with some inspiration from R's stringr package. Examples -------- @@ -957,11 +957,14 @@ def __iter__(self): def _wrap_result(self, result): from pandas.core.series import Series from pandas.core.frame import DataFrame + from pandas.core.index import Index if not hasattr(result, 'ndim'): return result elif result.ndim == 1: name = getattr(result, 'name', None) + if isinstance(self.series, Index): + return Index(result, name=name or self.series.name) return Series(result, index=self.series.index, name=name or self.series.name) else: diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 39db387045f12..92927ed4dc951 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -1174,6 +1174,21 @@ def test_join_self(self): for kind in kinds: joined = res.join(res, how=kind) self.assertIs(res, joined) + def test_str_attribute(self): + # GH9068 + methods = ['strip', 'rstrip', 'lstrip'] + idx = Index([' jack', 'jill ', ' jesse ', 'frank']) + for method in methods: + expected = Index([getattr(str, method)(x) for x in idx.values]) + tm.assert_index_equal(getattr(Index.str, method)(idx.str), expected) + + # create a few instances that are not able to use .str accessor + indices = [Index(range(5)), + tm.makeDateIndex(10), + PeriodIndex(start='2000', end='2010', freq='A')] + for idx in indices: + with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'): + idx.str.repeat(2) def test_indexing_doesnt_change_class(self): idx = Index([1, 2, 3, 'a', 'b', 'c']) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 9b5e36974553b..2cbd1d87976d1 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -4855,6 +4855,19 @@ def test_to_csv_path_is_none(self): csv_str = s.to_csv(path=None) self.assertIsInstance(csv_str, str) + def test_str_attribute(self): + # GH9068 + methods = ['strip', 'rstrip', 'lstrip'] + s = Series([' jack', 'jill ', ' jesse ', 'frank']) + for method in methods: + expected = Series([getattr(str, method)(x) for x in s.values]) + assert_series_equal(getattr(Series.str, method)(s.str), expected) + + # str accessor only valid with string values + s = Series(range(5)) + with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'): + s.str.repeat(2) + def test_clip(self): val = self.ts.median()