Skip to content

Commit

Permalink
ENH: add StringMethods (.str accessor) to Index, fixes #9068
Browse files Browse the repository at this point in the history
  • Loading branch information
mortada committed Apr 1, 2015
1 parent 10c933b commit 00e6aea
Show file tree
Hide file tree
Showing 7 changed files with 58 additions and 21 deletions.
11 changes: 9 additions & 2 deletions doc/source/text.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ Working with Text Data

.. _text.string_methods:

Series is equipped with a set of string processing methods
Series and Index are equipped with a set of string processing methods
that make it easy to operate on each element of the array. Perhaps most
importantly, these methods exclude missing/NA values automatically. These are
accessed via the Series's ``str`` attribute and generally have names matching
accessed via the ``str`` attribute and generally have names matching
the equivalent (scalar) built-in string methods:

.. ipython:: python
Expand All @@ -30,6 +30,13 @@ the equivalent (scalar) built-in string methods:
s.str.upper()
s.str.len()
.. ipython:: python
idx = Index([' jack', 'jill ', ' jesse ', 'frank'])
idx.str.strip()
idx.str.lstrip()
idx.str.rstrip()
Splitting and Replacing Strings
-------------------------------

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.16.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Enhancements
~~~~~~~~~~~~

- Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`)
- Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`)



Expand Down
14 changes: 14 additions & 0 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pandas.tslib as tslib
import pandas.lib as lib
from pandas.util.decorators import Appender, cache_readonly
from pandas.core.strings import StringMethods


_shared_docs = dict()
Expand Down Expand Up @@ -497,6 +498,19 @@ def searchsorted(self, key, side='left'):
#### needs tests/doc-string
return self.values.searchsorted(key, side=side)

# string methods
def _make_str_accessor(self):
if not com.is_object_dtype(self.dtype):
# this really should exclude all series with any non-string values,
# but that isn't practical for performance reasons until we have a
# str dtype (GH 9343)
raise AttributeError("Can only use .str accessor with string "
"values, which use np.object_ dtype in "
"pandas")
return StringMethods(self)

str = AccessorProperty(StringMethods, _make_str_accessor)

_shared_docs['drop_duplicates'] = (
"""Return %(klass)s with duplicate values removed
Expand Down
16 changes: 0 additions & 16 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
from pandas.core import generic, base
from pandas.core.internals import SingleBlockManager
from pandas.core.categorical import Categorical, CategoricalAccessor
from pandas.core.strings import StringMethods
from pandas.tseries.common import (maybe_to_datetimelike,
CombinedDatetimelikeProperties)
from pandas.tseries.index import DatetimeIndex
Expand Down Expand Up @@ -2494,21 +2493,6 @@ def to_period(self, freq=None, copy=True):
return self._constructor(new_values,
index=new_index).__finalize__(self)

#------------------------------------------------------------------------------
# string methods

def _make_str_accessor(self):
if not com.is_object_dtype(self.dtype):
# this really should exclude all series with any non-string values,
# but that isn't practical for performance reasons until we have a
# str dtype (GH 9343)
raise AttributeError("Can only use .str accessor with string "
"values, which use np.object_ dtype in "
"pandas")
return StringMethods(self)

str = base.AccessorProperty(StringMethods, _make_str_accessor)

#------------------------------------------------------------------------------
# Datetimelike delegation methods

Expand Down
9 changes: 6 additions & 3 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -926,9 +926,9 @@ def do_copy(target):
class StringMethods(object):

"""
Vectorized string functions for Series. NAs stay NA unless handled
otherwise by a particular method. Patterned after Python's string methods,
with some inspiration from R's stringr package.
Vectorized string functions for Series and Index. NAs stay NA unless
handled otherwise by a particular method. Patterned after Python's string
methods, with some inspiration from R's stringr package.
Examples
--------
Expand Down Expand Up @@ -957,11 +957,14 @@ def __iter__(self):
def _wrap_result(self, result):
from pandas.core.series import Series
from pandas.core.frame import DataFrame
from pandas.core.index import Index

if not hasattr(result, 'ndim'):
return result
elif result.ndim == 1:
name = getattr(result, 'name', None)
if isinstance(self.series, Index):
return Index(result, name=name or self.series.name)
return Series(result, index=self.series.index,
name=name or self.series.name)
else:
Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1174,6 +1174,21 @@ def test_join_self(self):
for kind in kinds:
joined = res.join(res, how=kind)
self.assertIs(res, joined)
def test_str_attribute(self):
# GH9068
methods = ['strip', 'rstrip', 'lstrip']
idx = Index([' jack', 'jill ', ' jesse ', 'frank'])
for method in methods:
expected = Index([getattr(str, method)(x) for x in idx.values])
tm.assert_index_equal(getattr(Index.str, method)(idx.str), expected)

# create a few instances that are not able to use .str accessor
indices = [Index(range(5)),
tm.makeDateIndex(10),
PeriodIndex(start='2000', end='2010', freq='A')]
for idx in indices:
with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'):
idx.str.repeat(2)

def test_indexing_doesnt_change_class(self):
idx = Index([1, 2, 3, 'a', 'b', 'c'])
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4855,6 +4855,19 @@ def test_to_csv_path_is_none(self):
csv_str = s.to_csv(path=None)
self.assertIsInstance(csv_str, str)

def test_str_attribute(self):
# GH9068
methods = ['strip', 'rstrip', 'lstrip']
s = Series([' jack', 'jill ', ' jesse ', 'frank'])
for method in methods:
expected = Series([getattr(str, method)(x) for x in s.values])
assert_series_equal(getattr(Series.str, method)(s.str), expected)

# str accessor only valid with string values
s = Series(range(5))
with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'):
s.str.repeat(2)

def test_clip(self):
val = self.ts.median()

Expand Down

0 comments on commit 00e6aea

Please sign in to comment.