Skip to content

Commit

Permalink
ENH: add StringMethods (.str accessor) to Index, fixes #9068
Browse files Browse the repository at this point in the history
  • Loading branch information
mortada committed Apr 6, 2015
1 parent 10c933b commit 4b68a72
Show file tree
Hide file tree
Showing 7 changed files with 104 additions and 24 deletions.
11 changes: 9 additions & 2 deletions doc/source/text.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ Working with Text Data

.. _text.string_methods:

Series is equipped with a set of string processing methods
Series and Index are equipped with a set of string processing methods
that make it easy to operate on each element of the array. Perhaps most
importantly, these methods exclude missing/NA values automatically. These are
accessed via the Series's ``str`` attribute and generally have names matching
accessed via the ``str`` attribute and generally have names matching
the equivalent (scalar) built-in string methods:

.. ipython:: python
Expand All @@ -30,6 +30,13 @@ the equivalent (scalar) built-in string methods:
s.str.upper()
s.str.len()
.. ipython:: python
idx = Index([' jack', 'jill ', ' jesse ', 'frank'])
idx.str.strip()
idx.str.lstrip()
idx.str.rstrip()
Splitting and Replacing Strings
-------------------------------

Expand Down
15 changes: 15 additions & 0 deletions doc/source/whatsnew/v0.16.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,26 @@ Enhancements
~~~~~~~~~~~~

- Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`)
- Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`)

The `.str` accessor is now available for both `Series` and `Index`.

.. ipython:: python

idx = Index([' jack', 'jill ', ' jesse ', 'frank'])
idx.str.strip()

One special case for the `.str` accessor on `Index` is that if a string method returns `bool`, the `.str` accessor
will return a `np.array` instead of a boolean `Index` (:issue:`8875`). This enables the following expression
to work naturally:

.. ipython:: python

idx = Index(['a1', 'a2', 'b1', 'b2'])
s = Series(range(4), index=idx)
s
idx.str.startswith('a')
s[s.index.str.startswith('a')]

.. _whatsnew_0161.api:

Expand Down
19 changes: 19 additions & 0 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pandas.tslib as tslib
import pandas.lib as lib
from pandas.util.decorators import Appender, cache_readonly
from pandas.core.strings import StringMethods


_shared_docs = dict()
Expand Down Expand Up @@ -497,6 +498,24 @@ def searchsorted(self, key, side='left'):
#### needs tests/doc-string
return self.values.searchsorted(key, side=side)

# string methods
def _make_str_accessor(self):
from pandas.core.series import Series
from pandas.core.index import Index
if isinstance(self, Series) and not com.is_object_dtype(self.dtype):
# this really should exclude all series with any non-string values,
# but that isn't practical for performance reasons until we have a
# str dtype (GH 9343)
raise AttributeError("Can only use .str accessor with string "
"values, which use np.object_ dtype in "
"pandas")
elif isinstance(self, Index) and self.inferred_type != 'string':
raise AttributeError("Can only use .str accessor with string "
"values (i.e. inferred_type is 'string')")
return StringMethods(self)

str = AccessorProperty(StringMethods, _make_str_accessor)

_shared_docs['drop_duplicates'] = (
"""Return %(klass)s with duplicate values removed
Expand Down
16 changes: 0 additions & 16 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
from pandas.core import generic, base
from pandas.core.internals import SingleBlockManager
from pandas.core.categorical import Categorical, CategoricalAccessor
from pandas.core.strings import StringMethods
from pandas.tseries.common import (maybe_to_datetimelike,
CombinedDatetimelikeProperties)
from pandas.tseries.index import DatetimeIndex
Expand Down Expand Up @@ -2494,21 +2493,6 @@ def to_period(self, freq=None, copy=True):
return self._constructor(new_values,
index=new_index).__finalize__(self)

#------------------------------------------------------------------------------
# string methods

def _make_str_accessor(self):
if not com.is_object_dtype(self.dtype):
# this really should exclude all series with any non-string values,
# but that isn't practical for performance reasons until we have a
# str dtype (GH 9343)
raise AttributeError("Can only use .str accessor with string "
"values, which use np.object_ dtype in "
"pandas")
return StringMethods(self)

str = base.AccessorProperty(StringMethods, _make_str_accessor)

#------------------------------------------------------------------------------
# Datetimelike delegation methods

Expand Down
23 changes: 17 additions & 6 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np

from pandas.compat import zip
from pandas.core.common import isnull, _values_from_object
from pandas.core.common import isnull, _values_from_object, is_bool_dtype
import pandas.compat as compat
from pandas.util.decorators import Appender
import re
Expand Down Expand Up @@ -632,9 +632,9 @@ def str_split(arr, pat=None, n=None, return_type='series'):
pat : string, default None
String or regular expression to split on. If None, splits on whitespace
n : int, default None (all)
return_type : {'series', 'frame'}, default 'series
return_type : {'series', 'frame'}, default 'series'
If frame, returns a DataFrame (elements are strings)
If series, returns an Series (elements are lists of strings).
If series, returns a Series (elements are lists of strings).
Notes
-----
Expand All @@ -646,9 +646,13 @@ def str_split(arr, pat=None, n=None, return_type='series'):
"""
from pandas.core.series import Series
from pandas.core.frame import DataFrame
from pandas.core.index import Index

if return_type not in ('series', 'frame'):
raise ValueError("return_type must be {'series', 'frame'}")
if return_type == 'frame' and isinstance(arr, Index):
raise ValueError("return_type='frame' is not supported for string "
"methods on Index")
if pat is None:
if n is None or n == 0:
n = -1
Expand Down Expand Up @@ -926,9 +930,9 @@ def do_copy(target):
class StringMethods(object):

"""
Vectorized string functions for Series. NAs stay NA unless handled
otherwise by a particular method. Patterned after Python's string methods,
with some inspiration from R's stringr package.
Vectorized string functions for Series and Index. NAs stay NA unless
handled otherwise by a particular method. Patterned after Python's string
methods, with some inspiration from R's stringr package.
Examples
--------
Expand Down Expand Up @@ -957,11 +961,18 @@ def __iter__(self):
def _wrap_result(self, result):
from pandas.core.series import Series
from pandas.core.frame import DataFrame
from pandas.core.index import Index

if not hasattr(result, 'ndim'):
return result
elif result.ndim == 1:
name = getattr(result, 'name', None)
if isinstance(self.series, Index):
# if result is a boolean np.array, return the np.array
# instead of wrapping it into a boolean Index (GH 8875)
if is_bool_dtype(result):
return result
return Index(result, name=name or self.series.name)
return Series(result, index=self.series.index,
name=name or self.series.name)
else:
Expand Down
31 changes: 31 additions & 0 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1174,6 +1174,37 @@ def test_join_self(self):
for kind in kinds:
joined = res.join(res, how=kind)
self.assertIs(res, joined)
def test_str_attribute(self):
# GH9068
methods = ['strip', 'rstrip', 'lstrip']
idx = Index([' jack', 'jill ', ' jesse ', 'frank'])
for method in methods:
expected = Index([getattr(str, method)(x) for x in idx.values])
tm.assert_index_equal(getattr(Index.str, method)(idx.str), expected)

# create a few instances that are not able to use .str accessor
indices = [Index(range(5)),
tm.makeDateIndex(10),
MultiIndex.from_tuples([('foo', '1'), ('bar', '3')]),
PeriodIndex(start='2000', end='2010', freq='A')]
for idx in indices:
with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'):
idx.str.repeat(2)

idx = Index(['a b c', 'd e', 'f'])
expected = Index([['a', 'b', 'c'], ['d', 'e'], ['f']])
tm.assert_index_equal(idx.str.split(), expected)
with self.assertRaisesRegexp(ValueError, 'not supported'):
idx.str.split(return_type='frame')

# test boolean case, should return np.array instead of boolean Index
idx = Index(['a1', 'a2', 'b1', 'b2'])
expected = np.array([True, True, False, False])
self.assert_array_equal(idx.str.startswith('a'), expected)
self.assertIsInstance(idx.str.startswith('a'), np.ndarray)
s = Series(range(4), index=idx)
expected = Series(range(2), index=['a1', 'a2'])
tm.assert_series_equal(s[s.index.str.startswith('a')], expected)

def test_indexing_doesnt_change_class(self):
idx = Index([1, 2, 3, 'a', 'b', 'c'])
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4855,6 +4855,19 @@ def test_to_csv_path_is_none(self):
csv_str = s.to_csv(path=None)
self.assertIsInstance(csv_str, str)

def test_str_attribute(self):
# GH9068
methods = ['strip', 'rstrip', 'lstrip']
s = Series([' jack', 'jill ', ' jesse ', 'frank'])
for method in methods:
expected = Series([getattr(str, method)(x) for x in s.values])
assert_series_equal(getattr(Series.str, method)(s.str), expected)

# str accessor only valid with string values
s = Series(range(5))
with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'):
s.str.repeat(2)

def test_clip(self):
val = self.ts.median()

Expand Down

0 comments on commit 4b68a72

Please sign in to comment.