Skip to content

Commit

Permalink
ENH: add StringMethods (.str accessor) to Index, fixes #9068
Browse files Browse the repository at this point in the history
  • Loading branch information
mortada committed Apr 8, 2015
1 parent 9e4e447 commit ed77c72
Show file tree
Hide file tree
Showing 7 changed files with 113 additions and 28 deletions.
11 changes: 9 additions & 2 deletions doc/source/text.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ Working with Text Data

.. _text.string_methods:

Series is equipped with a set of string processing methods
Series and Index are equipped with a set of string processing methods
that make it easy to operate on each element of the array. Perhaps most
importantly, these methods exclude missing/NA values automatically. These are
accessed via the Series's ``str`` attribute and generally have names matching
accessed via the ``str`` attribute and generally have names matching
the equivalent (scalar) built-in string methods:

.. ipython:: python
Expand All @@ -30,6 +30,13 @@ the equivalent (scalar) built-in string methods:
s.str.upper()
s.str.len()
.. ipython:: python
idx = Index([' jack', 'jill ', ' jesse ', 'frank'])
idx.str.strip()
idx.str.lstrip()
idx.str.rstrip()
Splitting and Replacing Strings
-------------------------------

Expand Down
20 changes: 18 additions & 2 deletions doc/source/whatsnew/v0.16.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,35 @@ Enhancements
~~~~~~~~~~~~

- Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`)
- Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`)

- ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`)
The `.str` accessor is now available for both `Series` and `Index`.

.. ipython:: python

idx = Index([' jack', 'jill ', ' jesse ', 'frank'])
idx.str.strip()

One special case for the `.str` accessor on `Index` is that if a string method returns `bool`, the `.str` accessor
will return a `np.array` instead of a boolean `Index` (:issue:`8875`). This enables the following expression
to work naturally:

.. ipython:: python

idx = Index(['a1', 'a2', 'b1', 'b2'])
s = Series(range(4), index=idx)
s
idx.str.startswith('a')
s[s.index.str.startswith('a')]

- ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`)
- ``drop`` function can now accept ``errors`` keyword to suppress ValueError raised when any of label does not exist in the target data. (:issue:`6736`)

.. ipython:: python

df = DataFrame(np.random.randn(3, 3), columns=['A', 'B', 'C'])
df.drop(['A', 'X'], axis=1, errors='ignore')


.. _whatsnew_0161.api:

API changes
Expand Down
19 changes: 19 additions & 0 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pandas.tslib as tslib
import pandas.lib as lib
from pandas.util.decorators import Appender, cache_readonly
from pandas.core.strings import StringMethods


_shared_docs = dict()
Expand Down Expand Up @@ -497,6 +498,24 @@ def searchsorted(self, key, side='left'):
#### needs tests/doc-string
return self.values.searchsorted(key, side=side)

# string methods
def _make_str_accessor(self):
from pandas.core.series import Series
from pandas.core.index import Index
if isinstance(self, Series) and not com.is_object_dtype(self.dtype):
# this really should exclude all series with any non-string values,
# but that isn't practical for performance reasons until we have a
# str dtype (GH 9343)
raise AttributeError("Can only use .str accessor with string "
"values, which use np.object_ dtype in "
"pandas")
elif isinstance(self, Index) and self.inferred_type != 'string':
raise AttributeError("Can only use .str accessor with string "
"values (i.e. inferred_type is 'string')")
return StringMethods(self)

str = AccessorProperty(StringMethods, _make_str_accessor)

_shared_docs['drop_duplicates'] = (
"""Return %(klass)s with duplicate values removed
Expand Down
16 changes: 0 additions & 16 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
from pandas.core import generic, base
from pandas.core.internals import SingleBlockManager
from pandas.core.categorical import Categorical, CategoricalAccessor
from pandas.core.strings import StringMethods
from pandas.tseries.common import (maybe_to_datetimelike,
CombinedDatetimelikeProperties)
from pandas.tseries.index import DatetimeIndex
Expand Down Expand Up @@ -2494,21 +2493,6 @@ def to_period(self, freq=None, copy=True):
return self._constructor(new_values,
index=new_index).__finalize__(self)

#------------------------------------------------------------------------------
# string methods

def _make_str_accessor(self):
if not com.is_object_dtype(self.dtype):
# this really should exclude all series with any non-string values,
# but that isn't practical for performance reasons until we have a
# str dtype (GH 9343)
raise AttributeError("Can only use .str accessor with string "
"values, which use np.object_ dtype in "
"pandas")
return StringMethods(self)

str = base.AccessorProperty(StringMethods, _make_str_accessor)

#------------------------------------------------------------------------------
# Datetimelike delegation methods

Expand Down
28 changes: 20 additions & 8 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np

from pandas.compat import zip
from pandas.core.common import isnull, _values_from_object
from pandas.core.common import isnull, _values_from_object, is_bool_dtype
import pandas.compat as compat
from pandas.util.decorators import Appender
import re
Expand Down Expand Up @@ -632,9 +632,10 @@ def str_split(arr, pat=None, n=None, return_type='series'):
pat : string, default None
String or regular expression to split on. If None, splits on whitespace
n : int, default None (all)
return_type : {'series', 'frame'}, default 'series
return_type : {'series', 'index', 'frame'}, default 'series'
If frame, returns a DataFrame (elements are strings)
If series, returns an Series (elements are lists of strings).
If series or index, returns the same type as the original object
(elements are lists of strings).
Notes
-----
Expand All @@ -646,9 +647,13 @@ def str_split(arr, pat=None, n=None, return_type='series'):
"""
from pandas.core.series import Series
from pandas.core.frame import DataFrame
from pandas.core.index import Index

if return_type not in ('series', 'frame'):
raise ValueError("return_type must be {'series', 'frame'}")
if return_type not in ('series', 'index', 'frame'):
raise ValueError("return_type must be {'series', 'index', 'frame'}")
if return_type == 'frame' and isinstance(arr, Index):
raise ValueError("return_type='frame' is not supported for string "
"methods on Index")
if pat is None:
if n is None or n == 0:
n = -1
Expand Down Expand Up @@ -928,9 +933,9 @@ def do_copy(target):
class StringMethods(object):

"""
Vectorized string functions for Series. NAs stay NA unless handled
otherwise by a particular method. Patterned after Python's string methods,
with some inspiration from R's stringr package.
Vectorized string functions for Series and Index. NAs stay NA unless
handled otherwise by a particular method. Patterned after Python's string
methods, with some inspiration from R's stringr package.
Examples
--------
Expand Down Expand Up @@ -959,11 +964,18 @@ def __iter__(self):
def _wrap_result(self, result):
from pandas.core.series import Series
from pandas.core.frame import DataFrame
from pandas.core.index import Index

if not hasattr(result, 'ndim'):
return result
elif result.ndim == 1:
name = getattr(result, 'name', None)
if isinstance(self.series, Index):
# if result is a boolean np.array, return the np.array
# instead of wrapping it into a boolean Index (GH 8875)
if is_bool_dtype(result):
return result
return Index(result, name=name or self.series.name)
return Series(result, index=self.series.index,
name=name or self.series.name)
else:
Expand Down
34 changes: 34 additions & 0 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1197,6 +1197,40 @@ def test_join_self(self):
for kind in kinds:
joined = res.join(res, how=kind)
self.assertIs(res, joined)
def test_str_attribute(self):
# GH9068
methods = ['strip', 'rstrip', 'lstrip']
idx = Index([' jack', 'jill ', ' jesse ', 'frank'])
for method in methods:
expected = Index([getattr(str, method)(x) for x in idx.values])
tm.assert_index_equal(getattr(Index.str, method)(idx.str), expected)

# create a few instances that are not able to use .str accessor
indices = [Index(range(5)),
tm.makeDateIndex(10),
MultiIndex.from_tuples([('foo', '1'), ('bar', '3')]),
PeriodIndex(start='2000', end='2010', freq='A')]
for idx in indices:
with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'):
idx.str.repeat(2)

idx = Index(['a b c', 'd e', 'f'])
expected = Index([['a', 'b', 'c'], ['d', 'e'], ['f']])
tm.assert_index_equal(idx.str.split(), expected)
tm.assert_index_equal(idx.str.split(return_type='series'), expected)
# return_type 'index' is an alias for 'series'
tm.assert_index_equal(idx.str.split(return_type='index'), expected)
with self.assertRaisesRegexp(ValueError, 'not supported'):
idx.str.split(return_type='frame')

# test boolean case, should return np.array instead of boolean Index
idx = Index(['a1', 'a2', 'b1', 'b2'])
expected = np.array([True, True, False, False])
self.assert_array_equal(idx.str.startswith('a'), expected)
self.assertIsInstance(idx.str.startswith('a'), np.ndarray)
s = Series(range(4), index=idx)
expected = Series(range(2), index=['a1', 'a2'])
tm.assert_series_equal(s[s.index.str.startswith('a')], expected)

def test_indexing_doesnt_change_class(self):
idx = Index([1, 2, 3, 'a', 'b', 'c'])
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4933,6 +4933,19 @@ def test_to_csv_path_is_none(self):
csv_str = s.to_csv(path=None)
self.assertIsInstance(csv_str, str)

def test_str_attribute(self):
# GH9068
methods = ['strip', 'rstrip', 'lstrip']
s = Series([' jack', 'jill ', ' jesse ', 'frank'])
for method in methods:
expected = Series([getattr(str, method)(x) for x in s.values])
assert_series_equal(getattr(Series.str, method)(s.str), expected)

# str accessor only valid with string values
s = Series(range(5))
with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'):
s.str.repeat(2)

def test_clip(self):
val = self.ts.median()

Expand Down

0 comments on commit ed77c72

Please sign in to comment.