Skip to content

Commit

Permalink
[FEAT] Add str.upper() function (#1942)
Browse files Browse the repository at this point in the history
* Adding the `upper` function to match
https://ibis-project.org/reference/expression-strings#ibis.expr.types.strings.StringValue.upper
* Added tests showing example usage
* Refactor tests for str.lower to be a single parameterized test

Closes #1920
  • Loading branch information
nsalerni authored Feb 22, 2024
1 parent 9c66a5e commit 5b2fe98
Show file tree
Hide file tree
Showing 13 changed files with 163 additions and 26 deletions.
2 changes: 2 additions & 0 deletions daft/daft.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -910,6 +910,7 @@ class PyExpr:
def utf8_split(self, pattern: PyExpr) -> PyExpr: ...
def utf8_length(self) -> PyExpr: ...
def utf8_lower(self) -> PyExpr: ...
def utf8_upper(self) -> PyExpr: ...
def image_decode(self) -> PyExpr: ...
def image_encode(self, image_format: ImageFormat) -> PyExpr: ...
def image_resize(self, w: int, h: int) -> PyExpr: ...
Expand Down Expand Up @@ -986,6 +987,7 @@ class PySeries:
def utf8_split(self, pattern: PySeries) -> PySeries: ...
def utf8_length(self) -> PySeries: ...
def utf8_lower(self) -> PySeries: ...
def utf8_upper(self) -> PySeries: ...
def is_nan(self) -> PySeries: ...
def dt_date(self) -> PySeries: ...
def dt_day(self) -> PySeries: ...
Expand Down
11 changes: 11 additions & 0 deletions daft/expressions/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,6 +743,17 @@ def lower(self) -> Expression:
"""
return Expression._from_pyexpr(self._expr.utf8_lower())

def upper(self) -> Expression:
"""Convert UTF-8 string to all upper
Example:
>>> col("x").str.upper()
Returns:
Expression: a String expression which is `self` uppercased
"""
return Expression._from_pyexpr(self._expr.utf8_upper())


class ExpressionListNamespace(ExpressionNamespace):
def join(self, delimiter: str | Expression) -> Expression:
Expand Down
4 changes: 4 additions & 0 deletions daft/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,10 @@ def lower(self) -> Series:
assert self._series is not None
return Series._from_pyseries(self._series.utf8_lower())

def upper(self) -> Series:
assert self._series is not None
return Series._from_pyseries(self._series.utf8_upper())


class SeriesDateNamespace(SeriesNamespace):
def date(self) -> Series:
Expand Down
1 change: 1 addition & 0 deletions docs/source/api_docs/expressions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ The following methods are available under the ``expr.str`` attribute.
Expression.str.length
Expression.str.split
Expression.str.lower
Expression.str.upper

.. _api-expressions-temporal:

Expand Down
15 changes: 14 additions & 1 deletion src/daft-core/src/array/ops/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use crate::{
datatypes::{BooleanArray, Field, UInt64Array, Utf8Array},
DataType, Series,
};
use arrow2::{self};
use arrow2;

use common_error::{DaftError, DaftResult};

Expand Down Expand Up @@ -159,6 +159,19 @@ impl Utf8Array {
Ok(Utf8Array::from((self.name(), Box::new(arrow_result))))
}

pub fn upper(&self) -> DaftResult<Utf8Array> {
let self_arrow = self.as_arrow();
let arrow_result = self_arrow
.iter()
.map(|val| {
let v = val?;
Some(v.to_uppercase())
})
.collect::<arrow2::array::Utf8Array<i64>>()
.with_validity(self_arrow.validity().cloned());
Ok(Utf8Array::from((self.name(), Box::new(arrow_result))))
}

fn binary_broadcasted_compare<ScalarKernel>(
&self,
other: &Self,
Expand Down
4 changes: 4 additions & 0 deletions src/daft-core/src/python/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,10 @@ impl PySeries {
Ok(self.series.utf8_lower()?.into())
}

pub fn utf8_upper(&self) -> PyResult<Self> {
Ok(self.series.utf8_upper()?.into())
}

pub fn is_nan(&self) -> PyResult<Self> {
Ok(self.series.is_nan()?.into())
}
Expand Down
10 changes: 10 additions & 0 deletions src/daft-core/src/series/ops/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,14 @@ impl Series {
))),
}
}

pub fn utf8_upper(&self) -> DaftResult<Series> {
match self.data_type() {
DataType::Utf8 => Ok(self.utf8()?.upper()?.into_series()),
DataType::Null => Ok(self.clone()),
dt => Err(DaftError::TypeError(format!(
"Upper not implemented for type {dt}"
))),
}
}
}
11 changes: 11 additions & 0 deletions src/daft-dsl/src/functions/utf8/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ mod length;
mod lower;
mod split;
mod startswith;
mod upper;

use contains::ContainsEvaluator;
use endswith::EndswithEvaluator;
Expand All @@ -12,6 +13,7 @@ use lower::LowerEvaluator;
use serde::{Deserialize, Serialize};
use split::SplitEvaluator;
use startswith::StartswithEvaluator;
use upper::UpperEvaluator;

use crate::Expr;

Expand All @@ -25,6 +27,7 @@ pub enum Utf8Expr {
Split,
Length,
Lower,
Upper,
}

impl Utf8Expr {
Expand All @@ -38,6 +41,7 @@ impl Utf8Expr {
Split => &SplitEvaluator {},
Length => &LengthEvaluator {},
Lower => &LowerEvaluator {},
Upper => &UpperEvaluator {},
}
}
}
Expand Down Expand Up @@ -83,3 +87,10 @@ pub fn lower(data: &Expr) -> Expr {
inputs: vec![data.clone()],
}
}

pub fn upper(data: &Expr) -> Expr {
Expr::Function {
func: super::FunctionExpr::Utf8(Utf8Expr::Upper),
inputs: vec![data.clone()],
}
}
46 changes: 46 additions & 0 deletions src/daft-dsl/src/functions/utf8/upper.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
use daft_core::{
datatypes::{DataType, Field},
schema::Schema,
series::Series,
};

use crate::Expr;
use common_error::{DaftError, DaftResult};

use super::super::FunctionEvaluator;

pub(super) struct UpperEvaluator {}

impl FunctionEvaluator for UpperEvaluator {
fn fn_name(&self) -> &'static str {
"upper"
}

fn to_field(&self, inputs: &[Expr], schema: &Schema, _: &Expr) -> DaftResult<Field> {
match inputs {
[data] => match data.to_field(schema) {
Ok(data_field) => match &data_field.dtype {
DataType::Utf8 => Ok(Field::new(data_field.name, DataType::Utf8)),
_ => Err(DaftError::TypeError(format!(
"Expects input to upper to be utf8, but received {data_field}",
))),
},
Err(e) => Err(e),
},
_ => Err(DaftError::SchemaMismatch(format!(
"Expected 1 input args, got {}",
inputs.len()
))),
}
}

fn evaluate(&self, inputs: &[Series], _: &Expr) -> DaftResult<Series> {
match inputs {
[data] => data.utf8_upper(),
_ => Err(DaftError::ValueError(format!(
"Expected 1 input args, got {}",
inputs.len()
))),
}
}
}
5 changes: 5 additions & 0 deletions src/daft-dsl/src/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,11 @@ impl PyExpr {
Ok(lower(&self.expr).into())
}

pub fn utf8_upper(&self) -> PyResult<Self> {
use crate::functions::utf8::upper;
Ok(upper(&self.expr).into())
}

pub fn image_decode(&self) -> PyResult<Self> {
use crate::functions::image::decode;
Ok(decode(&self.expr).into())
Expand Down
10 changes: 10 additions & 0 deletions tests/expressions/typing/test_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,13 @@ def test_str_lower():
run_kernel=s.str.lower,
resolvable=True,
)


def test_str_upper():
s = Series.from_arrow(pa.array(["Foo", "BarBaz", "quux"]), name="arg")
assert_typing_resolve_vs_runtime_behavior(
data=[s],
expr=col(s.name()).str.upper(),
run_kernel=s.str.lower,
resolvable=True,
)
60 changes: 35 additions & 25 deletions tests/series/test_utf8_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,31 +224,41 @@ def test_series_utf8_length_all_null() -> None:
assert result.to_pylist() == [None, None, None]


def test_series_utf8_lower() -> None:
s = Series.from_arrow(pa.array(["Foo", "BarBaz", "QUUX"]))
result = s.str.lower()
assert result.to_pylist() == ["foo", "barbaz", "quux"]


def test_series_utf8_lower_with_nulls() -> None:
s = Series.from_arrow(pa.array(["Foo", None, "BarBaz", "QUUX"]))
result = s.str.lower()
assert result.to_pylist() == ["foo", None, "barbaz", "quux"]


def test_series_utf8_lower_empty() -> None:
s = Series.from_arrow(pa.array([], type=pa.string()))
result = s.str.lower()
assert result.to_pylist() == []


def test_series_utf8_lower_all_null() -> None:
s = Series.from_arrow(pa.array([None, None, None]))
@pytest.mark.parametrize(
["data", "expected"],
[
(["Foo", "BarBaz", "QUUX"], ["foo", "barbaz", "quux"]),
# With at least one null
(["Foo", None, "BarBaz", "QUUX"], ["foo", None, "barbaz", "quux"]),
# With all nulls
([None] * 4, [None] * 4),
# With at least one numeric strings
(["Foo", "BarBaz", "QUUX", "2"], ["foo", "barbaz", "quux", "2"]),
# With all numeric strings
(["1", "2", "3"], ["1", "2", "3"]),
],
)
def test_series_utf8_lower(data, expected) -> None:
s = Series.from_arrow(pa.array(data))
result = s.str.lower()
assert result.to_pylist() == [None, None, None]
assert result.to_pylist() == expected


def test_series_utf8_lower_all_numeric_strs() -> None:
s = Series.from_arrow(pa.array(["1", "2", "3"]))
result = s.str.lower()
assert result.to_pylist() == ["1", "2", "3"]
@pytest.mark.parametrize(
["data", "expected"],
[
(["Foo", "BarBaz", "quux"], ["FOO", "BARBAZ", "QUUX"]),
# With at least one null
(["Foo", None, "BarBaz", "quux"], ["FOO", None, "BARBAZ", "QUUX"]),
# With all nulls
([None] * 4, [None] * 4),
# With at least one numeric strings
(["Foo", "BarBaz", "quux", "2"], ["FOO", "BARBAZ", "QUUX", "2"]),
# With all numeric strings
(["1", "2", "3"], ["1", "2", "3"]),
],
)
def test_series_utf8_upper(data, expected) -> None:
s = Series.from_arrow(pa.array(data))
result = s.str.upper()
assert result.to_pylist() == expected
10 changes: 10 additions & 0 deletions tests/table/utf8/test_upper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from __future__ import annotations

from daft.expressions import col
from daft.table import MicroPartition


def test_utf8_upper():
table = MicroPartition.from_pydict({"col": ["Foo", None, "BarBaz", "quux", "1"]})
result = table.eval_expression_list([col("col").str.upper()])
assert result.to_pydict() == {"col": ["FOO", None, "BARBAZ", "QUUX", "1"]}

0 comments on commit 5b2fe98

Please sign in to comment.