Skip to content

Commit

Permalink
Use graphemes in ansi_nchar()
Browse files Browse the repository at this point in the history
Also add `utf8_nchar()`.

Towards #317.
  • Loading branch information
gaborcsardi committed Jul 20, 2021
1 parent cfe4fee commit 5e92e5e
Show file tree
Hide file tree
Showing 8 changed files with 141 additions and 27 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -195,4 +195,5 @@ export(symbol)
export(test_that_cli)
export(ticking)
export(tree)
export(utf8_nchar)
useDynLib(cli, .registration=TRUE)
25 changes: 10 additions & 15 deletions R/ansiex.R
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,13 @@ ansi_strip <- function(string, sgr = TRUE, csi = TRUE) {

#' Count number of characters in an ANSI colored string
#'
#' This is a color-aware counterpart of [base::nchar()],
#' which does not do well, since it also counts the ANSI control
#' characters.
#' This is a color-aware counterpart of [utf8_nchar()]. By default it
#' counts Unicode grapheme clusters, instead of code points.
#'
#' @param x Character vector, potentially ANSO styled, or a vector to be
#' coarced to character.
#' @param type Whether to count characters, bytes, or calculate the
#' display width of the string. Passed to [base::nchar()].
#' @param x Character vector, potentially ANSI styled, or a vector to be
#' coarced to character. If it converted to UTF-8.
#' @param type Whether to count graphemes (characters), code points,
#' bytes, or calculate the display width of the string.
#' @return Numeric vector, the length of the strings in the character
#' vector.
#'
Expand All @@ -108,15 +107,11 @@ ansi_strip <- function(string, sgr = TRUE, csi = TRUE) {
#' ansi_nchar(str)
#' nchar(ansi_strip(str))

ansi_nchar <- function(x, type = c("chars", "bytes", "width")) {
type <- match.arg(type)
x <- enc2utf8(x)
ansi_nchar <- function(x,
type = c("chars", "bytes", "width", "graphemes",
"codepoints")) {
x <- ansi_strip(x)
if (type == "width") {
utf8_display_width(x)
} else {
base::nchar(x, allowNA = FALSE, keepNA = TRUE)
}
utf8_nchar(x, type)
}

#' Substring(s) of an ANSI colored string
Expand Down
54 changes: 51 additions & 3 deletions R/utf8.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,57 @@ is_utf8_output <- function() {
}
}

# @export
#' Count the number of characters in a character vector
#'
#' By default it counts Unicode grapheme clusters, instead of code points.
#'
#' @param x Character vector, it is converted to UTF-8.
#' @param type Whether to count graphemes (characters), code points,
#' bytes, or calculate the display width of the string.
#' @return Numeric vector, the length of the strings in the character
#' vector.
#'
#' @export
#' @examples
#' # Grapheme example, emoji with combining characters. This is a single
#' # grapheme, consisting of five Unicode code points:
#' # * `\U0001f477` is the construction worker emoji
#' # * `\U0001f3fb` is emoji modifier that changes the skin color
#' # * `\u200d` is the zero width joiner
#' # * `\u2640` is the female sign
#' # * `\ufe0f` is variation selector 16, requesting an emoji style glyph
#' emo <- "\U0001f477\U0001f3fb\u200d\u2640\ufe0f"
#' cat(emo)
#'
#' utf8_nchar(emo, "chars") # = graphemes
#' utf8_nchar(emo, "bytes")
#' utf8_nchar(emo, "width")
#' utf8_nchar(emo, "codepoints")
#'
#' # For comparision, the output for width depends on the R version used:
#' nchar(emo, "chars")
#' nchar(emo, "bytes")
#' nchar(emo, "width")

utf8_nchar <- function(x, type = c("chars", "bytes", "width", "graphemes",
"codepoints")) {

type <- match.arg(type)
if (type == "chars") type <- "graphemes"

utf8_display_width <- function(x) {
x <- enc2utf8(x)
.Call(clic_utf8_display_width, x)

if (type == "width") {
.Call(clic_utf8_display_width, x)

} else if (type == "graphemes") {
.Call(clic_utf8_nchar_graphemes, x)

} else if (type == "codepoints") {
base::nchar(x, allowNA = FALSE, keepNA = TRUE, type = "chars")

} else { # bytes
base::nchar(x, allowNA = FALSE, keepNA = TRUE, type = "bytes")
}

}
15 changes: 7 additions & 8 deletions man/ansi_nchar.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

42 changes: 42 additions & 0 deletions man/utf8_nchar.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions src/cli.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ SEXP clic_ansi_html(SEXP x, SEXP keep_csi);
SEXP clic_ansi_has_any(SEXP x, SEXP sgr, SEXP csi);
SEXP clic_ansi_strip(SEXP x, SEXP sgr, SEXP csi);

SEXP clic_utf8_nchar_graphemes(SEXP x);
SEXP clic_utf8_display_width(SEXP x);

typedef volatile int vint;
Expand Down
3 changes: 2 additions & 1 deletion src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ static const R_CallMethodDef callMethods[] = {
{ "clic_ansi_has_any", (DL_FUNC) clic_ansi_has_any, 3 },
{ "clic_ansi_strip", (DL_FUNC) clic_ansi_strip, 3 },

{ "clic_utf8_display_width", (DL_FUNC) clic_utf8_display_width, 1 },
{ "clic_utf8_display_width", (DL_FUNC) clic_utf8_display_width, 1 },
{ "clic_utf8_nchar_graphemes", (DL_FUNC) clic_utf8_nchar_graphemes, 1 },

{ "clic_dataptr", (DL_FUNC) clic_dataptr, 1 },
{ "clic_start_thread", (DL_FUNC) clic_start_thread, 3 },
Expand Down
27 changes: 27 additions & 0 deletions src/utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -318,3 +318,30 @@ SEXP clic_utf8_display_width(SEXP x) {
UNPROTECT(1);
return res;
}

SEXP clic_utf8_nchar_graphemes(SEXP x) {
R_xlen_t i, len = XLENGTH(x);
SEXP res = PROTECT(allocVector(INTSXP, len));
int *pres = INTEGER(res);

for (i = 0; i < len; i++) {
SEXP x1 = STRING_ELT(x, i);
if (x1 == NA_STRING) {
pres[i] = NA_INTEGER;
} else {
struct grapheme_iterator iter;
const uint8_t *chr = (const uint8_t*) CHAR(x1);
int len = 0;
clic_utf8_graphscan_make(&iter, chr, /* width= */ 1);
while (iter.nxt_prop != -1) {
clic_utf8_graphscan_next(&iter, NULL, NULL);
len ++;
}

pres[i] = len;
}
}

UNPROTECT(1);
return res;
}

0 comments on commit 5e92e5e

Please sign in to comment.