From 5853eb12136fd23081dd396ae8e1107a28481b0d Mon Sep 17 00:00:00 2001 From: maechler Date: Fri, 23 Jun 2023 07:40:54 +0000 Subject: [PATCH] fix substr(x, n,L) for UTF-8 x and L > nchar(x) git-svn-id: https://svn.r-project.org/R/trunk@84598 00db46b3-68df-0310-9c12-caf00c1e9a41 --- doc/NEWS.Rd | 10 ++++++++++ src/main/character.c | 8 +++----- tests/reg-tests-1e.R | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 5 deletions(-) diff --git a/doc/NEWS.Rd b/doc/NEWS.Rd index 134a2e84c00..7c3a78518b8 100644 --- a/doc/NEWS.Rd +++ b/doc/NEWS.Rd @@ -155,6 +155,16 @@ } } +\section{\Rlogo CHANGES IN R 4.3.1 patched}{ + \subsection{BUG FIXES}{ + \itemize{ + \item \code{substr(x, n, L) <- cc} now works (more) correctly for + multibyte UTF-8 strings \code{x} when \code{L > nchar(x)}, thanks to + a report and patch by \sQuote{Architect 95}. + } + } +} + \section{\Rlogo CHANGES IN R 4.3.1}{ \subsection{C-LEVEL FACILITIES}{ \itemize{ diff --git a/src/main/character.c b/src/main/character.c index 42c930a3fea..7aec2fa5fd7 100644 --- a/src/main/character.c +++ b/src/main/character.c @@ -598,10 +598,9 @@ substrset(char *buf, const char *const str, cetype_t ienc, int sa, int so, error(_("invalid multibyte string, %s"), msg); } for (i = 1; i < sa; i++) buf += utf8clen(*buf); - for (i = sa; i <= so && in < strlen(str); i++) { + for (i = sa; i <= so && buf[out] && str[in]; i++) { in += utf8clen(str[in]); out += utf8clen(buf[out]); - if (!str[in]) break; } if (in != out) memmove(buf+in, buf+out, strlen(buf+out)+1); memcpy(buf, str, in); @@ -619,10 +618,9 @@ substrset(char *buf, const char *const str, cetype_t ienc, int sa, int so, /* now work out how many bytes to replace by how many */ mbstate_t mb_st_out; mbs_init(&mb_st_out); - for (i = sa; i <= so && in < strlen(str); i++) { - in += (int) Mbrtowc(NULL, str+in, R_MB_CUR_MAX, &mb_st_in); + for (i = sa; i <= so && buf[out] && str[in]; i++) { + in += (int) Mbrtowc(NULL, str+in, R_MB_CUR_MAX, &mb_st_in); out += (int) Mbrtowc(NULL, buf+out, R_MB_CUR_MAX, &mb_st_out); - if (!str[in]) break; } if (in != out) memmove(buf+in, buf+out, strlen(buf+out)+1); memcpy(buf, str, in); diff --git a/tests/reg-tests-1e.R b/tests/reg-tests-1e.R index 6a91c3a7274..67f935ad22b 100644 --- a/tests/reg-tests-1e.R +++ b/tests/reg-tests-1e.R @@ -669,6 +669,41 @@ stopifnot(identical(contrib.url(character()), character())) ## R < 4.4.0 returned "/src/contrib" or similar +## `substr<-` overrun in case of UTF-8 --- private bug report by 'Architect 95' +s0 <- "123456"; nchar(s0) # 6 +substr(s0, 6, 7) <- "cc" +s0 ; nchar(s0) # {"12345c", 6}: all fine: no overrun, silent truncation +(s1 <- intToUtf8(c(23383, 97, 97, 97, 97, 97))); nchar(s1) # "字aaaaa" , 6 +substr(s1, 6, 7) <- "cc" +# Now s1 should be "字aaaac", but actually did overrunn nchar(s1); +s1; nchar(s1) ## was "字aaaacc", nchar = 7 +(s2 <- intToUtf8(c(23383, 98, 98))); nchar(s2) # "字bb" 3 +substr(s2, 4, 5) <- "dd" # should silently truncate as with s0: +## --> s2 should be "字bb", but was "字bbdddd\x97" (4.1.3) or "字bbdd字" (4.3.1) +s2; nchar(s2) ## was either 6 or "Error ... : invalid multibyte string, element 1" +#------------- +## Example where a partial UTF-8 character is included in the second string +## 3) all fine +(s3 <- intToUtf8(c(23383, 97, 97, 97, 97, 97))); nchar(s3) # "字aaaaa" 6 +substr(s3, 6, 6) <- print(intToUtf8(23383)) # "字" +s3 ; nchar(s3) # everything as expected: ("字aaaa字", 6) +## 4) not good +(s4 <- intToUtf8(c(23383, 98, 98, 98, 98))); nchar(s4) # "字bbbb" 5 +substr(s4, 5, 7) <- "ddd" +# Now s4 should be "字bbbd", but was "字bbbddd\x97", (\x97 = last byte of "字" in UTF-8) +s4; nchar(s4)## gave "字bbbddd\x97" and "Error ...: invalid multibyte string, element 1" +stopifnot(exprs = { + identical(s0, "12345c") # always ok + identical(utf8ToInt(s1), c(23383L, rep(97L, 4), 99L)) ; nchar(s1) == 6 + identical(utf8ToInt(s2), c(23383L, 98L, 98L)) ; nchar(s2) == 3 + identical(utf8ToInt(s3), c(23383L, 97L, 97L, 97L, 97L, 23383L)) ; nchar(s3) == 6 + identical(utf8ToInt(s4), c(23383L, 98L, 98L, 98L, 100L)) ; nchar(s4) == 5 + Encoding(c(s1,s2,s3,s4)) == rep("UTF-8", 4) +}) +## did partly overrun to invalid strings, nchar(.) giving error in R <= 4.3.1 + + + ## keep at end rbind(last = proc.time() - .pt, total = proc.time())