From 5853eb12136fd23081dd396ae8e1107a28481b0d Mon Sep 17 00:00:00 2001
From: maechler <maechler@00db46b3-68df-0310-9c12-caf00c1e9a41>
Date: Fri, 23 Jun 2023 07:40:54 +0000
Subject: [PATCH] fix  substr(x, n,L) for UTF-8 x  and  L > nchar(x)

git-svn-id: https://svn.r-project.org/R/trunk@84598 00db46b3-68df-0310-9c12-caf00c1e9a41
---
 doc/NEWS.Rd          | 10 ++++++++++
 src/main/character.c |  8 +++-----
 tests/reg-tests-1e.R | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/doc/NEWS.Rd b/doc/NEWS.Rd
index 134a2e84c00..7c3a78518b8 100644
--- a/doc/NEWS.Rd
+++ b/doc/NEWS.Rd
@@ -155,6 +155,16 @@
   }
 }
 
+\section{\Rlogo CHANGES IN R 4.3.1 patched}{
+  \subsection{BUG FIXES}{
+    \itemize{
+      \item \code{substr(x, n, L) <- cc} now works (more) correctly for
+      multibyte UTF-8 strings \code{x} when \code{L > nchar(x)}, thanks to
+      a report and patch by \sQuote{Architect 95}.
+    }
+  }
+}
+
 \section{\Rlogo CHANGES IN R 4.3.1}{
   \subsection{C-LEVEL FACILITIES}{
     \itemize{
diff --git a/src/main/character.c b/src/main/character.c
index 42c930a3fea..7aec2fa5fd7 100644
--- a/src/main/character.c
+++ b/src/main/character.c
@@ -598,10 +598,9 @@ substrset(char *buf, const char *const str, cetype_t ienc, int sa, int so,
 	    error(_("invalid multibyte string, %s"), msg);
 	}
 	for (i = 1; i < sa; i++) buf += utf8clen(*buf);
-	for (i = sa; i <= so && in < strlen(str); i++) {
+	for (i = sa; i <= so && buf[out] && str[in]; i++) {
 	    in +=  utf8clen(str[in]);
 	    out += utf8clen(buf[out]);
-	    if (!str[in]) break;
 	}
 	if (in != out) memmove(buf+in, buf+out, strlen(buf+out)+1);
 	memcpy(buf, str, in);
@@ -619,10 +618,9 @@ substrset(char *buf, const char *const str, cetype_t ienc, int sa, int so,
 	    /* now work out how many bytes to replace by how many */
 	    mbstate_t mb_st_out;
 	    mbs_init(&mb_st_out);
-	    for (i = sa; i <= so && in < strlen(str); i++) {
-		in += (int) Mbrtowc(NULL, str+in, R_MB_CUR_MAX, &mb_st_in);
+	    for (i = sa; i <= so && buf[out] && str[in]; i++) {
+		in  += (int) Mbrtowc(NULL, str+in,  R_MB_CUR_MAX, &mb_st_in);
 		out += (int) Mbrtowc(NULL, buf+out, R_MB_CUR_MAX, &mb_st_out);
-		if (!str[in]) break;
 	    }
 	    if (in != out) memmove(buf+in, buf+out, strlen(buf+out)+1);
 	    memcpy(buf, str, in);
diff --git a/tests/reg-tests-1e.R b/tests/reg-tests-1e.R
index 6a91c3a7274..67f935ad22b 100644
--- a/tests/reg-tests-1e.R
+++ b/tests/reg-tests-1e.R
@@ -669,6 +669,41 @@ stopifnot(identical(contrib.url(character()), character()))
 ## R < 4.4.0 returned "/src/contrib" or similar
 
 
+## `substr<-` overrun in case of UTF-8 --- private bug report by 'Architect 95'
+s0 <- "123456"; nchar(s0) #  6
+substr(s0, 6, 7) <- "cc"
+s0 ; nchar(s0) # {"12345c", 6}: all fine: no overrun, silent truncation
+(s1 <- intToUtf8(c(23383, 97, 97, 97, 97, 97))); nchar(s1)  # "字aaaaa" , 6
+substr(s1, 6, 7) <- "cc"
+# Now s1 should be "字aaaac", but  actually did overrunn nchar(s1);
+s1; nchar(s1) ## was "字aaaacc", nchar  = 7
+(s2 <- intToUtf8(c(23383, 98, 98))); nchar(s2)  # "字bb" 3
+substr(s2, 4, 5) <- "dd" # should silently truncate as with s0:
+## --> s2 should be "字bb", but was "字bbdddd\x97" (4.1.3) or "字bbdd字" (4.3.1)
+s2; nchar(s2) ## was either 6 or  "Error ... : invalid multibyte string, element 1"
+#-------------
+## Example where a partial UTF-8 character is included in the second string
+## 3) all fine
+(s3 <- intToUtf8(c(23383, 97, 97, 97, 97, 97))); nchar(s3)  # "字aaaaa" 6
+substr(s3, 6, 6) <- print(intToUtf8(23383))  # "字"
+s3 ; nchar(s3) # everything as expected:  ("字aaaa字", 6)
+## 4) not good
+(s4 <- intToUtf8(c(23383, 98, 98, 98, 98))); nchar(s4) # "字bbbb" 5
+substr(s4, 5, 7) <- "ddd"
+# Now s4 should be "字bbbd", but was "字bbbddd\x97", (\x97 = last byte of "字" in UTF-8)
+s4; nchar(s4)## gave "字bbbddd\x97" and "Error ...: invalid multibyte string, element 1"
+stopifnot(exprs = {
+    identical(s0, "12345c") # always ok
+    identical(utf8ToInt(s1), c(23383L, rep(97L, 4), 99L))           ; nchar(s1) == 6
+    identical(utf8ToInt(s2), c(23383L, 98L, 98L))                   ; nchar(s2) == 3
+    identical(utf8ToInt(s3), c(23383L, 97L, 97L, 97L, 97L, 23383L)) ; nchar(s3) == 6
+    identical(utf8ToInt(s4), c(23383L, 98L, 98L, 98L, 100L))        ; nchar(s4) == 5
+    Encoding(c(s1,s2,s3,s4)) == rep("UTF-8", 4)
+})
+## did partly overrun to invalid strings, nchar(.) giving error in R <= 4.3.1
+
+
+
 ## keep at end
 rbind(last =  proc.time() - .pt,
       total = proc.time())