Skip to content

Commit

Permalink
fix substr(x, n,L) for UTF-8 x and L > nchar(x)
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.r-project.org/R/trunk@84598 00db46b3-68df-0310-9c12-caf00c1e9a41
  • Loading branch information
maechler committed Jun 23, 2023
1 parent b9efbb9 commit 5853eb1
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 5 deletions.
10 changes: 10 additions & 0 deletions doc/NEWS.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,16 @@
}
}
\section{\Rlogo CHANGES IN R 4.3.1 patched}{
\subsection{BUG FIXES}{
\itemize{
\item \code{substr(x, n, L) <- cc} now works (more) correctly for
multibyte UTF-8 strings \code{x} when \code{L > nchar(x)}, thanks to
a report and patch by \sQuote{Architect 95}.
}
}
}
\section{\Rlogo CHANGES IN R 4.3.1}{
\subsection{C-LEVEL FACILITIES}{
\itemize{
Expand Down
8 changes: 3 additions & 5 deletions src/main/character.c
Original file line number Diff line number Diff line change
Expand Up @@ -598,10 +598,9 @@ substrset(char *buf, const char *const str, cetype_t ienc, int sa, int so,
error(_("invalid multibyte string, %s"), msg);
}
for (i = 1; i < sa; i++) buf += utf8clen(*buf);
for (i = sa; i <= so && in < strlen(str); i++) {
for (i = sa; i <= so && buf[out] && str[in]; i++) {
in += utf8clen(str[in]);
out += utf8clen(buf[out]);
if (!str[in]) break;
}
if (in != out) memmove(buf+in, buf+out, strlen(buf+out)+1);
memcpy(buf, str, in);
Expand All @@ -619,10 +618,9 @@ substrset(char *buf, const char *const str, cetype_t ienc, int sa, int so,
/* now work out how many bytes to replace by how many */
mbstate_t mb_st_out;
mbs_init(&mb_st_out);
for (i = sa; i <= so && in < strlen(str); i++) {
in += (int) Mbrtowc(NULL, str+in, R_MB_CUR_MAX, &mb_st_in);
for (i = sa; i <= so && buf[out] && str[in]; i++) {
in += (int) Mbrtowc(NULL, str+in, R_MB_CUR_MAX, &mb_st_in);
out += (int) Mbrtowc(NULL, buf+out, R_MB_CUR_MAX, &mb_st_out);
if (!str[in]) break;
}
if (in != out) memmove(buf+in, buf+out, strlen(buf+out)+1);
memcpy(buf, str, in);
Expand Down
35 changes: 35 additions & 0 deletions tests/reg-tests-1e.R
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,41 @@ stopifnot(identical(contrib.url(character()), character()))
## R < 4.4.0 returned "/src/contrib" or similar


## `substr<-` overrun in case of UTF-8 --- private bug report by 'Architect 95'
s0 <- "123456"; nchar(s0) # 6
substr(s0, 6, 7) <- "cc"
s0 ; nchar(s0) # {"12345c", 6}: all fine: no overrun, silent truncation
(s1 <- intToUtf8(c(23383, 97, 97, 97, 97, 97))); nchar(s1) # "字aaaaa" , 6
substr(s1, 6, 7) <- "cc"
# Now s1 should be "字aaaac", but actually did overrunn nchar(s1);
s1; nchar(s1) ## was "字aaaacc", nchar = 7
(s2 <- intToUtf8(c(23383, 98, 98))); nchar(s2) # "字bb" 3
substr(s2, 4, 5) <- "dd" # should silently truncate as with s0:
## --> s2 should be "字bb", but was "字bbdddd\x97" (4.1.3) or "字bbdd字" (4.3.1)
s2; nchar(s2) ## was either 6 or "Error ... : invalid multibyte string, element 1"
#-------------
## Example where a partial UTF-8 character is included in the second string
## 3) all fine
(s3 <- intToUtf8(c(23383, 97, 97, 97, 97, 97))); nchar(s3) # "字aaaaa" 6
substr(s3, 6, 6) <- print(intToUtf8(23383)) # "字"
s3 ; nchar(s3) # everything as expected: ("字aaaa字", 6)
## 4) not good
(s4 <- intToUtf8(c(23383, 98, 98, 98, 98))); nchar(s4) # "字bbbb" 5
substr(s4, 5, 7) <- "ddd"
# Now s4 should be "字bbbd", but was "字bbbddd\x97", (\x97 = last byte of "字" in UTF-8)
s4; nchar(s4)## gave "字bbbddd\x97" and "Error ...: invalid multibyte string, element 1"
stopifnot(exprs = {
identical(s0, "12345c") # always ok
identical(utf8ToInt(s1), c(23383L, rep(97L, 4), 99L)) ; nchar(s1) == 6
identical(utf8ToInt(s2), c(23383L, 98L, 98L)) ; nchar(s2) == 3
identical(utf8ToInt(s3), c(23383L, 97L, 97L, 97L, 97L, 23383L)) ; nchar(s3) == 6
identical(utf8ToInt(s4), c(23383L, 98L, 98L, 98L, 100L)) ; nchar(s4) == 5
Encoding(c(s1,s2,s3,s4)) == rep("UTF-8", 4)
})
## did partly overrun to invalid strings, nchar(.) giving error in R <= 4.3.1



## keep at end
rbind(last = proc.time() - .pt,
total = proc.time())

0 comments on commit 5853eb1

Please sign in to comment.