Skip to content

Commit

Permalink
read(io, Char): fix read with too many leading ones (#50552)
Browse files Browse the repository at this point in the history
Fixes #50532. The `read(io, Char)` method didn't correctly handle the
case where the lead byte starts with too many leading ones; this fix
makes it handle that case correctly, which makes `read(io, Char)` match
`collect(s)` in its interpretation of what a character is in all invalid
cases. Also fix and test `read(::File, Char)` which has the same bug.

(cherry picked from commit ffe1a07)
  • Loading branch information
StefanKarpinski authored and KristofferC committed Jul 17, 2023
1 parent 703622d commit d5c9b50
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 5 deletions.
5 changes: 3 additions & 2 deletions base/filesystem.jl
Original file line number Diff line number Diff line change
Expand Up @@ -200,11 +200,12 @@ end

function read(f::File, ::Type{Char})
b0 = read(f, UInt8)
l = 8 * (4 - leading_ones(b0))
l = 0x08 * (0x04 - UInt8(leading_ones(b0)))
c = UInt32(b0) << 24
if l < 24
if l 0x10
s = 16
while s l && !eof(f)
# this works around lack of peek(::File)
p = position(f)
b = read(f, UInt8)
if b & 0xc0 != 0x80
Expand Down
4 changes: 2 additions & 2 deletions base/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -800,9 +800,9 @@ end

function read(io::IO, ::Type{Char})
b0 = read(io, UInt8)::UInt8
l = 8(4-leading_ones(b0))
l = 0x08 * (0x04 - UInt8(leading_ones(b0)))
c = UInt32(b0) << 24
if l < 24
if l 0x10
s = 16
while s l && !eof(io)::Bool
peek(io) & 0xc0 == 0x80 || break
Expand Down
30 changes: 29 additions & 1 deletion test/char.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license

@testset "basic properties" begin

@test typemax(Char) == reinterpret(Char, typemax(UInt32))
@test typemin(Char) == Char(0)
@test typemax(Char) == reinterpret(Char, 0xffffffff)
Expand Down Expand Up @@ -214,6 +213,35 @@ end
end
end

# issue #50532
@testset "invalid read(io, Char)" begin
# byte values with different numbers of leading bits
B = UInt8[
0x3f, 0x4d, 0x52, 0x63, 0x81, 0x83, 0x89, 0xb6,
0xc0, 0xc8, 0xd3, 0xe3, 0xea, 0xeb, 0xf0, 0xf2,
0xf4, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
]
f = tempname()
for b1 in B, b2 in B, t = 0:3
bytes = [b1, b2]
append!(bytes, rand(B, t))
s = String(bytes)
write(f, s)
@test s == read(f, String)
chars = collect(s)
ios = [IOBuffer(s), open(f), Base.Filesystem.open(f, 0)]
for io in ios
chars′ = Char[]
while !eof(io)
push!(chars′, read(io, Char))
end
@test chars == chars′
close(io)
end
end
rm(f)
end

@testset "overlong codes" begin
function test_overlong(c::Char, n::Integer, rep::String)
if isvalid(c)
Expand Down

0 comments on commit d5c9b50

Please sign in to comment.