Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix isequal_normalized for combining-char reordering #52447

Merged
merged 9 commits into from
Dec 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 70 additions & 11 deletions stdlib/Unicode/src/Unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -208,12 +208,19 @@ end

using Base.Unicode: utf8proc_error, UTF8PROC_DECOMPOSE, UTF8PROC_CASEFOLD, UTF8PROC_STRIPMARK

function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32}, options::Integer)
ret = @ccall utf8proc_decompose_char(codepoint::UInt32, dest::Ptr{UInt32}, length(dest)::Int, options::Cint, C_NULL::Ptr{Cint})::Int
function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32}, offset::Integer, options::Integer)
ret = GC.@preserve dest @ccall utf8proc_decompose_char(codepoint::UInt32, pointer(dest, 1+offset)::Ptr{UInt32}, (length(dest)-offset)::Int, options::Cint, C_NULL::Ptr{Cint})::Int
ret < 0 && utf8proc_error(ret)
return ret
end

# would be good to have higher-level accessor functions in utf8proc. alternatively,
# we could mirror the whole utf8proc_property_t struct in Julia, but that is annoying
# because of the bitfields.
combining_class(uc::Integer) =
0x000301 ≤ uc ≤ 0x10ffff ? unsafe_load(ccall(:utf8proc_get_property, Ptr{UInt16}, (UInt32,), uc), 2) : 0x0000
combining_class(c::AbstractChar) = ismalformed(c) ? 0x0000 : combining_class(UInt32(c))

"""
isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false, chartransform=identity)

Expand All @@ -225,6 +232,9 @@ As with [`Unicode.normalize`](@ref), you can also pass an arbitrary
function via the `chartransform` keyword (mapping `Integer` codepoints to codepoints)
to perform custom normalizations, such as [`Unicode.julia_chartransform`](@ref).

!!! compat "Julia 1.8"
The `isequal_normalized` function was added in Julia 1.8.

# Examples

For example, the string `"noël"` can be constructed in two canonically equivalent ways
Expand All @@ -251,29 +261,78 @@ julia> isequal_normalized(s1, "NOËL", casefold=true)
true
```
"""
function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false, chartransform=identity)
function decompose_next_char!(c, state, d, options, s)
n = _decompose_char!(c, d, options)
if n > length(d) # may be possible in future Unicode versions?
n = _decompose_char!(c, resize!(d, n), options)
isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false, chartransform=identity) =
_isequal_normalized!(s1, s2, Vector{UInt32}(undef, 4), Vector{UInt32}(undef, 4), chartransform; casefold, stripmark)

# like isequal_normalized, but takes pre-allocated codepoint buffers as arguments, and chartransform is a positional argument
function _isequal_normalized!(s1::AbstractString, s2::AbstractString,
d1::Vector{UInt32}, d2::Vector{UInt32}, chartransform::F=identity;
casefold::Bool=false, stripmark::Bool=false) where {F}
function decompose_next_chars!(state, d, options, s)
local n
offset = 0
@inbounds while true
# read a char and decompose it to d
c = chartransform(UInt32(state[1]))
state = iterate(s, state[2])
if c < 0x80 # fast path for common ASCII case
n = 1 + offset
n > length(d) && resize!(d, 2n)
d[n] = casefold ? (0x41 ≤ c ≤ 0x5A ? c+0x20 : c) : c
break # ASCII characters are all zero combining class
else
while true
n = _decompose_char!(c, d, offset, options) + offset
if n > length(d)
resize!(d, 2n)
continue
end
break
end
end

# decomposed chars must be sorted in ascending order of combining class,
# which means we need to keep fetching chars until we get to non-combining
(iszero(combining_class(d[n])) || isnothing(state)) && break # non-combining
offset = n
end
return 1, n, iterate(s, state)

# sort by combining class
if n < 32 # almost always true
for j1 = 2:n # insertion sort
cc = combining_class(d[j1])
iszero(cc) && continue # don't re-order non-combiners
for j2 = j1:-1:2
combining_class(d[j2-1]) ≤ cc && break
d[j2-1], d[j2] = d[j2], d[j2-1]
end
end
else # avoid n^2 complexity in crazy large-n case
j = 1
@views while j < n
j₀ = j + something(findnext(iszero ∘ combining_class, d[j+1:n], 1), n+1-j)
sort!(d[j:j₀-1], by=combining_class)
j = j₀
end
end

# split return statement to help type inference:
return state === nothing ? (1, n, nothing) : (1, n, state)
end
options = UTF8PROC_DECOMPOSE
casefold && (options |= UTF8PROC_CASEFOLD)
stripmark && (options |= UTF8PROC_STRIPMARK)
i1,i2 = iterate(s1),iterate(s2)
d1,d2 = Vector{UInt32}(undef, 4), Vector{UInt32}(undef, 4) # codepoint buffers
n1 = n2 = 0 # lengths of codepoint buffers
j1 = j2 = 1 # indices in d1, d2
while true
if j1 > n1
i1 === nothing && return i2 === nothing && j2 > n2
j1, n1, i1 = decompose_next_char!(chartransform(UInt32(i1[1])), i1[2], d1, options, s1)
j1, n1, i1 = decompose_next_chars!(i1, d1, options, s1)
end
if j2 > n2
i2 === nothing && return false
j2, n2, i2 = decompose_next_char!(chartransform(UInt32(i2[1])), i2[2], d2, options, s2)
j2, n2, i2 = decompose_next_chars!(i2, d2, options, s2)
end
d1[j1] == d2[j2] || return false
j1 += 1; j2 += 1
Expand Down
67 changes: 67 additions & 0 deletions stdlib/Unicode/test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
using Test
using Unicode
using Unicode: normalize, isassigned, julia_chartransform
import Random

Random.seed!(12345)

@testset "string normalization" begin
# normalize (Unicode normalization etc.):
Expand Down Expand Up @@ -455,6 +458,9 @@ end
@test !Base.Unicode.isvalid(Char, overlong_char)
end

# the obvious, but suboptimal, algorithm:
isequal_normalized_naive(s1, s2; kws...) = normalize(s1; kws...) == normalize(s2; kws...)

@testset "Unicode equivalence" begin
@test isequal_normalized("no\u00EBl", "noe\u0308l")
@test !isequal_normalized("no\u00EBl", "noe\u0308l ")
Expand All @@ -466,4 +472,65 @@ end
@test isequal_normalized("no\u00EBl", "noel", stripmark=true)
@test isequal_normalized("no\u00EBl", "NOEL", stripmark=true, casefold=true)
@test isequal_normalized("\u00B5\u0302m", "\u03BC\u0302m", chartransform=julia_chartransform)

# issue #52408
@testset "Sorting combining characters" begin
for str in ("\u5bc\u5b0", "j\u5ae\u5bf\u5b2\u5b4") # julia#52408 examples
@test isequal_normalized(str, normalize(str))
end

# first codepoint in every possible Unicode combining class
let cc_chars = UInt32[0x00000334, 0x00016ff0, 0x0000093c, 0x00003099, 0x0000094d, 0x000005b0, 0x000005b1, 0x000005b2, 0x000005b3, 0x000005b4, 0x000005b5, 0x000005b6, 0x000005b7, 0x000005b8, 0x000005b9, 0x000005bb, 0x000005bc, 0x000005bd, 0x000005bf, 0x000005c1, 0x000005c2, 0x0000fb1e, 0x0000064b, 0x0000064c, 0x0000064d, 0x00000618, 0x00000619, 0x0000061a, 0x00000651, 0x00000652, 0x00000670, 0x00000711, 0x00000c55, 0x00000c56, 0x00000e38, 0x00000e48, 0x00000eb8, 0x00000ec8, 0x00000f71, 0x00000f72, 0x00000f74, 0x00000321, 0x00001dce, 0x0000031b, 0x00001dfa, 0x00000316, 0x0000059a, 0x0000302e, 0x0001d16d, 0x000005ae, 0x00000301, 0x00000315, 0x0000035c, 0x0000035d, 0x00000345],
vowels = ['a', 'e', 'i', 'o', 'u', 'å', 'é', 'î', 'ö', 'ü'], Vowels = [vowels; uppercase.(vowels)]
function randcc(n, n_cc) # random string with lots of combining chars
buf = IOBuffer()
for _ = 1:n
print.(buf, rand(Vowels, rand(1:5)))
print.(buf, Char.(rand(cc_chars, rand(0:n_cc))))
end
return String(take!(buf))
end
for _ = 1:100
s = randcc(10,10)
ns = normalize(s)
cs = normalize(s, casefold=true)
@test isequal_normalized(s, s)
if !isequal_normalized(s, ns)
@show s
end
@test isequal_normalized(s, ns)
@test isequal_normalized(cs, ns) == isequal_normalized_naive(cs, ns)
@test isequal_normalized(cs, ns, casefold=true) ==
isequal_normalized_naive(cs, ns, casefold=true)
end
for _ = 1:3
s = randcc(5,1000) # exercise sort!-based fallback
@test isequal_normalized(s, normalize(s))
end
function randcc2(n, n_cc) # 2 strings with equivalent reordered combiners
buf1 = IOBuffer()
buf2 = IOBuffer()
p = n_cc / length(cc_chars)
for _ = 1:n
a = join(rand(Vowels, rand(1:5)))
print(buf1, a)
print(buf2, a)

# chars from distinct combining classes
# are canonically equivalent when re-ordered
c = Random.randsubseq(cc_chars, p)
print.(buf1, Char.(Random.shuffle!(c)))
print.(buf2, Char.(Random.shuffle!(c)))
end
return String(take!(buf1)), String(take!(buf2))
end
for _ = 1:100
s1, s2 = randcc2(10,10)
@test isequal_normalized(s1, s2)
end
end

# combining characters in the same class are inequivalent if re-ordered:
@test !isequal_normalized("x\u0334\u0335", "x\u0335\u0334")
end
end