add AbstractChar supertype of Char (#26286)

JuliaLang · Mar 7, 2018 · b1b0149 · b1b0149
1 parent 732743f
commit b1b0149
Show file tree

Hide file tree

Showing 45 changed files with 722 additions and 564 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -458,6 +458,9 @@ Library improvements
   * The function `thisind(s::AbstractString, i::Integer)` returns the largest valid index
     less or equal than `i` in the string `s` or `0` if no such index exists ([#24414]).
 
+  * `Char` is now a subtype of `AbstractChar`, and most of the functions that
+    take character arguments now accept any `AbstractChar` ([#26286]).
+
   * `Irrational` is now a subtype of `AbstractIrrational` ([#24245]).
 
   * Introduced the `empty` function, the functional pair to `empty!` which returns a new,

diff --git a/base/arrayshow.jl b/base/arrayshow.jl
@@ -39,7 +39,7 @@ methods. By default returns a string of the same width as original with a
 centered cdot, used in printing of structural zeros of structured matrices.
 Accept keyword args `c` for alternate single character marker.
 """
-function replace_with_centered_mark(s::AbstractString;c::Char = '⋅')
+function replace_with_centered_mark(s::AbstractString;c::AbstractChar = '⋅')
     N = length(s)
     return join(setindex!([" " for i=1:N],string(c),ceil(Int,N/2)))
 end

diff --git a/base/boot.jl b/base/boot.jl
@@ -143,7 +143,7 @@ export
     Signed, Int, Int8, Int16, Int32, Int64, Int128,
     Unsigned, UInt, UInt8, UInt16, UInt32, UInt64, UInt128,
     # string types
-    Char, AbstractString, String, IO,
+    AbstractChar, Char, AbstractString, String, IO,
     # errors
     ErrorException, BoundsError, DivideError, DomainError, Exception,
     InterruptException, InexactError, OutOfMemoryError, ReadOnlyMemoryError,
@@ -177,7 +177,8 @@ primitive type Float32 <: AbstractFloat 32 end
 primitive type Float64 <: AbstractFloat 64 end
 
 #primitive type Bool <: Integer 8 end
-primitive type Char 32 end
+abstract type AbstractChar end
+primitive type Char <: AbstractChar 32 end
 
 primitive type Int8    <: Signed   8 end
 #primitive type UInt8   <: Unsigned 8 end
@@ -460,7 +461,7 @@ function write(io::IO, x::String)
 end
 
 show(io::IO, @nospecialize x) = ccall(:jl_static_show, Cvoid, (Ptr{Cvoid}, Any), io_pointer(io), x)
-print(io::IO, x::Char) = ccall(:jl_uv_putc, Cvoid, (Ptr{Cvoid}, Char), io_pointer(io), x)
+print(io::IO, x::AbstractChar) = ccall(:jl_uv_putc, Cvoid, (Ptr{Cvoid}, Char), io_pointer(io), x)
 print(io::IO, x::String) = (write(io, x); nothing)
 print(io::IO, @nospecialize x) = show(io, x)
 print(io::IO, @nospecialize(x), @nospecialize a...) = (print(io, x); print(io, a...))
@@ -701,10 +702,6 @@ UInt32(x::BuiltinInts)  = toUInt32(x)::UInt32
 UInt64(x::BuiltinInts)  = toUInt64(x)::UInt64
 UInt128(x::BuiltinInts) = toUInt128(x)::UInt128
 
-Char(x::Number) = Char(UInt32(x))
-Char(x::Char) = x
-(::Type{T})(x::Char) where {T<:Number} = T(UInt32(x))
-
 (::Type{T})(x::T) where {T<:Number} = x
 
 Int(x::Ptr)  = bitcast(Int, x)

diff --git a/base/char.jl b/base/char.jl
@@ -1,13 +1,76 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
-struct InvalidCharError <: Exception
-    char::Char
+"""
+The `AbstractChar` type is the supertype of all character implementations
+in Julia.   A character represents a Unicode code point, and can be converted
+to an integer via the [`codepoint`](@ref) function in order to obtain the
+numerical value of the code point, or constructed from the same integer.
+These numerical values determine how characters are compared with `<` and `==`,
+for example.  New `T <: AbstractChar` types should define a `codepoint(::T)`
+method and a `T(::UInt32)` constructor, at minimum.
+
+A given `AbstractChar` subtype may be capable of representing only a subset
+of Unicode, in which case conversion from an unsupported `UInt32` value
+may throw an error.  Conversely, the built-in [`Char`](@ref) type represents
+a *superset* of Unicode (in order to losslessly encode invalid byte streams),
+in which case conversion of a non-Unicode value *to* `UInt32` throws an error.
+The [`isvalid`](@ref) function can be used to check which codepoints are
+representable in a given `AbstractChar` type.
+
+Internally, an `AbstractChar` type may use a variety of encodings.  Conversion
+via `codepoint(char)` will not reveal this encoding because it always returns the
+Unicode value of the character. `print(io, c)` of any `c::AbstractChar`
+produces an encoding determined by `io` (UTF-8 for all built-in [`IO`](@ref)
+types), via conversion to `Char` if necessary.
+
+`write(io, c)`, in contrast, may emit an encoding depending on
+`typeof(c)`, and `read(io, typeof(c))` should read the same encoding as `write`.
+New `AbstractChar` types must provide their own implementations of
+`write` and `read`.
+"""
+AbstractChar
+
+"""
+    Char(c::Union{Number,AbstractChar})
+
+`Char` is a 32-bit [`AbstractChar`](@ref) type that is the default representation
+of characters in Julia.  `Char` is the type used for character literals like `'x'`
+and it is also the element type of [`String`](@ref).
+
+In order to losslessly represent arbitrary byte streams stored in a `String`,
+a `Char` value may store information that cannot be converted to a Unicode
+codepoint — converting such a `Char` to `UInt32` will throw an error.
+The [`isvalid(c::Char)`](@ref) function can be used to query whether `c`
+represents a valid Unicode character.
+"""
+Char
+
+(::Type{T})(x::Number) where {T<:AbstractChar} = T(UInt32(x))
+(::Type{AbstractChar})(x::Number) = Char(x)
+(::Type{T})(x::AbstractChar) where {T<:Union{Number,AbstractChar}} = T(codepoint(x))
+(::Type{T})(x::T) where {T<:AbstractChar} = x
+
+codepoint(c::Char) = UInt32(c)
+
+"""
+    codepoint(c::AbstractChar)
+
+Return the Unicode codepoint (an unsigned integer) corresponding
+to the character `c` (or throw an exception if `c` does not represent
+a valid character).   For `Char`, this is a `UInt32` value, but
+`AbstractChar` types that represent only a subset of Unicode may
+return a different-sized integer (e.g. `UInt8`).
+"""
+codepoint # defined for Char in boot.jl
+
+struct InvalidCharError{T<:AbstractChar} <: Exception
+    char::T
 end
-struct CodePointError <: Exception
-    code::Integer
+struct CodePointError{T<:Integer} <: Exception
+    code::T
 end
-@noinline invalid_char(c::Char) = throw(InvalidCharError(c))
-@noinline code_point_err(u::UInt32) = throw(CodePointError(u))
+@noinline invalid_char(c::AbstractChar) = throw(InvalidCharError(c))
+@noinline code_point_err(u::Integer) = throw(CodePointError(u))
 
 function ismalformed(c::Char)
     u = reinterpret(UInt32, c)
@@ -24,6 +87,27 @@ function isoverlong(c::Char)
     is_overlong_enc(u)
 end
 
+# fallback: other AbstractChar types, by default, are assumed
+#           not to support malformed or overlong encodings.
+
+"""
+    ismalformed(c::AbstractChar)
+
+Return `true` if `c` represents malformed (non-Unicode) data according to the
+encoding used by `c`.  Defaults to `false` for non-`Char` types.  See also
+[`show_invalid`](@ref).
+"""
+ismalformed(c::AbstractChar) = false
+
+"""
+    isoverlong(c::AbstractChar)
+
+Return `true` if `c` represents an overlong UTF-8 sequence.  Defaults
+to `false` for non-`Char` types.  See also [`decode_overlong`](@ref)
+and [`show_invalid`](@ref).
+"""
+isoverlong(c::AbstractChar) = false
+
 function UInt32(c::Char)
     # TODO: use optimized inline LLVM
     u = reinterpret(UInt32, c)
@@ -49,6 +133,15 @@ function decode_overlong(c::Char)
     (u & 0x007f0000 >> 4) | (u & 0x7f000000 >> 6)
 end
 
+"""
+    decode_overlong(c::AbstractChar)
+
+When [`isoverlong(c)`](@ref) is `true`, `decode_overlong(c)` returns
+the Unicode codepoint value of `c`.   `AbstractChar` implementations
+that support overlong encodings should implement `Base.decode_overlong`.
+"""
+decode_overlong
+
 function Char(u::UInt32)
     u < 0x80 && return reinterpret(Char, u << 24)
     u < 0x00200000 || code_point_err(u)::Union{}
@@ -69,50 +162,85 @@ function Char(b::Union{Int8,UInt8})
     0 ≤ b ≤ 0x7f ? reinterpret(Char, (b % UInt32) << 24) : Char(UInt32(b))
 end
 
-convert(::Type{Char}, x::Number) = Char(x)
-convert(::Type{T}, x::Char) where {T<:Number} = T(x)
+convert(::Type{AbstractChar}, x::Number) = Char(x) # default to Char
+convert(::Type{T}, x::Number) where {T<:AbstractChar} = T(x)
+convert(::Type{T}, x::AbstractChar) where {T<:Number} = T(x)
+convert(::Type{T}, c::AbstractChar) where {T<:AbstractChar} = T(c)
+convert(::Type{T}, c::T) where {T<:AbstractChar} = c
 
-rem(x::Char, ::Type{T}) where {T<:Number} = rem(UInt32(x), T)
+rem(x::AbstractChar, ::Type{T}) where {T<:Number} = rem(codepoint(x), T)
 
 typemax(::Type{Char}) = reinterpret(Char, typemax(UInt32))
 typemin(::Type{Char}) = reinterpret(Char, typemin(UInt32))
 
-size(c::Char) = ()
-size(c::Char,d) = convert(Int, d) < 1 ? throw(BoundsError()) : 1
-ndims(c::Char) = 0
-ndims(::Type{Char}) = 0
-length(c::Char) = 1
-firstindex(c::Char) = 1
-lastindex(c::Char) = 1
-getindex(c::Char) = c
-getindex(c::Char, i::Integer) = i == 1 ? c : throw(BoundsError())
-getindex(c::Char, I::Integer...) = all(x -> x == 1, I) ? c : throw(BoundsError())
-first(c::Char) = c
-last(c::Char) = c
-eltype(::Type{Char}) = Char
-
-start(c::Char) = false
-next(c::Char, state) = (c, true)
-done(c::Char, state) = state
-isempty(c::Char) = false
-in(x::Char, y::Char) = x == y
+size(c::AbstractChar) = ()
+size(c::AbstractChar,d) = convert(Int, d) < 1 ? throw(BoundsError()) : 1
+ndims(c::AbstractChar) = 0
+ndims(::Type{<:AbstractChar}) = 0
+length(c::AbstractChar) = 1
+firstindex(c::AbstractChar) = 1
+lastindex(c::AbstractChar) = 1
+getindex(c::AbstractChar) = c
+getindex(c::AbstractChar, i::Integer) = i == 1 ? c : throw(BoundsError())
+getindex(c::AbstractChar, I::Integer...) = all(x -> x == 1, I) ? c : throw(BoundsError())
+first(c::AbstractChar) = c
+last(c::AbstractChar) = c
+eltype(::Type{T}) where {T<:AbstractChar} = T
+
+start(c::AbstractChar) = false
+next(c::AbstractChar, state) = (c, true)
+done(c::AbstractChar, state) = state
+isempty(c::AbstractChar) = false
+in(x::AbstractChar, y::AbstractChar) = x == y
 
 ==(x::Char, y::Char) = reinterpret(UInt32, x) == reinterpret(UInt32, y)
 isless(x::Char, y::Char) = reinterpret(UInt32, x) < reinterpret(UInt32, y)
 hash(x::Char, h::UInt) =
     hash_uint64(((reinterpret(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h))
-widen(::Type{Char}) = Char
 
--(x::Char, y::Char) = Int(x) - Int(y)
--(x::Char, y::Integer) = Char(Int32(x) - Int32(y))
-+(x::Char, y::Integer) = Char(Int32(x) + Int32(y))
-+(x::Integer, y::Char) = y + x
+# fallbacks:
+isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y))
+==(x::AbstractChar, y::AbstractChar) = Char(x) == Char(y)
+hash(x::AbstractChar, h::UInt) = hash(Char(x), h)
+widen(::Type{T}) where {T<:AbstractChar} = T
 
+-(x::AbstractChar, y::AbstractChar) = Int(x) - Int(y)
+-(x::T, y::Integer) where {T<:AbstractChar} = T(Int32(x) - Int32(y))
++(x::T, y::Integer) where {T<:AbstractChar} = T(Int32(x) + Int32(y))
++(x::Integer, y::AbstractChar) = y + x
+
+# `print` should output UTF-8 by default for all AbstractChar types.
+# (Packages may implement other IO subtypes to specify different encodings.)
+# In contrast, `write(io, c)` outputs a `c` in an encoding determined by typeof(c).
 print(io::IO, c::Char) = (write(io, c); nothing)
+print(io::IO, c::AbstractChar) = print(io, Char(c)) # fallback: convert to output UTF-8
 
 const hex_chars = UInt8['0':'9';'a':'z']
 
-function show(io::IO, c::Char)
+function show_invalid(io::IO, c::Char)
+    write(io, 0x27)
+    u = reinterpret(UInt32, c)
+    while true
+        a = hex_chars[((u >> 28) & 0xf) + 1]
+        b = hex_chars[((u >> 24) & 0xf) + 1]
+        write(io, 0x5c, UInt8('x'), a, b)
+        (u <<= 8) == 0 && break
+    end
+    write(io, 0x27)
+end
+
+"""
+    show_invalid(io::IO, c::AbstractChar)
+
+Called by `show(io, c)` when [`isoverlong(c)`](@ref) or
+[`ismalformed(c)`](@ref) return `true`.   Subclasses
+of `AbstractChar` should define `Base.show_invalid` methods
+if they support storing invalid character data.
+"""
+show_invalid
+
+# show c to io, assuming UTF-8 encoded output
+function show(io::IO, c::AbstractChar)
     if c <= '\\'
         b = c == '\0' ? 0x30 :
             c == '\a' ? 0x61 :
@@ -131,19 +259,13 @@ function show(io::IO, c::Char)
         end
     end
     if isoverlong(c) || ismalformed(c)
+        show_invalid(io, c)
+    elseif isprint(c)
         write(io, 0x27)
-        u = reinterpret(UInt32, c)
-        while true
-            a = hex_chars[((u >> 28) & 0xf) + 1]
-            b = hex_chars[((u >> 24) & 0xf) + 1]
-            write(io, 0x5c, 'x', a, b)
-            (u <<= 8) == 0 && break
-        end
+        print(io, c) # use print, not write, to use UTF-8 for any AbstractChar
         write(io, 0x27)
-    elseif isprint(c)
-        write(io, 0x27, c, 0x27)
     else # unprintable, well-formed, non-overlong Unicode
-        u = UInt32(c)
+        u = codepoint(c)
         write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55)
         d = max(2, 8 - (leading_zeros(u) >> 2))
         while 0 < d
@@ -154,16 +276,16 @@ function show(io::IO, c::Char)
     return
 end
 
-function show(io::IO, ::MIME"text/plain", c::Char)
+function show(io::IO, ::MIME"text/plain", c::T) where {T<:AbstractChar}
     show(io, c)
     if !ismalformed(c)
         print(io, ": ")
         if isoverlong(c)
             print(io, "[overlong] ")
             u = decode_overlong(c)
-            c = Char(u)
+            c = T(u)
         else
-            u = UInt32(c)
+            u = codepoint(c)
         end
         h = string(u, base = 16, pad = u ≤ 0xffff ? 4 : 6)
         print(io, (isascii(c) ? "ASCII/" : ""), "Unicode U+", h)

diff --git a/base/compiler/validation.jl b/base/compiler/validation.jl
@@ -210,7 +210,7 @@ is_valid_lvalue(x) = isa(x, Slot) || isa(x, SSAValue) || isa(x, GlobalRef)
 function is_valid_argument(x)
     if isa(x, Slot) || isa(x, SSAValue) || isa(x, GlobalRef) || isa(x, QuoteNode) ||
         (isa(x,Expr) && (x.head in (:static_parameter, :boundscheck, :copyast))) ||
-        isa(x, Number) || isa(x, AbstractString) || isa(x, Char) || isa(x, Tuple) ||
+        isa(x, Number) || isa(x, AbstractString) || isa(x, AbstractChar) || isa(x, Tuple) ||
         isa(x, Type) || isa(x, Core.Box) || isa(x, Module) || x === nothing
         return true
     end

diff --git a/base/exports.jl b/base/exports.jl
@@ -563,6 +563,7 @@ export
     bytes2hex,
     chomp,
     chop,
+    codepoint,
     codeunit,
     codeunits,
     digits,

diff --git a/base/filesystem.jl b/base/filesystem.jl
@@ -170,6 +170,7 @@ function read(f::File, ::Type{Char})
     end
     return reinterpret(Char, c)
 end
+read(f::File, ::Type{T}) where {T<:AbstractChar} = T(read(f, Char)) # fallback
 
 function unsafe_read(f::File, p::Ptr{UInt8}, nel::UInt)
     check_open(f)