Skip to content

Commit

Permalink
add AbstractChar supertype of Char (#26286)
Browse files Browse the repository at this point in the history
  • Loading branch information
stevengj authored and StefanKarpinski committed Mar 7, 2018
1 parent 732743f commit b1b0149
Show file tree
Hide file tree
Showing 45 changed files with 722 additions and 564 deletions.
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,9 @@ Library improvements
* The function `thisind(s::AbstractString, i::Integer)` returns the largest valid index
less or equal than `i` in the string `s` or `0` if no such index exists ([#24414]).

* `Char` is now a subtype of `AbstractChar`, and most of the functions that
take character arguments now accept any `AbstractChar` ([#26286]).

* `Irrational` is now a subtype of `AbstractIrrational` ([#24245]).

* Introduced the `empty` function, the functional pair to `empty!` which returns a new,
Expand Down
2 changes: 1 addition & 1 deletion base/arrayshow.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ methods. By default returns a string of the same width as original with a
centered cdot, used in printing of structural zeros of structured matrices.
Accept keyword args `c` for alternate single character marker.
"""
function replace_with_centered_mark(s::AbstractString;c::Char = '')
function replace_with_centered_mark(s::AbstractString;c::AbstractChar = '')
N = length(s)
return join(setindex!([" " for i=1:N],string(c),ceil(Int,N/2)))
end
Expand Down
11 changes: 4 additions & 7 deletions base/boot.jl
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ export
Signed, Int, Int8, Int16, Int32, Int64, Int128,
Unsigned, UInt, UInt8, UInt16, UInt32, UInt64, UInt128,
# string types
Char, AbstractString, String, IO,
AbstractChar, Char, AbstractString, String, IO,
# errors
ErrorException, BoundsError, DivideError, DomainError, Exception,
InterruptException, InexactError, OutOfMemoryError, ReadOnlyMemoryError,
Expand Down Expand Up @@ -177,7 +177,8 @@ primitive type Float32 <: AbstractFloat 32 end
primitive type Float64 <: AbstractFloat 64 end

#primitive type Bool <: Integer 8 end
primitive type Char 32 end
abstract type AbstractChar end
primitive type Char <: AbstractChar 32 end

primitive type Int8 <: Signed 8 end
#primitive type UInt8 <: Unsigned 8 end
Expand Down Expand Up @@ -460,7 +461,7 @@ function write(io::IO, x::String)
end

show(io::IO, @nospecialize x) = ccall(:jl_static_show, Cvoid, (Ptr{Cvoid}, Any), io_pointer(io), x)
print(io::IO, x::Char) = ccall(:jl_uv_putc, Cvoid, (Ptr{Cvoid}, Char), io_pointer(io), x)
print(io::IO, x::AbstractChar) = ccall(:jl_uv_putc, Cvoid, (Ptr{Cvoid}, Char), io_pointer(io), x)
print(io::IO, x::String) = (write(io, x); nothing)
print(io::IO, @nospecialize x) = show(io, x)
print(io::IO, @nospecialize(x), @nospecialize a...) = (print(io, x); print(io, a...))
Expand Down Expand Up @@ -701,10 +702,6 @@ UInt32(x::BuiltinInts) = toUInt32(x)::UInt32
UInt64(x::BuiltinInts) = toUInt64(x)::UInt64
UInt128(x::BuiltinInts) = toUInt128(x)::UInt128

Char(x::Number) = Char(UInt32(x))
Char(x::Char) = x
(::Type{T})(x::Char) where {T<:Number} = T(UInt32(x))

(::Type{T})(x::T) where {T<:Number} = x

Int(x::Ptr) = bitcast(Int, x)
Expand Down
216 changes: 169 additions & 47 deletions base/char.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,76 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license

struct InvalidCharError <: Exception
char::Char
"""
The `AbstractChar` type is the supertype of all character implementations
in Julia. A character represents a Unicode code point, and can be converted
to an integer via the [`codepoint`](@ref) function in order to obtain the
numerical value of the code point, or constructed from the same integer.
These numerical values determine how characters are compared with `<` and `==`,
for example. New `T <: AbstractChar` types should define a `codepoint(::T)`
method and a `T(::UInt32)` constructor, at minimum.
A given `AbstractChar` subtype may be capable of representing only a subset
of Unicode, in which case conversion from an unsupported `UInt32` value
may throw an error. Conversely, the built-in [`Char`](@ref) type represents
a *superset* of Unicode (in order to losslessly encode invalid byte streams),
in which case conversion of a non-Unicode value *to* `UInt32` throws an error.
The [`isvalid`](@ref) function can be used to check which codepoints are
representable in a given `AbstractChar` type.
Internally, an `AbstractChar` type may use a variety of encodings. Conversion
via `codepoint(char)` will not reveal this encoding because it always returns the
Unicode value of the character. `print(io, c)` of any `c::AbstractChar`
produces an encoding determined by `io` (UTF-8 for all built-in [`IO`](@ref)
types), via conversion to `Char` if necessary.
`write(io, c)`, in contrast, may emit an encoding depending on
`typeof(c)`, and `read(io, typeof(c))` should read the same encoding as `write`.
New `AbstractChar` types must provide their own implementations of
`write` and `read`.
"""
AbstractChar

"""
Char(c::Union{Number,AbstractChar})
`Char` is a 32-bit [`AbstractChar`](@ref) type that is the default representation
of characters in Julia. `Char` is the type used for character literals like `'x'`
and it is also the element type of [`String`](@ref).
In order to losslessly represent arbitrary byte streams stored in a `String`,
a `Char` value may store information that cannot be converted to a Unicode
codepoint — converting such a `Char` to `UInt32` will throw an error.
The [`isvalid(c::Char)`](@ref) function can be used to query whether `c`
represents a valid Unicode character.
"""
Char

(::Type{T})(x::Number) where {T<:AbstractChar} = T(UInt32(x))
(::Type{AbstractChar})(x::Number) = Char(x)
(::Type{T})(x::AbstractChar) where {T<:Union{Number,AbstractChar}} = T(codepoint(x))
(::Type{T})(x::T) where {T<:AbstractChar} = x

codepoint(c::Char) = UInt32(c)

"""
codepoint(c::AbstractChar)
Return the Unicode codepoint (an unsigned integer) corresponding
to the character `c` (or throw an exception if `c` does not represent
a valid character). For `Char`, this is a `UInt32` value, but
`AbstractChar` types that represent only a subset of Unicode may
return a different-sized integer (e.g. `UInt8`).
"""
codepoint # defined for Char in boot.jl

struct InvalidCharError{T<:AbstractChar} <: Exception
char::T
end
struct CodePointError <: Exception
code::Integer
struct CodePointError{T<:Integer} <: Exception
code::T
end
@noinline invalid_char(c::Char) = throw(InvalidCharError(c))
@noinline code_point_err(u::UInt32) = throw(CodePointError(u))
@noinline invalid_char(c::AbstractChar) = throw(InvalidCharError(c))
@noinline code_point_err(u::Integer) = throw(CodePointError(u))

function ismalformed(c::Char)
u = reinterpret(UInt32, c)
Expand All @@ -24,6 +87,27 @@ function isoverlong(c::Char)
is_overlong_enc(u)
end

# fallback: other AbstractChar types, by default, are assumed
# not to support malformed or overlong encodings.

"""
ismalformed(c::AbstractChar)
Return `true` if `c` represents malformed (non-Unicode) data according to the
encoding used by `c`. Defaults to `false` for non-`Char` types. See also
[`show_invalid`](@ref).
"""
ismalformed(c::AbstractChar) = false

"""
isoverlong(c::AbstractChar)
Return `true` if `c` represents an overlong UTF-8 sequence. Defaults
to `false` for non-`Char` types. See also [`decode_overlong`](@ref)
and [`show_invalid`](@ref).
"""
isoverlong(c::AbstractChar) = false

function UInt32(c::Char)
# TODO: use optimized inline LLVM
u = reinterpret(UInt32, c)
Expand All @@ -49,6 +133,15 @@ function decode_overlong(c::Char)
(u & 0x007f0000 >> 4) | (u & 0x7f000000 >> 6)
end

"""
decode_overlong(c::AbstractChar)
When [`isoverlong(c)`](@ref) is `true`, `decode_overlong(c)` returns
the Unicode codepoint value of `c`. `AbstractChar` implementations
that support overlong encodings should implement `Base.decode_overlong`.
"""
decode_overlong

function Char(u::UInt32)
u < 0x80 && return reinterpret(Char, u << 24)
u < 0x00200000 || code_point_err(u)::Union{}
Expand All @@ -69,50 +162,85 @@ function Char(b::Union{Int8,UInt8})
0 b 0x7f ? reinterpret(Char, (b % UInt32) << 24) : Char(UInt32(b))
end

convert(::Type{Char}, x::Number) = Char(x)
convert(::Type{T}, x::Char) where {T<:Number} = T(x)
convert(::Type{AbstractChar}, x::Number) = Char(x) # default to Char
convert(::Type{T}, x::Number) where {T<:AbstractChar} = T(x)
convert(::Type{T}, x::AbstractChar) where {T<:Number} = T(x)
convert(::Type{T}, c::AbstractChar) where {T<:AbstractChar} = T(c)
convert(::Type{T}, c::T) where {T<:AbstractChar} = c

rem(x::Char, ::Type{T}) where {T<:Number} = rem(UInt32(x), T)
rem(x::AbstractChar, ::Type{T}) where {T<:Number} = rem(codepoint(x), T)

typemax(::Type{Char}) = reinterpret(Char, typemax(UInt32))
typemin(::Type{Char}) = reinterpret(Char, typemin(UInt32))

size(c::Char) = ()
size(c::Char,d) = convert(Int, d) < 1 ? throw(BoundsError()) : 1
ndims(c::Char) = 0
ndims(::Type{Char}) = 0
length(c::Char) = 1
firstindex(c::Char) = 1
lastindex(c::Char) = 1
getindex(c::Char) = c
getindex(c::Char, i::Integer) = i == 1 ? c : throw(BoundsError())
getindex(c::Char, I::Integer...) = all(x -> x == 1, I) ? c : throw(BoundsError())
first(c::Char) = c
last(c::Char) = c
eltype(::Type{Char}) = Char

start(c::Char) = false
next(c::Char, state) = (c, true)
done(c::Char, state) = state
isempty(c::Char) = false
in(x::Char, y::Char) = x == y
size(c::AbstractChar) = ()
size(c::AbstractChar,d) = convert(Int, d) < 1 ? throw(BoundsError()) : 1
ndims(c::AbstractChar) = 0
ndims(::Type{<:AbstractChar}) = 0
length(c::AbstractChar) = 1
firstindex(c::AbstractChar) = 1
lastindex(c::AbstractChar) = 1
getindex(c::AbstractChar) = c
getindex(c::AbstractChar, i::Integer) = i == 1 ? c : throw(BoundsError())
getindex(c::AbstractChar, I::Integer...) = all(x -> x == 1, I) ? c : throw(BoundsError())
first(c::AbstractChar) = c
last(c::AbstractChar) = c
eltype(::Type{T}) where {T<:AbstractChar} = T

start(c::AbstractChar) = false
next(c::AbstractChar, state) = (c, true)
done(c::AbstractChar, state) = state
isempty(c::AbstractChar) = false
in(x::AbstractChar, y::AbstractChar) = x == y

==(x::Char, y::Char) = reinterpret(UInt32, x) == reinterpret(UInt32, y)
isless(x::Char, y::Char) = reinterpret(UInt32, x) < reinterpret(UInt32, y)
hash(x::Char, h::UInt) =
hash_uint64(((reinterpret(UInt32, x) + UInt64(0xd4d64234)) << 32) UInt64(h))
widen(::Type{Char}) = Char

-(x::Char, y::Char) = Int(x) - Int(y)
-(x::Char, y::Integer) = Char(Int32(x) - Int32(y))
+(x::Char, y::Integer) = Char(Int32(x) + Int32(y))
+(x::Integer, y::Char) = y + x
# fallbacks:
isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y))
==(x::AbstractChar, y::AbstractChar) = Char(x) == Char(y)
hash(x::AbstractChar, h::UInt) = hash(Char(x), h)
widen(::Type{T}) where {T<:AbstractChar} = T

-(x::AbstractChar, y::AbstractChar) = Int(x) - Int(y)
-(x::T, y::Integer) where {T<:AbstractChar} = T(Int32(x) - Int32(y))
+(x::T, y::Integer) where {T<:AbstractChar} = T(Int32(x) + Int32(y))
+(x::Integer, y::AbstractChar) = y + x

# `print` should output UTF-8 by default for all AbstractChar types.
# (Packages may implement other IO subtypes to specify different encodings.)
# In contrast, `write(io, c)` outputs a `c` in an encoding determined by typeof(c).
print(io::IO, c::Char) = (write(io, c); nothing)
print(io::IO, c::AbstractChar) = print(io, Char(c)) # fallback: convert to output UTF-8

const hex_chars = UInt8['0':'9';'a':'z']

function show(io::IO, c::Char)
function show_invalid(io::IO, c::Char)
write(io, 0x27)
u = reinterpret(UInt32, c)
while true
a = hex_chars[((u >> 28) & 0xf) + 1]
b = hex_chars[((u >> 24) & 0xf) + 1]
write(io, 0x5c, UInt8('x'), a, b)
(u <<= 8) == 0 && break
end
write(io, 0x27)
end

"""
show_invalid(io::IO, c::AbstractChar)
Called by `show(io, c)` when [`isoverlong(c)`](@ref) or
[`ismalformed(c)`](@ref) return `true`. Subclasses
of `AbstractChar` should define `Base.show_invalid` methods
if they support storing invalid character data.
"""
show_invalid

# show c to io, assuming UTF-8 encoded output
function show(io::IO, c::AbstractChar)
if c <= '\\'
b = c == '\0' ? 0x30 :
c == '\a' ? 0x61 :
Expand All @@ -131,19 +259,13 @@ function show(io::IO, c::Char)
end
end
if isoverlong(c) || ismalformed(c)
show_invalid(io, c)
elseif isprint(c)
write(io, 0x27)
u = reinterpret(UInt32, c)
while true
a = hex_chars[((u >> 28) & 0xf) + 1]
b = hex_chars[((u >> 24) & 0xf) + 1]
write(io, 0x5c, 'x', a, b)
(u <<= 8) == 0 && break
end
print(io, c) # use print, not write, to use UTF-8 for any AbstractChar
write(io, 0x27)
elseif isprint(c)
write(io, 0x27, c, 0x27)
else # unprintable, well-formed, non-overlong Unicode
u = UInt32(c)
u = codepoint(c)
write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55)
d = max(2, 8 - (leading_zeros(u) >> 2))
while 0 < d
Expand All @@ -154,16 +276,16 @@ function show(io::IO, c::Char)
return
end

function show(io::IO, ::MIME"text/plain", c::Char)
function show(io::IO, ::MIME"text/plain", c::T) where {T<:AbstractChar}
show(io, c)
if !ismalformed(c)
print(io, ": ")
if isoverlong(c)
print(io, "[overlong] ")
u = decode_overlong(c)
c = Char(u)
c = T(u)
else
u = UInt32(c)
u = codepoint(c)
end
h = string(u, base = 16, pad = u 0xffff ? 4 : 6)
print(io, (isascii(c) ? "ASCII/" : ""), "Unicode U+", h)
Expand Down
2 changes: 1 addition & 1 deletion base/compiler/validation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ is_valid_lvalue(x) = isa(x, Slot) || isa(x, SSAValue) || isa(x, GlobalRef)
function is_valid_argument(x)
if isa(x, Slot) || isa(x, SSAValue) || isa(x, GlobalRef) || isa(x, QuoteNode) ||
(isa(x,Expr) && (x.head in (:static_parameter, :boundscheck, :copyast))) ||
isa(x, Number) || isa(x, AbstractString) || isa(x, Char) || isa(x, Tuple) ||
isa(x, Number) || isa(x, AbstractString) || isa(x, AbstractChar) || isa(x, Tuple) ||
isa(x, Type) || isa(x, Core.Box) || isa(x, Module) || x === nothing
return true
end
Expand Down
1 change: 1 addition & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,7 @@ export
bytes2hex,
chomp,
chop,
codepoint,
codeunit,
codeunits,
digits,
Expand Down
1 change: 1 addition & 0 deletions base/filesystem.jl
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ function read(f::File, ::Type{Char})
end
return reinterpret(Char, c)
end
read(f::File, ::Type{T}) where {T<:AbstractChar} = T(read(f, Char)) # fallback

function unsafe_read(f::File, p::Ptr{UInt8}, nel::UInt)
check_open(f)
Expand Down
Loading

0 comments on commit b1b0149

Please sign in to comment.