JuliaLang · c42f · May 11, 2023 · May 9, 2023 · May 11, 2023 · May 11, 2023
diff --git a/src/kinds.jl b/src/kinds.jl
@@ -934,7 +934,25 @@ primitive type Kind 16 end
 # the K_str macro to self-name these kinds with their literal representation,
 # rather than needing to invent a new name for each.
 
-let kind_int_type = :UInt16,
+let kind_int_type = :UInt16
+    # Preprocess _kind_names to conflate category markers with the first/last
+    # in the category.
+    kindstr_to_int = Dict{String,UInt16}()
+    i = 1
+    while i <= length(_kind_names)
+        kn = _kind_names[i]
+        kind_int = i-1
+        if startswith(kn, "BEGIN_")
+            deleteat!(_kind_names, i)
+        elseif startswith(kn, "END_")
+            kind_int = i-2
+            deleteat!(_kind_names, i)
+        else
+            i += 1
+        end
+        push!(kindstr_to_int, kn=>kind_int)
+    end
+
     max_kind_int = length(_kind_names)-1
 
     @eval begin
@@ -945,9 +963,9 @@ let kind_int_type = :UInt16,
             return Base.bitcast(Kind, convert($kind_int_type, x))
         end
 
-        Base.convert(::Type{String}, k::Kind) = _kind_names[1 + Base.bitcast($kind_int_type, k)]
+        Base.convert(::Type{String}, k::Kind) = _kind_names[1 + reinterpret($kind_int_type, k)]
 
-        let kindstr_to_int = Dict(s=>i-1 for (i,s) in enumerate(_kind_names))
+        let kindstr_to_int=$kindstr_to_int
             function Base.convert(::Type{Kind}, s::AbstractString)
                 i = get(kindstr_to_int, s) do
                     error("unknown Kind name $(repr(s))")
@@ -1078,12 +1096,12 @@ const _token_error_descriptions = Dict{Kind, String}(
 
 #-------------------------------------------------------------------------------
 # Predicates
-is_contextual_keyword(k::Kind) = K"BEGIN_CONTEXTUAL_KEYWORDS" < k < K"END_CONTEXTUAL_KEYWORDS"
-is_error(k::Kind) = K"BEGIN_ERRORS" < k < K"END_ERRORS" || k == K"ErrorInvalidOperator" || k == K"Error**"
-is_keyword(k::Kind) = K"BEGIN_KEYWORDS" < k < K"END_KEYWORDS"
-is_block_continuation_keyword(k::Kind) = K"BEGIN_BLOCK_CONTINUATION_KEYWORDS" < k < K"END_BLOCK_CONTINUATION_KEYWORDS"
-is_literal(k::Kind) = K"BEGIN_LITERAL" < k < K"END_LITERAL"
-is_operator(k::Kind) = K"BEGIN_OPS" < k < K"END_OPS"
+is_contextual_keyword(k::Kind) = K"BEGIN_CONTEXTUAL_KEYWORDS" <= k <= K"END_CONTEXTUAL_KEYWORDS"
+is_error(k::Kind) = K"BEGIN_ERRORS" <= k <= K"END_ERRORS" || k == K"ErrorInvalidOperator" || k == K"Error**"
+is_keyword(k::Kind) = K"BEGIN_KEYWORDS" <= k <= K"END_KEYWORDS"
+is_block_continuation_keyword(k::Kind) = K"BEGIN_BLOCK_CONTINUATION_KEYWORDS" <= k <= K"END_BLOCK_CONTINUATION_KEYWORDS"
+is_literal(k::Kind) = K"BEGIN_LITERAL" <= k <= K"END_LITERAL"
+is_operator(k::Kind) = K"BEGIN_OPS" <= k <= K"END_OPS"
 is_word_operator(k::Kind) = (k == K"in" || k == K"isa" || k == K"where")
 
 is_contextual_keyword(k) = is_contextual_keyword(kind(k))
@@ -1097,28 +1115,28 @@ is_word_operator(k) = is_word_operator(kind(k))
 # Predicates for operator precedence
 # FIXME: Review how precedence depends on dottedness, eg
 # https://github.com/JuliaLang/julia/pull/36725
-is_prec_assignment(x)  = K"BEGIN_ASSIGNMENTS" < kind(x) < K"END_ASSIGNMENTS"
-is_prec_pair(x)        = K"BEGIN_PAIRARROW"   < kind(x) < K"END_PAIRARROW"
-is_prec_conditional(x) = K"BEGIN_CONDITIONAL" < kind(x) < K"END_CONDITIONAL"
-is_prec_arrow(x)       = K"BEGIN_ARROW"       < kind(x) < K"END_ARROW"
-is_prec_lazy_or(x)     = K"BEGIN_LAZYOR"      < kind(x) < K"END_LAZYOR"
-is_prec_lazy_and(x)    = K"BEGIN_LAZYAND"     < kind(x) < K"END_LAZYAND"
-is_prec_comparison(x)  = K"BEGIN_COMPARISON"  < kind(x) < K"END_COMPARISON"
-is_prec_pipe(x)        = K"BEGIN_PIPE"        < kind(x) < K"END_PIPE"
-is_prec_colon(x)       = K"BEGIN_COLON"       < kind(x) < K"END_COLON"
-is_prec_plus(x)        = K"BEGIN_PLUS"        < kind(x) < K"END_PLUS"
-is_prec_bitshift(x)    = K"BEGIN_BITSHIFTS"   < kind(x) < K"END_BITSHIFTS"
-is_prec_times(x)       = K"BEGIN_TIMES"       < kind(x) < K"END_TIMES"
-is_prec_rational(x)    = K"BEGIN_RATIONAL"    < kind(x) < K"END_RATIONAL"
-is_prec_power(x)       = K"BEGIN_POWER"       < kind(x) < K"END_POWER"
-is_prec_decl(x)        = K"BEGIN_DECL"        < kind(x) < K"END_DECL"
-is_prec_where(x)       = K"BEGIN_WHERE"       < kind(x) < K"END_WHERE"
-is_prec_dot(x)         = K"BEGIN_DOT"         < kind(x) < K"END_DOT"
-is_prec_unicode_ops(x) = K"BEGIN_UNICODE_OPS" < kind(x) < K"END_UNICODE_OPS"
+is_prec_assignment(x)  = K"BEGIN_ASSIGNMENTS" <= kind(x) <= K"END_ASSIGNMENTS"
+is_prec_pair(x)        = K"BEGIN_PAIRARROW"   <= kind(x) <= K"END_PAIRARROW"
+is_prec_conditional(x) = K"BEGIN_CONDITIONAL" <= kind(x) <= K"END_CONDITIONAL"
+is_prec_arrow(x)       = K"BEGIN_ARROW"       <= kind(x) <= K"END_ARROW"
+is_prec_lazy_or(x)     = K"BEGIN_LAZYOR"      <= kind(x) <= K"END_LAZYOR"
+is_prec_lazy_and(x)    = K"BEGIN_LAZYAND"     <= kind(x) <= K"END_LAZYAND"
+is_prec_comparison(x)  = K"BEGIN_COMPARISON"  <= kind(x) <= K"END_COMPARISON"
+is_prec_pipe(x)        = K"BEGIN_PIPE"        <= kind(x) <= K"END_PIPE"
+is_prec_colon(x)       = K"BEGIN_COLON"       <= kind(x) <= K"END_COLON"
+is_prec_plus(x)        = K"BEGIN_PLUS"        <= kind(x) <= K"END_PLUS"
+is_prec_bitshift(x)    = K"BEGIN_BITSHIFTS"   <= kind(x) <= K"END_BITSHIFTS"
+is_prec_times(x)       = K"BEGIN_TIMES"       <= kind(x) <= K"END_TIMES"
+is_prec_rational(x)    = K"BEGIN_RATIONAL"    <= kind(x) <= K"END_RATIONAL"
+is_prec_power(x)       = K"BEGIN_POWER"       <= kind(x) <= K"END_POWER"
+is_prec_decl(x)        = K"BEGIN_DECL"        <= kind(x) <= K"END_DECL"
+is_prec_where(x)       = K"BEGIN_WHERE"       <= kind(x) <= K"END_WHERE"
+is_prec_dot(x)         = K"BEGIN_DOT"         <= kind(x) <= K"END_DOT"
+is_prec_unicode_ops(x) = K"BEGIN_UNICODE_OPS" <= kind(x) <= K"END_UNICODE_OPS"
 is_prec_pipe_lt(x)     = kind(x) == K"<|"
 is_prec_pipe_gt(x)     = kind(x) == K"|>"
-is_syntax_kind(x)      = K"BEGIN_SYNTAX_KINDS" < kind(x) < K"END_SYNTAX_KINDS"
-is_macro_name(x)       = K"BEGIN_MACRO_NAMES" < kind(x) < K"END_MACRO_NAMES"
+is_syntax_kind(x)      = K"BEGIN_SYNTAX_KINDS"<= kind(x) <= K"END_SYNTAX_KINDS"
+is_macro_name(x)       = K"BEGIN_MACRO_NAMES" <= kind(x) <= K"END_MACRO_NAMES"
 
 function is_number(x)
     kind(x) in (K"Integer", K"BinInt", K"HexInt", K"OctInt", K"Float", K"Float32")

diff --git a/src/literal_parsing.jl b/src/literal_parsing.jl
@@ -329,11 +329,11 @@ end
 
 # static wrapper around user callback function
 function utf8proc_custom_func(codepoint::UInt32, ::Ptr{Cvoid})::UInt32
-    (codepoint == 0x025B ? 0x03B5 :
-    codepoint == 0x00B5 ? 0x03BC :
-    codepoint == 0x00B7 ? 0x22C5 :
-    codepoint == 0x0387 ? 0x22C5 :
-    codepoint == 0x2212 ? 0x002D :
+    (codepoint == 0x025B ? 0x03B5 :  # 'ɛ' => 'ε'
+    codepoint == 0x00B5 ? 0x03BC :   # 'µ' => 'μ'
+    codepoint == 0x00B7 ? 0x22C5 :   # '·' => '⋅'
+    codepoint == 0x0387 ? 0x22C5 :   # '·' => '⋅'
+    codepoint == 0x2212 ? 0x002D :   # '−' => '-'
     codepoint)
 end
 

diff --git a/src/tokenize.jl b/src/tokenize.jl
@@ -2,12 +2,174 @@ module Tokenize
 
 export tokenize, untokenize, Tokens
 
-using ..JuliaSyntax: Kind, @K_str
+using ..JuliaSyntax: JuliaSyntax, Kind, @K_str
 
 import ..JuliaSyntax: kind,
     is_literal, is_error, is_contextual_keyword, is_word_operator
 
-include("tokenize_utils.jl")
+#-------------------------------------------------------------------------------
+# Character-based predicates for tokenization
+import Base.Unicode
+
+const EOF_CHAR = typemax(Char)
+
+function is_identifier_char(c::Char)
+    c == EOF_CHAR && return false
+    Base.isvalid(c) || return false
+    return Base.is_id_char(c)
+end
+
+function is_identifier_start_char(c::Char)
+    c == EOF_CHAR && return false
+    Base.isvalid(c) || return false
+    return Base.is_id_start_char(c)
+end
+
+# Chars that we will never allow to be part of a valid non-operator identifier
+function is_never_id_char(ch::Char)
+    Base.isvalid(ch) || return true
+    cat = Unicode.category_code(ch)
+    c = UInt32(ch)
+    return (
+        # spaces and control characters:
+        (cat >= Unicode.UTF8PROC_CATEGORY_ZS && cat <= Unicode.UTF8PROC_CATEGORY_CS) ||
+
+        # ASCII and Latin1 non-connector punctuation
+        (c < 0xff &&
+         cat >= Unicode.UTF8PROC_CATEGORY_PD && cat <= Unicode.UTF8PROC_CATEGORY_PO) ||
+
+        c == UInt32('`') ||
+
+        # mathematical brackets
+        (c >= 0x27e6 && c <= 0x27ef) ||
+        # angle, corner, and lenticular brackets
+        (c >= 0x3008 && c <= 0x3011) ||
+        # tortoise shell, square, and more lenticular brackets
+        (c >= 0x3014 && c <= 0x301b) ||
+        # fullwidth parens
+        (c == 0xff08 || c == 0xff09) ||
+        # fullwidth square brackets
+        (c == 0xff3b || c == 0xff3d)
+    )
+end
+
+readchar(io::IO) = eof(io) ? EOF_CHAR : read(io, Char)
+
+# Some unicode operators are normalized by the tokenizer into their equivalent
+# kinds. See also normalize_identifier()
+const _ops_with_unicode_aliases = [
+    # \minus '−' is normalized into K"-",
+    '−' => K"-"
+    # Lookalikes which are normalized into K"⋅",
+    # https://github.com/JuliaLang/julia/pull/25157,
+    '\u00b7' => K"⋅" # '·' Middle Dot,,
+    '\u0387' => K"⋅" # '·' Greek Ano Teleia,,
+]
+
+function _nondot_symbolic_operator_kinds()
+    op_range = reinterpret(UInt16, K"BEGIN_OPS"):reinterpret(UInt16, K"END_OPS")
+    setdiff(reinterpret.(Kind, op_range), [
+        K"ErrorInvalidOperator"
+        K"Error**"
+        K"..."
+        K"."
+        K"where"
+        K"isa"
+        K"in"
+        K".'"
+    ])
+end
+
+function _char_in_set_expr(varname, firstchars)
+    codes = sort!(UInt32.(unique(firstchars)))
+    terms = []
+    i = 1
+    while i <= length(codes)
+        j = i
+        while j < length(codes) && codes[j+1] == codes[j]+1
+            j += 1
+        end
+        if i == j
+            push!(terms, :($varname == $(codes[i])))
+        else
+            push!(terms, :($(codes[i]) <= $varname <= $(codes[j])))
+        end
+        i = j+1
+    end
+    foldr((t1,t2)->:($t1 || $t2), terms)
+end
+
+@eval function is_operator_start_char(c)
+   if c == EOF_CHAR || !Base.isvalid(c)
+       return false
+   end
+   u = UInt32(c)
+   return $(_char_in_set_expr(:u,
+       append!(first.(string.(_nondot_symbolic_operator_kinds())),
+               first.(_ops_with_unicode_aliases))))
+end
+
+# Checks whether a Char is an operator which can be prefixed with a dot `.`
+function is_dottable_operator_start_char(c)
+    return c != '?' && c != '$' && c != ':' && c != '\'' && is_operator_start_char(c)
+end
+
+@eval function isopsuffix(c::Char)
+    c == EOF_CHAR && return false
+    Base.isvalid(c) || return false
+    u = UInt32(c)
+    if (u < 0xa1 || u > 0x10ffff)
+        return false
+    end
+    cat = Base.Unicode.category_code(u)
+    if (cat == Base.Unicode.UTF8PROC_CATEGORY_MN ||
+        cat == Base.Unicode.UTF8PROC_CATEGORY_MC ||
+        cat == Base.Unicode.UTF8PROC_CATEGORY_ME)
+        return true
+    end
+    # Additional allowed cases
+    return $(_char_in_set_expr(:u,
+        collect("²³¹ʰʲʳʷʸˡˢˣᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁᵂᵃᵇᵈᵉᵍᵏᵐᵒᵖᵗᵘᵛᵝᵞᵟᵠᵡᵢᵣᵤᵥᵦᵧᵨᵩᵪᶜᶠᶥᶦᶫᶰᶸᶻᶿ′″‴‵‶‷⁗⁰ⁱ⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₒₓₕₖₗₘₙₚₛₜⱼⱽꜛꜜꜝ")))
+end
+
+function optakessuffix(k)
+    (K"BEGIN_OPS" <= k <= K"END_OPS") &&
+    !(
+        k == K"..." ||
+        K"BEGIN_ASSIGNMENTS" <= k <= K"END_ASSIGNMENTS" ||
+        k == K"?"   ||
+        k == K"<:"  ||
+        k == K">:"  ||
+        k == K"&&"  ||
+        k == K"||"  ||
+        k == K"in"  ||
+        k == K"isa" ||
+        k == K"≔"   ||
+        k == K"⩴"   ||
+        k == K":"   ||
+        k == K".."  ||
+        k == K"$"   ||
+        k == K"::"  ||
+        k == K"where" ||
+        k == K"."   ||
+        k == K"!"   ||
+        k == K".'"  ||
+        k == K"->"  ||
+        K"¬" <= k <= K"∜"
+    )
+end
+
+const _unicode_ops = let
+    ks = _nondot_symbolic_operator_kinds()
+    ss = string.(ks)
+
+    ops = Dict{Char, Kind}([first(s)=>k for (k,s) in zip(ks,ss)
+                            if length(s) == 1 && !isascii(s[1])])
+    for ck in _ops_with_unicode_aliases
+        push!(ops, ck)
+    end
+    ops
+end
 
 #-------------------------------------------------------------------------------
 # Tokens
@@ -370,7 +532,7 @@ function _next_token(l::Lexer, c)
         return lex_identifier(l, c)
     elseif isdigit(c)
         return lex_digit(l, K"Integer")
-    elseif (k = get(UNICODE_OPS, c, K"error")) != K"error"
+    elseif (k = get(_unicode_ops, c, K"error")) != K"error"
         return emit(l, k)
     else
         emit_error(l, K"ErrorUnknownCharacter")
@@ -416,6 +578,7 @@ function lex_string_chunk(l)
             !(pc == EOF_CHAR || is_operator_start_char(pc) || is_never_id_char(pc))
         # Only allow certain characters after interpolated vars
         # https://github.com/JuliaLang/julia/pull/25234
+        readchar(l)
         return emit_error(l, K"ErrorInvalidInterpolationTerminator")
     end
     if pc == EOF_CHAR
@@ -771,7 +934,7 @@ function lex_digit(l::Lexer, kind)
             # If we enter the function with kind == K"Float" then a '.' has been parsed.
             readchar(l)
             return emit_error(l, K"ErrorInvalidNumericConstant")
-        elseif is_operator_start_char(ppc) && ppc !== ':'
+        elseif is_dottable_operator_start_char(ppc)
             readchar(l)
             return emit_error(l, K"ErrorAmbiguousNumericConstant") # `1.+`
         end
@@ -787,14 +950,14 @@ function lex_digit(l::Lexer, kind)
             accept(l, "+-−")
             if accept_batch(l, isdigit)
                 pc,ppc = dpeekchar(l)
-                if pc === '.' && !dotop2(ppc)
+                if pc === '.' && !is_dottable_operator_start_char(ppc)
                     readchar(l)
                     return emit_error(l, K"ErrorInvalidNumericConstant") # `1.e1.`
                 end
             else
                 return emit_error(l, K"ErrorInvalidNumericConstant") # `1.e`
             end
-        elseif pc == '.' && ppc != '.' && !is_operator_start_char(ppc)
+        elseif pc == '.' && ppc != '.' && !is_dottable_operator_start_char(ppc)
             readchar(l)
             return emit_error(l, K"ErrorInvalidNumericConstant") # `1.1.`
         elseif !had_fraction_digs && (is_identifier_start_char(pc) ||
@@ -808,7 +971,7 @@ function lex_digit(l::Lexer, kind)
         accept(l, "+-−")
         if accept_batch(l, isdigit)
             pc,ppc = dpeekchar(l)
-            if pc === '.' && !dotop2(ppc)
+            if pc === '.' && !is_dottable_operator_start_char(ppc)
                 accept(l, '.')
                 return emit_error(l, K"ErrorInvalidNumericConstant") # `1e1.`
             end
@@ -948,7 +1111,7 @@ function lex_dot(l::Lexer)
         if accept(l, '.')
             return emit(l, K"...")
         else
-            if dotop2(peekchar(l))
+            if is_dottable_operator_start_char(peekchar(l))
                 readchar(l)
                 return emit_error(l, K"ErrorInvalidOperator")
             else
@@ -959,10 +1122,7 @@ function lex_dot(l::Lexer)
         return lex_digit(l, K"Float")
     else
         pc, dpc = dpeekchar(l)
-        if dotop1(pc)
-            l.dotop = true
-            return _next_token(l, readchar(l))
-        elseif pc =='+'
+        if pc == '+'
             l.dotop = true
             readchar(l)
             return lex_plus(l)
@@ -1040,6 +1200,9 @@ function lex_dot(l::Lexer)
             l.dotop = true
             readchar(l)
             return lex_equal(l)
+        elseif is_dottable_operator_start_char(pc)
+            l.dotop = true
+            return _next_token(l, readchar(l))
         end
         return emit(l, K".")
     end