Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup and fix operator predicates #272

Merged
merged 5 commits into from
May 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 47 additions & 29 deletions src/kinds.jl
Original file line number Diff line number Diff line change
Expand Up @@ -934,7 +934,25 @@ primitive type Kind 16 end
# the K_str macro to self-name these kinds with their literal representation,
# rather than needing to invent a new name for each.

let kind_int_type = :UInt16,
let kind_int_type = :UInt16
# Preprocess _kind_names to conflate category markers with the first/last
# in the category.
kindstr_to_int = Dict{String,UInt16}()
i = 1
while i <= length(_kind_names)
kn = _kind_names[i]
kind_int = i-1
if startswith(kn, "BEGIN_")
deleteat!(_kind_names, i)
elseif startswith(kn, "END_")
kind_int = i-2
deleteat!(_kind_names, i)
else
i += 1
end
push!(kindstr_to_int, kn=>kind_int)
end

max_kind_int = length(_kind_names)-1

@eval begin
Expand All @@ -945,9 +963,9 @@ let kind_int_type = :UInt16,
return Base.bitcast(Kind, convert($kind_int_type, x))
end

Base.convert(::Type{String}, k::Kind) = _kind_names[1 + Base.bitcast($kind_int_type, k)]
Base.convert(::Type{String}, k::Kind) = _kind_names[1 + reinterpret($kind_int_type, k)]

let kindstr_to_int = Dict(s=>i-1 for (i,s) in enumerate(_kind_names))
let kindstr_to_int=$kindstr_to_int
function Base.convert(::Type{Kind}, s::AbstractString)
i = get(kindstr_to_int, s) do
error("unknown Kind name $(repr(s))")
Expand Down Expand Up @@ -1078,12 +1096,12 @@ const _token_error_descriptions = Dict{Kind, String}(

#-------------------------------------------------------------------------------
# Predicates
is_contextual_keyword(k::Kind) = K"BEGIN_CONTEXTUAL_KEYWORDS" < k < K"END_CONTEXTUAL_KEYWORDS"
is_error(k::Kind) = K"BEGIN_ERRORS" < k < K"END_ERRORS" || k == K"ErrorInvalidOperator" || k == K"Error**"
is_keyword(k::Kind) = K"BEGIN_KEYWORDS" < k < K"END_KEYWORDS"
is_block_continuation_keyword(k::Kind) = K"BEGIN_BLOCK_CONTINUATION_KEYWORDS" < k < K"END_BLOCK_CONTINUATION_KEYWORDS"
is_literal(k::Kind) = K"BEGIN_LITERAL" < k < K"END_LITERAL"
is_operator(k::Kind) = K"BEGIN_OPS" < k < K"END_OPS"
is_contextual_keyword(k::Kind) = K"BEGIN_CONTEXTUAL_KEYWORDS" <= k <= K"END_CONTEXTUAL_KEYWORDS"
is_error(k::Kind) = K"BEGIN_ERRORS" <= k <= K"END_ERRORS" || k == K"ErrorInvalidOperator" || k == K"Error**"
is_keyword(k::Kind) = K"BEGIN_KEYWORDS" <= k <= K"END_KEYWORDS"
is_block_continuation_keyword(k::Kind) = K"BEGIN_BLOCK_CONTINUATION_KEYWORDS" <= k <= K"END_BLOCK_CONTINUATION_KEYWORDS"
is_literal(k::Kind) = K"BEGIN_LITERAL" <= k <= K"END_LITERAL"
is_operator(k::Kind) = K"BEGIN_OPS" <= k <= K"END_OPS"
is_word_operator(k::Kind) = (k == K"in" || k == K"isa" || k == K"where")

is_contextual_keyword(k) = is_contextual_keyword(kind(k))
Expand All @@ -1097,28 +1115,28 @@ is_word_operator(k) = is_word_operator(kind(k))
# Predicates for operator precedence
# FIXME: Review how precedence depends on dottedness, eg
# https://github.com/JuliaLang/julia/pull/36725
is_prec_assignment(x) = K"BEGIN_ASSIGNMENTS" < kind(x) < K"END_ASSIGNMENTS"
is_prec_pair(x) = K"BEGIN_PAIRARROW" < kind(x) < K"END_PAIRARROW"
is_prec_conditional(x) = K"BEGIN_CONDITIONAL" < kind(x) < K"END_CONDITIONAL"
is_prec_arrow(x) = K"BEGIN_ARROW" < kind(x) < K"END_ARROW"
is_prec_lazy_or(x) = K"BEGIN_LAZYOR" < kind(x) < K"END_LAZYOR"
is_prec_lazy_and(x) = K"BEGIN_LAZYAND" < kind(x) < K"END_LAZYAND"
is_prec_comparison(x) = K"BEGIN_COMPARISON" < kind(x) < K"END_COMPARISON"
is_prec_pipe(x) = K"BEGIN_PIPE" < kind(x) < K"END_PIPE"
is_prec_colon(x) = K"BEGIN_COLON" < kind(x) < K"END_COLON"
is_prec_plus(x) = K"BEGIN_PLUS" < kind(x) < K"END_PLUS"
is_prec_bitshift(x) = K"BEGIN_BITSHIFTS" < kind(x) < K"END_BITSHIFTS"
is_prec_times(x) = K"BEGIN_TIMES" < kind(x) < K"END_TIMES"
is_prec_rational(x) = K"BEGIN_RATIONAL" < kind(x) < K"END_RATIONAL"
is_prec_power(x) = K"BEGIN_POWER" < kind(x) < K"END_POWER"
is_prec_decl(x) = K"BEGIN_DECL" < kind(x) < K"END_DECL"
is_prec_where(x) = K"BEGIN_WHERE" < kind(x) < K"END_WHERE"
is_prec_dot(x) = K"BEGIN_DOT" < kind(x) < K"END_DOT"
is_prec_unicode_ops(x) = K"BEGIN_UNICODE_OPS" < kind(x) < K"END_UNICODE_OPS"
is_prec_assignment(x) = K"BEGIN_ASSIGNMENTS" <= kind(x) <= K"END_ASSIGNMENTS"
is_prec_pair(x) = K"BEGIN_PAIRARROW" <= kind(x) <= K"END_PAIRARROW"
is_prec_conditional(x) = K"BEGIN_CONDITIONAL" <= kind(x) <= K"END_CONDITIONAL"
is_prec_arrow(x) = K"BEGIN_ARROW" <= kind(x) <= K"END_ARROW"
is_prec_lazy_or(x) = K"BEGIN_LAZYOR" <= kind(x) <= K"END_LAZYOR"
is_prec_lazy_and(x) = K"BEGIN_LAZYAND" <= kind(x) <= K"END_LAZYAND"
is_prec_comparison(x) = K"BEGIN_COMPARISON" <= kind(x) <= K"END_COMPARISON"
is_prec_pipe(x) = K"BEGIN_PIPE" <= kind(x) <= K"END_PIPE"
is_prec_colon(x) = K"BEGIN_COLON" <= kind(x) <= K"END_COLON"
is_prec_plus(x) = K"BEGIN_PLUS" <= kind(x) <= K"END_PLUS"
is_prec_bitshift(x) = K"BEGIN_BITSHIFTS" <= kind(x) <= K"END_BITSHIFTS"
is_prec_times(x) = K"BEGIN_TIMES" <= kind(x) <= K"END_TIMES"
is_prec_rational(x) = K"BEGIN_RATIONAL" <= kind(x) <= K"END_RATIONAL"
is_prec_power(x) = K"BEGIN_POWER" <= kind(x) <= K"END_POWER"
is_prec_decl(x) = K"BEGIN_DECL" <= kind(x) <= K"END_DECL"
is_prec_where(x) = K"BEGIN_WHERE" <= kind(x) <= K"END_WHERE"
is_prec_dot(x) = K"BEGIN_DOT" <= kind(x) <= K"END_DOT"
is_prec_unicode_ops(x) = K"BEGIN_UNICODE_OPS" <= kind(x) <= K"END_UNICODE_OPS"
is_prec_pipe_lt(x) = kind(x) == K"<|"
is_prec_pipe_gt(x) = kind(x) == K"|>"
is_syntax_kind(x) = K"BEGIN_SYNTAX_KINDS" < kind(x) < K"END_SYNTAX_KINDS"
is_macro_name(x) = K"BEGIN_MACRO_NAMES" < kind(x) < K"END_MACRO_NAMES"
is_syntax_kind(x) = K"BEGIN_SYNTAX_KINDS"<= kind(x) <= K"END_SYNTAX_KINDS"
is_macro_name(x) = K"BEGIN_MACRO_NAMES" <= kind(x) <= K"END_MACRO_NAMES"

function is_number(x)
kind(x) in (K"Integer", K"BinInt", K"HexInt", K"OctInt", K"Float", K"Float32")
Expand Down
10 changes: 5 additions & 5 deletions src/literal_parsing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -329,11 +329,11 @@ end

# static wrapper around user callback function
function utf8proc_custom_func(codepoint::UInt32, ::Ptr{Cvoid})::UInt32
(codepoint == 0x025B ? 0x03B5 :
codepoint == 0x00B5 ? 0x03BC :
codepoint == 0x00B7 ? 0x22C5 :
codepoint == 0x0387 ? 0x22C5 :
codepoint == 0x2212 ? 0x002D :
(codepoint == 0x025B ? 0x03B5 : # 'ɛ' => 'ε'
codepoint == 0x00B5 ? 0x03BC : # 'µ' => 'μ'
codepoint == 0x00B7 ? 0x22C5 : # '·' => '⋅'
codepoint == 0x0387 ? 0x22C5 : # '·' => '⋅'
codepoint == 0x2212 ? 0x002D : # '−' => '-'
codepoint)
end

Expand Down
187 changes: 175 additions & 12 deletions src/tokenize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,174 @@ module Tokenize

export tokenize, untokenize, Tokens

using ..JuliaSyntax: Kind, @K_str
using ..JuliaSyntax: JuliaSyntax, Kind, @K_str

import ..JuliaSyntax: kind,
is_literal, is_error, is_contextual_keyword, is_word_operator

include("tokenize_utils.jl")
#-------------------------------------------------------------------------------
# Character-based predicates for tokenization
import Base.Unicode

const EOF_CHAR = typemax(Char)

function is_identifier_char(c::Char)
c == EOF_CHAR && return false
Base.isvalid(c) || return false
return Base.is_id_char(c)
end

function is_identifier_start_char(c::Char)
c == EOF_CHAR && return false
Base.isvalid(c) || return false
return Base.is_id_start_char(c)
end

# Chars that we will never allow to be part of a valid non-operator identifier
function is_never_id_char(ch::Char)
Base.isvalid(ch) || return true
cat = Unicode.category_code(ch)
c = UInt32(ch)
return (
# spaces and control characters:
(cat >= Unicode.UTF8PROC_CATEGORY_ZS && cat <= Unicode.UTF8PROC_CATEGORY_CS) ||

# ASCII and Latin1 non-connector punctuation
(c < 0xff &&
cat >= Unicode.UTF8PROC_CATEGORY_PD && cat <= Unicode.UTF8PROC_CATEGORY_PO) ||

c == UInt32('`') ||

# mathematical brackets
(c >= 0x27e6 && c <= 0x27ef) ||
# angle, corner, and lenticular brackets
(c >= 0x3008 && c <= 0x3011) ||
# tortoise shell, square, and more lenticular brackets
(c >= 0x3014 && c <= 0x301b) ||
# fullwidth parens
(c == 0xff08 || c == 0xff09) ||
# fullwidth square brackets
(c == 0xff3b || c == 0xff3d)
)
end

readchar(io::IO) = eof(io) ? EOF_CHAR : read(io, Char)

# Some unicode operators are normalized by the tokenizer into their equivalent
# kinds. See also normalize_identifier()
const _ops_with_unicode_aliases = [
# \minus '−' is normalized into K"-",
'−' => K"-"
# Lookalikes which are normalized into K"⋅",
# https://github.com/JuliaLang/julia/pull/25157,
'\u00b7' => K"⋅" # '·' Middle Dot,,
'\u0387' => K"⋅" # '·' Greek Ano Teleia,,
]

function _nondot_symbolic_operator_kinds()
op_range = reinterpret(UInt16, K"BEGIN_OPS"):reinterpret(UInt16, K"END_OPS")
setdiff(reinterpret.(Kind, op_range), [
K"ErrorInvalidOperator"
K"Error**"
K"..."
K"."
K"where"
K"isa"
K"in"
K".'"
])
end

function _char_in_set_expr(varname, firstchars)
codes = sort!(UInt32.(unique(firstchars)))
terms = []
i = 1
while i <= length(codes)
j = i
while j < length(codes) && codes[j+1] == codes[j]+1
j += 1
end
if i == j
push!(terms, :($varname == $(codes[i])))
else
push!(terms, :($(codes[i]) <= $varname <= $(codes[j])))
end
i = j+1
end
foldr((t1,t2)->:($t1 || $t2), terms)
end

@eval function is_operator_start_char(c)
if c == EOF_CHAR || !Base.isvalid(c)
return false
end
u = UInt32(c)
return $(_char_in_set_expr(:u,
append!(first.(string.(_nondot_symbolic_operator_kinds())),
first.(_ops_with_unicode_aliases))))
end

# Checks whether a Char is an operator which can be prefixed with a dot `.`
function is_dottable_operator_start_char(c)
return c != '?' && c != '$' && c != ':' && c != '\'' && is_operator_start_char(c)
end

@eval function isopsuffix(c::Char)
c == EOF_CHAR && return false
Base.isvalid(c) || return false
u = UInt32(c)
if (u < 0xa1 || u > 0x10ffff)
return false
end
cat = Base.Unicode.category_code(u)
if (cat == Base.Unicode.UTF8PROC_CATEGORY_MN ||
cat == Base.Unicode.UTF8PROC_CATEGORY_MC ||
cat == Base.Unicode.UTF8PROC_CATEGORY_ME)
return true
end
# Additional allowed cases
return $(_char_in_set_expr(:u,
collect("²³¹ʰʲʳʷʸˡˢˣᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁᵂᵃᵇᵈᵉᵍᵏᵐᵒᵖᵗᵘᵛᵝᵞᵟᵠᵡᵢᵣᵤᵥᵦᵧᵨᵩᵪᶜᶠᶥᶦᶫᶰᶸᶻᶿ′″‴‵‶‷⁗⁰ⁱ⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₒₓₕₖₗₘₙₚₛₜⱼⱽꜛꜜꜝ")))
end

function optakessuffix(k)
(K"BEGIN_OPS" <= k <= K"END_OPS") &&
!(
k == K"..." ||
K"BEGIN_ASSIGNMENTS" <= k <= K"END_ASSIGNMENTS" ||
k == K"?" ||
k == K"<:" ||
k == K">:" ||
k == K"&&" ||
k == K"||" ||
k == K"in" ||
k == K"isa" ||
k == K"≔" ||
k == K"⩴" ||
k == K":" ||
k == K".." ||
k == K"$" ||
k == K"::" ||
k == K"where" ||
k == K"." ||
k == K"!" ||
k == K".'" ||
k == K"->" ||
K"¬" <= k <= K"∜"
)
end

const _unicode_ops = let
ks = _nondot_symbolic_operator_kinds()
ss = string.(ks)

ops = Dict{Char, Kind}([first(s)=>k for (k,s) in zip(ks,ss)
if length(s) == 1 && !isascii(s[1])])
for ck in _ops_with_unicode_aliases
push!(ops, ck)
end
ops
end

#-------------------------------------------------------------------------------
# Tokens
Expand Down Expand Up @@ -370,7 +532,7 @@ function _next_token(l::Lexer, c)
return lex_identifier(l, c)
elseif isdigit(c)
return lex_digit(l, K"Integer")
elseif (k = get(UNICODE_OPS, c, K"error")) != K"error"
elseif (k = get(_unicode_ops, c, K"error")) != K"error"
return emit(l, k)
else
emit_error(l, K"ErrorUnknownCharacter")
Expand Down Expand Up @@ -416,6 +578,7 @@ function lex_string_chunk(l)
!(pc == EOF_CHAR || is_operator_start_char(pc) || is_never_id_char(pc))
# Only allow certain characters after interpolated vars
# https://github.com/JuliaLang/julia/pull/25234
readchar(l)
return emit_error(l, K"ErrorInvalidInterpolationTerminator")
end
if pc == EOF_CHAR
Expand Down Expand Up @@ -771,7 +934,7 @@ function lex_digit(l::Lexer, kind)
# If we enter the function with kind == K"Float" then a '.' has been parsed.
readchar(l)
return emit_error(l, K"ErrorInvalidNumericConstant")
elseif is_operator_start_char(ppc) && ppc !== ':'
elseif is_dottable_operator_start_char(ppc)
readchar(l)
return emit_error(l, K"ErrorAmbiguousNumericConstant") # `1.+`
end
Expand All @@ -787,14 +950,14 @@ function lex_digit(l::Lexer, kind)
accept(l, "+-−")
if accept_batch(l, isdigit)
pc,ppc = dpeekchar(l)
if pc === '.' && !dotop2(ppc)
if pc === '.' && !is_dottable_operator_start_char(ppc)
readchar(l)
return emit_error(l, K"ErrorInvalidNumericConstant") # `1.e1.`
end
else
return emit_error(l, K"ErrorInvalidNumericConstant") # `1.e`
end
elseif pc == '.' && ppc != '.' && !is_operator_start_char(ppc)
elseif pc == '.' && ppc != '.' && !is_dottable_operator_start_char(ppc)
readchar(l)
return emit_error(l, K"ErrorInvalidNumericConstant") # `1.1.`
elseif !had_fraction_digs && (is_identifier_start_char(pc) ||
Expand All @@ -808,7 +971,7 @@ function lex_digit(l::Lexer, kind)
accept(l, "+-−")
if accept_batch(l, isdigit)
pc,ppc = dpeekchar(l)
if pc === '.' && !dotop2(ppc)
if pc === '.' && !is_dottable_operator_start_char(ppc)
accept(l, '.')
return emit_error(l, K"ErrorInvalidNumericConstant") # `1e1.`
end
Expand Down Expand Up @@ -948,7 +1111,7 @@ function lex_dot(l::Lexer)
if accept(l, '.')
return emit(l, K"...")
else
if dotop2(peekchar(l))
if is_dottable_operator_start_char(peekchar(l))
readchar(l)
return emit_error(l, K"ErrorInvalidOperator")
else
Expand All @@ -959,10 +1122,7 @@ function lex_dot(l::Lexer)
return lex_digit(l, K"Float")
else
pc, dpc = dpeekchar(l)
if dotop1(pc)
l.dotop = true
return _next_token(l, readchar(l))
elseif pc =='+'
if pc == '+'
l.dotop = true
readchar(l)
return lex_plus(l)
Expand Down Expand Up @@ -1040,6 +1200,9 @@ function lex_dot(l::Lexer)
l.dotop = true
readchar(l)
return lex_equal(l)
elseif is_dottable_operator_start_char(pc)
l.dotop = true
return _next_token(l, readchar(l))
end
return emit(l, K".")
end
Expand Down
Loading