Skip to content

Commit

Permalink
ensure the path regexes will accept all valid paths (#48686)
Browse files Browse the repository at this point in the history
Previously, we might try to interpret the random bytes in a path as
UTF-8 and excluding \n, causing the regex match to fail or be incomplete
in some cases. But those are valid in a path, so we want PCRE2 to treat
them as transparent bytes. Accordingly, change r""a to specify all flags
needed to interpret the values simply as ASCII.

Note, this would be breaking if someone was previously trying to match a
Unicode character by `\u` while also disabling UCP matching of \w and
\s, but that seems an odd specific choice to need.

    julia> match(r"\u03b1"a, "α")
    ERROR: PCRE compilation error: character code point value in \u.... sequence is too large at offset 6

(this would have previously worked). Note that explicitly starting the
regex with (*UTF) or using a literal α in the regex would continue to
work as before however.

Note that `s` (DOTALL) is a more efficient matcher (if the pattern
contains `.`), as is `a`, so it is often preferable to set both when in
doubt: http://man.he.net/man3/pcre2perform

Refs: #48648
  • Loading branch information
vtjnash authored Feb 17, 2023
1 parent cbbfc68 commit 892cd4f
Show file tree
Hide file tree
Showing 11 changed files with 91 additions and 52 deletions.
10 changes: 5 additions & 5 deletions base/binaryplatforms.jl
Original file line number Diff line number Diff line change
Expand Up @@ -741,10 +741,10 @@ function Base.parse(::Type{Platform}, triplet::String; validate_strict::Bool = f
end
os_version = nothing
if os == "macos"
os_version = extract_os_version("macos", r".*darwin([\d\.]+)")
os_version = extract_os_version("macos", r".*darwin([\d\.]+)"sa)
end
if os == "freebsd"
os_version = extract_os_version("freebsd", r".*freebsd([\d.]+)")
os_version = extract_os_version("freebsd", r".*freebsd([\d.]+)"sa)
end
tags["os_version"] = os_version

Expand Down Expand Up @@ -798,13 +798,13 @@ function parse_dl_name_version(path::String, os::String)
local dlregex
if os == "windows"
# On Windows, libraries look like `libnettle-6.dll`
dlregex = r"^(.*?)(?:-((?:[\.\d]+)*))?\.dll$"
dlregex = r"^(.*?)(?:-((?:[\.\d]+)*))?\.dll$"sa
elseif os == "macos"
# On OSX, libraries look like `libnettle.6.3.dylib`
dlregex = r"^(.*?)((?:\.[\d]+)*)\.dylib$"
dlregex = r"^(.*?)((?:\.[\d]+)*)\.dylib$"sa
else
# On Linux and FreeBSD, libraries look like `libnettle.so.6.3.0`
dlregex = r"^(.*?)\.so((?:\.[\d]+)*)$"
dlregex = r"^(.*?)\.so((?:\.[\d]+)*)$"sa
end

m = match(dlregex, basename(path))
Expand Down
2 changes: 1 addition & 1 deletion base/compiler/ssair/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -796,7 +796,7 @@ function inline_linfo_printer(code::IRCode)
end
end

_strip_color(s::String) = replace(s, r"\e\[\d+m" => "")
_strip_color(s::String) = replace(s, r"\e\[\d+m"a => "")

function statementidx_lineinfo_printer(f, code::IRCode)
printer = f(code.linetable)
Expand Down
2 changes: 1 addition & 1 deletion base/deprecated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ arguments of type `Any`.
To restrict deprecation to a specific signature, annotate the
arguments of `old`. For example,
```jldoctest; filter = r"@ .*"
```jldoctest; filter = r"@ .*"a
julia> new(x::Int) = x;
julia> new(x::Float64) = 2x;
Expand Down
2 changes: 1 addition & 1 deletion base/libc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ function strptime(fmt::AbstractString, timestr::AbstractString)
@static if Sys.isapple()
# if we didn't explicitly parse the weekday or year day, use mktime
# to fill them in automatically.
if !occursin(r"([^%]|^)%(a|A|j|w|Ow)", fmt)
if !occursin(r"([^%]|^)%(a|A|j|w|Ow)"a, fmt)
ccall(:mktime, Int, (Ref{TmStruct},), tm)
end
end
Expand Down
4 changes: 2 additions & 2 deletions base/methodshow.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ function strip_gensym(sym)
if sym === :var"#self#" || sym === :var"#unused#"
return empty_sym
end
return Symbol(replace(String(sym), r"^(.*)#(.*#)?\d+$" => s"\1"))
return Symbol(replace(String(sym), r"^(.*)#(.*#)?\d+$"sa => s"\1"))
end

function argtype_decl(env, n, @nospecialize(sig::DataType), i::Int, nargs, isva::Bool) # -> (argname, argtype)
Expand Down Expand Up @@ -364,7 +364,7 @@ function url(m::Method)
(m.file === :null || m.file === :string) && return ""
file = string(m.file)
line = m.line
line <= 0 || occursin(r"In\[[0-9]+\]", file) && return ""
line <= 0 || occursin(r"In\[[0-9]+\]"a, file) && return ""
Sys.iswindows() && (file = replace(file, '\\' => '/'))
libgit2_id = PkgId(UUID((0x76f85450_5226_5b5a,0x8eaa_529ad045b433)), "LibGit2")
if inbase(M)
Expand Down
20 changes: 10 additions & 10 deletions base/path.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,22 @@ export

if Sys.isunix()
const path_separator = "/"
const path_separator_re = r"/+"
const path_directory_re = r"(?:^|/)\.{0,2}$"
const path_dir_splitter = r"^(.*?)(/+)([^/]*)$"
const path_ext_splitter = r"^((?:.*/)?(?:\.|[^/\.])[^/]*?)(\.[^/\.]*|)$"
const path_separator_re = r"/+"sa
const path_directory_re = r"(?:^|/)\.{0,2}$"sa
const path_dir_splitter = r"^(.*?)(/+)([^/]*)$"sa
const path_ext_splitter = r"^((?:.*/)?(?:\.|[^/\.])[^/]*?)(\.[^/\.]*|)$"sa

splitdrive(path::String) = ("",path)
elseif Sys.iswindows()
const path_separator = "\\"
const path_separator_re = r"[/\\]+"
const path_absolute_re = r"^(?:[A-Za-z]+:)?[/\\]"
const path_directory_re = r"(?:^|[/\\])\.{0,2}$"
const path_dir_splitter = r"^(.*?)([/\\]+)([^/\\]*)$"
const path_ext_splitter = r"^((?:.*[/\\])?(?:\.|[^/\\\.])[^/\\]*?)(\.[^/\\\.]*|)$"
const path_separator_re = r"[/\\]+"sa
const path_absolute_re = r"^(?:[A-Za-z]+:)?[/\\]"sa
const path_directory_re = r"(?:^|[/\\])\.{0,2}$"sa
const path_dir_splitter = r"^(.*?)([/\\]+)([^/\\]*)$"sa
const path_ext_splitter = r"^((?:.*[/\\])?(?:\.|[^/\\\.])[^/\\]*?)(\.[^/\\\.]*|)$"sa

function splitdrive(path::String)
m = match(r"^([^\\]+:|\\\\[^\\]+\\[^\\]+|\\\\\?\\UNC\\[^\\]+\\[^\\]+|\\\\\?\\[^\\]+:|)(.*)$"s, path)::AbstractMatch
m = match(r"^([^\\]+:|\\\\[^\\]+\\[^\\]+|\\\\\?\\UNC\\[^\\]+\\[^\\]+|\\\\\?\\[^\\]+:|)(.*)$"sa, path)::AbstractMatch
String(something(m.captures[1])), String(something(m.captures[2]))
end
else
Expand Down
66 changes: 46 additions & 20 deletions base/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,19 +46,24 @@ mutable struct Regex <: AbstractPattern
end

function Regex(pattern::AbstractString, flags::AbstractString)
options = DEFAULT_COMPILER_OPTS
compile_options = DEFAULT_COMPILER_OPTS
match_options = DEFAULT_MATCH_OPTS
for f in flags
if f == 'a'
options &= ~PCRE.UCP
# instruct pcre2 to treat the strings as simple bytes (aka "ASCII"), not char encodings
compile_options &= ~PCRE.UCP # user can re-enable with (*UCP)
compile_options &= ~PCRE.UTF # user can re-enable with (*UTF)
compile_options &= ~PCRE.MATCH_INVALID_UTF # this would force on UTF
match_options &= ~PCRE.NO_UTF_CHECK # if the user did force on UTF, we should check it for safety
else
options |= f=='i' ? PCRE.CASELESS :
f=='m' ? PCRE.MULTILINE :
f=='s' ? PCRE.DOTALL :
f=='x' ? PCRE.EXTENDED :
throw(ArgumentError("unknown regex flag: $f"))
compile_options |= f=='i' ? PCRE.CASELESS :
f=='m' ? PCRE.MULTILINE :
f=='s' ? PCRE.DOTALL :
f=='x' ? PCRE.EXTENDED :
throw(ArgumentError("unknown regex flag: $f"))
end
end
Regex(pattern, options, DEFAULT_MATCH_OPTS)
Regex(pattern, compile_options, match_options)
end
Regex(pattern::AbstractString) = Regex(pattern, DEFAULT_COMPILER_OPTS, DEFAULT_MATCH_OPTS)

Expand Down Expand Up @@ -96,9 +101,15 @@ listed after the ending quote, to change its behaviour:
- `s` allows the `.` modifier to match newlines.
- `x` enables "comment mode": whitespace is enabled except when escaped with `\\`, and `#`
is treated as starting a comment.
- `a` disables `UCP` mode (enables ASCII mode). By default `\\B`, `\\b`, `\\D`, `\\d`, `\\S`,
`\\s`, `\\W`, `\\w`, etc. match based on Unicode character properties. With this option,
these sequences only match ASCII characters.
- `a` enables ASCII mode (disables `UTF` and `UCP` modes). By default `\\B`, `\\b`, `\\D`,
`\\d`, `\\S`, `\\s`, `\\W`, `\\w`, etc. match based on Unicode character properties. With
this option, these sequences only match ASCII characters. This includes `\\u` also, which
will emit the specified character value directly as a single byte, and not attempt to
encode it into UTF-8. Importantly, this option allows matching against invalid UTF-8
strings, by treating both matcher and target as simple bytes (as if they were ISO/IEC
8859-1 / Latin-1 bytes) instead of as character encodings. In this case, this option is
often combined with `s`. This option can be further refined by starting the pattern with
(*UCP) or (*UTF).
See [`Regex`](@ref) if interpolation is needed.
Expand All @@ -112,23 +123,38 @@ This regex has the first three flags enabled.
macro r_str(pattern, flags...) Regex(pattern, flags...) end

function show(io::IO, re::Regex)
imsxa = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED|PCRE.UCP
imsx = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED
ac = PCRE.UTF|PCRE.MATCH_INVALID_UTF|PCRE.UCP
am = PCRE.NO_UTF_CHECK
opts = re.compile_options
if (opts & ~imsxa) == (DEFAULT_COMPILER_OPTS & ~imsxa)
mopts = re.match_options
default = ((opts & ~imsx) | ac) == DEFAULT_COMPILER_OPTS
if default
if (opts & ac) == ac
default = mopts == DEFAULT_MATCH_OPTS
elseif (opts & ac) == 0
default = mopts == (DEFAULT_MATCH_OPTS & ~am)
else
default = false
end
end
if default
print(io, "r\"")
escape_raw_string(io, re.pattern)
print(io, "\"")
if (opts & PCRE.CASELESS ) != 0; print(io, 'i'); end
if (opts & PCRE.MULTILINE) != 0; print(io, 'm'); end
if (opts & PCRE.DOTALL ) != 0; print(io, 's'); end
if (opts & PCRE.EXTENDED ) != 0; print(io, 'x'); end
if (opts & PCRE.UCP ) == 0; print(io, 'a'); end
if (opts & PCRE.CASELESS ) != 0; print(io, "i"); end
if (opts & PCRE.MULTILINE) != 0; print(io, "m"); end
if (opts & PCRE.DOTALL ) != 0; print(io, "s"); end
if (opts & PCRE.EXTENDED ) != 0; print(io, "x"); end
if (opts & ac ) == 0; print(io, "a"); end
else
print(io, "Regex(")
show(io, re.pattern)
print(io, ',')
print(io, ", ")
show(io, opts)
print(io, ')')
print(io, ", ")
show(io, mopts)
print(io, ")")
end
end

Expand Down
4 changes: 2 additions & 2 deletions base/set.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ See also: [`AbstractSet`](@ref), [`BitSet`](@ref), [`Dict`](@ref),
[`push!`](@ref), [`empty!`](@ref), [`union!`](@ref), [`in`](@ref), [`isequal`](@ref)
# Examples
```jldoctest filter = r"^\\S.+"
```jldoctest; filter = r"^ '.'"ma
julia> s = Set("aaBca")
Set{Char} with 3 elements:
'a'
Expand All @@ -23,9 +23,9 @@ Set{Char} with 3 elements:
julia> push!(s, 'b')
Set{Char} with 4 elements:
'a'
'c'
'b'
'B'
'c'
julia> s = Set([NaN, 0.0, 1.0, 2.0]);
Expand Down
10 changes: 5 additions & 5 deletions base/shell.jl
Original file line number Diff line number Diff line change
Expand Up @@ -292,9 +292,9 @@ function shell_escape_csh(io::IO, args::AbstractString...)
first = false
i = 1
while true
for (r,e) = (r"^[A-Za-z0-9/\._-]+\z" => "",
r"^[^']*\z" => "'", r"^[^\$\`\"]*\z" => "\"",
r"^[^']+" => "'", r"^[^\$\`\"]+" => "\"")
for (r,e) = (r"^[A-Za-z0-9/\._-]+\z"sa => "",
r"^[^']*\z"sa => "'", r"^[^\$\`\"]*\z"sa => "\"",
r"^[^']+"sa => "'", r"^[^\$\`\"]+"sa => "\"")
if ((m = match(r, SubString(arg, i))) !== nothing)
write(io, e)
write(io, replace(m.match, '\n' => "\\\n"))
Expand Down Expand Up @@ -391,7 +391,7 @@ julia> Base.shell_escape_wincmd("a^\\"^o\\"^u\\"")
"""
function shell_escape_wincmd(io::IO, s::AbstractString)
# https://stackoverflow.com/a/4095133/1990689
occursin(r"[\r\n\0]", s) &&
occursin(r"[\r\n\0]"sa, s) &&
throw(ArgumentError("control character unsupported by CMD.EXE"))
i = 1
len = ncodeunits(s)
Expand Down Expand Up @@ -446,7 +446,7 @@ function escape_microsoft_c_args(io::IO, args::AbstractString...)
else
write(io, ' ') # separator
end
if isempty(arg) || occursin(r"[ \t\"]", arg)
if isempty(arg) || occursin(r"[ \t\"]"sa, arg)
# Julia raw strings happen to use the same escaping convention
# as the argv[] parser in Microsoft's C runtime library.
write(io, '"')
Expand Down
3 changes: 3 additions & 0 deletions test/path.jl
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,9 @@
@test string(splitdrive(S(homedir()))...) == homedir()
@test splitdrive("a\nb") == ("", "a\nb")

@test splitdir("a/\xfe/\n/b/c.ext") == ("a/\xfe/\n/b", "c.ext")
@test splitext("a/\xfe/\n/b/c.ext") == ("a/\xfe/\n/b/c", ".ext")

if Sys.iswindows()
@test splitdrive(S("\\\\servername\\hello.world\\filename.ext")) ==
("\\\\servername\\hello.world","\\filename.ext")
Expand Down
20 changes: 15 additions & 5 deletions test/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@
@test repr(r"\\\"") == raw"r\"\\\\\\\"\""
@test repr(s"\\\"\\") == raw"s\"\\\\\\\"\\\\\""

@test repr(r""a) == "r\"\"a"
@test repr(r""imsxa) == "r\"\"imsxa"
@test repr(Regex("", Base.DEFAULT_COMPILER_OPTS, UInt32(0))) == """Regex("", $(repr(Base.DEFAULT_COMPILER_OPTS)), $(repr(UInt32(0))))"""
@test repr(Regex("", UInt32(0), Base.DEFAULT_MATCH_OPTS)) == """Regex("", $(repr(UInt32(0))), $(repr(Base.DEFAULT_MATCH_OPTS)))"""

# findall
@test findall(r"\w+", "foo bar") == [1:3, 5:7]
@test findall(r"\w+", "foo bar", overlap=true) == [1:3, 2:3, 3:3, 5:7, 6:7, 7:7]
Expand Down Expand Up @@ -122,18 +127,24 @@

# Backcapture reference in substitution string
@test replace("abcde", r"(..)(?P<byname>d)" => s"\g<byname>xy\\\1") == "adxy\\bce"
@test_throws ErrorException replace("a", r"(?P<x>)" => s"\g<y>")
@test_throws(ErrorException("Bad replacement string: Group y not found in regex r\"(?P<x>)\""),
replace("a", r"(?P<x>)" => s"\g<y>"))
# test replace with invalid substitution group pattern
@test_throws ErrorException replace("s", r"(?<g1>.)" => s"\gg1>")
@test_throws(ErrorException("Bad replacement string: \\gg1>"),
replace("s", r"(?<g1>.)" => s"\gg1>"))
# test replace with 2-digit substitution group
@test replace(("0" ^ 9) * "1", Regex(("(0)" ^ 9) * "(1)") => s"10th group: \10") == "10th group: 1"

# Proper unicode handling
@test match(r"∀∀", "∀x∀∀∀").match == "∀∀"

# 'a' flag to disable UCP
# 'a' flag to disable UCP and UTF
@test match(r"\w+", "Düsseldorf").match == "Düsseldorf"
@test match(r"\w+"a, "Düsseldorf").match == "D"
@test match(r".+"a, "Düsseldorf").match == "Düsseldorf"
@test match(r".+"a, "\xefsseldorf").match == "\xefsseldorf"
@test_throws(ErrorException("PCRE.exec error: $(Base.PCRE.err_message(Base.PCRE.ERROR_UTF8_ERR6))"),
match(r"(*UTF).+"a, "\xefsseldorf"))

# Regex behaves like a scalar in broadcasting
@test occursin.(r"Hello", ["Hello", "World"]) == [true, false]
Expand Down Expand Up @@ -211,8 +222,7 @@
end

# Test that PCRE throws the correct kind of error
# TODO: Uncomment this once the corresponding change has propagated to CI
#@test_throws ErrorException Base.PCRE.info(C_NULL, Base.PCRE.INFO_NAMECOUNT, UInt32)
@test_throws ErrorException("PCRE error: NULL regex object") Base.PCRE.info(C_NULL, Base.PCRE.INFO_NAMECOUNT, UInt32)

# test that we can get the error message of negative error codes
@test Base.PCRE.err_message(Base.PCRE.ERROR_NOMEMORY) isa String
Expand Down

0 comments on commit 892cd4f

Please sign in to comment.