diff --git a/base/binaryplatforms.jl b/base/binaryplatforms.jl index eb4bcfd8c76fc..fb9feba41c636 100644 --- a/base/binaryplatforms.jl +++ b/base/binaryplatforms.jl @@ -741,10 +741,10 @@ function Base.parse(::Type{Platform}, triplet::String; validate_strict::Bool = f end os_version = nothing if os == "macos" - os_version = extract_os_version("macos", r".*darwin([\d\.]+)") + os_version = extract_os_version("macos", r".*darwin([\d\.]+)"sa) end if os == "freebsd" - os_version = extract_os_version("freebsd", r".*freebsd([\d.]+)") + os_version = extract_os_version("freebsd", r".*freebsd([\d.]+)"sa) end tags["os_version"] = os_version @@ -798,13 +798,13 @@ function parse_dl_name_version(path::String, os::String) local dlregex if os == "windows" # On Windows, libraries look like `libnettle-6.dll` - dlregex = r"^(.*?)(?:-((?:[\.\d]+)*))?\.dll$" + dlregex = r"^(.*?)(?:-((?:[\.\d]+)*))?\.dll$"sa elseif os == "macos" # On OSX, libraries look like `libnettle.6.3.dylib` - dlregex = r"^(.*?)((?:\.[\d]+)*)\.dylib$" + dlregex = r"^(.*?)((?:\.[\d]+)*)\.dylib$"sa else # On Linux and FreeBSD, libraries look like `libnettle.so.6.3.0` - dlregex = r"^(.*?)\.so((?:\.[\d]+)*)$" + dlregex = r"^(.*?)\.so((?:\.[\d]+)*)$"sa end m = match(dlregex, basename(path)) diff --git a/base/compiler/ssair/show.jl b/base/compiler/ssair/show.jl index f4d240f423e89..0d17746c6d928 100644 --- a/base/compiler/ssair/show.jl +++ b/base/compiler/ssair/show.jl @@ -796,7 +796,7 @@ function inline_linfo_printer(code::IRCode) end end -_strip_color(s::String) = replace(s, r"\e\[\d+m" => "") +_strip_color(s::String) = replace(s, r"\e\[\d+m"a => "") function statementidx_lineinfo_printer(f, code::IRCode) printer = f(code.linetable) diff --git a/base/deprecated.jl b/base/deprecated.jl index 6d1e4283c814d..1b661716cc2d9 100644 --- a/base/deprecated.jl +++ b/base/deprecated.jl @@ -48,7 +48,7 @@ arguments of type `Any`. To restrict deprecation to a specific signature, annotate the arguments of `old`. For example, -```jldoctest; filter = r"@ .*" +```jldoctest; filter = r"@ .*"a julia> new(x::Int) = x; julia> new(x::Float64) = 2x; diff --git a/base/libc.jl b/base/libc.jl index 0a542ecbd1a82..5b508e00bf3e0 100644 --- a/base/libc.jl +++ b/base/libc.jl @@ -225,7 +225,7 @@ function strptime(fmt::AbstractString, timestr::AbstractString) @static if Sys.isapple() # if we didn't explicitly parse the weekday or year day, use mktime # to fill them in automatically. - if !occursin(r"([^%]|^)%(a|A|j|w|Ow)", fmt) + if !occursin(r"([^%]|^)%(a|A|j|w|Ow)"a, fmt) ccall(:mktime, Int, (Ref{TmStruct},), tm) end end diff --git a/base/methodshow.jl b/base/methodshow.jl index d3a40db665d1c..a45b89c6ccf63 100644 --- a/base/methodshow.jl +++ b/base/methodshow.jl @@ -7,7 +7,7 @@ function strip_gensym(sym) if sym === :var"#self#" || sym === :var"#unused#" return empty_sym end - return Symbol(replace(String(sym), r"^(.*)#(.*#)?\d+$" => s"\1")) + return Symbol(replace(String(sym), r"^(.*)#(.*#)?\d+$"sa => s"\1")) end function argtype_decl(env, n, @nospecialize(sig::DataType), i::Int, nargs, isva::Bool) # -> (argname, argtype) @@ -364,7 +364,7 @@ function url(m::Method) (m.file === :null || m.file === :string) && return "" file = string(m.file) line = m.line - line <= 0 || occursin(r"In\[[0-9]+\]", file) && return "" + line <= 0 || occursin(r"In\[[0-9]+\]"a, file) && return "" Sys.iswindows() && (file = replace(file, '\\' => '/')) libgit2_id = PkgId(UUID((0x76f85450_5226_5b5a,0x8eaa_529ad045b433)), "LibGit2") if inbase(M) diff --git a/base/path.jl b/base/path.jl index 1fac47432cda3..c439a2800acce 100644 --- a/base/path.jl +++ b/base/path.jl @@ -20,22 +20,22 @@ export if Sys.isunix() const path_separator = "/" - const path_separator_re = r"/+" - const path_directory_re = r"(?:^|/)\.{0,2}$" - const path_dir_splitter = r"^(.*?)(/+)([^/]*)$" - const path_ext_splitter = r"^((?:.*/)?(?:\.|[^/\.])[^/]*?)(\.[^/\.]*|)$" + const path_separator_re = r"/+"sa + const path_directory_re = r"(?:^|/)\.{0,2}$"sa + const path_dir_splitter = r"^(.*?)(/+)([^/]*)$"sa + const path_ext_splitter = r"^((?:.*/)?(?:\.|[^/\.])[^/]*?)(\.[^/\.]*|)$"sa splitdrive(path::String) = ("",path) elseif Sys.iswindows() const path_separator = "\\" - const path_separator_re = r"[/\\]+" - const path_absolute_re = r"^(?:[A-Za-z]+:)?[/\\]" - const path_directory_re = r"(?:^|[/\\])\.{0,2}$" - const path_dir_splitter = r"^(.*?)([/\\]+)([^/\\]*)$" - const path_ext_splitter = r"^((?:.*[/\\])?(?:\.|[^/\\\.])[^/\\]*?)(\.[^/\\\.]*|)$" + const path_separator_re = r"[/\\]+"sa + const path_absolute_re = r"^(?:[A-Za-z]+:)?[/\\]"sa + const path_directory_re = r"(?:^|[/\\])\.{0,2}$"sa + const path_dir_splitter = r"^(.*?)([/\\]+)([^/\\]*)$"sa + const path_ext_splitter = r"^((?:.*[/\\])?(?:\.|[^/\\\.])[^/\\]*?)(\.[^/\\\.]*|)$"sa function splitdrive(path::String) - m = match(r"^([^\\]+:|\\\\[^\\]+\\[^\\]+|\\\\\?\\UNC\\[^\\]+\\[^\\]+|\\\\\?\\[^\\]+:|)(.*)$"s, path)::AbstractMatch + m = match(r"^([^\\]+:|\\\\[^\\]+\\[^\\]+|\\\\\?\\UNC\\[^\\]+\\[^\\]+|\\\\\?\\[^\\]+:|)(.*)$"sa, path)::AbstractMatch String(something(m.captures[1])), String(something(m.captures[2])) end else diff --git a/base/regex.jl b/base/regex.jl index d1ef3c9d13d48..400784e1b27d7 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -46,19 +46,24 @@ mutable struct Regex <: AbstractPattern end function Regex(pattern::AbstractString, flags::AbstractString) - options = DEFAULT_COMPILER_OPTS + compile_options = DEFAULT_COMPILER_OPTS + match_options = DEFAULT_MATCH_OPTS for f in flags if f == 'a' - options &= ~PCRE.UCP + # instruct pcre2 to treat the strings as simple bytes (aka "ASCII"), not char encodings + compile_options &= ~PCRE.UCP # user can re-enable with (*UCP) + compile_options &= ~PCRE.UTF # user can re-enable with (*UTF) + compile_options &= ~PCRE.MATCH_INVALID_UTF # this would force on UTF + match_options &= ~PCRE.NO_UTF_CHECK # if the user did force on UTF, we should check it for safety else - options |= f=='i' ? PCRE.CASELESS : - f=='m' ? PCRE.MULTILINE : - f=='s' ? PCRE.DOTALL : - f=='x' ? PCRE.EXTENDED : - throw(ArgumentError("unknown regex flag: $f")) + compile_options |= f=='i' ? PCRE.CASELESS : + f=='m' ? PCRE.MULTILINE : + f=='s' ? PCRE.DOTALL : + f=='x' ? PCRE.EXTENDED : + throw(ArgumentError("unknown regex flag: $f")) end end - Regex(pattern, options, DEFAULT_MATCH_OPTS) + Regex(pattern, compile_options, match_options) end Regex(pattern::AbstractString) = Regex(pattern, DEFAULT_COMPILER_OPTS, DEFAULT_MATCH_OPTS) @@ -96,9 +101,15 @@ listed after the ending quote, to change its behaviour: - `s` allows the `.` modifier to match newlines. - `x` enables "comment mode": whitespace is enabled except when escaped with `\\`, and `#` is treated as starting a comment. -- `a` disables `UCP` mode (enables ASCII mode). By default `\\B`, `\\b`, `\\D`, `\\d`, `\\S`, - `\\s`, `\\W`, `\\w`, etc. match based on Unicode character properties. With this option, - these sequences only match ASCII characters. +- `a` enables ASCII mode (disables `UTF` and `UCP` modes). By default `\\B`, `\\b`, `\\D`, + `\\d`, `\\S`, `\\s`, `\\W`, `\\w`, etc. match based on Unicode character properties. With + this option, these sequences only match ASCII characters. This includes `\\u` also, which + will emit the specified character value directly as a single byte, and not attempt to + encode it into UTF-8. Importantly, this option allows matching against invalid UTF-8 + strings, by treating both matcher and target as simple bytes (as if they were ISO/IEC + 8859-1 / Latin-1 bytes) instead of as character encodings. In this case, this option is + often combined with `s`. This option can be further refined by starting the pattern with + (*UCP) or (*UTF). See [`Regex`](@ref) if interpolation is needed. @@ -112,23 +123,38 @@ This regex has the first three flags enabled. macro r_str(pattern, flags...) Regex(pattern, flags...) end function show(io::IO, re::Regex) - imsxa = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED|PCRE.UCP + imsx = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED + ac = PCRE.UTF|PCRE.MATCH_INVALID_UTF|PCRE.UCP + am = PCRE.NO_UTF_CHECK opts = re.compile_options - if (opts & ~imsxa) == (DEFAULT_COMPILER_OPTS & ~imsxa) + mopts = re.match_options + default = ((opts & ~imsx) | ac) == DEFAULT_COMPILER_OPTS + if default + if (opts & ac) == ac + default = mopts == DEFAULT_MATCH_OPTS + elseif (opts & ac) == 0 + default = mopts == (DEFAULT_MATCH_OPTS & ~am) + else + default = false + end + end + if default print(io, "r\"") escape_raw_string(io, re.pattern) print(io, "\"") - if (opts & PCRE.CASELESS ) != 0; print(io, 'i'); end - if (opts & PCRE.MULTILINE) != 0; print(io, 'm'); end - if (opts & PCRE.DOTALL ) != 0; print(io, 's'); end - if (opts & PCRE.EXTENDED ) != 0; print(io, 'x'); end - if (opts & PCRE.UCP ) == 0; print(io, 'a'); end + if (opts & PCRE.CASELESS ) != 0; print(io, "i"); end + if (opts & PCRE.MULTILINE) != 0; print(io, "m"); end + if (opts & PCRE.DOTALL ) != 0; print(io, "s"); end + if (opts & PCRE.EXTENDED ) != 0; print(io, "x"); end + if (opts & ac ) == 0; print(io, "a"); end else print(io, "Regex(") show(io, re.pattern) - print(io, ',') + print(io, ", ") show(io, opts) - print(io, ')') + print(io, ", ") + show(io, mopts) + print(io, ")") end end diff --git a/base/set.jl b/base/set.jl index 5be7eaf004352..a91bf328bd911 100644 --- a/base/set.jl +++ b/base/set.jl @@ -13,7 +13,7 @@ See also: [`AbstractSet`](@ref), [`BitSet`](@ref), [`Dict`](@ref), [`push!`](@ref), [`empty!`](@ref), [`union!`](@ref), [`in`](@ref), [`isequal`](@ref) # Examples -```jldoctest filter = r"^\\S.+" +```jldoctest; filter = r"^ '.'"ma julia> s = Set("aaBca") Set{Char} with 3 elements: 'a' @@ -23,9 +23,9 @@ Set{Char} with 3 elements: julia> push!(s, 'b') Set{Char} with 4 elements: 'a' - 'c' 'b' 'B' + 'c' julia> s = Set([NaN, 0.0, 1.0, 2.0]); diff --git a/base/shell.jl b/base/shell.jl index f443a1f9c094a..7c973ab289c7f 100644 --- a/base/shell.jl +++ b/base/shell.jl @@ -292,9 +292,9 @@ function shell_escape_csh(io::IO, args::AbstractString...) first = false i = 1 while true - for (r,e) = (r"^[A-Za-z0-9/\._-]+\z" => "", - r"^[^']*\z" => "'", r"^[^\$\`\"]*\z" => "\"", - r"^[^']+" => "'", r"^[^\$\`\"]+" => "\"") + for (r,e) = (r"^[A-Za-z0-9/\._-]+\z"sa => "", + r"^[^']*\z"sa => "'", r"^[^\$\`\"]*\z"sa => "\"", + r"^[^']+"sa => "'", r"^[^\$\`\"]+"sa => "\"") if ((m = match(r, SubString(arg, i))) !== nothing) write(io, e) write(io, replace(m.match, '\n' => "\\\n")) @@ -391,7 +391,7 @@ julia> Base.shell_escape_wincmd("a^\\"^o\\"^u\\"") """ function shell_escape_wincmd(io::IO, s::AbstractString) # https://stackoverflow.com/a/4095133/1990689 - occursin(r"[\r\n\0]", s) && + occursin(r"[\r\n\0]"sa, s) && throw(ArgumentError("control character unsupported by CMD.EXE")) i = 1 len = ncodeunits(s) @@ -446,7 +446,7 @@ function escape_microsoft_c_args(io::IO, args::AbstractString...) else write(io, ' ') # separator end - if isempty(arg) || occursin(r"[ \t\"]", arg) + if isempty(arg) || occursin(r"[ \t\"]"sa, arg) # Julia raw strings happen to use the same escaping convention # as the argv[] parser in Microsoft's C runtime library. write(io, '"') diff --git a/test/path.jl b/test/path.jl index 4a4caa6b0b115..2f4f2d0983a58 100644 --- a/test/path.jl +++ b/test/path.jl @@ -171,6 +171,9 @@ @test string(splitdrive(S(homedir()))...) == homedir() @test splitdrive("a\nb") == ("", "a\nb") + @test splitdir("a/\xfe/\n/b/c.ext") == ("a/\xfe/\n/b", "c.ext") + @test splitext("a/\xfe/\n/b/c.ext") == ("a/\xfe/\n/b/c", ".ext") + if Sys.iswindows() @test splitdrive(S("\\\\servername\\hello.world\\filename.ext")) == ("\\\\servername\\hello.world","\\filename.ext") diff --git a/test/regex.jl b/test/regex.jl index 70f620cad7141..e5f1428527512 100644 --- a/test/regex.jl +++ b/test/regex.jl @@ -59,6 +59,11 @@ @test repr(r"\\\"") == raw"r\"\\\\\\\"\"" @test repr(s"\\\"\\") == raw"s\"\\\\\\\"\\\\\"" + @test repr(r""a) == "r\"\"a" + @test repr(r""imsxa) == "r\"\"imsxa" + @test repr(Regex("", Base.DEFAULT_COMPILER_OPTS, UInt32(0))) == """Regex("", $(repr(Base.DEFAULT_COMPILER_OPTS)), $(repr(UInt32(0))))""" + @test repr(Regex("", UInt32(0), Base.DEFAULT_MATCH_OPTS)) == """Regex("", $(repr(UInt32(0))), $(repr(Base.DEFAULT_MATCH_OPTS)))""" + # findall @test findall(r"\w+", "foo bar") == [1:3, 5:7] @test findall(r"\w+", "foo bar", overlap=true) == [1:3, 2:3, 3:3, 5:7, 6:7, 7:7] @@ -122,18 +127,24 @@ # Backcapture reference in substitution string @test replace("abcde", r"(..)(?Pd)" => s"\gxy\\\1") == "adxy\\bce" - @test_throws ErrorException replace("a", r"(?P)" => s"\g") + @test_throws(ErrorException("Bad replacement string: Group y not found in regex r\"(?P)\""), + replace("a", r"(?P)" => s"\g")) # test replace with invalid substitution group pattern - @test_throws ErrorException replace("s", r"(?.)" => s"\gg1>") + @test_throws(ErrorException("Bad replacement string: \\gg1>"), + replace("s", r"(?.)" => s"\gg1>")) # test replace with 2-digit substitution group @test replace(("0" ^ 9) * "1", Regex(("(0)" ^ 9) * "(1)") => s"10th group: \10") == "10th group: 1" # Proper unicode handling @test match(r"∀∀", "∀x∀∀∀").match == "∀∀" - # 'a' flag to disable UCP + # 'a' flag to disable UCP and UTF @test match(r"\w+", "Düsseldorf").match == "Düsseldorf" @test match(r"\w+"a, "Düsseldorf").match == "D" + @test match(r".+"a, "Düsseldorf").match == "Düsseldorf" + @test match(r".+"a, "Dü\xefsseldorf").match == "Dü\xefsseldorf" + @test_throws(ErrorException("PCRE.exec error: $(Base.PCRE.err_message(Base.PCRE.ERROR_UTF8_ERR6))"), + match(r"(*UTF).+"a, "Dü\xefsseldorf")) # Regex behaves like a scalar in broadcasting @test occursin.(r"Hello", ["Hello", "World"]) == [true, false] @@ -211,8 +222,7 @@ end # Test that PCRE throws the correct kind of error - # TODO: Uncomment this once the corresponding change has propagated to CI - #@test_throws ErrorException Base.PCRE.info(C_NULL, Base.PCRE.INFO_NAMECOUNT, UInt32) + @test_throws ErrorException("PCRE error: NULL regex object") Base.PCRE.info(C_NULL, Base.PCRE.INFO_NAMECOUNT, UInt32) # test that we can get the error message of negative error codes @test Base.PCRE.err_message(Base.PCRE.ERROR_NOMEMORY) isa String