From 84e9989bee4ca9dce57ebe7b2a6d4e074c55b3b3 Mon Sep 17 00:00:00 2001 From: Jerry Ling Date: Thu, 26 May 2022 23:11:15 -0500 Subject: [PATCH] expose `findall` for `Vector{UInt8}` (#45307) --- base/regex.jl | 58 ---------------------------------- base/strings/search.jl | 71 ++++++++++++++++++++++++++++++++++++++++++ test/regex.jl | 4 +++ 3 files changed, 75 insertions(+), 58 deletions(-) diff --git a/base/regex.jl b/base/regex.jl index 7a69ecbf7cdbd..27e0391f8a6c8 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -7,16 +7,6 @@ include("pcre.jl") const DEFAULT_COMPILER_OPTS = PCRE.UTF | PCRE.MATCH_INVALID_UTF | PCRE.ALT_BSUX | PCRE.UCP const DEFAULT_MATCH_OPTS = PCRE.NO_UTF_CHECK -""" -An abstract type representing any sort of pattern matching expression -(typically a regular expression). `AbstractPattern` objects can be used to -match strings with [`match`](@ref). - -!!! compat "Julia 1.6" - This type is available in Julia 1.6 and later. -""" -abstract type AbstractPattern end - """ Regex(pattern[, flags]) @@ -438,54 +428,6 @@ findnext(r::Regex, s::AbstractString, idx::Integer) = throw(ArgumentError( findfirst(r::Regex, s::AbstractString) = findnext(r,s,firstindex(s)) -""" - findall( - pattern::Union{AbstractString,AbstractPattern}, - string::AbstractString; - overlap::Bool = false, - ) - -Return a `Vector{UnitRange{Int}}` of all the matches for `pattern` in `string`. -Each element of the returned vector is a range of indices where the -matching sequence is found, like the return value of [`findnext`](@ref). - -If `overlap=true`, the matching sequences are allowed to overlap indices in the -original string, otherwise they must be from disjoint character ranges. - -# Examples -```jldoctest -julia> findall("a", "apple") -1-element Vector{UnitRange{Int64}}: - 1:1 - -julia> findall("nana", "banana") -1-element Vector{UnitRange{Int64}}: - 3:6 - -julia> findall("a", "banana") -3-element Vector{UnitRange{Int64}}: - 2:2 - 4:4 - 6:6 -``` - -!!! compat "Julia 1.3" - This method requires at least Julia 1.3. -""" -function findall(t::Union{AbstractString,AbstractPattern}, s::AbstractString; overlap::Bool=false) - found = UnitRange{Int}[] - i, e = firstindex(s), lastindex(s) - while true - r = findnext(t, s, i) - isnothing(r) && break - push!(found, r) - j = overlap || isempty(r) ? first(r) : last(r) - j > e && break - @inbounds i = nextind(s, j) - end - return found -end - """ findall(c::AbstractChar, s::AbstractString) diff --git a/base/strings/search.jl b/base/strings/search.jl index 6423c01a162bc..eade1fbe74158 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -1,5 +1,15 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license +""" +An abstract type representing any sort of pattern matching expression +(typically a regular expression). `AbstractPattern` objects can be used to +match strings with [`match`](@ref). + +!!! compat "Julia 1.6" + This type is available in Julia 1.6 and later. +""" +abstract type AbstractPattern end + nothing_sentinel(i) = i == 0 ? nothing : i function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar}, @@ -406,6 +416,67 @@ true """ findlast(ch::AbstractChar, string::AbstractString) = findlast(==(ch), string) +""" + findall( + pattern::Union{AbstractString,AbstractPattern}, + string::AbstractString; + overlap::Bool = false, + ) + findall( + pattern::Vector{UInt8} + A::Vector{UInt8}; + overlap::Bool = false, + ) + +Return a `Vector{UnitRange{Int}}` of all the matches for `pattern` in `string`. +Each element of the returned vector is a range of indices where the +matching sequence is found, like the return value of [`findnext`](@ref). + +If `overlap=true`, the matching sequences are allowed to overlap indices in the +original string, otherwise they must be from disjoint character ranges. + +# Examples +```jldoctest +julia> findall("a", "apple") +1-element Vector{UnitRange{Int64}}: + 1:1 + +julia> findall("nana", "banana") +1-element Vector{UnitRange{Int64}}: + 3:6 + +julia> findall("a", "banana") +3-element Vector{UnitRange{Int64}}: + 2:2 + 4:4 + 6:6 + +julia> findall(UInt8[1,2], UInt8[1,2,3,1,2]) +2-element Vector{UnitRange{Int64}}: + 1:2 + 4:5 +``` + +!!! compat "Julia 1.3" + This method requires at least Julia 1.3. +""" + +function findall(t::Union{AbstractString, AbstractPattern, AbstractVector{<:Union{Int8,UInt8}}}, + s::Union{AbstractString, AbstractPattern, AbstractVector{<:Union{Int8,UInt8}}}, + ; overlap::Bool=false) + found = UnitRange{Int}[] + i, e = firstindex(s), lastindex(s) + while true + r = findnext(t, s, i) + isnothing(r) && break + push!(found, r) + j = overlap || isempty(r) ? first(r) : last(r) + j > e && break + @inbounds i = nextind(s, j) + end + return found +end + # AbstractString implementation of the generic findprev interface function findprev(testf::Function, s::AbstractString, i::Integer) i = Int(i) diff --git a/test/regex.jl b/test/regex.jl index 0202dc4758e2f..1cc377d9cfdbf 100644 --- a/test/regex.jl +++ b/test/regex.jl @@ -69,6 +69,10 @@ @test findall('→', "OH⁻ + H₃CBr → HOH₃CBr⁻ → HOCH₃ + Br⁻") == [17, 35] @test findall('a', "") == Int[] @test findall('c', "batman") == Int[] + @test findall([0x52, 0x62], [0x40, 0x52, 0x62, 0x63]) == [2:3] + @test findall([0x52, 0x62], [0x40, 0x52, 0x62, 0x63, 0x52, 0x62]) == [2:3, 5:6] + @test findall([0x01, 0x01], [0x01, 0x01, 0x01, 0x01]) == [1:2, 3:4] + @test findall([0x01, 0x01], [0x01, 0x01, 0x01, 0x01]; overlap=true) == [1:2, 2:3, 3:4] # count @test count(r"\w+", "foo bar") == 2