Skip to content

Commit

Permalink
expose findall for Vector{UInt8} (#45307)
Browse files Browse the repository at this point in the history
  • Loading branch information
Moelf authored May 27, 2022
1 parent 762561c commit 84e9989
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 58 deletions.
58 changes: 0 additions & 58 deletions base/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,6 @@ include("pcre.jl")
const DEFAULT_COMPILER_OPTS = PCRE.UTF | PCRE.MATCH_INVALID_UTF | PCRE.ALT_BSUX | PCRE.UCP
const DEFAULT_MATCH_OPTS = PCRE.NO_UTF_CHECK

"""
An abstract type representing any sort of pattern matching expression
(typically a regular expression). `AbstractPattern` objects can be used to
match strings with [`match`](@ref).
!!! compat "Julia 1.6"
This type is available in Julia 1.6 and later.
"""
abstract type AbstractPattern end

"""
Regex(pattern[, flags])
Expand Down Expand Up @@ -438,54 +428,6 @@ findnext(r::Regex, s::AbstractString, idx::Integer) = throw(ArgumentError(
findfirst(r::Regex, s::AbstractString) = findnext(r,s,firstindex(s))


"""
findall(
pattern::Union{AbstractString,AbstractPattern},
string::AbstractString;
overlap::Bool = false,
)
Return a `Vector{UnitRange{Int}}` of all the matches for `pattern` in `string`.
Each element of the returned vector is a range of indices where the
matching sequence is found, like the return value of [`findnext`](@ref).
If `overlap=true`, the matching sequences are allowed to overlap indices in the
original string, otherwise they must be from disjoint character ranges.
# Examples
```jldoctest
julia> findall("a", "apple")
1-element Vector{UnitRange{Int64}}:
1:1
julia> findall("nana", "banana")
1-element Vector{UnitRange{Int64}}:
3:6
julia> findall("a", "banana")
3-element Vector{UnitRange{Int64}}:
2:2
4:4
6:6
```
!!! compat "Julia 1.3"
This method requires at least Julia 1.3.
"""
function findall(t::Union{AbstractString,AbstractPattern}, s::AbstractString; overlap::Bool=false)
found = UnitRange{Int}[]
i, e = firstindex(s), lastindex(s)
while true
r = findnext(t, s, i)
isnothing(r) && break
push!(found, r)
j = overlap || isempty(r) ? first(r) : last(r)
j > e && break
@inbounds i = nextind(s, j)
end
return found
end

"""
findall(c::AbstractChar, s::AbstractString)
Expand Down
71 changes: 71 additions & 0 deletions base/strings/search.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license

"""
An abstract type representing any sort of pattern matching expression
(typically a regular expression). `AbstractPattern` objects can be used to
match strings with [`match`](@ref).
!!! compat "Julia 1.6"
This type is available in Julia 1.6 and later.
"""
abstract type AbstractPattern end

nothing_sentinel(i) = i == 0 ? nothing : i

function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
Expand Down Expand Up @@ -406,6 +416,67 @@ true
"""
findlast(ch::AbstractChar, string::AbstractString) = findlast(==(ch), string)

"""
findall(
pattern::Union{AbstractString,AbstractPattern},
string::AbstractString;
overlap::Bool = false,
)
findall(
pattern::Vector{UInt8}
A::Vector{UInt8};
overlap::Bool = false,
)
Return a `Vector{UnitRange{Int}}` of all the matches for `pattern` in `string`.
Each element of the returned vector is a range of indices where the
matching sequence is found, like the return value of [`findnext`](@ref).
If `overlap=true`, the matching sequences are allowed to overlap indices in the
original string, otherwise they must be from disjoint character ranges.
# Examples
```jldoctest
julia> findall("a", "apple")
1-element Vector{UnitRange{Int64}}:
1:1
julia> findall("nana", "banana")
1-element Vector{UnitRange{Int64}}:
3:6
julia> findall("a", "banana")
3-element Vector{UnitRange{Int64}}:
2:2
4:4
6:6
julia> findall(UInt8[1,2], UInt8[1,2,3,1,2])
2-element Vector{UnitRange{Int64}}:
1:2
4:5
```
!!! compat "Julia 1.3"
This method requires at least Julia 1.3.
"""

function findall(t::Union{AbstractString, AbstractPattern, AbstractVector{<:Union{Int8,UInt8}}},
s::Union{AbstractString, AbstractPattern, AbstractVector{<:Union{Int8,UInt8}}},
; overlap::Bool=false)
found = UnitRange{Int}[]
i, e = firstindex(s), lastindex(s)
while true
r = findnext(t, s, i)
isnothing(r) && break
push!(found, r)
j = overlap || isempty(r) ? first(r) : last(r)
j > e && break
@inbounds i = nextind(s, j)
end
return found
end

# AbstractString implementation of the generic findprev interface
function findprev(testf::Function, s::AbstractString, i::Integer)
i = Int(i)
Expand Down
4 changes: 4 additions & 0 deletions test/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@
@test findall('', "OH⁻ + H₃CBr → HOH₃CBr⁻ → HOCH₃ + Br⁻") == [17, 35]
@test findall('a', "") == Int[]
@test findall('c', "batman") == Int[]
@test findall([0x52, 0x62], [0x40, 0x52, 0x62, 0x63]) == [2:3]
@test findall([0x52, 0x62], [0x40, 0x52, 0x62, 0x63, 0x52, 0x62]) == [2:3, 5:6]
@test findall([0x01, 0x01], [0x01, 0x01, 0x01, 0x01]) == [1:2, 3:4]
@test findall([0x01, 0x01], [0x01, 0x01, 0x01, 0x01]; overlap=true) == [1:2, 2:3, 3:4]

# count
@test count(r"\w+", "foo bar") == 2
Expand Down

0 comments on commit 84e9989

Please sign in to comment.