Skip to content

Commit

Permalink
Refactor eachregion to be O(n log n) not O(n^2)
Browse files Browse the repository at this point in the history
Since we removed the ordering restriction on annotations to improve the
semantics of annotation modification, each `annotations(str)` call
became `O(n)` which is fine for a once off, but use it in a loop as
`eachregion` does and now it's `O(n m)`. That's pretty underwhelming.

We can improve this to `O(n log n)` by pre-sorting the list of
annotations, and working with it instead. A bit more complexity is
needed to do this while preserving the semantics, but it can be worth it
for long strings. With a 100,000 char string with 20,000 annotations,
print time goes from ~0.4s to 0.015s on my machine.
  • Loading branch information
tecosaur committed Aug 2, 2024
1 parent f7af623 commit b74d396
Showing 1 changed file with 49 additions and 18 deletions.
67 changes: 49 additions & 18 deletions src/regioniterator.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,37 +36,68 @@ julia> collect(StyledStrings.eachregion(Base.AnnotatedString(
("there", [:face => :italic])
```
"""
function eachregion(s::AnnotatedString, region::UnitRange{Int}=firstindex(s):lastindex(s))
isempty(s) || isempty(region) &&
return RegionIterator(s, Vector{UnitRange{Int}}(), Vector{Vector{Pair{Symbol, Any}}}())
function eachregion(s::AnnotatedString, pos::UnitRange{Int}=firstindex(s):lastindex(s))
isempty(s) || isempty(pos) &&
return RegionIterator(s, UnitRange{Int}[], Vector{Pair{Symbol, Any}}[])
annots_unsorted = annotations(s)
isempty(annots_unsorted) && return RegionIterator(s.string, [pos], [Pair{Symbol, Any}[]])
# We know we have to deal with some annotations at this point, so it's worth
# doing some work to make doing so repeatedly more efficient. We want to
# make sure that:
# - the annotations are sorted by their start index,
# - all annotations regions lie within `pos`, and
# - that we can get the original order of annotations
annotreordering = sortperm(annots_unsorted, by=first)
sortedregions = first.(annots_unsorted[annotreordering])
if first(sortedregions[begin]) < first(pos) || last(sortedregions[end]) > last(pos)
annot_unsorted =
Tuple{UnitRange{Int64}, Pair{Symbol, Any}}[
(max(first(pos), first(region)):min(last(pos), last(region)), annot)
for (region, annot) in s.annotations if !isempty(intersect(pos, region))]
annotreordering = sortperm(annots_unsorted, by=first)
sortedregions = first.(annots_unsorted[annotreordering])
end
sortedannots = last.(annots_unsorted[annotreordering])
annotordering = sortperm(annotreordering)
changepoints = append!(first.(sortedregions), last.(sortedregions)) |> sort |> unique
isempty(changepoints) &&
return RegionIterator(s.string, UnitRange{Int}[pos], Vector{Pair{Symbol, Any}}[map(last, annotations(s, first(pos)))])
# Now we have a list of all locations that the active annotations change, and
# a sorted list of annotations. We can use this to efficiently collect each
# region of text with a constant set of annotations.
regions = Vector{UnitRange{Int}}()
annots = Vector{Vector{Pair{Symbol, Any}}}()
changepoints = filter(c -> c in region,
Iterators.flatten((first(region), nextind(s, last(region)))
for region in first.(s.annotations)) |>
unique |> sort)
isempty(changepoints) &&
return RegionIterator(s.string, UnitRange{Int}[region], Vector{Pair{Symbol, Any}}[map(last, annotations(s, first(region)))])
function registerchange!(start, stop)
# This only really needs `start` and `stop`, but for performance it's important to avoid boxing.
function registerchange!(regions, annots, start, stop, #=box avoidance: =# sortedregions, sortedannots, annotreordering)
startann = searchsortedfirst(sortedregions, start:start, by=first)
prestopann = searchsortedlast(sortedregions, (stop-1):(stop-1), by=first)
stopann = searchsortedlast(sortedregions, stop:stop, by=first)
annlist = collect(startann:prestopann)
for i in prestopann+1:stopann
if !isempty(intersect(sortedregions[i], start:stop))
push!(annlist, i)
end
end
anns = view(sortedannots, startann:stopann)[sortperm(view(annotreordering, annlist))]
push!(regions, start:stop)
push!(annots, map(last, annotations(s, start)))
push!(annots, anns)
end
if first(region) < first(changepoints)
registerchange!(first(region), prevind(s, first(changepoints)))
if first(pos) < first(changepoints)
registerchange!(regions, annots, first(pos), prevind(s, first(changepoints)), sortedregions, sortedannots, annotreordering)
end
for (start, stop) in zip(changepoints, changepoints[2:end])
registerchange!(start, prevind(s, stop))
registerchange!(regions, annots, start, prevind(s, stop), sortedregions, sortedannots, annotreordering)
end
if last(changepoints) <= last(region)
registerchange!(last(changepoints), last(region))
if last(changepoints) <= last(pos)
registerchange!(regions, annots, last(changepoints), last(pos), sortedregions, sortedannots, annotreordering)
end
RegionIterator(s.string, regions, annots)
end

function eachregion(s::SubString{<:AnnotatedString}, region::UnitRange{Int}=firstindex(s):lastindex(s))
function eachregion(s::SubString{<:AnnotatedString}, pos::UnitRange{Int}=firstindex(s):lastindex(s))
if isempty(s)
RegionIterator(s, Vector{UnitRange{Int}}(), Vector{Vector{Pair{Symbol, Any}}}())
else
eachregion(s.string, first(region)+s.offset:last(region)+s.offset)
eachregion(s.string, first(pos)+s.offset:last(pos)+s.offset)
end
end

0 comments on commit b74d396

Please sign in to comment.