From b74d3968aad834d74952e098f57dd2d7d24b7eb1 Mon Sep 17 00:00:00 2001 From: TEC Date: Fri, 2 Aug 2024 23:10:34 +0800 Subject: [PATCH] Refactor eachregion to be O(n log n) not O(n^2) Since we removed the ordering restriction on annotations to improve the semantics of annotation modification, each `annotations(str)` call became `O(n)` which is fine for a once off, but use it in a loop as `eachregion` does and now it's `O(n m)`. That's pretty underwhelming. We can improve this to `O(n log n)` by pre-sorting the list of annotations, and working with it instead. A bit more complexity is needed to do this while preserving the semantics, but it can be worth it for long strings. With a 100,000 char string with 20,000 annotations, print time goes from ~0.4s to 0.015s on my machine. --- src/regioniterator.jl | 67 +++++++++++++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 18 deletions(-) diff --git a/src/regioniterator.jl b/src/regioniterator.jl index 99df7f2..41c3224 100644 --- a/src/regioniterator.jl +++ b/src/regioniterator.jl @@ -36,37 +36,68 @@ julia> collect(StyledStrings.eachregion(Base.AnnotatedString( ("there", [:face => :italic]) ``` """ -function eachregion(s::AnnotatedString, region::UnitRange{Int}=firstindex(s):lastindex(s)) - isempty(s) || isempty(region) && - return RegionIterator(s, Vector{UnitRange{Int}}(), Vector{Vector{Pair{Symbol, Any}}}()) +function eachregion(s::AnnotatedString, pos::UnitRange{Int}=firstindex(s):lastindex(s)) + isempty(s) || isempty(pos) && + return RegionIterator(s, UnitRange{Int}[], Vector{Pair{Symbol, Any}}[]) + annots_unsorted = annotations(s) + isempty(annots_unsorted) && return RegionIterator(s.string, [pos], [Pair{Symbol, Any}[]]) + # We know we have to deal with some annotations at this point, so it's worth + # doing some work to make doing so repeatedly more efficient. We want to + # make sure that: + # - the annotations are sorted by their start index, + # - all annotations regions lie within `pos`, and + # - that we can get the original order of annotations + annotreordering = sortperm(annots_unsorted, by=first) + sortedregions = first.(annots_unsorted[annotreordering]) + if first(sortedregions[begin]) < first(pos) || last(sortedregions[end]) > last(pos) + annot_unsorted = + Tuple{UnitRange{Int64}, Pair{Symbol, Any}}[ + (max(first(pos), first(region)):min(last(pos), last(region)), annot) + for (region, annot) in s.annotations if !isempty(intersect(pos, region))] + annotreordering = sortperm(annots_unsorted, by=first) + sortedregions = first.(annots_unsorted[annotreordering]) + end + sortedannots = last.(annots_unsorted[annotreordering]) + annotordering = sortperm(annotreordering) + changepoints = append!(first.(sortedregions), last.(sortedregions)) |> sort |> unique + isempty(changepoints) && + return RegionIterator(s.string, UnitRange{Int}[pos], Vector{Pair{Symbol, Any}}[map(last, annotations(s, first(pos)))]) + # Now we have a list of all locations that the active annotations change, and + # a sorted list of annotations. We can use this to efficiently collect each + # region of text with a constant set of annotations. regions = Vector{UnitRange{Int}}() annots = Vector{Vector{Pair{Symbol, Any}}}() - changepoints = filter(c -> c in region, - Iterators.flatten((first(region), nextind(s, last(region))) - for region in first.(s.annotations)) |> - unique |> sort) - isempty(changepoints) && - return RegionIterator(s.string, UnitRange{Int}[region], Vector{Pair{Symbol, Any}}[map(last, annotations(s, first(region)))]) - function registerchange!(start, stop) + # This only really needs `start` and `stop`, but for performance it's important to avoid boxing. + function registerchange!(regions, annots, start, stop, #=box avoidance: =# sortedregions, sortedannots, annotreordering) + startann = searchsortedfirst(sortedregions, start:start, by=first) + prestopann = searchsortedlast(sortedregions, (stop-1):(stop-1), by=first) + stopann = searchsortedlast(sortedregions, stop:stop, by=first) + annlist = collect(startann:prestopann) + for i in prestopann+1:stopann + if !isempty(intersect(sortedregions[i], start:stop)) + push!(annlist, i) + end + end + anns = view(sortedannots, startann:stopann)[sortperm(view(annotreordering, annlist))] push!(regions, start:stop) - push!(annots, map(last, annotations(s, start))) + push!(annots, anns) end - if first(region) < first(changepoints) - registerchange!(first(region), prevind(s, first(changepoints))) + if first(pos) < first(changepoints) + registerchange!(regions, annots, first(pos), prevind(s, first(changepoints)), sortedregions, sortedannots, annotreordering) end for (start, stop) in zip(changepoints, changepoints[2:end]) - registerchange!(start, prevind(s, stop)) + registerchange!(regions, annots, start, prevind(s, stop), sortedregions, sortedannots, annotreordering) end - if last(changepoints) <= last(region) - registerchange!(last(changepoints), last(region)) + if last(changepoints) <= last(pos) + registerchange!(regions, annots, last(changepoints), last(pos), sortedregions, sortedannots, annotreordering) end RegionIterator(s.string, regions, annots) end -function eachregion(s::SubString{<:AnnotatedString}, region::UnitRange{Int}=firstindex(s):lastindex(s)) +function eachregion(s::SubString{<:AnnotatedString}, pos::UnitRange{Int}=firstindex(s):lastindex(s)) if isempty(s) RegionIterator(s, Vector{UnitRange{Int}}(), Vector{Vector{Pair{Symbol, Any}}}()) else - eachregion(s.string, first(region)+s.offset:last(region)+s.offset) + eachregion(s.string, first(pos)+s.offset:last(pos)+s.offset) end end