From b74d3968aad834d74952e098f57dd2d7d24b7eb1 Mon Sep 17 00:00:00 2001
From: TEC <git@tecosaur.net>
Date: Fri, 2 Aug 2024 23:10:34 +0800
Subject: [PATCH] Refactor eachregion to be O(n log n) not O(n^2)

Since we removed the ordering restriction on annotations to improve the
semantics of annotation modification, each `annotations(str)` call
became `O(n)` which is fine for a once off, but use it in a loop as
`eachregion` does and now it's `O(n m)`. That's pretty underwhelming.

We can improve this to `O(n log n)` by pre-sorting the list of
annotations, and working with it instead. A bit more complexity is
needed to do this while preserving the semantics, but it can be worth it
for long strings. With a 100,000 char string with 20,000 annotations,
print time goes from ~0.4s to 0.015s on my machine.
---
 src/regioniterator.jl | 67 +++++++++++++++++++++++++++++++------------
 1 file changed, 49 insertions(+), 18 deletions(-)

diff --git a/src/regioniterator.jl b/src/regioniterator.jl
index 99df7f2..41c3224 100644
--- a/src/regioniterator.jl
+++ b/src/regioniterator.jl
@@ -36,37 +36,68 @@ julia> collect(StyledStrings.eachregion(Base.AnnotatedString(
  ("there", [:face => :italic])
 ```
 """
-function eachregion(s::AnnotatedString, region::UnitRange{Int}=firstindex(s):lastindex(s))
-    isempty(s) || isempty(region) &&
-        return RegionIterator(s, Vector{UnitRange{Int}}(), Vector{Vector{Pair{Symbol, Any}}}())
+function eachregion(s::AnnotatedString, pos::UnitRange{Int}=firstindex(s):lastindex(s))
+    isempty(s) || isempty(pos) &&
+        return RegionIterator(s, UnitRange{Int}[], Vector{Pair{Symbol, Any}}[])
+    annots_unsorted = annotations(s)
+    isempty(annots_unsorted) && return RegionIterator(s.string, [pos], [Pair{Symbol, Any}[]])
+    # We know we have to deal with some annotations at this point, so it's worth
+    # doing some work to make doing so repeatedly more efficient. We want to
+    # make sure that:
+    # - the annotations are sorted by their start index,
+    # - all annotations regions lie within `pos`, and
+    # - that we can get the original order of annotations
+    annotreordering = sortperm(annots_unsorted, by=first)
+    sortedregions = first.(annots_unsorted[annotreordering])
+    if first(sortedregions[begin]) < first(pos) || last(sortedregions[end]) > last(pos)
+        annot_unsorted =
+            Tuple{UnitRange{Int64}, Pair{Symbol, Any}}[
+                (max(first(pos), first(region)):min(last(pos), last(region)), annot)
+                for (region, annot) in s.annotations if !isempty(intersect(pos, region))]
+        annotreordering = sortperm(annots_unsorted, by=first)
+        sortedregions = first.(annots_unsorted[annotreordering])
+    end
+    sortedannots = last.(annots_unsorted[annotreordering])
+    annotordering = sortperm(annotreordering)
+    changepoints = append!(first.(sortedregions), last.(sortedregions)) |> sort |> unique
+    isempty(changepoints) &&
+        return RegionIterator(s.string, UnitRange{Int}[pos], Vector{Pair{Symbol, Any}}[map(last, annotations(s, first(pos)))])
+    # Now we have a list of all locations that the active annotations change, and
+    # a sorted list of annotations. We can use this to efficiently collect each
+    # region of text with a constant set of annotations.
     regions = Vector{UnitRange{Int}}()
     annots = Vector{Vector{Pair{Symbol, Any}}}()
-    changepoints = filter(c -> c in region,
-                          Iterators.flatten((first(region), nextind(s, last(region)))
-                                            for region in first.(s.annotations)) |>
-                                                unique |> sort)
-    isempty(changepoints) &&
-        return RegionIterator(s.string, UnitRange{Int}[region], Vector{Pair{Symbol, Any}}[map(last, annotations(s, first(region)))])
-    function registerchange!(start, stop)
+    # This only really needs `start` and `stop`, but for performance it's important to avoid boxing.
+    function registerchange!(regions, annots, start, stop, #=box avoidance: =# sortedregions, sortedannots, annotreordering)
+        startann = searchsortedfirst(sortedregions, start:start, by=first)
+        prestopann = searchsortedlast(sortedregions, (stop-1):(stop-1), by=first)
+        stopann = searchsortedlast(sortedregions, stop:stop, by=first)
+        annlist = collect(startann:prestopann)
+        for i in prestopann+1:stopann
+            if !isempty(intersect(sortedregions[i], start:stop))
+                push!(annlist, i)
+            end
+        end
+        anns = view(sortedannots, startann:stopann)[sortperm(view(annotreordering, annlist))]
         push!(regions, start:stop)
-        push!(annots, map(last, annotations(s, start)))
+        push!(annots, anns)
     end
-    if first(region) < first(changepoints)
-        registerchange!(first(region), prevind(s, first(changepoints)))
+    if first(pos) < first(changepoints)
+        registerchange!(regions, annots, first(pos), prevind(s, first(changepoints)), sortedregions, sortedannots, annotreordering)
     end
     for (start, stop) in zip(changepoints, changepoints[2:end])
-        registerchange!(start, prevind(s, stop))
+        registerchange!(regions, annots, start, prevind(s, stop), sortedregions, sortedannots, annotreordering)
     end
-    if last(changepoints) <= last(region)
-        registerchange!(last(changepoints), last(region))
+    if last(changepoints) <= last(pos)
+        registerchange!(regions, annots, last(changepoints), last(pos), sortedregions, sortedannots, annotreordering)
     end
     RegionIterator(s.string, regions, annots)
 end
 
-function eachregion(s::SubString{<:AnnotatedString}, region::UnitRange{Int}=firstindex(s):lastindex(s))
+function eachregion(s::SubString{<:AnnotatedString}, pos::UnitRange{Int}=firstindex(s):lastindex(s))
     if isempty(s)
         RegionIterator(s, Vector{UnitRange{Int}}(), Vector{Vector{Pair{Symbol, Any}}}())
     else
-        eachregion(s.string, first(region)+s.offset:last(region)+s.offset)
+        eachregion(s.string, first(pos)+s.offset:last(pos)+s.offset)
     end
 end