From f01dc17d8588c58469d2ae0b8fc0e8db41074d68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Valim?= Date: Tue, 2 Jul 2024 14:51:49 +0200 Subject: [PATCH] Optimize reversing logic Rule #1: Enum.reverse([1, 2, 3]) ++ [4, 5, 6] is equivalent but slower than Enum.reverse([1, 2, 3], [4, 5, 6]) Rule #2: Enum.reverse([1, 2, 3] ++ [4, 5, 6]) is equivalent but slower than: Enum.reverse([4, 5, 6], Enum.reverse([1, 2, 3])) Rule #3 Enum.reverse(Enum.reverse([1, 2, 3])) is the same as: [1, 2, 3] --- lib/elixir/unicode/security.ex | 48 ++++++++++++++++----------------- lib/elixir/unicode/tokenizer.ex | 1 - 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/lib/elixir/unicode/security.ex b/lib/elixir/unicode/security.ex index 23118322764..60a61a4ef87 100644 --- a/lib/elixir/unicode/security.ex +++ b/lib/elixir/unicode/security.ex @@ -110,7 +110,7 @@ defmodule String.Tokenizer.Security do |> :unicode.characters_to_nfd_list() end - # unicode 15 adds bidiSkeleton because, w/RTL codepoints, idents that + # Unicode 15 adds bidiSkeleton because, w/RTL codepoints, idents that # aren't confusable LTR *are* confusable in most places human review # occurs (editors/browsers, thanks to bidi algo, UAX9). # @@ -122,11 +122,11 @@ defmodule String.Tokenizer.Security do # chars like _ or 0..9 can mix w/RTL chars). def bidi_skeleton(s) do # UTS39-28 4: - # 'Bidirectional confusability is costlier to check than - # confusability, as [unicode bidi algo] must be applied. - # [...] a fast path can be used: [...] if X has no characters - # w/bidi classes R or AL, bidiSkeleton(X) = skeleton(X) # + # Bidirectional confusability is costlier to check than + # confusability, as [unicode bidi algo] must be applied. + # [...] a fast path can be used: [...] if X has no characters + # w/bidi classes R or AL, bidiSkeleton(X) = skeleton(X) if match?([_, _ | _], s) and any_rtl?(s) do unbidify(s) |> Enum.map(&confusable_prototype/1) else @@ -136,7 +136,7 @@ defmodule String.Tokenizer.Security do import String.Tokenizer, only: [dir: 1] - defp any_rtl?(s), do: Enum.any?(s, &(:rtl == dir(&1))) + defp any_rtl?(s), do: Enum.any?(s, &(:rtl == String.Tokenizer.dir(&1))) defp dir_compare(a, b) do """ @@ -150,7 +150,7 @@ defmodule String.Tokenizer.Security do for codepoint <- s, into: init do hex = :io_lib.format(~c"~4.16.0B", [codepoint]) - " \\u#{hex} #{[codepoint]} #{dir(codepoint)}\n" + " \\u#{hex} #{[codepoint]} #{String.Tokenizer.dir(codepoint)}\n" end end @@ -163,29 +163,29 @@ defmodule String.Tokenizer.Security do # the [...] stack of the [unicode bidi algo]' def unbidify(chars) when is_list(chars) do {neutrals, direction, last_part, acc} = - chars - |> Enum.map(&{&1, dir(&1)}) - |> Enum.reduce({[], :ltr, [], []}, fn + Enum.reduce(chars, {[], :ltr, [], []}, fn head, {neutrals, part_dir, part, acc} -> # https://www.unicode.org/reports/tr9/#W2 - {head, :weak_number}, {neutrals, part_dir, part, acc} -> - {[], part_dir, [head] ++ neutrals ++ part, acc} + case String.Tokenizer.dir(head) do + :weak_number -> + {[], part_dir, [head] ++ neutrals ++ part, acc} - {head, :neutral}, {neutrals, part_dir, part, acc} -> - {[head | neutrals], part_dir, part, acc} + :neutral -> + {[head | neutrals], part_dir, part, acc} - {head, part_dir}, {neutrals, part_dir, part, acc} -> - {[], part_dir, [head | neutrals] ++ part, acc} + ^part_dir -> + {[], part_dir, [head | neutrals] ++ part, acc} - {head, :ltr = head_dir}, {neutrals, :rtl, part, acc} -> - {[], head_dir, [head | neutrals], maybe_reverse(:rtl, part) ++ acc} + :ltr when part_dir == :rtl -> + {[], :ltr, [head | neutrals], Enum.reverse(part, acc)} - {head, :rtl = head_dir}, {neutrals, :ltr, part, acc} -> - {[], head_dir, [head], maybe_reverse(:ltr, neutrals ++ part) ++ acc} + :rtl when part_dir == :ltr -> + {[], :rtl, [head], neutrals ++ part ++ acc} + end end) - Enum.reverse(maybe_reverse(direction, neutrals ++ last_part) ++ acc) + case direction do + :ltr -> Enum.reverse(acc, Enum.reverse(neutrals ++ last_part)) + :rtl -> Enum.reverse(acc, neutrals ++ last_part) + end end - - defp maybe_reverse(:rtl, part), do: Enum.reverse(part) - defp maybe_reverse(:ltr, part), do: part end diff --git a/lib/elixir/unicode/tokenizer.ex b/lib/elixir/unicode/tokenizer.ex index 14bc454d5fe..db34cec796c 100644 --- a/lib/elixir/unicode/tokenizer.ex +++ b/lib/elixir/unicode/tokenizer.ex @@ -350,7 +350,6 @@ defmodule String.Tokenizer do end def dir(i) when is_integer(i), do: :ltr - def dir(_), do: {:error, :codepoint_must_be_integer} # Hard-coded normalizations. Also split by upper, start, continue.