From c1b64c174ec2e8ca2174c51332670e3be30c865f Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 10:57:50 +0900 Subject: [PATCH] Fix performance issue caused by using repeated `>` characters inside comments (#171) A `<` is treated as a string delimiter. In certain cases, if `<` is used in succession, read and match are repeated, which slows down the process. Therefore, the following is used to read ahead to a specific part of the string in advance. --- lib/rexml/parsers/baseparser.rb | 3 ++- test/parse/test_comment.rb | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index b117e654..ba205175 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -126,6 +126,7 @@ class BaseParser module Private INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um INSTRUCTION_TERM = "?>" + COMMENT_TERM = "-->" TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um @@ -243,7 +244,7 @@ def pull_event return process_instruction(start_position) elsif @source.match("/um, true) + md = @source.match(/(.*?)-->/um, true, term: Private::COMMENT_TERM) if md.nil? raise REXML::ParseException.new("Unclosed comment", @source) end diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index 46a07409..543d9ad8 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -1,8 +1,12 @@ require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests class TestParseComment < Test::Unit::TestCase + include Test::Unit::CoreAssertions + def parse(xml) REXML::Document.new(xml) end @@ -117,5 +121,12 @@ def test_after_root assert_equal(" ok comment ", events[:comment]) end + + def test_gt_linear_performance + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('') + end + end end end