Skip to content

Commit

Permalink
Fix a bug that SAX2 parser doesn't expand the predefined entities for…
Browse files Browse the repository at this point in the history
… "characters" (#168)

## Why?

SAX2 parser expand user-defined entity references and character
references but doesn't expand predefined entity references.

## Change
- text_unnormalized.rb
```
require 'rexml/document'
require 'rexml/parsers/sax2parser'
require 'rexml/parsers/pullparser'
require 'rexml/parsers/streamparser'

xml = <<EOS
<root>
  <A>&lt;P&gt;&#13; &lt;I&gt; &lt;B&gt; Text &lt;/B&gt;  &lt;/I&gt;</A>
</root>
EOS

class Listener
  def method_missing(name, *args)
    p [name, *args]
  end
end

puts "REXML(DOM)"
REXML::Document.new(xml).elements.each("/root/A") {|element| puts element.text}

puts ""
puts "REXML(Pull)"
parser = REXML::Parsers::PullParser.new(xml)
while parser.has_next?
    res = parser.pull
    p res
end

puts ""
puts "REXML(Stream)"
parser = REXML::Parsers::StreamParser.new(xml, Listener.new).parse

puts ""
puts "REXML(SAX)"
parser = REXML::Parsers::SAX2Parser.new(xml)
parser.listen(Listener.new)
parser.parse
```

## Before (master)
```
$ ruby text_unnormalized.rb
REXML(DOM)
 <I> <B> Text </B>  </I>

REXML(Pull)
start_element: ["root", {}]
text: ["\n  ", "\n  "]
start_element: ["A", {}]
text: ["&lt;P&gt;&#13; &lt;I&gt; &lt;B&gt; Text &lt;/B&gt;  &lt;/I&gt;", "<P>\r <I> <B> Text </B>  </I>"]
end_element: ["A"]
text: ["\n", "\n"]
end_element: ["root"]
end_document: []

REXML(Stream)
[:tag_start, "root", {}]
[:text, "\n  "]
[:tag_start, "A", {}]
[:text, "<P>\r <I> <B> Text </B>  </I>"]
[:tag_end, "A"]
[:text, "\n"]
[:tag_end, "root"]

REXML(SAX)
[:start_document]
[:start_element, nil, "root", "root", {}]
[:progress, 6]
[:characters, "\n  "]
[:progress, 9]
[:start_element, nil, "A", "A", {}]
[:progress, 12]
[:characters, "&lt;P&gt;\r &lt;I&gt; &lt;B&gt; Text &lt;/B&gt;  &lt;/I&gt;"] #<= This
[:progress, 74]
[:end_element, nil, "A", "A"]
[:progress, 78]
[:characters, "\n"]
[:progress, 79]
[:end_element, nil, "root", "root"]
[:progress, 86]
[:end_document]
```

## After(This PR)
```
$ ruby text_unnormalized.rb
REXML(SAX)
[:start_document]
[:start_element, nil, "root", "root", {}]
[:progress, 6]
[:characters, "\n  "]
[:progress, 9]
[:start_element, nil, "A", "A", {}]
[:progress, 12]
[:characters, "<P>\r <I> <B> Text </B>  </I>"]
[:progress, 74]
[:end_element, nil, "A", "A"]
[:progress, 78]
[:characters, "\n"]
[:progress, 79]
[:end_element, nil, "root", "root"]
[:progress, 86]
[:end_document]
```
  • Loading branch information
naitoh authored Jul 14, 2024
1 parent a5075c1 commit 4ebf21f
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 21 deletions.
21 changes: 2 additions & 19 deletions lib/rexml/parsers/sax2parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -157,25 +157,8 @@ def parse
end
end
when :text
#normalized = @parser.normalize( event[1] )
#handle( :characters, normalized )
copy = event[1].clone

esub = proc { |match|
if @entities.has_key?($1)
@entities[$1].gsub(Text::REFERENCE, &esub)
else
match
end
}

copy.gsub!( Text::REFERENCE, &esub )
copy.gsub!( Text::NUMERICENTITY ) {|m|
m=$1
m = "0#{m}" if m[0] == ?x
[Integer(m)].pack('U*')
}
handle( :characters, copy )
unnormalized = @parser.unnormalize( event[1], @entities )
handle( :characters, unnormalized )
when :entitydecl
handle_entitydecl( event )
when :processing_instruction, :comment, :attlistdecl,
Expand Down
4 changes: 2 additions & 2 deletions lib/rexml/parsers/streamparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ def parse
@listener.tag_end( event[1] )
@tag_stack.pop
when :text
normalized = @parser.unnormalize( event[1] )
@listener.text( normalized )
unnormalized = @parser.unnormalize( event[1] )
@listener.text( unnormalized )
when :processing_instruction
@listener.instruction( *event[1,2] )
when :start_doctype
Expand Down
16 changes: 16 additions & 0 deletions test/test_pullparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,22 @@ def test_character_references
assert_equal("B", events['b'])
end

def test_text_entity_references
source = '<root><a>&lt;P&gt; &lt;I&gt; &lt;B&gt; Text &lt;/B&gt; &lt;/I&gt;</a></root>'
parser = REXML::Parsers::PullParser.new( source )

events = []
while parser.has_next?
event = parser.pull
case event.event_type
when :text
events << event[1]
end
end

assert_equal(["<P> <I> <B> Text </B> </I>"], events)
end

def test_text_content_with_line_breaks
source = "<root><a>A</a><b>B\n</b><c>C\r\n</c></root>"
parser = REXML::Parsers::PullParser.new( source )
Expand Down
11 changes: 11 additions & 0 deletions test/test_sax.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,17 @@ def test_entity_replacement
assert_equal '--1234--', results[1]
end

def test_characters_predefined_entities
source = '<root><a>&lt;P&gt; &lt;I&gt; &lt;B&gt; Text &lt;/B&gt; &lt;/I&gt;</a></root>'

sax = Parsers::SAX2Parser.new( source )
results = []
sax.listen(:characters) {|x| results << x }
sax.parse

assert_equal(["<P> <I> <B> Text </B> </I>"], results)
end

def test_sax2
File.open(fixture_path("documentation.xml")) do |f|
parser = Parsers::SAX2Parser.new( f )
Expand Down

0 comments on commit 4ebf21f

Please sign in to comment.