Skip to content

Commit

Permalink
Scrub invalid characters from source XML
Browse files Browse the repository at this point in the history
Nori::Parser has a new option, :scrub_xml, which defaults to true, when
it's true, the parser will clean invalid or undefined characters from
the string using String#scrub if it's available or String#encode
otherwise.
  • Loading branch information
alethea committed Mar 4, 2016
1 parent 4ef964a commit 37d0191
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 8 deletions.
21 changes: 20 additions & 1 deletion lib/nori.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def initialize(options = {})
:empty_tag_value => nil,
:advanced_typecasting => true,
:convert_dashes_to_underscores => true,
:scrub_xml => true,
:parser => :nokogiri
}

Expand All @@ -40,7 +41,7 @@ def find(hash, *path)
end

def parse(xml)
cleaned_xml = xml.strip
cleaned_xml = scrub_xml(xml).strip
return {} if cleaned_xml.empty?

parser = load_parser @options[:parser]
Expand Down Expand Up @@ -77,4 +78,22 @@ def find_value(hash, key)
nil
end

def scrub_xml(string)
if @options[:scrub_xml]
if string.respond_to? :scrub
string.scrub
else
if string.valid_encoding?
string
else
enc = string.encoding
mid_enc = ([:UTF_8, :UTF_16].map { |e| Encoding.find(e) } - [enc]).first
string.encode(mid_enc, undef: :replace, invalid: :replace).encode(enc)
end
end
else
string
end
end

end
7 changes: 0 additions & 7 deletions spec/nori/api_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -108,13 +108,6 @@
expect(Nori::Parser::Nokogiri).to receive(:parse).and_return({})
nori.parse("<any>thing</any>")
end

it "strips the XML" do
xml = double("xml")
expect(xml).to receive(:strip).and_return("<any>thing</any>")

expect(nori.parse(xml)).to eq({ "any" => "thing" })
end
end

context "#parse without :advanced_typecasting" do
Expand Down
5 changes: 5 additions & 0 deletions spec/nori/nori_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@
expect(parse(xml)["tag"].strip).to eq("text inside cdata")
end

it "should scrub bad characters" do
xml = "<tag>a\xfbc</tag>".force_encoding('UTF-8')
expect(parse(xml)["tag"]).to eq("a\uFFFDc")
end

it "should transform a simple tag with attributes" do
xml = "<tag attr1='1' attr2='2'></tag>"
hash = { 'tag' => { '@attr1' => '1', '@attr2' => '2' } }
Expand Down

0 comments on commit 37d0191

Please sign in to comment.