diff --git a/lib/nori.rb b/lib/nori.rb index e5e334c..ba55215 100644 --- a/lib/nori.rb +++ b/lib/nori.rb @@ -22,6 +22,7 @@ def initialize(options = {}) :empty_tag_value => nil, :advanced_typecasting => true, :convert_dashes_to_underscores => true, + :scrub_xml => true, :parser => :nokogiri } @@ -40,7 +41,7 @@ def find(hash, *path) end def parse(xml) - cleaned_xml = xml.strip + cleaned_xml = scrub_xml(xml).strip return {} if cleaned_xml.empty? parser = load_parser @options[:parser] @@ -77,4 +78,22 @@ def find_value(hash, key) nil end + def scrub_xml(string) + if @options[:scrub_xml] + if string.respond_to? :scrub + string.scrub + else + if string.valid_encoding? + string + else + enc = string.encoding + mid_enc = (["UTF-8", "UTF-16BE"].map { |e| Encoding.find(e) } - [enc]).first + string.encode(mid_enc, undef: :replace, invalid: :replace).encode(enc) + end + end + else + string + end + end + end diff --git a/spec/nori/api_spec.rb b/spec/nori/api_spec.rb index ab4c67d..9a4b98c 100644 --- a/spec/nori/api_spec.rb +++ b/spec/nori/api_spec.rb @@ -108,13 +108,6 @@ expect(Nori::Parser::Nokogiri).to receive(:parse).and_return({}) nori.parse("thing") end - - it "strips the XML" do - xml = double("xml") - expect(xml).to receive(:strip).and_return("thing") - - expect(nori.parse(xml)).to eq({ "any" => "thing" }) - end end context "#parse without :advanced_typecasting" do diff --git a/spec/nori/nori_spec.rb b/spec/nori/nori_spec.rb index 5f49267..52c009f 100644 --- a/spec/nori/nori_spec.rb +++ b/spec/nori/nori_spec.rb @@ -28,6 +28,11 @@ expect(parse(xml)["tag"].strip).to eq("text inside cdata") end + it "should scrub bad characters" do + xml = "a\xfbc".force_encoding('UTF-8') + expect(parse(xml)["tag"]).to eq("a\uFFFDc") + end + it "should transform a simple tag with attributes" do xml = "" hash = { 'tag' => { '@attr1' => '1', '@attr2' => '2' } }