diff --git a/lib/nori.rb b/lib/nori.rb
index e5e334c..ba55215 100644
--- a/lib/nori.rb
+++ b/lib/nori.rb
@@ -22,6 +22,7 @@ def initialize(options = {})
:empty_tag_value => nil,
:advanced_typecasting => true,
:convert_dashes_to_underscores => true,
+ :scrub_xml => true,
:parser => :nokogiri
}
@@ -40,7 +41,7 @@ def find(hash, *path)
end
def parse(xml)
- cleaned_xml = xml.strip
+ cleaned_xml = scrub_xml(xml).strip
return {} if cleaned_xml.empty?
parser = load_parser @options[:parser]
@@ -77,4 +78,22 @@ def find_value(hash, key)
nil
end
+ def scrub_xml(string)
+ if @options[:scrub_xml]
+ if string.respond_to? :scrub
+ string.scrub
+ else
+ if string.valid_encoding?
+ string
+ else
+ enc = string.encoding
+ mid_enc = (["UTF-8", "UTF-16BE"].map { |e| Encoding.find(e) } - [enc]).first
+ string.encode(mid_enc, undef: :replace, invalid: :replace).encode(enc)
+ end
+ end
+ else
+ string
+ end
+ end
+
end
diff --git a/spec/nori/api_spec.rb b/spec/nori/api_spec.rb
index ab4c67d..9a4b98c 100644
--- a/spec/nori/api_spec.rb
+++ b/spec/nori/api_spec.rb
@@ -108,13 +108,6 @@
expect(Nori::Parser::Nokogiri).to receive(:parse).and_return({})
nori.parse("thing")
end
-
- it "strips the XML" do
- xml = double("xml")
- expect(xml).to receive(:strip).and_return("thing")
-
- expect(nori.parse(xml)).to eq({ "any" => "thing" })
- end
end
context "#parse without :advanced_typecasting" do
diff --git a/spec/nori/nori_spec.rb b/spec/nori/nori_spec.rb
index 5f49267..52c009f 100644
--- a/spec/nori/nori_spec.rb
+++ b/spec/nori/nori_spec.rb
@@ -28,6 +28,11 @@
expect(parse(xml)["tag"].strip).to eq("text inside cdata")
end
+ it "should scrub bad characters" do
+ xml = "a\xfbc".force_encoding('UTF-8')
+ expect(parse(xml)["tag"]).to eq("a\uFFFDc")
+ end
+
it "should transform a simple tag with attributes" do
xml = ""
hash = { 'tag' => { '@attr1' => '1', '@attr2' => '2' } }