From 3d927df4cb6970f0987ded62712349024d1a930e Mon Sep 17 00:00:00 2001 From: Dmitry Krasnoukhov Date: Mon, 9 Jun 2014 00:14:06 +0300 Subject: [PATCH 1/4] Option to use Ox as a SAX handler (instead of Nokogiri) --- Gemfile | 2 + README.md | 4 +- lib/sax-machine.rb | 14 ++++++- .../sax_abstract_handler.rb} | 38 ++++++++---------- .../handlers/sax_nokogiri_handler.rb | 15 +++++++ lib/sax-machine/handlers/sax_ox_handler.rb | 40 +++++++++++++++++++ lib/sax-machine/sax_document.rb | 20 ++++++++-- sax-machine.gemspec | 2 +- spec/sax-machine/sax_document_spec.rb | 39 +++++++++++++----- spec/spec_helper.rb | 4 ++ 10 files changed, 139 insertions(+), 39 deletions(-) rename lib/sax-machine/{sax_handler.rb => handlers/sax_abstract_handler.rb} (93%) create mode 100644 lib/sax-machine/handlers/sax_nokogiri_handler.rb create mode 100644 lib/sax-machine/handlers/sax_ox_handler.rb diff --git a/Gemfile b/Gemfile index eb93987..41b6361 100644 --- a/Gemfile +++ b/Gemfile @@ -8,3 +8,5 @@ group :development, :test do gem 'simplecov', require: false, platforms: :mri gem 'activerecord', '~> 4.1' end + +gem 'ox', '~> 2.1' if ENV['HANDLER'] == 'ox' diff --git a/README.md b/README.md index da45186..3738200 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ## Description -A declarative SAX parsing library backed by Nokogiri +A declarative SAX parsing library backed by Nokogiri or Ox ## Usage ```ruby @@ -104,4 +104,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/lib/sax-machine.rb b/lib/sax-machine.rb index 9c9c86d..e31a04f 100644 --- a/lib/sax-machine.rb +++ b/lib/sax-machine.rb @@ -1,8 +1,18 @@ require "sax-machine/version" require "sax-machine/sax_document" require "sax-machine/sax_configure" -require "sax-machine/sax_handler" require "sax-machine/sax_config" +require "sax-machine/handlers/sax_abstract_handler" +require "sax-machine/handlers/sax_nokogiri_handler" module SAXMachine -end \ No newline at end of file + @@handler = :nokogiri + + def self.handler + @@handler + end + + def self.handler=(handler) + @@handler = handler + end +end diff --git a/lib/sax-machine/sax_handler.rb b/lib/sax-machine/handlers/sax_abstract_handler.rb similarity index 93% rename from lib/sax-machine/sax_handler.rb rename to lib/sax-machine/handlers/sax_abstract_handler.rb index 8a7c553..b390584 100644 --- a/lib/sax-machine/sax_handler.rb +++ b/lib/sax-machine/handlers/sax_abstract_handler.rb @@ -1,8 +1,7 @@ -require "nokogiri" require "time" module SAXMachine - class SAXHandler < Nokogiri::XML::SAX::Document + module SAXAbstractHandler NO_BUFFER = :no_buffer class StackNode < Struct.new(:object, :config, :buffer) @@ -13,14 +12,14 @@ def initialize(object, config = nil, buffer = NO_BUFFER) end end - def initialize(object, on_error = nil, on_warning = nil) + def _initialize(object, on_error = nil, on_warning = nil) @stack = [ StackNode.new(object) ] @parsed_configs = {} @on_error = on_error @on_warning = on_warning end - def characters(data) + def _value(data) node = stack.last if node.buffer == NO_BUFFER @@ -29,10 +28,8 @@ def characters(data) node.buffer << data end end - alias cdata_block characters - - def start_element(name, attrs = []) + def _start_element(name, attrs = []) name = normalize_name(name) node = stack.last object = node.object @@ -76,7 +73,7 @@ def start_element(name, attrs = []) end end - def end_element(name) + def _end_element(name) name = normalize_name(name) start_tag = stack[-2] @@ -134,30 +131,29 @@ def end_element(name) stack.pop end - private - - def mark_as_parsed(object, element_config) - unless element_config.collection? - @parsed_configs[[object.object_id, element_config.object_id]] = true + def _error(string) + if @on_error + @on_error.call(string) end end - def parsed_config?(object, element_config) - @parsed_configs[[object.object_id, element_config.object_id]] - end - - def warning(string) + def _warning(string) if @on_warning @on_warning.call(string) end end - def error(string) - if @on_error - @on_error.call(string) + private + + def mark_as_parsed(object, element_config) + unless element_config.collection? + @parsed_configs[[object.object_id, element_config.object_id]] = true end end + def parsed_config?(object, element_config) + @parsed_configs[[object.object_id, element_config.object_id]] + end def sax_config_for(object) if object.class.respond_to?(:sax_config) diff --git a/lib/sax-machine/handlers/sax_nokogiri_handler.rb b/lib/sax-machine/handlers/sax_nokogiri_handler.rb new file mode 100644 index 0000000..643e94a --- /dev/null +++ b/lib/sax-machine/handlers/sax_nokogiri_handler.rb @@ -0,0 +1,15 @@ +require "nokogiri" + +module SAXMachine + class SAXNokogiriHandler < Nokogiri::XML::SAX::Document + include SAXAbstractHandler + + alias_method :initialize, :_initialize + alias_method :characters, :_value + alias_method :cdata_block, :_value + alias_method :start_element, :_start_element + alias_method :end_element, :_end_element + alias_method :error, :_error + alias_method :warning, :_warning + end +end diff --git a/lib/sax-machine/handlers/sax_ox_handler.rb b/lib/sax-machine/handlers/sax_ox_handler.rb new file mode 100644 index 0000000..4e43963 --- /dev/null +++ b/lib/sax-machine/handlers/sax_ox_handler.rb @@ -0,0 +1,40 @@ +require "ox" + +module SAXMachine + class SAXOxHandler < Ox::Sax + include SAXAbstractHandler + + alias_method :text, :_value + alias_method :cdata, :_value + alias_method :end_element, :_end_element + + def initialize(object, on_error = nil, on_warning = nil) + _initialize(object, on_error, on_warning) + _reset + end + + def attr(name, str) + @attrs[name] = str + end + + def attrs_done + _start_element(@element || "", @attrs) + _reset + end + + def start_element(name) + @element = name + end + + def error(message, line, column) + _error("#{message} on line #{line} column #{column}") + end + + private + + def _reset + @attrs = {} + @element = nil + end + end +end diff --git a/lib/sax-machine/sax_document.rb b/lib/sax-machine/sax_document.rb index f39163a..0823744 100644 --- a/lib/sax-machine/sax_document.rb +++ b/lib/sax-machine/sax_document.rb @@ -8,11 +8,23 @@ def self.included(base) end def parse(xml_text, on_error = nil, on_warning = nil) - sax_handler = SAXHandler.new(self, on_error, on_warning) - parser = Nokogiri::XML::SAX::Parser.new(sax_handler) - parser.parse(xml_text) do |ctx| - ctx.replace_entities = true + if SAXMachine.handler == :ox + Ox.sax_parse( + SAXOxHandler.new(self, on_error, on_warning), + StringIO.new(xml_text), + { + symbolize: false, + convert_special: true, + } + ) + else + handler = SAXNokogiriHandler.new(self, on_error, on_warning) + parser = Nokogiri::XML::SAX::Parser.new(handler) + parser.parse(xml_text) do |ctx| + ctx.replace_entities = true + end end + self end diff --git a/sax-machine.gemspec b/sax-machine.gemspec index 60b5133..bc951b6 100644 --- a/sax-machine.gemspec +++ b/sax-machine.gemspec @@ -9,7 +9,7 @@ Gem::Specification.new do |s| s.email = %q{paul@pauldix.net} s.homepage = %q{http://github.com/pauldix/sax-machine} - s.summary = %q{Declarative SAX Parsing with Nokogiri} + s.summary = %q{Declarative SAX Parsing with Nokogiri or Ox} s.license = %q{MIT} diff --git a/spec/sax-machine/sax_document_spec.rb b/spec/sax-machine/sax_document_spec.rb index 81caafa..c6ab2af 100644 --- a/spec/sax-machine/sax_document_spec.rb +++ b/spec/sax-machine/sax_document_spec.rb @@ -64,7 +64,7 @@ def title=(val) it "should be available" do @klass.data_class(:date).should == DateTime end - + it "should handle an integer class" do @klass = Class.new do include SAXMachine @@ -73,7 +73,7 @@ def title=(val) document = @klass.parse("5") document.number.should == 5 end - + it "should handle an float class" do @klass = Class.new do include SAXMachine @@ -81,7 +81,7 @@ def title=(val) end document = @klass.parse("5.5") document.number.should == 5.5 - end + end it "should handle an string class" do @klass = Class.new do @@ -91,7 +91,7 @@ def title=(val) document = @klass.parse("5.5") document.number.should == "5.5" end - + it "should handle a time class" do @klass = Class.new do include SAXMachine @@ -100,7 +100,7 @@ def title=(val) document = @klass.parse("") document.time.should == Time.utc(1994, 2, 4, 6, 20, 0, 0) end - + end describe "the required attribute" do it "should be available" do @@ -466,7 +466,7 @@ class Item elements :item, :as => :items, :with => {:type => /Foo/}, :class => Foo end end - + it "should cast into the correct class" do document = @klass.parse("Bar titleFoo title") document.items.size.should == 2 @@ -536,14 +536,14 @@ class Foo end end end - + describe "when dealing with element names containing dashes" do - it 'should automatically convert dashes to underscores' do + it 'should automatically convert dashes to underscores' do class Dashes include SAXMachine element :dashed_element end - + parsed = Dashes.parse('Text') parsed.dashed_element.should eq "Text" end @@ -835,4 +835,25 @@ def title=(blah) @item.authors.last.role.should == 'artist' end end + + describe "with error handling" do + before do + @xml = %[ + + sweet + ] + + class ItemElement5 + include SAXMachine + element :title + end + + @errors = [] + @item = ItemElement5.parse(@xml, ->(x) { @errors << x }) + end + + it 'should have error' do + @errors.uniq.size.should == 1 + end + end end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 4c510b3..d81c22f 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -7,6 +7,10 @@ end require File.expand_path(File.dirname(__FILE__) + '/../lib/sax-machine') +if ENV['HANDLER'] == 'ox' + require 'sax-machine/handlers/sax_ox_handler' + SAXMachine.handler = :ox +end RSpec.configure do |config| config.treat_symbols_as_metadata_keys_with_true_values = true From 56a36ef8bce119a87982cdab8c3b1c2d83420542 Mon Sep 17 00:00:00 2001 From: Dmitry Krasnoukhov Date: Wed, 11 Jun 2014 21:56:50 +0300 Subject: [PATCH 2/4] Ox: improve code a bit --- .../handlers/sax_abstract_handler.rb | 2 +- .../handlers/sax_nokogiri_handler.rb | 4 ++-- lib/sax-machine/handlers/sax_ox_handler.rb | 22 +++++++++---------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/lib/sax-machine/handlers/sax_abstract_handler.rb b/lib/sax-machine/handlers/sax_abstract_handler.rb index b390584..3ba430f 100644 --- a/lib/sax-machine/handlers/sax_abstract_handler.rb +++ b/lib/sax-machine/handlers/sax_abstract_handler.rb @@ -19,7 +19,7 @@ def _initialize(object, on_error = nil, on_warning = nil) @on_warning = on_warning end - def _value(data) + def _characters(data) node = stack.last if node.buffer == NO_BUFFER diff --git a/lib/sax-machine/handlers/sax_nokogiri_handler.rb b/lib/sax-machine/handlers/sax_nokogiri_handler.rb index 643e94a..fb28329 100644 --- a/lib/sax-machine/handlers/sax_nokogiri_handler.rb +++ b/lib/sax-machine/handlers/sax_nokogiri_handler.rb @@ -5,8 +5,8 @@ class SAXNokogiriHandler < Nokogiri::XML::SAX::Document include SAXAbstractHandler alias_method :initialize, :_initialize - alias_method :characters, :_value - alias_method :cdata_block, :_value + alias_method :characters, :_characters + alias_method :cdata_block, :_characters alias_method :start_element, :_start_element alias_method :end_element, :_end_element alias_method :error, :_error diff --git a/lib/sax-machine/handlers/sax_ox_handler.rb b/lib/sax-machine/handlers/sax_ox_handler.rb index 4e43963..f099eda 100644 --- a/lib/sax-machine/handlers/sax_ox_handler.rb +++ b/lib/sax-machine/handlers/sax_ox_handler.rb @@ -4,13 +4,9 @@ module SAXMachine class SAXOxHandler < Ox::Sax include SAXAbstractHandler - alias_method :text, :_value - alias_method :cdata, :_value - alias_method :end_element, :_end_element - - def initialize(object, on_error = nil, on_warning = nil) - _initialize(object, on_error, on_warning) - _reset + def initialize(*args) + _initialize(*args) + _reset_element end def attr(name, str) @@ -18,8 +14,8 @@ def attr(name, str) end def attrs_done - _start_element(@element || "", @attrs) - _reset + _start_element(@element, @attrs) + _reset_element end def start_element(name) @@ -30,11 +26,15 @@ def error(message, line, column) _error("#{message} on line #{line} column #{column}") end + alias_method :text, :_characters + alias_method :cdata, :_characters + alias_method :end_element, :_end_element + private - def _reset + def _reset_element @attrs = {} - @element = nil + @element = "" end end end From 899ed084fff33ad349d617797d65589a463041c1 Mon Sep 17 00:00:00 2001 From: Dmitry Krasnoukhov Date: Mon, 16 Jun 2014 18:48:47 +0300 Subject: [PATCH 3/4] Ox: add skip option and content spec --- Gemfile | 2 +- lib/sax-machine/sax_document.rb | 1 + spec/benchmarks/benchmark.rb | 4 ++-- spec/fixtures/atom-content.html | 15 +++++++++++++++ spec/{sax-machine => fixtures}/atom.xml | 0 spec/sax-machine/sax_document_spec.rb | 16 ++++++++++------ 6 files changed, 29 insertions(+), 9 deletions(-) create mode 100644 spec/fixtures/atom-content.html rename spec/{sax-machine => fixtures}/atom.xml (100%) diff --git a/Gemfile b/Gemfile index 41b6361..07e2cd7 100644 --- a/Gemfile +++ b/Gemfile @@ -9,4 +9,4 @@ group :development, :test do gem 'activerecord', '~> 4.1' end -gem 'ox', '~> 2.1' if ENV['HANDLER'] == 'ox' +gem 'ox', github: 'ohler55/ox' if ENV['HANDLER'] == 'ox' diff --git a/lib/sax-machine/sax_document.rb b/lib/sax-machine/sax_document.rb index 0823744..13a45c8 100644 --- a/lib/sax-machine/sax_document.rb +++ b/lib/sax-machine/sax_document.rb @@ -15,6 +15,7 @@ def parse(xml_text, on_error = nil, on_warning = nil) { symbolize: false, convert_special: true, + skip: :skip_return, } ) else diff --git a/spec/benchmarks/benchmark.rb b/spec/benchmarks/benchmark.rb index 4391efc..2e1398f 100644 --- a/spec/benchmarks/benchmark.rb +++ b/spec/benchmarks/benchmark.rb @@ -26,7 +26,7 @@ class Atom elements :entry, :as => :entries, :class => AtomEntry end end -feed_text = File.read("spec/sax-machine/atom.xml") +feed_text = File.read("spec/fixtures/atom.xml") benchmark do |t| t.report("feedzirra") do @@ -65,7 +65,7 @@ class Atom # element :title, String # has_many :entry, Entry # end -# feed_text = File.read("spec/sax-machine/atom.xml") +# feed_text = File.read("spec/fixtures/atom.xml") # # benchmark do |t| # t.report("sax-machine") do diff --git a/spec/fixtures/atom-content.html b/spec/fixtures/atom-content.html new file mode 100644 index 0000000..825815a --- /dev/null +++ b/spec/fixtures/atom-content.html @@ -0,0 +1,15 @@ + +

In my previous post about the speed of serializing data, I concluded that Marshal was the quickest way to get things done. So I set about using Marshal to store some data in an ActiveRecord object. Things worked great at first, but on some test data I got this error: marshal data too short. Luckily, Bryan Helmkamp had helpfully pointed out that there were sometimes problems with storing marshaled data in the database. He said it was best to base64 encode the marshal dump before storing.

+ +

I was curious why it was working on some things and not others. It turns out that some types of data being marshaled were causing the error to pop up. Here's the test data I used in my specs:

+
{ :foo => 3, :bar => 2 } # hash with symbols for keys and integer values
[3, 2.1, 4, 8]           # array with integer and float values
+

Everything worked when I switched the array values to all integers so it seems that floats were causing the problem. However, in the interest of keeping everything working regardless of data types, I base64 encoded before going into the database and decoded on the way out.

+ +

I also ran the benchmarks again to determine what impact this would have on speed. Here are the results for 100 iterations on a 10k element array and a 10k element hash with and without base64 encode/decode:

+
                user       system     total       real
array marshal  0.200000   0.010000   0.210000 (  0.214018) (without Base64)
array marshal  0.220000   0.010000   0.230000 (  0.250260)

hash marshal   1.830000   0.040000   1.870000 (  1.892874) (without Base64)
hash marshal   2.040000   0.100000   2.140000 (  2.170405)
+

As you can see the difference in speed is pretty negligible. I assume that the error has to do with AR cleaning the stuff that gets inserted into the database, but I'm not really sure. In the end it's just easier to use Base64.encode64 when serializing data into a text field in ActiveRecord using Marshal.

+ +

I've also read people posting about this error when using the database session store. I can only assume that it's because they were trying to store either way too much data in their session (too much for a regular text field) or they were storing float values or some other data type that would cause this to pop up. Hopefully this helps.

+
+ +
\ No newline at end of file diff --git a/spec/sax-machine/atom.xml b/spec/fixtures/atom.xml similarity index 100% rename from spec/sax-machine/atom.xml rename to spec/fixtures/atom.xml diff --git a/spec/sax-machine/sax_document_spec.rb b/spec/sax-machine/sax_document_spec.rb index c6ab2af..bf61a76 100644 --- a/spec/sax-machine/sax_document_spec.rb +++ b/spec/sax-machine/sax_document_spec.rb @@ -551,7 +551,7 @@ class Dashes describe "full example" do before :each do - @xml = File.read('spec/sax-machine/atom.xml') + @xml = File.read('spec/fixtures/atom.xml') class AtomEntry include SAXMachine element :title @@ -570,17 +570,21 @@ class Atom element :link, :value => :href, :as => :feed_url, :with => {:type => "application/atom+xml"} elements :entry, :as => :entries, :class => AtomEntry end + + @feed = Atom.parse(@xml) end # before it "should parse the url" do - f = Atom.parse(@xml) - f.url.should == "http://www.pauldix.net/" + @feed.url.should == "http://www.pauldix.net/" end it "should parse entry url" do - f = Atom.parse(@xml) - f.entries.first.url.should == "http://www.pauldix.net/2008/09/marshal-data-to.html?param1=1¶m2=2" - f.entries.first.alternate.should == "http://feeds.feedburner.com/~r/PaulDixExplainsNothing/~3/383536354/marshal-data-to.html?param1=1¶m2=2" + @feed.entries.first.url.should == "http://www.pauldix.net/2008/09/marshal-data-to.html?param1=1¶m2=2" + @feed.entries.first.alternate.should == "http://feeds.feedburner.com/~r/PaulDixExplainsNothing/~3/383536354/marshal-data-to.html?param1=1¶m2=2" + end + + it "should parse content" do + @feed.entries.first.content.should == File.read('spec/fixtures/atom-content.html') end end From 9087bb32bcd88bd23852ae9514098ec9154df1dd Mon Sep 17 00:00:00 2001 From: Dmitry Krasnoukhov Date: Thu, 17 Jul 2014 19:36:14 +0300 Subject: [PATCH 4/4] Ox: update ox gem --- Gemfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Gemfile b/Gemfile index 07e2cd7..95d6d5a 100644 --- a/Gemfile +++ b/Gemfile @@ -7,6 +7,5 @@ group :development, :test do gem 'guard-rspec' gem 'simplecov', require: false, platforms: :mri gem 'activerecord', '~> 4.1' + gem 'ox', '>= 2.1.2' end - -gem 'ox', github: 'ohler55/ox' if ENV['HANDLER'] == 'ox'