diff --git a/Gemfile b/Gemfile index eb93987..95d6d5a 100644 --- a/Gemfile +++ b/Gemfile @@ -7,4 +7,5 @@ group :development, :test do gem 'guard-rspec' gem 'simplecov', require: false, platforms: :mri gem 'activerecord', '~> 4.1' + gem 'ox', '>= 2.1.2' end diff --git a/README.md b/README.md index da45186..3738200 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ## Description -A declarative SAX parsing library backed by Nokogiri +A declarative SAX parsing library backed by Nokogiri or Ox ## Usage ```ruby @@ -104,4 +104,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/lib/sax-machine.rb b/lib/sax-machine.rb index 9c9c86d..e31a04f 100644 --- a/lib/sax-machine.rb +++ b/lib/sax-machine.rb @@ -1,8 +1,18 @@ require "sax-machine/version" require "sax-machine/sax_document" require "sax-machine/sax_configure" -require "sax-machine/sax_handler" require "sax-machine/sax_config" +require "sax-machine/handlers/sax_abstract_handler" +require "sax-machine/handlers/sax_nokogiri_handler" module SAXMachine -end \ No newline at end of file + @@handler = :nokogiri + + def self.handler + @@handler + end + + def self.handler=(handler) + @@handler = handler + end +end diff --git a/lib/sax-machine/sax_handler.rb b/lib/sax-machine/handlers/sax_abstract_handler.rb similarity index 93% rename from lib/sax-machine/sax_handler.rb rename to lib/sax-machine/handlers/sax_abstract_handler.rb index 8a7c553..3ba430f 100644 --- a/lib/sax-machine/sax_handler.rb +++ b/lib/sax-machine/handlers/sax_abstract_handler.rb @@ -1,8 +1,7 @@ -require "nokogiri" require "time" module SAXMachine - class SAXHandler < Nokogiri::XML::SAX::Document + module SAXAbstractHandler NO_BUFFER = :no_buffer class StackNode < Struct.new(:object, :config, :buffer) @@ -13,14 +12,14 @@ def initialize(object, config = nil, buffer = NO_BUFFER) end end - def initialize(object, on_error = nil, on_warning = nil) + def _initialize(object, on_error = nil, on_warning = nil) @stack = [ StackNode.new(object) ] @parsed_configs = {} @on_error = on_error @on_warning = on_warning end - def characters(data) + def _characters(data) node = stack.last if node.buffer == NO_BUFFER @@ -29,10 +28,8 @@ def characters(data) node.buffer << data end end - alias cdata_block characters - - def start_element(name, attrs = []) + def _start_element(name, attrs = []) name = normalize_name(name) node = stack.last object = node.object @@ -76,7 +73,7 @@ def start_element(name, attrs = []) end end - def end_element(name) + def _end_element(name) name = normalize_name(name) start_tag = stack[-2] @@ -134,30 +131,29 @@ def end_element(name) stack.pop end - private - - def mark_as_parsed(object, element_config) - unless element_config.collection? - @parsed_configs[[object.object_id, element_config.object_id]] = true + def _error(string) + if @on_error + @on_error.call(string) end end - def parsed_config?(object, element_config) - @parsed_configs[[object.object_id, element_config.object_id]] - end - - def warning(string) + def _warning(string) if @on_warning @on_warning.call(string) end end - def error(string) - if @on_error - @on_error.call(string) + private + + def mark_as_parsed(object, element_config) + unless element_config.collection? + @parsed_configs[[object.object_id, element_config.object_id]] = true end end + def parsed_config?(object, element_config) + @parsed_configs[[object.object_id, element_config.object_id]] + end def sax_config_for(object) if object.class.respond_to?(:sax_config) diff --git a/lib/sax-machine/handlers/sax_nokogiri_handler.rb b/lib/sax-machine/handlers/sax_nokogiri_handler.rb new file mode 100644 index 0000000..fb28329 --- /dev/null +++ b/lib/sax-machine/handlers/sax_nokogiri_handler.rb @@ -0,0 +1,15 @@ +require "nokogiri" + +module SAXMachine + class SAXNokogiriHandler < Nokogiri::XML::SAX::Document + include SAXAbstractHandler + + alias_method :initialize, :_initialize + alias_method :characters, :_characters + alias_method :cdata_block, :_characters + alias_method :start_element, :_start_element + alias_method :end_element, :_end_element + alias_method :error, :_error + alias_method :warning, :_warning + end +end diff --git a/lib/sax-machine/handlers/sax_ox_handler.rb b/lib/sax-machine/handlers/sax_ox_handler.rb new file mode 100644 index 0000000..f099eda --- /dev/null +++ b/lib/sax-machine/handlers/sax_ox_handler.rb @@ -0,0 +1,40 @@ +require "ox" + +module SAXMachine + class SAXOxHandler < Ox::Sax + include SAXAbstractHandler + + def initialize(*args) + _initialize(*args) + _reset_element + end + + def attr(name, str) + @attrs[name] = str + end + + def attrs_done + _start_element(@element, @attrs) + _reset_element + end + + def start_element(name) + @element = name + end + + def error(message, line, column) + _error("#{message} on line #{line} column #{column}") + end + + alias_method :text, :_characters + alias_method :cdata, :_characters + alias_method :end_element, :_end_element + + private + + def _reset_element + @attrs = {} + @element = "" + end + end +end diff --git a/lib/sax-machine/sax_document.rb b/lib/sax-machine/sax_document.rb index f39163a..13a45c8 100644 --- a/lib/sax-machine/sax_document.rb +++ b/lib/sax-machine/sax_document.rb @@ -8,11 +8,24 @@ def self.included(base) end def parse(xml_text, on_error = nil, on_warning = nil) - sax_handler = SAXHandler.new(self, on_error, on_warning) - parser = Nokogiri::XML::SAX::Parser.new(sax_handler) - parser.parse(xml_text) do |ctx| - ctx.replace_entities = true + if SAXMachine.handler == :ox + Ox.sax_parse( + SAXOxHandler.new(self, on_error, on_warning), + StringIO.new(xml_text), + { + symbolize: false, + convert_special: true, + skip: :skip_return, + } + ) + else + handler = SAXNokogiriHandler.new(self, on_error, on_warning) + parser = Nokogiri::XML::SAX::Parser.new(handler) + parser.parse(xml_text) do |ctx| + ctx.replace_entities = true + end end + self end diff --git a/sax-machine.gemspec b/sax-machine.gemspec index 60b5133..bc951b6 100644 --- a/sax-machine.gemspec +++ b/sax-machine.gemspec @@ -9,7 +9,7 @@ Gem::Specification.new do |s| s.email = %q{paul@pauldix.net} s.homepage = %q{http://github.com/pauldix/sax-machine} - s.summary = %q{Declarative SAX Parsing with Nokogiri} + s.summary = %q{Declarative SAX Parsing with Nokogiri or Ox} s.license = %q{MIT} diff --git a/spec/benchmarks/benchmark.rb b/spec/benchmarks/benchmark.rb index 4391efc..2e1398f 100644 --- a/spec/benchmarks/benchmark.rb +++ b/spec/benchmarks/benchmark.rb @@ -26,7 +26,7 @@ class Atom elements :entry, :as => :entries, :class => AtomEntry end end -feed_text = File.read("spec/sax-machine/atom.xml") +feed_text = File.read("spec/fixtures/atom.xml") benchmark do |t| t.report("feedzirra") do @@ -65,7 +65,7 @@ class Atom # element :title, String # has_many :entry, Entry # end -# feed_text = File.read("spec/sax-machine/atom.xml") +# feed_text = File.read("spec/fixtures/atom.xml") # # benchmark do |t| # t.report("sax-machine") do diff --git a/spec/fixtures/atom-content.html b/spec/fixtures/atom-content.html new file mode 100644 index 0000000..825815a --- /dev/null +++ b/spec/fixtures/atom-content.html @@ -0,0 +1,15 @@ + +
In my previous post about the speed of serializing data, I concluded that Marshal was the quickest way to get things done. So I set about using Marshal to store some data in an ActiveRecord object. Things worked great at first, but on some test data I got this error: marshal data too short. Luckily, Bryan Helmkamp had helpfully pointed out that there were sometimes problems with storing marshaled data in the database. He said it was best to base64 encode the marshal dump before storing.
+ +I was curious why it was working on some things and not others. It turns out that some types of data being marshaled were causing the error to pop up. Here's the test data I used in my specs:
+{ :foo => 3, :bar => 2 } # hash with symbols for keys and integer values+
[3, 2.1, 4, 8] # array with integer and float values
Everything worked when I switched the array values to all integers so it seems that floats were causing the problem. However, in the interest of keeping everything working regardless of data types, I base64 encoded before going into the database and decoded on the way out.
+ +I also ran the benchmarks again to determine what impact this would have on speed. Here are the results for 100 iterations on a 10k element array and a 10k element hash with and without base64 encode/decode:
+user system total real+
array marshal 0.200000 0.010000 0.210000 ( 0.214018) (without Base64)
array marshal 0.220000 0.010000 0.230000 ( 0.250260)
hash marshal 1.830000 0.040000 1.870000 ( 1.892874) (without Base64)
hash marshal 2.040000 0.100000 2.140000 ( 2.170405)
As you can see the difference in speed is pretty negligible. I assume that the error has to do with AR cleaning the stuff that gets inserted into the database, but I'm not really sure. In the end it's just easier to use Base64.encode64 when serializing data into a text field in ActiveRecord using Marshal.
+ +I've also read people posting about this error when using the database session store. I can only assume that it's because they were trying to store either way too much data in their session (too much for a regular text field) or they were storing float values or some other data type that would cause this to pop up. Hopefully this helps.