From 7d7068e4fae90d438c9b7914a983ac17b8885d68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20K=C3=B6plinger?= Date: Thu, 16 Oct 2014 16:35:23 +0200 Subject: [PATCH] Added an html validation check by looking at Nokogiri errors --- README.md | 6 +++ bin/htmlproof | 2 + lib/html/proofer.rb | 4 +- lib/html/proofer/checks.rb | 3 +- lib/html/proofer/checks/html.rb | 24 +++++++++ .../fixtures/html/div_inside_head.html | 6 +++ .../proofer/fixtures/html/html5_tags.html | 9 ++++ .../proofer/fixtures/html/invalid_tag.html | 3 ++ .../fixtures/html/missing_closing_quotes.html | 5 ++ .../html/opening_and_ending_tag_mismatch.html | 7 +++ .../unescaped_ampersand_in_attribute.html | 4 ++ .../fixtures/html/unmatched_end_tag.html | 5 ++ spec/html/proofer/html_spec.rb | 51 +++++++++++++++++++ 13 files changed, 127 insertions(+), 2 deletions(-) create mode 100644 lib/html/proofer/checks/html.rb create mode 100644 spec/html/proofer/fixtures/html/div_inside_head.html create mode 100644 spec/html/proofer/fixtures/html/html5_tags.html create mode 100644 spec/html/proofer/fixtures/html/invalid_tag.html create mode 100644 spec/html/proofer/fixtures/html/missing_closing_quotes.html create mode 100644 spec/html/proofer/fixtures/html/opening_and_ending_tag_mismatch.html create mode 100644 spec/html/proofer/fixtures/html/unescaped_ampersand_in_attribute.html create mode 100644 spec/html/proofer/fixtures/html/unmatched_end_tag.html create mode 100644 spec/html/proofer/html_spec.rb diff --git a/README.md b/README.md index c6b8cc54..3b4fb7da 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,11 @@ Project | Repository * Whether your internal script references are not broken * Whether external scripts are loading +### HTML + +Nokogiri looks at the markup and [provides errors](http://www.nokogiri.org/tutorials/ensuring_well_formed_markup.html) when parsing your document. +This is an optional feature, set the `validate_html` option to enable validation errors from Nokogiri. + ## Configuration The `HTML::Proofer` constructor takes an optional hash of additional options: @@ -138,6 +143,7 @@ The `HTML::Proofer` constructor takes an optional hash of additional options: | `href_swap` | A hash containing key-value pairs of `RegExp => String`. It transforms links that match `RegExp` into `String` via `gsub`. | `{}` | | `verbose` | If `true`, outputs extra information as the checking happens. Useful for debugging. | `false` | | `only_4xx` | Only reports errors for links that fall within the 4xx status code range. | `false` | +| `validate_html` | Enables HTML validation errors from Nokogiri | `false` | ### Configuring Typhoeus diff --git a/bin/htmlproof b/bin/htmlproof index a27be83e..9697f237 100755 --- a/bin/htmlproof +++ b/bin/htmlproof @@ -24,6 +24,7 @@ Mercenary.program(:htmlproof) do |p| p.option 'only-4xx', '--only-4xx', 'Only reports errors for links that fall within the 4x status code range.' p.option 'verbose', '--verbose', 'Enables more verbose logging.' p.option 'directory_index_file', '--directory_index_file', 'Sets the file to look for when a link refers to a directory.' + p.option 'validate_html', '--validate_html', 'Enables HTML validation errors from Nokogiri (default: `false`).' p.action do |args, opts| args = ["."] if args.empty? @@ -45,6 +46,7 @@ Mercenary.program(:htmlproof) do |p| options[:favicon] = opts["favicon"] unless opts["favicon"].nil? options[:verbose] = opts["verbose"] unless opts["verbose"].nil? options[:directory_index_file] = opts["directory_index_file"] unless opts["directory_index_file"].nil? + options[:validate_html] = opts["validate_html"] unless opts["validate_html"].nil? path = path.delete(' ').split(",") if opts["as-links"] diff --git a/lib/html/proofer.rb b/lib/html/proofer.rb index d722f869..815ea243 100644 --- a/lib/html/proofer.rb +++ b/lib/html/proofer.rb @@ -41,7 +41,8 @@ def initialize(src, opts={}) :disable_external => false, :verbose => false, :only_4xx => false, - :directory_index_file => "index.html" + :directory_index_file => "index.html", + :validate_html => false } @typhoeus_opts = { @@ -191,6 +192,7 @@ def self.create_nokogiri(path) def get_checks checks = HTML::Proofer::Checks::Check.subclasses.map { |c| c.name } checks.delete("Favicons") unless @options[:favicon] + checks.delete("Html") unless @options[:validate_html] checks end diff --git a/lib/html/proofer/checks.rb b/lib/html/proofer/checks.rb index fb6d6b41..9e9f39f7 100644 --- a/lib/html/proofer/checks.rb +++ b/lib/html/proofer/checks.rb @@ -7,7 +7,8 @@ class Checks "checks/images", "checks/links", "checks/scripts", - "checks/favicon" + "checks/favicon", + "checks/html" ].each { |r| require File.join(File.dirname(__FILE__), r) } end end diff --git a/lib/html/proofer/checks/html.rb b/lib/html/proofer/checks/html.rb new file mode 100644 index 00000000..85be90d8 --- /dev/null +++ b/lib/html/proofer/checks/html.rb @@ -0,0 +1,24 @@ +# encoding: utf-8 + +class Html < ::HTML::Proofer::Checks::Check + + # new html5 tags (source: http://www.w3schools.com/html/html5_new_elements.asp) + HTML5_TAGS = %w(article aside bdi details dialog figcaption + figure footer header main mark menuitem meter + nav progress rp rt ruby section summary + time wbr datalist keygen output color date + datetime datetime-local email month number + range search tel time url week canvas + svg audio embed source track video) + + def run + @html.errors.each do |e| + + # Nokogiri (or rather libxml2 underhood) only recognizes html4 tags, + # so we need to skip errors caused by the new tags in html5 + next if HTML5_TAGS.include? e.to_s[/Tag ([\w-]+) invalid/o, 1] + + self.add_issue(e.to_s) + end + end +end diff --git a/spec/html/proofer/fixtures/html/div_inside_head.html b/spec/html/proofer/fixtures/html/div_inside_head.html new file mode 100644 index 00000000..2de8461d --- /dev/null +++ b/spec/html/proofer/fixtures/html/div_inside_head.html @@ -0,0 +1,6 @@ + + +
+
+ + diff --git a/spec/html/proofer/fixtures/html/html5_tags.html b/spec/html/proofer/fixtures/html/html5_tags.html new file mode 100644 index 00000000..0ad4ae71 --- /dev/null +++ b/spec/html/proofer/fixtures/html/html5_tags.html @@ -0,0 +1,9 @@ + + + +
+

Some text

+
+ + diff --git a/spec/html/proofer/fixtures/html/invalid_tag.html b/spec/html/proofer/fixtures/html/invalid_tag.html new file mode 100644 index 00000000..04e97935 --- /dev/null +++ b/spec/html/proofer/fixtures/html/invalid_tag.html @@ -0,0 +1,3 @@ + + + diff --git a/spec/html/proofer/fixtures/html/missing_closing_quotes.html b/spec/html/proofer/fixtures/html/missing_closing_quotes.html new file mode 100644 index 00000000..37683ec4 --- /dev/null +++ b/spec/html/proofer/fixtures/html/missing_closing_quotes.html @@ -0,0 +1,5 @@ + + + + + diff --git a/spec/html/proofer/fixtures/html/unmatched_end_tag.html b/spec/html/proofer/fixtures/html/unmatched_end_tag.html new file mode 100644 index 00000000..dcdb1c5d --- /dev/null +++ b/spec/html/proofer/fixtures/html/unmatched_end_tag.html @@ -0,0 +1,5 @@ + + + + + diff --git a/spec/html/proofer/html_spec.rb b/spec/html/proofer/html_spec.rb new file mode 100644 index 00000000..4a1f56df --- /dev/null +++ b/spec/html/proofer/html_spec.rb @@ -0,0 +1,51 @@ +require "spec_helper" + +describe "Html test" do + it "ignores an invalid tag by default" do + html = "#{FIXTURES_DIR}/html/invalid_tag.html" + output = capture_stderr { HTML::Proofer.new(html).run } + output.should == "" + end + + it "doesn't fail for html5 tags" do + html = "#{FIXTURES_DIR}/html/html5_tags.html" + output = capture_stderr { HTML::Proofer.new(html, {:validate_html => true}).run } + output.should == "" + end + + it "fails for an invalid tag" do + html = "#{FIXTURES_DIR}/html/invalid_tag.html" + output = capture_stderr { HTML::Proofer.new(html, {:validate_html => true}).run } + output.should match /Tag myfancytag invalid/ + end + + it "fails for an unmatched end tag" do + html = "#{FIXTURES_DIR}/html/unmatched_end_tag.html" + output = capture_stderr { HTML::Proofer.new(html, {:validate_html => true}).run } + output.should match /Unexpected end tag : div/ + end + + it "fails for an unescaped ampersand in attribute" do + html = "#{FIXTURES_DIR}/html/unescaped_ampersand_in_attribute.html" + output = capture_stderr { HTML::Proofer.new(html, {:validate_html => true}).run } + output.should match /htmlParseEntityRef: expecting ';'/ + end + + it "fails for mismatch between opening and ending tag" do + html = "#{FIXTURES_DIR}/html/opening_and_ending_tag_mismatch.html" + output = capture_stderr { HTML::Proofer.new(html, {:validate_html => true}).run } + output.should match /Opening and ending tag mismatch: p and strong/ + end + + it "fails for div inside head" do + html = "#{FIXTURES_DIR}/html/div_inside_head.html" + output = capture_stderr { HTML::Proofer.new(html, {:validate_html => true}).run } + output.should match /Unexpected end tag : head/ + end + + it "fails for missing closing quotation mark in href" do + html = "#{FIXTURES_DIR}/html/missing_closing_quotes.html" + output = capture_stderr { HTML::Proofer.new(html, {:validate_html => true}).run } + output.should match /Couldn't find end of Start Tag a/ + end +end