Skip to content

Commit

Permalink
Added an html validation check by looking at Nokogiri errors
Browse files Browse the repository at this point in the history
  • Loading branch information
akoeplinger committed Oct 29, 2014
1 parent c30c183 commit e413f18
Show file tree
Hide file tree
Showing 12 changed files with 112 additions and 2 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,11 @@ Project | Repository
* Whether your internal script references are not broken
* Whether external scripts are loading

### HTML

Nokogiri looks at the markup and [provides errors](http://www.nokogiri.org/tutorials/ensuring_well_formed_markup.html) when parsing your document.
This is an optional feature, set the `validate_html` option to enable validation errors from Nokogiri.

## Configuration

The `HTML::Proofer` constructor takes an optional hash of additional options:
Expand All @@ -138,6 +143,7 @@ The `HTML::Proofer` constructor takes an optional hash of additional options:
| `href_swap` | A hash containing key-value pairs of `RegExp => String`. It transforms links that match `RegExp` into `String` via `gsub`. | `{}` |
| `verbose` | If `true`, outputs extra information as the checking happens. Useful for debugging. | `false` |
| `only_4xx` | Only reports errors for links that fall within the 4xx status code range. | `false` |
| `validate_html` | Enables HTML validation errors from Nokogiri | `false` |

### Configuring Typhoeus

Expand Down
2 changes: 2 additions & 0 deletions bin/htmlproof
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Mercenary.program(:htmlproof) do |p|
p.option 'only-4xx', '--only-4xx', 'Only reports errors for links that fall within the 4x status code range.'
p.option 'verbose', '--verbose', 'Enables more verbose logging.'
p.option 'directory_index_file', '--directory_index_file', 'Sets the file to look for when a link refers to a directory.'
p.option 'validate_html', '--validate_html', 'Enables HTML validation errors from Nokogiri (default: `false`).'

p.action do |args, opts|
args = ["."] if args.empty?
Expand All @@ -45,6 +46,7 @@ Mercenary.program(:htmlproof) do |p|
options[:favicon] = opts["favicon"] unless opts["favicon"].nil?
options[:verbose] = opts["verbose"] unless opts["verbose"].nil?
options[:directory_index_file] = opts["directory_index_file"] unless opts["directory_index_file"].nil?
options[:validate_html] = opts["validate_html"] unless opts["validate_html"].nil?

path = path.delete(' ').split(",") if opts["as-links"]

Expand Down
4 changes: 3 additions & 1 deletion lib/html/proofer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ def initialize(src, opts={})
:disable_external => false,
:verbose => false,
:only_4xx => false,
:directory_index_file => "index.html"
:directory_index_file => "index.html",
:validate_html => false
}

@typhoeus_opts = {
Expand Down Expand Up @@ -191,6 +192,7 @@ def self.create_nokogiri(path)
def get_checks
checks = HTML::Proofer::Checks::Check.subclasses.map { |c| c.name }
checks.delete("Favicons") unless @options[:favicon]
checks.delete("Html") unless @options[:validate_html]
checks
end

Expand Down
3 changes: 2 additions & 1 deletion lib/html/proofer/checks.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ class Checks
"checks/images",
"checks/links",
"checks/scripts",
"checks/favicon"
"checks/favicon",
"checks/html"
].each { |r| require File.join(File.dirname(__FILE__), r) }
end
end
Expand Down
24 changes: 24 additions & 0 deletions lib/html/proofer/checks/html.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# encoding: utf-8

class Html < ::HTML::Proofer::Checks::Check

# new html5 tags (source: http://www.w3schools.com/html/html5_new_elements.asp)
HTML5_TAGS = %w(article aside bdi details dialog figcaption
figure footer header main mark menuitem meter
nav progress rp rt ruby section summary
time wbr datalist keygen output color date
datetime datetime-local email month number
range search tel time url week canvas
svg audio embed source track video)

def run
@html.errors.each do |e|

# Nokogiri (or rather libxml2 underhood) only recognizes html4 tags,
# so we need to skip errors caused by the new tags in html5
next if HTML5_TAGS.include? e.to_s[/Tag ([\w-]+) invalid/o, 1]

self.add_issue(e.to_s)
end
end
end
6 changes: 6 additions & 0 deletions spec/html/proofer/fixtures/html/div_inside_head.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<html>
<head>
<div>
</div>
</head>
</html>
3 changes: 3 additions & 0 deletions spec/html/proofer/fixtures/html/invalid_tag.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<html>
<myfancytag></myfancytag>
</html>
5 changes: 5 additions & 0 deletions spec/html/proofer/fixtures/html/missing_closing_quotes.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<html>
<body>
<a href="/test>A test link</a>
</body>
</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<html>
<body>
<p>The quick <strong>brown fox</p>
jumped over the
<p>lazy</strong> dog.</p>
</body>
</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<html>
<body data-something="up&down">
</body>
</html>
5 changes: 5 additions & 0 deletions spec/html/proofer/fixtures/html/unmatched_end_tag.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<html>
<body>
</div>
</body>
</html>
45 changes: 45 additions & 0 deletions spec/html/proofer/html_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
require "spec_helper"

describe "Html test" do
it "ignores an invalid tag by default" do
html = "#{FIXTURES_DIR}/html/invalid_tag.html"
output = capture_stderr { HTML::Proofer.new(html).run }
output.should == ""
end

it "fails for an invalid tag" do
html = "#{FIXTURES_DIR}/html/invalid_tag.html"
output = capture_stderr { HTML::Proofer.new(html, {:validate_html => true}).run }
output.should match /Tag myfancytag invalid/
end

it "fails for an unmatched end tag" do
html = "#{FIXTURES_DIR}/html/unmatched_end_tag.html"
output = capture_stderr { HTML::Proofer.new(html, {:validate_html => true}).run }
output.should match /Unexpected end tag : div/
end

it "fails for an unescaped ampersand in attribute" do
html = "#{FIXTURES_DIR}/html/unescaped_ampersand_in_attribute.html"
output = capture_stderr { HTML::Proofer.new(html, {:validate_html => true}).run }
output.should match /htmlParseEntityRef: expecting ';'/
end

it "fails for mismatch between opening and ending tag" do
html = "#{FIXTURES_DIR}/html/opening_and_ending_tag_mismatch.html"
output = capture_stderr { HTML::Proofer.new(html, {:validate_html => true}).run }
output.should match /Opening and ending tag mismatch: p and strong/
end

it "fails for div inside head" do
html = "#{FIXTURES_DIR}/html/div_inside_head.html"
output = capture_stderr { HTML::Proofer.new(html, {:validate_html => true}).run }
output.should match /Unexpected end tag : head/
end

it "fails for missing closing quotation mark in href" do
html = "#{FIXTURES_DIR}/html/missing_closing_quotes.html"
output = capture_stderr { HTML::Proofer.new(html, {:validate_html => true}).run }
output.should match /Couldn't find end of Start Tag a/
end
end

0 comments on commit e413f18

Please sign in to comment.