Skip to content

Commit

Permalink
Merge pull request #370 from gjtorikian/update-semantics
Browse files Browse the repository at this point in the history
Update Selma signatures
  • Loading branch information
gjtorikian authored Dec 26, 2022
2 parents 9ace3a6 + d804541 commit b207c62
Show file tree
Hide file tree
Showing 8 changed files with 50 additions and 65 deletions.
6 changes: 3 additions & 3 deletions lib/html_pipeline/node_filter/emoji_filter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ def selector
Selma::Selector.new(match_text_within: "*", ignore_text_within: ignored_ancestor_tags)
end

def handle_text(text)
return text unless text.include?(":")
def handle_text_chunk(text)
return unless text.to_s.include?(":")

emoji_image_filter(text)
text.replace(emoji_image_filter(text.to_s), as: :html)
end

# Implementation of validate hook.
Expand Down
3 changes: 2 additions & 1 deletion lib/html_pipeline/node_filter/image_max_width_filter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ def handle_element(element)

def link_image(element)
link_start = %(<a target="_blank" href="#{element["src"]}">)
element.before(link_start, as: :html)
link_end = "</a>"
element.wrap(link_start, link_end, :as_html)
element.after(link_end, as: :html)
end
end
end
Expand Down
11 changes: 6 additions & 5 deletions lib/html_pipeline/node_filter/mention_filter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,14 @@ def selector
SELECTOR
end

def handle_text(text)
return text unless text.include?("@")
def handle_text_chunk(text)
content = text.to_s
return unless content.include?("@")

html = mention_link_filter(text, base_url: base_url, username_pattern: username_pattern)
return text if html == text
html = mention_link_filter(content, base_url: base_url, username_pattern: username_pattern)
return if html == content

html
text.replace(html, as: :html)
end

# The URL to provide when someone @mentions a "mention" name, such
Expand Down
10 changes: 6 additions & 4 deletions lib/html_pipeline/node_filter/syntax_highlight_filter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,13 @@ def handle_element(element)
element["class"] = "#{scope} #{scope}-#{@lang}" if include_lang?
end

def handle_text(text)
return text if @lang.nil?
return text if (lexer = lexer_for(@lang)).nil?
def handle_text_chunk(text)
return if @lang.nil?
return if (lexer = lexer_for(@lang)).nil?

highlight_with_timeout_handling(text, lexer)
content = text.to_s

text.replace(highlight_with_timeout_handling(content, lexer), as: :html)
end

def highlight_with_timeout_handling(text, lexer)
Expand Down
8 changes: 3 additions & 5 deletions lib/html_pipeline/node_filter/table_of_contents_filter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,13 @@ def handle_element(element)
element["id"] = header_id
element["class"] = classes

element.set_inner_content(anchor_html, :as_html)
element.set_inner_content(anchor_html, as: :html)

result[:toc] << { href: header_href }
end

def handle_text(text)
result[:toc].last[:text] = text

text
def handle_text_chunk(text)
result[:toc].last[:text] = text.to_s
end
end
end
Expand Down
7 changes: 4 additions & 3 deletions lib/html_pipeline/node_filter/team_mention_filter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,11 @@ def selector
SELECTOR
end

def handle_text(text)
return text unless text.include?("@")
def handle_text_chunk(text)
content = text.to_s
return unless content.include?("@")

mention_link_filter(text, base_url: base_url, team_pattern: team_pattern)
text.replace(mention_link_filter(content, base_url: base_url, team_pattern: team_pattern), as: :html)
end

def team_pattern
Expand Down
66 changes: 24 additions & 42 deletions lib/html_pipeline/sanitization_filter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,37 +13,35 @@ class HTMLPipeline
#
# This filter does not write additional information to the context.
class SanitizationFilter
LISTS = Set.new(["ul", "ol"].freeze)
LIST_ITEM = "li"

# List of table child elements. These must be contained by a <table> element
# or they are not allowed through. Otherwise they can be used to break out
# of places we're using tables to contain formatted user content (like pull
# request review comments).
TABLE_ITEMS = Set.new(["tr", "td", "th"].freeze)
TABLE = "table"
TABLE_SECTIONS = Set.new(["thead", "tbody", "tfoot"].freeze)

# These schemes are the only ones allowed in <a href> attributes by default.
PROTOCOLS = ["http", "https", "mailto", "xmpp", :relative, "irc", "ircs"].freeze
VALID_PROTOCOLS = Selma::Sanitizer::Config::VALID_PROTOCOLS.dup

# The main sanitization allowlist. Only these elements and attributes are
# allowed through by default.
DEFAULT_CONFIG = {
elements: ["h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "br", "b", "i", "strong", "em", "a", "pre", "code", "img", "tt", "div", "ins", "del", "sup", "sub", "p", "ol", "ul", "table", "thead", "tbody", "tfoot", "blockquote", "dl", "dt", "dd", "kbd", "q", "samp", "var", "hr", "ruby", "rt", "rp", "li", "tr", "td", "th", "s", "strike", "summary", "details", "caption", "figure", "figcaption", "abbr", "bdo", "cite", "dfn", "mark", "small", "span", "time", "wbr"].freeze,
remove_contents: ["script"].freeze,
elements: ["h1", "h2", "h3", "h4", "h5", "h6", "br", "b", "i", "strong", "em", "a", "pre", "code",
"img", "tt", "div", "ins", "del", "sup", "sub", "p", "ol", "ul", "table", "thead", "tbody", "tfoot",
"blockquote", "dl", "dt", "dd", "kbd", "q", "samp", "var", "hr", "ruby", "rt", "rp", "li", "tr", "td", "th",
"s", "strike", "summary", "details", "caption", "figure", "figcaption", "abbr", "bdo", "cite",
"dfn", "mark", "small", "span", "time", "wbr",],

attributes: {
"a" => ["href"].freeze,
"img" => ["src", "longdesc"].freeze,
"div" => ["itemscope", "itemtype"].freeze,
"blockquote" => ["cite"].freeze,
"del" => ["cite"].freeze,
"ins" => ["cite"].freeze,
"q" => ["cite"].freeze,
all: ["abbr", "accept", "accept-charset", "accesskey", "action", "align", "alt", "aria-describedby", "aria-hidden", "aria-label", "aria-labelledby", "axis", "border", "cellpadding", "cellspacing", "char", "charoff", "charset", "checked", "clear", "cols", "colspan", "color", "compact", "coords", "datetime", "dir", "disabled", "enctype", "for", "frame", "headers", "height", "hreflang", "hspace", "ismap", "label", "lang", "maxlength", "media", "method", "multiple", "name", "nohref", "noshade", "nowrap", "open", "progress", "prompt", "readonly", "rel", "rev", "role", "rows", "rowspan", "rules", "scope", "selected", "shape", "size", "span", "start", "summary", "tabindex", "target", "title", "type", "usemap", "valign", "value", "vspace", "width", "itemprop"].freeze,
}.freeze,
"a" => ["href"],
"img" => ["src", "longdesc"],
"div" => ["itemscope", "itemtype"],
"blockquote" => ["cite"],
"del" => ["cite"],
"ins" => ["cite"],
"q" => ["cite"],
all: ["abbr", "accept", "accept-charset", "accesskey", "action", "align", "alt", "aria-describedby",
"aria-hidden", "aria-label", "aria-labelledby", "axis", "border", "cellpadding", "cellspacing", "char",
"charoff", "charset", "checked", "clear", "cols", "colspan", "color", "compact", "coords", "datetime", "dir",
"disabled", "enctype", "for", "frame", "headers", "height", "hreflang", "hspace", "id", "ismap", "label", "lang",
"maxlength", "media", "method", "multiple", "name", "nohref", "noshade", "nowrap", "open", "progress",
"prompt", "readonly", "rel", "rev", "role", "rows", "rowspan", "rules", "scope", "selected", "shape",
"size", "span", "start", "summary", "tabindex", "title", "type", "usemap", "valign", "value", "width", "itemprop",],
},
protocols: {
"a" => { "href" => PROTOCOLS }.freeze,
"a" => { "href" => Selma::Sanitizer::Config::VALID_PROTOCOLS }.freeze,
"blockquote" => { "cite" => ["http", "https", :relative].freeze },
"del" => { "cite" => ["http", "https", :relative].freeze },
"ins" => { "cite" => ["http", "https", :relative].freeze },
Expand All @@ -53,23 +51,7 @@ class SanitizationFilter
"longdesc" => ["http", "https", :relative].freeze,
}.freeze,
},
transformers: [
# Top-level <li> elements are removed because they can break out of
# containing markup.
lambda { |env|
name = env[:node_name]
node = env[:node]
node.replace(node.children) if name == LIST_ITEM && node.ancestors.none? { |n| LISTS.include?(n.name) }
},

# Table child elements that are not contained by a <table> are removed.
lambda { |env|
name = env[:node_name]
node = env[:node]
node.replace(node.children) if (TABLE_SECTIONS.include?(name) || TABLE_ITEMS.include?(name)) && node.ancestors.none? { |n| n.name == TABLE }
},
].freeze,
}.freeze
}

class << self
def call(html, config)
Expand Down
4 changes: 2 additions & 2 deletions test/sanitization_filter_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,11 @@ def test_uses_anchor_schemes_from_allowlist_when_not_separately_specified
end

def test_allowlist_contains_default_anchor_schemes
assert_equal(["http", "https", "mailto", "xmpp", :relative, "irc", "ircs"], SanitizationFilter::DEFAULT_CONFIG[:protocols]["a"]["href"])
assert_equal(["http", "https", "mailto", :relative], SanitizationFilter::DEFAULT_CONFIG[:protocols]["a"]["href"])
end

def test_exports_default_anchor_schemes
assert_equal(["http", "https", "mailto", "xmpp", :relative, "irc", "ircs"], SanitizationFilter::PROTOCOLS)
assert_equal(["http", "https", "mailto", :relative], SanitizationFilter::VALID_PROTOCOLS)
end

def test_script_contents_are_removed
Expand Down

0 comments on commit b207c62

Please sign in to comment.