← Index
NYTProf Performance Profile   « line view »
For /usr/local/libexec/sympa/task_manager-debug.pl
  Run on Tue Jun 1 22:32:51 2021
Reported on Tue Jun 1 22:35:14 2021

Filename/usr/local/lib/perl5/site_perl/HTML/StripScripts.pm
StatementsExecuted 0 statements in 0s
Subroutines
Calls P F Exclusive
Time
Inclusive
Time
Subroutine
0000s0sHTML::StripScripts::::BEGIN@1030HTML::StripScripts::BEGIN@1030
0000s0sHTML::StripScripts::::BEGIN@1032HTML::StripScripts::BEGIN@1032
0000s0sHTML::StripScripts::::BEGIN@1234HTML::StripScripts::BEGIN@1234
0000s0sHTML::StripScripts::::BEGIN@1236HTML::StripScripts::BEGIN@1236
0000s0sHTML::StripScripts::::BEGIN@1263HTML::StripScripts::BEGIN@1263
0000s0sHTML::StripScripts::::BEGIN@1265HTML::StripScripts::BEGIN@1265
0000s0sHTML::StripScripts::::BEGIN@1294HTML::StripScripts::BEGIN@1294
0000s0sHTML::StripScripts::::BEGIN@1296HTML::StripScripts::BEGIN@1296
0000s0sHTML::StripScripts::::BEGIN@1541HTML::StripScripts::BEGIN@1541
0000s0sHTML::StripScripts::::BEGIN@1543HTML::StripScripts::BEGIN@1543
0000s0sHTML::StripScripts::::BEGIN@2HTML::StripScripts::BEGIN@2
0000s0sHTML::StripScripts::::BEGIN@3HTML::StripScripts::BEGIN@3
0000s0sHTML::StripScripts::::BEGIN@5HTML::StripScripts::BEGIN@5
0000s0sHTML::StripScripts::::BEGIN@923HTML::StripScripts::BEGIN@923
0000s0sHTML::StripScripts::::BEGIN@925HTML::StripScripts::BEGIN@925
0000s0sHTML::StripScripts::::__ANON__[:1951]HTML::StripScripts::__ANON__[:1951]
0000s0sHTML::StripScripts::::_hss_accept_input_endHTML::StripScripts::_hss_accept_input_end
0000s0sHTML::StripScripts::::_hss_accept_input_startHTML::StripScripts::_hss_accept_input_start
0000s0sHTML::StripScripts::::_hss_attval_colorHTML::StripScripts::_hss_attval_color
0000s0sHTML::StripScripts::::_hss_attval_hrefHTML::StripScripts::_hss_attval_href
0000s0sHTML::StripScripts::::_hss_attval_novalueHTML::StripScripts::_hss_attval_novalue
0000s0sHTML::StripScripts::::_hss_attval_numberHTML::StripScripts::_hss_attval_number
0000s0sHTML::StripScripts::::_hss_attval_sizeHTML::StripScripts::_hss_attval_size
0000s0sHTML::StripScripts::::_hss_attval_srcHTML::StripScripts::_hss_attval_src
0000s0sHTML::StripScripts::::_hss_attval_styleHTML::StripScripts::_hss_attval_style
0000s0sHTML::StripScripts::::_hss_attval_stylesrcHTML::StripScripts::_hss_attval_stylesrc
0000s0sHTML::StripScripts::::_hss_attval_textHTML::StripScripts::_hss_attval_text
0000s0sHTML::StripScripts::::_hss_attval_wordHTML::StripScripts::_hss_attval_word
0000s0sHTML::StripScripts::::_hss_attval_wordlistHTML::StripScripts::_hss_attval_wordlist
0000s0sHTML::StripScripts::::_hss_attval_wordlistqHTML::StripScripts::_hss_attval_wordlistq
0000s0sHTML::StripScripts::::_hss_close_innermost_tagHTML::StripScripts::_hss_close_innermost_tag
0000s0sHTML::StripScripts::::_hss_contextHTML::StripScripts::_hss_context
0000s0sHTML::StripScripts::::_hss_decode_numericHTML::StripScripts::_hss_decode_numeric
0000s0sHTML::StripScripts::::_hss_get_attr_filterHTML::StripScripts::_hss_get_attr_filter
0000s0sHTML::StripScripts::::_hss_get_to_valid_contextHTML::StripScripts::_hss_get_to_valid_context
0000s0sHTML::StripScripts::::_hss_join_attribsHTML::StripScripts::_hss_join_attribs
0000s0sHTML::StripScripts::::_hss_prepare_ban_listHTML::StripScripts::_hss_prepare_ban_list
0000s0sHTML::StripScripts::::_hss_prepare_rulesHTML::StripScripts::_hss_prepare_rules
0000s0sHTML::StripScripts::::_hss_tag_is_bannedHTML::StripScripts::_hss_tag_is_banned
0000s0sHTML::StripScripts::::_hss_valid_in_contextHTML::StripScripts::_hss_valid_in_context
0000s0sHTML::StripScripts::::_hss_valid_in_current_contextHTML::StripScripts::_hss_valid_in_current_context
0000s0sHTML::StripScripts::::canonical_form_to_attvalHTML::StripScripts::canonical_form_to_attval
0000s0sHTML::StripScripts::::canonical_form_to_textHTML::StripScripts::canonical_form_to_text
0000s0sHTML::StripScripts::::escape_html_metacharsHTML::StripScripts::escape_html_metachars
0000s0sHTML::StripScripts::::filter_textHTML::StripScripts::filter_text
0000s0sHTML::StripScripts::::filtered_documentHTML::StripScripts::filtered_document
0000s0sHTML::StripScripts::::hss_initHTML::StripScripts::hss_init
0000s0sHTML::StripScripts::::init_attrib_whitelistHTML::StripScripts::init_attrib_whitelist
0000s0sHTML::StripScripts::::init_attval_whitelistHTML::StripScripts::init_attval_whitelist
0000s0sHTML::StripScripts::::init_context_whitelistHTML::StripScripts::init_context_whitelist
0000s0sHTML::StripScripts::::init_deinter_whitelistHTML::StripScripts::init_deinter_whitelist
0000s0sHTML::StripScripts::::init_style_whitelistHTML::StripScripts::init_style_whitelist
0000s0sHTML::StripScripts::::input_commentHTML::StripScripts::input_comment
0000s0sHTML::StripScripts::::input_declarationHTML::StripScripts::input_declaration
0000s0sHTML::StripScripts::::input_endHTML::StripScripts::input_end
0000s0sHTML::StripScripts::::input_end_documentHTML::StripScripts::input_end_document
0000s0sHTML::StripScripts::::input_processHTML::StripScripts::input_process
0000s0sHTML::StripScripts::::input_startHTML::StripScripts::input_start
0000s0sHTML::StripScripts::::input_start_documentHTML::StripScripts::input_start_document
0000s0sHTML::StripScripts::::input_textHTML::StripScripts::input_text
0000s0sHTML::StripScripts::::newHTML::StripScripts::new
0000s0sHTML::StripScripts::::outputHTML::StripScripts::output
0000s0sHTML::StripScripts::::output_stack_entryHTML::StripScripts::output_stack_entry
0000s0sHTML::StripScripts::::output_startHTML::StripScripts::output_start
0000s0sHTML::StripScripts::::output_start_documentHTML::StripScripts::output_start_document
0000s0sHTML::StripScripts::::quoted_to_canonical_formHTML::StripScripts::quoted_to_canonical_form
0000s0sHTML::StripScripts::::reject_startHTML::StripScripts::reject_start
0000s0sHTML::StripScripts::::strip_nonprintableHTML::StripScripts::strip_nonprintable
0000s0sHTML::StripScripts::::text_to_canonical_formHTML::StripScripts::text_to_canonical_form
0000s0sHTML::StripScripts::::unquoted_to_canonical_formHTML::StripScripts::unquoted_to_canonical_form
0000s0sHTML::StripScripts::::validate_href_attributeHTML::StripScripts::validate_href_attribute
0000s0sHTML::StripScripts::::validate_mailtoHTML::StripScripts::validate_mailto
Call graph for these subroutines as a Graphviz dot language file.
Line State
ments
Time
on line
Calls Time
in subs
Code
1package HTML::StripScripts;
2use strict;
3use warnings FATAL => 'all';
4
5use vars qw($VERSION);
6$VERSION = '1.06';
7
8=head1 NAME
9
10HTML::StripScripts - Strip scripting constructs out of HTML
11
12=head1 SYNOPSIS
13
14 use HTML::StripScripts;
15
16 my $hss = HTML::StripScripts->new({ Context => 'Inline' });
17
18 $hss->input_start_document;
19
20 $hss->input_start('<i>');
21 $hss->input_text('hello, world!');
22 $hss->input_end('</i>');
23
24 $hss->input_end_document;
25
26 print $hss->filtered_document;
27
28=head1 DESCRIPTION
29
30This module strips scripting constructs out of HTML, leaving as
31much non-scripting markup in place as possible. This allows web
32applications to display HTML originating from an untrusted source
33without introducing XSS (cross site scripting) vulnerabilities.
34
35You will probably use L<HTML::StripScripts::Parser> rather than using
36this module directly.
37
38The process is based on whitelists of tags, attributes and attribute
39values. This approach is the most secure against disguised scripting
40constructs hidden in malicious HTML documents.
41
42As well as removing scripting constructs, this module ensures that
43there is a matching end for each start tag, and that the tags are
44properly nested.
45
46Previously, in order to customise the output, you needed to subclass
47C<HTML::StripScripts> and override methods. Now, most customisation
48can be done through the C<Rules> option provided to C<new()>. (See
49examples/declaration/ and examples/tags/ for cases where subclassing is
50necessary.)
51
52The HTML document must be parsed into start tags, end tags and
53text before it can be filtered by this module. Use either
54L<HTML::StripScripts::Parser> or L<HTML::StripScripts::Regex> instead
55if you want to input an unparsed HTML document.
56
57See examples/direct/ for an example of how to feed tokens directly to
58 HTML::StripScripts.
59
60=head1 CONSTRUCTORS
61
62=over
63
64=item new ( CONFIG )
65
66Creates a new C<HTML::StripScripts> filter object, bound to a
67particular filtering policy. If present, the CONFIG parameter
68must be a hashref. The following keys are recognized (unrecognized
69keys will be silently ignored).
70
71 $s = HTML::Stripscripts->new({
72 Context => 'Document|Flow|Inline|NoTags',
73 BanList => [qw( br img )] | {br => '1', img => '1'},
74 BanAllBut => [qw(p div span)],
75 AllowSrc => 0|1,
76 AllowHref => 0|1,
77 AllowRelURL => 0|1,
78 AllowMailto => 0|1,
79 EscapeFiltered => 0|1,
80 Rules => { See below for details },
81 });
82
83=over
84
85=item C<Context>
86
87A string specifying the context in which the filtered document
88will be used. This influences the set of tags that will be
89allowed.
90
91If present, the C<Context> value must be one of:
92
93=over
94
95=item C<Document>
96
97If C<Context> is C<Document> then the filter will allow a full
98HTML document, including the C<HTML> tag and C<HEAD> and C<BODY>
99sections.
100
101=item C<Flow>
102
103If C<Context> is C<Flow> then most of the cosmetic tags that one
104would expect to find in a document body are allowed, including
105lists and tables but not including forms.
106
107=item C<Inline>
108
109If C<Context> is C<Inline> then only inline tags such as C<B>
110and C<FONT> are allowed.
111
112=item C<NoTags>
113
114If C<Context> is C<NoTags> then no tags are allowed.
115
116=back
117
118The default C<Context> value is C<Flow>.
119
120=item C<BanList>
121
122If present, this option must be an arrayref or a hashref. Any tag that
123would normally be allowed (because it presents no XSS hazard) will be
124blocked if the lowercase name of the tag is in this list.
125
126For example, in a guestbook application where C<HR> tags are used to
127separate posts, you may wish to prevent posts from including C<HR>
128tags, even though C<HR> is not an XSS risk.
129
130=item C<BanAllBut>
131
132If present, this option must be reference to an array holding a list of
133lowercase tag names. This has the effect of adding all but the listed
134tags to the ban list, so that only those tags listed will be allowed.
135
136=item C<AllowSrc>
137
138By default, the filter won't allow constructs that cause the browser to
139fetch things automatically, such as C<SRC> attributes in C<IMG> tags.
140If this option is present and true then those constructs will be
141allowed.
142
143=item C<AllowHref>
144
145By default, the filter won't allow constructs that cause the browser to
146fetch things if the user clicks on something, such as the C<HREF>
147attribute in C<A> tags. Set this option to a true value to allow this
148type of construct.
149
150=item C<AllowRelURL>
151
152By default, the filter won't allow relative URLs such as C<../foo.html>
153in C<SRC> and C<HREF> attribute values. Set this option to a true value
154to allow them. C<AllowHref> and / or C<AllowSrc> also need to be set to true
155for this to have any effect.
156
157=item C<AllowMailto>
158
159By default, C<mailto:> links are not allowed. If C<AllowMailto> is set to
160a true value, then this construct will be allowed. This can be enabled
161separately from AllowHref.
162
163=item C<EscapeFiltered>
164
165By default, any filtered tags are outputted as C<< <!--filtered--> >>. If
166C<EscapeFiltered> is set to a true value, then the filtered tags are converted
167to HTML entities.
168
169For instance:
170
171 <br> --> &lt;br&gt;
172
173=item C<Rules>
174
175The C<Rules> option provides a very flexible way of customising the filter.
176
177The focus is safety-first, so it is applied after all of the previous validation.
178This means that you cannot all malicious data should already have been cleared.
179
180Rules can be specified for tags and for attributes. Any tag or attribute
181not explicitly listed will be handled by the default C<*> rules.
182
183The following is a synopsis of all of the options that you can use to
184configure rules. Below, an example is broken into sections and explained.
185
186 Rules => {
187
188 tag => 0 | 1 | sub { tag_callback }
189 | {
190 attr => 0 | 1 | 'regex' | qr/regex/ | sub { attr_callback},
191 '*' => 0 | 1 | 'regex' | qr/regex/ | sub { attr_callback},
192 required => [qw(attrname attrname)],
193 tag => sub { tag_callback }
194 },
195
196 '*' => 0 | 1 | sub { tag_callback }
197 | {
198 attr => 0 | 1 | 'regex' | qr/regex/ | sub { attr_callback},
199 '*' => 0 | 1 | 'regex' | qr/regex/ | sub { attr_callback},
200 tag => sub { tag_callback }
201 }
202
203 }
204
205EXAMPLE:
206
207 Rules => {
208
209 ##########################
210 ##### EXPLICIT RULES #####
211 ##########################
212
213 ## Allow <br> tags, reject <img> tags
214 br => 1,
215 img => 0,
216
217 ## Send all <div> tags to a sub
218 div => sub { tag_callback },
219
220 ## Allow <blockquote> tags,and allow the 'cite' attribute
221 ## All other attributes are handled by the default C<*>
222 blockquote => {
223 cite => 1,
224 },
225
226 ## Allow <a> tags, and
227 a => {
228
229 ## Allow the 'title' attribute
230 title => 1,
231
232 ## Allow the 'href' attribute if it matches the regex
233 href => '^http://yourdomain.com'
234 OR href => qr{^http://yourdomain.com},
235
236 ## 'style' attributes are handled by a sub
237 style => sub { attr_callback },
238
239 ## All other attributes are rejected
240 '*' => 0,
241
242 ## Additionally, the <a> tag should be handled by this sub
243 tag => sub { tag_callback},
244
245 ## If the <a> tag doesn't have these attributes, filter the tag
246 required => [qw(href title)],
247
248 },
249
250 ##########################
251 ##### DEFAULT RULES #####
252 ##########################
253
254 ## The default '*' rule - accepts all the same options as above.
255 ## If a tag or attribute is not mentioned above, then the default
256 ## rule is applied:
257
258 ## Reject all tags
259 '*' => 0,
260
261 ## Allow all tags and all attributes
262 '*' => 1,
263
264 ## Send all tags to the sub
265 '*' => sub { tag_callback },
266
267 ## Allow all tags, reject all attributes
268 '*' => { '*' => 0 },
269
270 ## Allow all tags, and
271 '*' => {
272
273 ## Allow the 'title' attribute
274 title => 1,
275
276 ## Allow the 'href' attribute if it matches the regex
277 href => '^http://yourdomain.com'
278 OR href => qr{^http://yourdomain.com},
279
280 ## 'style' attributes are handled by a sub
281 style => sub { attr_callback },
282
283 ## All other attributes are rejected
284 '*' => 0,
285
286 ## Additionally, all tags should be handled by this sub
287 tag => sub { tag_callback},
288
289 },
290
291=over
292
293=item Tag Callbacks
294
295 sub tag_callback {
296 my ($filter,$element) = (@_);
297
298 $element = {
299 tag => 'tag',
300 content => 'inner_html',
301 attr => {
302 attr_name => 'attr_value',
303 }
304 };
305 return 0 | 1;
306 }
307
308A tag callback accepts two parameters, the C<$filter> object and the C$element>.
309It should return C<0> to completely ignore the tag and its content (which includes
310any nested HTML tags), or C<1> to accept and output the tag.
311
312The C<$element> is a hash ref containing the keys:
313
314=item C<tag>
315
316This is the tagname in lowercase, eg C<a>, C<br>, C<img>. If you set
317the tag value to an empty string, then the tag will not be outputted, but
318the tag contents will.
319
320=item C<content>
321
322This is the equivalent of DOM's innerHTML. It contains the text content
323and any HTML tags contained within this element. You can change the content
324or set it to an empty string so that it is not outputted.
325
326=item C<attr>
327
328C<attr> contains a hashref containing the attribute names and values
329
330=back
331
332If for instance, you wanted to replace C<< <b> >> tags with C<< <span> >> tags,
333you could do this:
334
335 sub b_callback {
336 my ($filter,$element) = @_;
337 $element->{tag} = 'span';
338 $element->{attr}{style} = 'font-weight:bold';
339 return 1;
340 }
341
342=item Attribute Callbacks
343
344 sub attr_callback {
345 my ( $filter, $tag, $attr_name, $attr_val ) = @_;
346 return undef | '' | 'value';
347 }
348
349Attribute callbacks accept four parameters, the C<$filter> object, the C<$tag>
350name, the C<$attr_name> and the C<$attr_value>.
351
352It should return either C<undef> to reject the attribute, or the value to be
353used. An empty string keeps the attribute, but without a value.
354
355=item C<BanList> vs C<BanAllBut> vs C<Rules>
356
357It is not necessary to use C<BanList> or C<BanAllBut> - everything can be done
358via C<Rules>, however it may be simpler to write:
359
360 BanAllBut => [qw(p div span)]
361
362The logic works as follows:
363
364 * If BanAllBut exists, then ban everything but the tags in the list
365 * Add to the ban list any elements in BanList
366 * Any tags mentioned explicitly in Rules (eg a => 0, br => 1)
367 are added or removed from the BanList
368 * A default rule of { '*' => 0 } would ban all tags except
369 those mentioned in Rules
370 * A default rule of { '*' => 1 } would allow all tags except
371 those disallowed in the ban list, or by explicit rules
372
373=back
374
375=cut
376
377sub new {
378 my ( $pkg, $cfg ) = @_;
379
380 my $self = bless {}, ref $pkg || $pkg;
381 $self->hss_init($cfg);
382 return $self;
383}
384
385=back
386
387=head1 METHODS
388
389This class provides the following methods:
390
391=over
392
393=item hss_init ()
394
395This method is called by new() and does the actual initialisation work
396for the new HTML::StripScripts object.
397
398=cut
399
400sub hss_init {
401 my ( $self, $cfg ) = @_;
402 $cfg ||= {};
403
404 $self->{_hssCfg} = $cfg;
405
406 $self->{_hssContext} = $self->init_context_whitelist;
407 $self->{_hssAttrib} = $self->init_attrib_whitelist;
408 $self->{_hssAttVal} = $self->init_attval_whitelist;
409 $self->{_hssStyle} = $self->init_style_whitelist;
410 $self->{_hssDeInter} = $self->init_deinter_whitelist;
411 $self->{_hssBanList} = $self->_hss_prepare_ban_list($cfg);
412 $self->{_hssRules} = $self->_hss_prepare_rules($cfg);
413}
414
415=item input_start_document ()
416
417This method initializes the filter, and must be called once before
418starting on each HTML document to be filtered.
419
420=cut
421
422sub input_start_document {
423 my ( $self, $context ) = @_;
424
425 $self->{_hssStack} = [ { NAME => '',
426 CTX => $self->{_hssCfg}{Context} || 'Flow',
427 CONTENT => '',
428 }
429 ];
430 $self->{_hssOutput} = '';
431
432 $self->output_start_document;
433}
434
435=item input_start ( TEXT )
436
437Handles a start tag from the input document. TEXT must be the
438full text of the tag, including angle-brackets.
439
440=cut
441
442sub input_start {
443 my ( $self, $text ) = @_;
444
445 $self->_hss_accept_input_start($text) or $self->reject_start($text);
446}
447
448sub _hss_accept_input_start {
449 my ( $self, $text ) = @_;
450
451 return 0 unless $text =~ m|^<([a-zA-Z0-9]+)\b(.*)>$|m;
452 my ( $tag, $attr ) = ( lc $1, $self->strip_nonprintable($2) );
453
454 return 0 if $self->{_hssSkipToEnd};
455 if ( $tag eq 'script' or $tag eq 'style' ) {
456 $self->{_hssSkipToEnd} = $tag;
457 return 0;
458 }
459
460 return 0 if $self->_hss_tag_is_banned($tag);
461
462 my $allowed_attr = $self->{_hssAttrib}{$tag};
463 return 0 unless defined $allowed_attr;
464
465 return 0 unless $self->_hss_get_to_valid_context($tag);
466
467 my $default_filters = $self->{_hssRules}{'*'};
468 my $tag_filters = $self->{_hssRules}{$tag}
469 || $default_filters;
470
471 my %filtered_attr;
472 while ( $attr
473 =~ s#^\s*([\w\-]+)(?:\s*=\s*(?:([^"'>\s]+)|"([^"]*)"|'([^']*)'))?## )
474 {
475 my $key = lc $1;
476 my $val = ( defined $2 ? $self->unquoted_to_canonical_form($2)
477 : defined $3 ? $self->quoted_to_canonical_form($3)
478 : defined $4 ? $self->quoted_to_canonical_form($4)
479 : ''
480 );
481
482 my $value_class = $allowed_attr->{$key};
483 next unless defined $value_class;
484
485 my $attval_handler = $self->{_hssAttVal}{$value_class};
486 next unless defined $attval_handler;
487
488 my $attr_filter;
489 if ($tag_filters) {
490 $attr_filter =
491 $self->_hss_get_attr_filter( $default_filters, $tag_filters,
492 $key );
493
494 # filter == 0
495 next unless $attr_filter;
496 }
497
498 my $filtered_value = &{$attval_handler}( $self, $tag, $key, $val );
499 next unless defined $filtered_value;
500
501 # send value to filter if sub
502 if ( $tag_filters && ref $attr_filter ) {
503 $filtered_value
504 = $attr_filter->( $self, $tag, $key, $filtered_value );
505 next unless defined $filtered_value;
506 }
507
508 $filtered_attr{$key} = $filtered_value;
509
510 }
511
512 # Check required attributes
513 if ( my $required = $tag_filters->{required} ) {
514 foreach my $key (@$required) {
515 return 0
516 unless defined $filtered_attr{$key} && length($filtered_attr{$key});
517 }
518 }
519
520 # Check for callback
521 my $tag_callback = $tag_filters && $tag_filters->{tag}
522 || $default_filters->{tag};
523
524 my $new_context = $self->{_hssContext}{ $self->{_hssStack}[0]{CTX} }{$tag};
525
526 my %stack_entry = ( NAME => $tag,
527 ATTR => \%filtered_attr,
528 CTX => $new_context,
529 CALLBACK => $tag_callback,
530 CONTENT => '',
531 );
532 if ( $new_context eq 'EMPTY' ) {
533 $self->output_stack_entry( \%stack_entry );
534 }
535 else {
536 unshift @{ $self->{_hssStack} }, \%stack_entry;
537
538 }
539
540 return 1;
541}
542
543=item input_end ( TEXT )
544
545Handles an end tag from the input document. TEXT must be the
546full text of the end tag, including angle-brackets.
547
548=cut
549
550sub input_end {
551 my ( $self, $text ) = @_;
552
553 $self->_hss_accept_input_end($text) or $self->reject_end($text);
554}
555
556sub _hss_accept_input_end {
557 my ( $self, $text ) = @_;
558
559 return 0 unless $text =~ m#^</(\w+)>$#;
560 my $tag = lc $1;
561
562 if ( $self->{_hssSkipToEnd} ) {
563 if ( $self->{_hssSkipToEnd} eq $tag ) {
564 delete $self->{_hssSkipToEnd};
565 }
566 return 0;
567 }
568
569 # Ignore a close without an open
570 return 0 unless grep { $_->{NAME} eq $tag } @{ $self->{_hssStack} };
571
572 # Close open tags up to the matching open
573 my @close = ();
574
575 while ( scalar @{ $self->{_hssStack} } ) {
576 my $entry = shift @{ $self->{_hssStack} };
577 push @close, $entry;
578 $self->output_stack_entry($entry);
579 $entry->{CONTENT} = '';
580 last if $entry->{NAME} eq $tag;
581 }
582
583 # Reopen any we closed early if all that were closed are
584 # configured to be auto de-interleaved.
585 unless ( grep { !$self->{_hssDeInter}{ $_->{NAME} } } @close ) {
586 pop @close;
587 unshift @{ $self->{_hssStack} }, @close;
588 }
589
590 return 1;
591}
592
593=item input_text ( TEXT )
594
595Handles some non-tag text from the input document.
596
597=cut
598
599sub input_text {
600 my ( $self, $text ) = @_;
601
602 return if $self->{_hssSkipToEnd};
603
604 $text = $self->strip_nonprintable($text);
605
606 if ( $text =~ /^(\s*)$/ ) {
607 $self->output_text($1);
608 return;
609 }
610
611 unless ( $self->_hss_get_to_valid_context('CDATA') ) {
612 $self->reject_text($text);
613 return;
614 }
615
616 my $filtered = $self->filter_text( $self->text_to_canonical_form($text) );
617 $self->output_text( $self->canonical_form_to_text($filtered) );
618}
619
620=item input_process ( TEXT )
621
622Handles a processing instruction from the input document.
623
624=cut
625
626sub input_process {
627 my ( $self, $text ) = @_;
628
629 $self->reject_process($text);
630}
631
632=item input_comment ( TEXT )
633
634Handles an HTML comment from the input document.
635
636=cut
637
638sub input_comment {
639 my ( $self, $text ) = @_;
640
641 $self->reject_comment($text);
642}
643
644=item input_declaration ( TEXT )
645
646Handles an declaration from the input document.
647
648=cut
649
650sub input_declaration {
651 my ( $self, $text ) = @_;
652
653 $self->reject_declaration($text);
654}
655
656=item input_end_document ()
657
658Call this method to signal the end of the input document.
659
660=cut
661
662sub input_end_document {
663 my ($self) = @_;
664
665 delete $self->{_hssSkipToEnd};
666
667 while ( @{ $self->{_hssStack} } > 1 ) {
668 $self->output_stack_entry( shift @{ $self->{_hssStack} } );
669 }
670
671 $self->output_end_document;
672 my $last_entry = shift @{ $self->{_hssStack} };
673 $self->{_hssOutput} = $last_entry->{CONTENT};
674 delete $self->{_hssStack};
675
676}
677
678=item filtered_document ()
679
680Returns the filtered document as a string.
681
682=cut
683
684sub filtered_document {
685 my ($self) = @_;
686 $self->{_hssOutput};
687}
688
689=back
690
691=cut
692
693=head1 SUBCLASSING
694
695The only reason for subclassing this module now is to add to the
696list of accepted tags, attributes and styles (See
697L</"WHITELIST INITIALIZATION METHODS">). Everything else can be
698achieved with L</"Rules">.
699
700The C<HTML::StripScripts> class is subclassable. Filter objects are plain
701hashes and C<HTML::StripScripts> reserves only hash keys that start with
702C<_hss>. The filter configuration can be set up by invoking the
703hss_init() method, which takes the same arguments as new().
704
705=head1 OUTPUT METHODS
706
707The filter outputs a stream of start tags, end tags, text, comments,
708declarations and processing instructions, via the following C<output_*>
709methods. Subclasses may override these to intercept the filter output.
710
711The default implementations of the C<output_*> methods pass the
712text on to the output() method. The default implementation of the
713output() method appends the text to a string, which can be fetched with
714the filtered_document() method once processing is complete.
715
716If the output() method or the individual C<output_*> methods are
717overridden in a subclass, then filtered_document() will not work in
718that subclass.
719
720=over
721
722=item output_start_document ()
723
724This method gets called once at the start of each HTML document passed
725through the filter. The default implementation does nothing.
726
727=cut
728
729sub output_start_document { }
730
731=item output_end_document ()
732
733This method gets called once at the end of each HTML document passed
734through the filter. The default implementation does nothing.
735
736=cut
737
738*output_end_document = \&output_start_document;
739
740=item output_start ( TEXT )
741
742This method is used to output a filtered start tag.
743
744=cut
745
746sub output_start { $_[0]->output( $_[1] ) }
747
748=item output_end ( TEXT )
749
750This method is used to output a filtered end tag.
751
752=cut
753
754*output_end = \&output_start;
755
756=item output_text ( TEXT )
757
758This method is used to output some filtered non-tag text.
759
760=cut
761
762*output_text = \&output_start;
763
764=item output_declaration ( TEXT )
765
766This method is used to output a filtered declaration.
767
768=cut
769
770*output_declaration = \&output_start;
771
772=item output_comment ( TEXT )
773
774This method is used to output a filtered HTML comment.
775
776=cut
777
778*output_comment = \&output_start;
779
780=item output_process ( TEXT )
781
782This method is used to output a filtered processing instruction.
783
784=cut
785
786*output_process = \&output_start;
787
788=item output ( TEXT )
789
790This method is invoked by all of the default C<output_*> methods. The
791default implementation appends the text to the string that the
792filtered_document() method will return.
793
794=cut
795
796sub output { $_[0]->{_hssStack}[0]{CONTENT} .= $_[1]; }
797
798=item output_stack_entry ( TEXT )
799
800This method is invoked when a tag plus all text and nested HTML content
801within the tag has been processed. It adds the tag plus its content
802to the content for its parent tag.
803
804=cut
805
806sub output_stack_entry {
807 my ( $self, $tag ) = @_;
808
809 my %entry;
810 @entry{qw(tag attr content)} = @{$tag}{qw(NAME ATTR CONTENT)};
811
812 if ( my $tag_callback = $tag->{CALLBACK} ) {
813 $tag_callback->( $self, \%entry )
814 or return;
815 }
816
817 my $tagname = $entry{tag};
818 my $filtered_attrs = $self->_hss_join_attribs( $entry{attr} );
819
820 if ( $tag->{CTX} eq 'EMPTY' ) {
821 $self->output_start("<$tagname$filtered_attrs />")
822 if $entry{tag};
823 return;
824 }
825 if ($tagname) {
826 $self->output_start("<$tagname$filtered_attrs>");
827 }
828
829 if ( defined $entry{content} ) {
830 $self->{_hssStack}[0]{CONTENT} .= $entry{content};
831 }
832
833 if ($tagname) {
834 $self->output_end("</$tagname>");
835 }
836}
837
838=back
839
840=head1 REJECT METHODS
841
842When the filter encounters something in the input document which it
843cannot transform into an acceptable construct, it invokes one of
844the following C<reject_*> methods to put something in the output
845document to take the place of the unacceptable construct.
846
847The TEXT parameter is the full text of the unacceptable construct.
848
849The default implementations of these methods output an HTML comment
850containing the text C<filtered>. If L</"EscapeFiltered">
851is set to true, then the rejected text is HTML escaped instead.
852
853Subclasses may override these methods, but should exercise caution.
854The TEXT parameter is unfiltered input and may contain malicious
855constructs.
856
857=over
858
859=item reject_start ( TEXT )
860
861=item reject_end ( TEXT )
862
863=item reject_text ( TEXT )
864
865=item reject_declaration ( TEXT )
866
867=item reject_comment ( TEXT )
868
869=item reject_process ( TEXT )
870
871=back
872
873=cut
874
875sub reject_start {
876 $_[0]->{_hssCfg}{EscapeFiltered}
877 ? $_[0]->output_text( $_[0]->escape_html_metachars( $_[1] ) )
878 : $_[0]->output_comment('<!--filtered-->');
879}
880*reject_end = \&reject_start;
881*reject_text = \&reject_start;
882*reject_declaration = \&reject_start;
883*reject_comment = \&reject_start;
884*reject_process = \&reject_start;
885
886=head1 WHITELIST INITIALIZATION METHODS
887
888The filter refers to various whitelists to determine which constructs
889are acceptable. To modify these whitelists, subclasses can override
890the following methods.
891
892Each method is called once at object initialization time, and must
893return a reference to a nested data structure. These references are
894installed into the object, and used whenever the filter needs to refer
895to a whitelist.
896
897The default implementations of these methods can be invoked as class
898methods.
899
900See examples/tags/ and examples/declaration/ for examples of how to
901override these methods.
902
903=over
904
905=item init_context_whitelist ()
906
907Returns a reference to the C<Context> whitelist, which determines
908which tags may appear at each point in the document, and which other
909tags may be nested within them.
910
911It is a hash, and the keys are context names, such as C<Flow> and
912C<Inline>.
913
914The values in the hash are hashrefs. The keys in these subhashes are
915lowercase tag names, and the values are context names, specifying the
916context that the tag provides to any other tags nested within it.
917
918The special context C<EMPTY> as a value in a subhash indicates that
919nothing can be nested within that tag.
920
921=cut
922
923use vars qw(%_Context);
924
925BEGIN {
926
927 my %pre_content = ( 'br' => 'EMPTY',
928 'span' => 'Inline',
929 'tt' => 'Inline',
930 'i' => 'Inline',
931 'b' => 'Inline',
932 'u' => 'Inline',
933 's' => 'Inline',
934 'strike' => 'Inline',
935 'em' => 'Inline',
936 'strong' => 'Inline',
937 'dfn' => 'Inline',
938 'code' => 'Inline',
939 'q' => 'Inline',
940 'samp' => 'Inline',
941 'kbd' => 'Inline',
942 'var' => 'Inline',
943 'cite' => 'Inline',
944 'abbr' => 'Inline',
945 'acronym' => 'Inline',
946 'ins' => 'Inline',
947 'del' => 'Inline',
948 'a' => 'Inline',
949 'CDATA' => 'CDATA',
950 );
951
952 my %inline = ( %pre_content,
953 'img' => 'EMPTY',
954 'big' => 'Inline',
955 'small' => 'Inline',
956 'sub' => 'Inline',
957 'sup' => 'Inline',
958 'font' => 'Inline',
959 'nobr' => 'Inline',
960 );
961
962 my %flow = ( %inline,
963 'ins' => 'Flow',
964 'del' => 'Flow',
965 'div' => 'Flow',
966 'p' => 'Inline',
967 'h1' => 'Inline',
968 'h2' => 'Inline',
969 'h3' => 'Inline',
970 'h4' => 'Inline',
971 'h5' => 'Inline',
972 'h6' => 'Inline',
973 'ul' => 'list',
974 'ol' => 'list',
975 'menu' => 'list',
976 'dir' => 'list',
977 'dl' => 'dt_dd',
978 'address' => 'Inline',
979 'hr' => 'EMPTY',
980 'pre' => 'pre.content',
981 'blockquote' => 'Flow',
982 'center' => 'Flow',
983 'table' => 'table',
984 );
985
986 my %table = ( 'caption' => 'Inline',
987 'thead' => 'tr_only',
988 'tfoot' => 'tr_only',
989 'tbody' => 'tr_only',
990 'colgroup' => 'colgroup',
991 'col' => 'EMPTY',
992 'tr' => 'th_td',
993 );
994
995 my %head = ( 'title' => 'NoTags', );
996
997 %_Context = ( 'Document' => { 'html' => 'Html' },
998 'Html' => { 'head' => 'Head', 'body' => 'Flow' },
999 'Head' => \%head,
1000 'Inline' => \%inline,
1001 'Flow' => \%flow,
1002 'NoTags' => { 'CDATA' => 'CDATA' },
1003 'pre.content' => \%pre_content,
1004 'table' => \%table,
1005 'list' => { 'li' => 'Flow' },
1006 'dt_dd' => { 'dt' => 'Inline', 'dd' => 'Flow' },
1007 'tr_only' => { 'tr' => 'th_td' },
1008 'colgroup' => { 'col' => 'EMPTY' },
1009 'th_td' => { 'th' => 'Flow', 'td' => 'Flow' },
1010 );
1011}
1012
1013sub init_context_whitelist { return \%_Context; }
1014
1015=item init_attrib_whitelist ()
1016
1017Returns a reference to the C<Attrib> whitelist, which determines which
1018attributes each tag can have and the values that those attributes can
1019take.
1020
1021It is a hash, and the keys are lowercase tag names.
1022
1023The values in the hash are hashrefs. The keys in these subhashes are
1024lowercase attribute names, and the values are attribute value class names,
1025which are short strings describing the type of values that the
1026attribute can take, such as C<color> or C<number>.
1027
1028=cut
1029
1030use vars qw(%_Attrib);
1031
1032BEGIN {
1033
1034 my %attr = ( 'style' => 'style' );
1035
1036 my %font_attr = ( %attr,
1037 'size' => 'size',
1038 'face' => 'wordlist',
1039 'color' => 'color',
1040 );
1041
1042 my %insdel_attr = ( %attr,
1043 'cite' => 'href',
1044 'datetime' => 'text',
1045 );
1046
1047 my %texta_attr = ( %attr, 'align' => 'word', );
1048
1049 my %cellha_attr = ( 'align' => 'word',
1050 'char' => 'word',
1051 'charoff' => 'size',
1052 );
1053
1054 my %cellva_attr = ( 'valign' => 'word', );
1055
1056 my %cellhv_attr = ( %attr, %cellha_attr, %cellva_attr );
1057
1058 my %col_attr = ( %attr, %cellhv_attr,
1059 'width' => 'size',
1060 'span' => 'number',
1061 );
1062
1063 my %thtd_attr = ( %attr,
1064 'abbr' => 'text',
1065 'axis' => 'text',
1066 'headers' => 'text',
1067 'scope' => 'word',
1068 'rowspan' => 'number',
1069 'colspan' => 'number',
1070 %cellhv_attr,
1071 'nowrap' => 'novalue',
1072 'bgcolor' => 'color',
1073 'width' => 'size',
1074 'height' => 'size',
1075 'bordercolor' => 'color',
1076 'bordercolorlight' => 'color',
1077 'bordercolordark' => 'color',
1078 );
1079
1080 %_Attrib = ( 'br' => { 'clear' => 'word' },
1081 'em' => \%attr,
1082 'strong' => \%attr,
1083 'dfn' => \%attr,
1084 'code' => \%attr,
1085 'samp' => \%attr,
1086 'kbd' => \%attr,
1087 'var' => \%attr,
1088 'cite' => \%attr,
1089 'abbr' => \%attr,
1090 'acronym' => \%attr,
1091 'q' => { %attr, 'cite' => 'href' },
1092 'blockquote' => { %attr, 'cite' => 'href' },
1093 'sub' => \%attr,
1094 'sup' => \%attr,
1095 'tt' => \%attr,
1096 'i' => \%attr,
1097 'b' => \%attr,
1098 'big' => \%attr,
1099 'small' => \%attr,
1100 'u' => \%attr,
1101 's' => \%attr,
1102 'strike' => \%attr,
1103 'font' => \%font_attr,
1104 'table' => {
1105 %attr,
1106 'frame' => 'word',
1107 'rules' => 'word',
1108 %texta_attr,
1109 'bgcolor' => 'color',
1110 'background' => 'src',
1111 'width' => 'size',
1112 'height' => 'size',
1113 'cellspacing' => 'size',
1114 'cellpadding' => 'size',
1115 'border' => 'size',
1116 'bordercolor' => 'color',
1117 'bordercolorlight' => 'color',
1118 'bordercolordark' => 'color',
1119 'summary' => 'text',
1120 },
1121 'caption' => { %attr, 'align' => 'word', },
1122 'colgroup' => \%col_attr,
1123 'col' => \%col_attr,
1124 'thead' => \%cellhv_attr,
1125 'tfoot' => \%cellhv_attr,
1126 'tbody' => \%cellhv_attr,
1127 'tr' => {
1128 %attr,
1129 bgcolor => 'color',
1130 %cellhv_attr,
1131 },
1132 'th' => \%thtd_attr,
1133 'td' => \%thtd_attr,
1134 'ins' => \%insdel_attr,
1135 'del' => \%insdel_attr,
1136 'a' => { %attr, href => 'href', title => 'text' },
1137 'h1' => \%texta_attr,
1138 'h2' => \%texta_attr,
1139 'h3' => \%texta_attr,
1140 'h4' => \%texta_attr,
1141 'h5' => \%texta_attr,
1142 'h6' => \%texta_attr,
1143 'p' => \%texta_attr,
1144 'div' => \%texta_attr,
1145 'span' => \%texta_attr,
1146 'ul' => {
1147 %attr,
1148 'type' => 'word',
1149 'compact' => 'novalue',
1150 },
1151 'ol' => { %attr,
1152 'type' => 'text',
1153 'compact' => 'novalue',
1154 'start' => 'number',
1155 },
1156 'li' => { %attr,
1157 'type' => 'text',
1158 'value' => 'number',
1159 },
1160 'dl' => { %attr, 'compact' => 'novalue' },
1161 'dt' => \%attr,
1162 'dd' => \%attr,
1163 'address' => \%attr,
1164 'hr' => {
1165 %texta_attr,
1166 'width' => 'size',
1167 'size' => 'size',
1168 'noshade' => 'novalue',
1169 },
1170 'pre' => { %attr, 'width' => 'size' },
1171 'center' => \%attr,
1172 'nobr' => {},
1173 'img' => {
1174 'src' => 'src',
1175 'alt' => 'text',
1176 'width' => 'size',
1177 'height' => 'size',
1178 'border' => 'size',
1179 'hspace' => 'size',
1180 'vspace' => 'size',
1181 'align' => 'word',
1182 },
1183 'body' => { 'bgcolor' => 'color',
1184 'background' => 'src',
1185 'link' => 'color',
1186 'vlink' => 'color',
1187 'alink' => 'color',
1188 'text' => 'color',
1189 },
1190 'head' => {},
1191 'title' => {},
1192 'html' => {},
1193 );
1194}
1195
1196sub init_attrib_whitelist { return \%_Attrib; }
1197
1198=item init_attval_whitelist ()
1199
1200Returns a reference to the C<AttVal> whitelist, which is a hash that maps
1201attribute value class names from the C<Attrib> whitelist to coderefs to
1202subs to validate (and optionally transform) a particular attribute value.
1203
1204The filter calls the attribute value validation subs with the
1205following parameters:
1206
1207=over
1208
1209=item C<filter>
1210
1211A reference to the filter object.
1212
1213=item C<tagname>
1214
1215The lowercase name of the tag in which the attribute appears.
1216
1217=item C<attrname>
1218
1219The name of the attribute.
1220
1221=item C<attrval>
1222
1223The attribute value found in the input document, in canonical form
1224(see L</"CANONICAL FORM">).
1225
1226=back
1227
1228The validation sub can return undef to indicate that the attribute
1229should be removed from the tag, or it can return the new value for
1230the attribute, in canonical form.
1231
1232=cut
1233
1234use vars qw(%_AttVal);
1235
1236BEGIN {
1237 %_AttVal = ( 'style' => \&_hss_attval_style,
1238 'size' => \&_hss_attval_size,
1239 'number' => \&_hss_attval_number,
1240 'color' => \&_hss_attval_color,
1241 'text' => \&_hss_attval_text,
1242 'word' => \&_hss_attval_word,
1243 'wordlist' => \&_hss_attval_wordlist,
1244 'wordlistq' => \&_hss_attval_wordlistq,
1245 'href' => \&_hss_attval_href,
1246 'src' => \&_hss_attval_src,
1247 'stylesrc' => \&_hss_attval_stylesrc,
1248 'novalue' => \&_hss_attval_novalue,
1249 );
1250}
1251
1252sub init_attval_whitelist { return \%_AttVal; }
1253
1254=item init_style_whitelist ()
1255
1256Returns a reference to the C<Style> whitelist, which determines which CSS
1257style directives are permitted in C<style> tag attributes. The keys are
1258value names such as C<color> and C<background-color>, and the values are
1259class names to be used as keys into the C<AttVal> whitelist.
1260
1261=cut
1262
1263use vars qw(%_Style);
1264
1265BEGIN {
1266 %_Style = ( 'color' => 'color',
1267 'background-color' => 'color',
1268 'background' => 'stylesrc',
1269 'background-image' => 'stylesrc',
1270 'font-size' => 'size',
1271 'font-family' => 'wordlistq',
1272 'text-align' => 'word',
1273 );
1274}
1275
1276sub init_style_whitelist { return \%_Style; }
1277
1278=item init_deinter_whitelist
1279
1280Returns a reference to the C<DeInter> whitelist, which determines which inline
1281tags the filter should attempt to automatically de-interleave if they are
1282encountered interleaved. For example, the filter will transform:
1283
1284 <b>hello <i>world</b> !</i>
1285
1286Into:
1287
1288 <b>hello <i>world</i></b><i> !</i>
1289
1290because both C<b> and C<i> appear as keys in the C<DeInter> whitelist.
1291
1292=cut
1293
1294use vars qw(%_DeInter);
1295
1296BEGIN {
1297 %_DeInter = map { $_ => 1 } qw(
1298 tt i b big small u s strike font em strong dfn code
1299 q sub sup samp kbd var cite abbr acronym span
1300 );
1301}
1302
1303sub init_deinter_whitelist { return \%_DeInter; }
1304
1305=back
1306
1307=head1 CHARACTER DATA PROCESSING
1308
1309These methods transform attribute values and non-tag text from the
1310input document into canonical form (see L</"CANONICAL FORM">), and
1311transform text in canonical form into a suitable form for the output
1312document.
1313
1314=over
1315
1316=item text_to_canonical_form ( TEXT )
1317
1318This method is used to reduce non-tag text from the input document to
1319canonical form before passing it to the filter_text() method.
1320
1321The default implementation unescapes all entities that map to
1322C<US-ASCII> characters other than ampersand, and replaces any
1323ampersands that don't form part of valid entities with C<&amp;>.
1324
1325=cut
1326
1327sub text_to_canonical_form {
1328 my ( $self, $text ) = @_;
1329
1330 $text =~ s#&gt;#>#g;
1331 $text =~ s#&lt;#<#g;
1332 $text =~ s#&quot;#"#g;
1333 $text =~ s#&apos;#'#g;
1334
1335 $text =~ s! ( [^&]+ | &[a-z0-9]{2,15}; ) |
1336 defined $1 ? $1 :
1337 defined $2 ? $self->_hss_decode_numeric($2) :
1338 defined $3 ? $self->_hss_decode_numeric($3) :
1339 '&amp;'
1340 !igex;
1341
- -
1346 return $text;
1347}
1348
1349=item quoted_to_canonical_form ( VALUE )
1350
1351This method is used to reduce attribute values quoted with doublequotes
1352or singlequotes to canonical form before passing it to the handler subs
1353in the C<AttVal> whitelist.
1354
1355The default behavior is the same as that of C<text_to_canonical_form()>,
1356plus it converts any CR, LF or TAB characters to spaces.
1357
1358=cut
1359
1360sub quoted_to_canonical_form {
1361 my ( $self, $text ) = @_;
1362 $text = $self->text_to_canonical_form($text);
1363 $text =~ tr/\n\r\t/ /s;
1364 return $text;
1365}
1366
1367=item unquoted_to_canonical_form ( VALUE )
1368
1369This method is used to reduce attribute values without quotes to
1370canonical form before passing it to the handler subs in the C<AttVal>
1371whitelist.
1372
1373The default implementation simply replaces all ampersands with C<&amp;>,
1374since that corresponds with the way most browsers treat entities in
1375unquoted values.
1376
1377=cut
1378
1379sub unquoted_to_canonical_form {
1380 my ( $self, $text ) = @_;
1381
1382 $text =~ s#&#&amp;#g;
1383 return $text;
1384}
1385
1386=item canonical_form_to_text ( TEXT )
1387
1388This method is used to convert the text in canonical form returned by
1389the filter_text() method to a form suitable for inclusion in the output
1390document.
1391
1392The default implementation runs anything that doesn't look like a
1393valid entity through the escape_html_metachars() method.
1394
1395=cut
1396
1397sub canonical_form_to_text {
1398 my ( $self, $text ) = @_;
1399 $text =~ s/ (&[#\w]+;) | (.[^&]*)
1400 /gex;
1401
1402
1403 return $text;
1404}
1405
1406=item canonical_form_to_attval ( ATTVAL )
1407
1408This method is used to convert the text in canonical form returned by
1409the C<AttVal> handler subs to a form suitable for inclusion in
1410doublequotes in the output tag.
1411
1412The default implementation converts CR, LF and TAB characters to a single
1413space, and runs anything that doesn't look like a
1414valid entity through the escape_html_metachars() method.
1415
1416=cut
1417
1418sub canonical_form_to_attval {
1419 my ( $self, $text ) = @_;
1420 $text =~ tr/\n\r\t/ /s;
1421 return $self->canonical_form_to_text($text);
1422}
1423
1424=item validate_href_attribute ( TEXT )
1425
1426If the C<AllowHref> filter configuration option is set, then this
1427method is used to validate C<href> type attribute values. TEXT is
1428the attribute value in canonical form. Returns a possibly modified
1429attribute value (in canonical form) or C<undef> to reject the attribute.
1430
1431The default implementation allows only absolute C<http> and C<https>
1432URLs, permits port numbers and query strings, and imposes reasonable
1433length limits.
1434
1435It does not URI escape the query string, and it does not guarantee
1436properly formatted URIs, it just tries to give safe URIs. You can
1437always use an attribute callback (see L<"Attribute Callbacks">)
1438to provide stricter handling.
1439
1440=cut
1441
1442sub validate_href_attribute {
1443 my ( $self, $text ) = @_;
1444
1445 return $1
1446 if $self->{_hssCfg}{AllowRelURL}
1447 and $text =~ /^((?:[\w\-.!~*|;\/?=+\$,%#]|&amp;){0,100})$/;
1448
1449 $text =~ m< ^ ( https? :// [\w\-\.]{1,100} (?:\:\d{1,5})?
1450 (?: / (?:[\w\-.!~*|;/?=+\$,%#]|&amp;){0,100} )?
1451 )
1452 $
1453 >x ? $1 : undef;
1454}
1455
1456=item validate_mailto ( TEXT )
1457
1458If the C<AllowMailto> filter configuration option is set, then this
1459method is used to validate C<href> type attribute values which begin
1460with C<mailto:>. TEXT is the attribute value in canonical form.
1461Returns a possibly modified attribute value (in canonical form) or C<undef>
1462to reject the attribute.
1463
1464This uses a lightweight regex and does not guarantee that email
1465addresses are properly formatted. You can
1466always use an attribute callback (see L<"Attribute Callbacks">)
1467to provide stricter handling.
1468
1469=cut
1470
1471sub validate_mailto {
1472 my ( $self, $text ) = @_;
1473
1474 return $1
1475 if $text =~ m/^(
1476 mailto:[\w\-!#\$%&'*+-\/=?^_`{|}~.]{1,64} # localpart
1477 \@ # @
1478 [\w\-\.]{1,100} # domain
1479 (?: # opt query string
1480 \?
1481 (?:[\w\-.!~*|;\/?=+\$,%#]|&amp;){0,100}
1482 )?
1483 )$/x;
1484 return;
1485}
1486
1487=item validate_src_attribute ( TEXT )
1488
1489If the C<AllowSrc> filter configuration option is set, then this
1490method is used to validate C<src> type attribute values. TEXT is
1491the attribute value in canonical form. Returns a possibly modified
1492attribute value (in canonical form) or C<undef> to reject the attribute.
1493
1494The default implementation behaves as validate_href_attribute().
1495
1496=cut
1497
1498*validate_src_attribute = \&validate_href_attribute;
1499
1500=back
1501
1502=head1 OTHER METHODS TO OVERRIDE
1503
1504As well as the output, reject, init and cdata methods listed above,
1505it might make sense for subclasses to override the following methods:
1506
1507=over
1508
1509=item filter_text ( TEXT )
1510
1511This method will be invoked to filter blocks of non-tag text in the
1512input document. Both input and output are in canonical form, see
1513L</"CANONICAL FORM">.
1514
1515The default implementation does no filtering.
1516
1517=cut
1518
1519sub filter_text {
1520 my ( $self, $text ) = @_;
1521
1522 return $text;
1523}
1524
1525=item escape_html_metachars ( TEXT )
1526
1527This method is used to escape all HTML metacharacters in TEXT.
1528The return value must be a copy of TEXT with metacharacters escaped.
1529
1530The default implementation escapes a minimal set of
1531metacharacters for security against XSS vulnerabilities. The set
1532of characters to escape is a compromise between the need for
1533security and the need to ensure that the filter will work for
1534documents in as many different character sets as possible.
1535
1536Subclasses which make strong assumptions about the document
1537character set will be able to escape much more aggressively.
1538
1539=cut
1540
1541use vars qw(%_Escape_HTML_map);
1542
1543BEGIN {
1544 %_Escape_HTML_map = ( '&' => '&amp;',
1545 '<' => '&lt;',
1546 '>' => '&gt;',
1547 '"' => '&quot;',
1548 "'" => '&#39;',
1549 );
1550}
1551
1552sub escape_html_metachars {
1553 my ( $self, $text ) = @_;
1554
1555 $text =~ s#([&<>"'])# $_Escape_HTML_map{$1} #ge;
1556 return $text;
1557}
1558
1559=item strip_nonprintable ( TEXT )
1560
1561Returns a copy of TEXT with runs of nonprintable characters replaced
1562with spaces or some other harmless string. Avoids replacing anything
1563with the empty string, as that can lead to other security issues.
1564
1565The default implementation strips out only NULL characters, in order to
1566avoid scrambling text for as many different character sets as possible.
1567
1568Subclasses which make some sort of assumption about the character set
1569in use will be able to have a much wider definition of a nonprintable
1570character, and hence a more secure strip_nonprintable() implementation.
1571
1572=cut
1573
1574sub strip_nonprintable {
1575 my ( $self, $text ) = @_;
1576
1577 $text =~ tr#\0# #s;
1578 return $text;
1579}
1580
1581=back
1582
1583=head1 ATTRIBUTE VALUE HANDLER SUBS
1584
1585References to the following subs appear in the C<AttVal> whitelist
1586returned by the init_attval_whitelist() method.
1587
1588=over
1589
1590=item _hss_attval_style( FILTER, TAGNAME, ATTRNAME, ATTRVAL )
1591
1592Attribute value hander for the C<style> attribute.
1593
1594=cut
1595
1596sub _hss_attval_style {
1597 my ( $filter, $tagname, $attrname, $attrval ) = @_;
1598 my @clean = ();
1599
1600 # Split on semicolon, making a reasonable attempt to ignore
1601 # semicolons inside doublequotes or singlequotes.
1602 while ( $attrval =~ s{^((?:[^;'"]|'[^']*'|"[^"]*")+)}{} ) {
1603 my $elt = $1;
1604 $attrval =~ s/^;//;
1605
1606 if ( $elt =~ m|^\s*([\w\-]+)\s*:\s*(.+?)\s*$|s ) {
1607 my ( $key, $val ) = ( lc $1, $2 );
1608
1609 my $value_class = $filter->{_hssStyle}{$key};
1610 next unless defined $value_class;
1611 my $sub = $filter->{_hssAttVal}{$value_class};
1612 next unless defined $sub;
1613
1614 my $cleanval = &{$sub}( $filter, 'style-psuedo-tag', $key, $val );
1615 if ( defined $cleanval ) {
1616 push @clean, "$key:$val";
1617 }
1618 }
1619 }
1620
1621 return join '; ', @clean;
1622}
1623
1624=item _hss_attval_size ( FILTER, TAGNAME, ATTRNAME, ATTRVAL )
1625
1626Attribute value handler for attributes who's values are some sort of
1627size or length.
1628
1629=cut
1630
1631sub _hss_attval_size {
1632 $_[3]
1633 =~ /^\s*([+-]?\d{1,20}(?:\.\d{1,20})?)\s*((?:\%|\*|ex|px|pc|cm|mm|in|pt|em)?)\s*$/i
1634 ? lc "$1$2"
1635 : undef;
1636}
1637
1638=item _hss_attval_number ( FILTER, TAGNAME, ATTRNAME, ATTRVAL )
1639
1640Attribute value handler for attributes who's values are a simple
1641integer.
1642
1643=cut
1644
1645sub _hss_attval_number {
1646 $_[3] =~ /^\s*\+?(\d{1,20})\s*$/ ? $1 : undef;
1647}
1648
1649=item _hss_attval_color ( FILTER, TAGNAME, ATTRNAME, ATTRVAL )
1650
1651Attribute value handler for color attributes.
1652
1653=cut
1654
1655sub _hss_attval_color {
1656 $_[3] =~ /^\s*(\w{2,20}|#[\da-fA-F]{6})\s*$/ ? $1 : undef;
1657}
1658
1659=item _hss_attval_text ( FILTER, TAGNAME, ATTRNAME, ATTRVAL )
1660
1661Attribute value handler for text attributes.
1662
1663=cut
1664
1665sub _hss_attval_text {
1666 length $_[3] <= 200 ? $_[3] : undef;
1667}
1668
1669=item _hss_attval_word ( FILTER, TAGNAME, ATTRNAME, ATTRVAL )
1670
1671Attribute value handler for attributes who's values must consist of
1672a single short word, with minus characters permitted.
1673
1674=cut
1675
1676sub _hss_attval_word {
1677 $_[3] =~ /^\s*([\w\-]{1,30})\s*$/ ? $1 : undef;
1678}
1679
1680=item _hss_attval_wordlist ( FILTER, TAGNAME, ATTRNAME, ATTRVAL )
1681
1682Attribute value handler for attributes who's values must consist of
1683one or more words, separated by spaces and/or commas.
1684
1685=cut
1686
1687sub _hss_attval_wordlist {
1688 $_[3] =~ /^\s*([\w\-\, ]{1,200})\s*$/ ? $1 : undef;
1689}
1690
1691=item _hss_attval_wordlistq ( FILTER, TAGNAME, ATTRNAME, ATTRVAL )
1692
1693Attribute value handler for attributes who's values must consist of
1694one or more words, separated by commas, with optional doublequotes
1695around words and spaces allowed within the doublequotes.
1696
1697=cut
1698
1699sub _hss_attval_wordlistq {
1700 my ( $filter, $tagname, $attrname, $attrval ) = @_;
1701
1702 my @words = grep {/^\s*(?:(?:"[\w\- ]{1,50}")|(?:[\w\-]{1,30}))\s*$/}
1703 split /,/, $attrval;
1704
1705 scalar(@words) ? join( ', ', @words ) : undef;
1706}
1707
1708=item _hss_attval_href ( FILTER, TAGNAME, ATTRNAME, ATTRVAL )
1709
1710Attribute value handler for C<href> type attributes. If the C<AllowHref>
1711or C<AllowMailto> configuration options are set,
1712uses the validate_href_attribute() method to check the attribute value.
1713
1714=cut
1715
1716sub _hss_attval_href {
1717 my ( $filter, $tagname, $attname, $attval ) = @_;
1718
1719 if ( $filter->{_hssCfg}{AllowMailto}
1720 && substr( $attval, 0, 7 ) eq 'mailto:' )
1721 {
1722 return $filter->validate_mailto($attval);
1723 }
1724 elsif ( $filter->{_hssCfg}{AllowHref} ) {
1725 return $filter->validate_href_attribute($attval);
1726 }
1727 return;
1728
1729}
1730
1731=item _hss_attval_src ( FILTER, TAGNAME, ATTRNAME, ATTRVAL )
1732
1733Attribute value handler for C<src> type attributes. If the C<AllowSrc>
1734configuration option is set, uses the validate_src_attribute() method
1735to check the attribute value.
1736
1737=cut
1738
1739sub _hss_attval_src {
1740 my ( $filter, $tagname, $attname, $attval ) = @_;
1741
1742 if ( $filter->{_hssCfg}{AllowSrc} ) {
1743 return $filter->validate_src_attribute($attval);
1744 }
1745 else {
1746 return;
1747 }
1748}
1749
1750=item _hss_attval_stylesrc ( FILTER, TAGNAME, ATTRNAME, ATTRVAL )
1751
1752Attribute value handler for C<src> type style pseudo attributes.
1753
1754=cut
1755
1756sub _hss_attval_stylesrc {
1757 my ( $filter, $tagname, $attname, $attval ) = @_;
1758
1759 if ( $attval =~ m#^\s*url\((.+)\)\s*$# ) {
1760 return _hss_attval_src( $filter, $tagname, $attname, $1 );
1761 }
1762 else {
1763 return;
1764 }
1765}
1766
1767=item _hss_attval_novalue ( FILTER, TAGNAME, ATTRNAME, ATTRVAL )
1768
1769Attribute value handler for attributes that have no value or a value that
1770is ignored. Just returns the attribute name as the value.
1771
1772=cut
1773
1774sub _hss_attval_novalue {
1775 my ( $filter, $tagname, $attname, $attval ) = @_;
1776
1777 return $attname;
1778}
1779
1780=back
1781
1782=head1 CANONICAL FORM
1783
1784Many of the methods described above deal with text from the input
1785document, encoded in what I call C<canonical form>, defined as
1786follows:
1787
1788All characters other than ampersands represent themselves. Literal
1789ampersands are encoded as C<&amp;>. Non C<US-ASCII> characters
1790may appear as literals in whatever character set is in use, or they
1791may appear as named or numeric HTML entities such as C<&aelig;>,
1792C<&#31337;> and C<&#xFF;>. Unknown named entities such as C<&foo;>
1793may appear.
1794
1795The idea is to be able to be able to reduce input text to a minimal
1796form, without making too many assumptions about the character set in
1797use.
1798
1799=head1 PRIVATE METHODS
1800
1801The following methods are internal to this class, and should not be
1802invoked from elsewhere. Subclasses should not use or override
1803these methods.
1804
1805=over
1806
1807=item _hss_prepare_ban_list (CFG)
1808
1809Returns a hash ref representing all the banned tags, based on the values
1810of BanList and BanAllBut
1811
1812=cut
1813
1814#===================================
1815sub _hss_prepare_ban_list {
1816#===================================
1817 my ( $self, $cfg ) = @_;
1818
1819 my $ban_list = $cfg->{BanList} || {};
1820 my $prepared_ban_list
1821 = ref $ban_list eq 'ARRAY'
1822 ? { map { $_ => 1 } @$ban_list }
1823 : $ban_list;
1824
1825 # Rules => {'*' => 0} or {'*' => {tag => '0'}} means BanAllBut other tags
1826 # mentioned in the rules
1827 if ( my $rules = $cfg->{Rules} ) {
1828 if ( exists $rules->{'*'}
1829 && ( ( ref $rules->{'*'} eq 'HASH'
1830 && exists $rules->{'*'}{'tag'}
1831 && !$rules->{'*'}{'tag'}
1832 )
1833 || ( !$rules->{'*'} )
1834 )
1835 )
1836 {
1837 $cfg->{BanAllBut} ||= [];
1838
1839 }
1840 }
1841
1842 if ( $cfg->{BanAllBut} ) {
1843 my %ban = map { $_ => 1 } keys %{ $self->{_hssAttrib} };
1844 foreach my $dontban ( @{ $cfg->{BanAllBut} } ) {
1845 delete $ban{$dontban} unless $prepared_ban_list->{$dontban};
1846 }
1847 $prepared_ban_list = \%ban;
1848 }
1849 return $prepared_ban_list;
1850}
1851
1852=item _hss_prepare_rules (CFG)
1853
1854Returns a hash ref representing the tag and attribute rules (See L</"Rules">).
1855
1856Returns undef if no filters are specified, in which case the
1857attribute filter code has very little performance impact. If any rules are
1858specified, then every tag and attribute is checked.
1859
1860=cut
1861
1862#===================================
1863sub _hss_prepare_rules {
1864#===================================
1865 my ( $self, $cfg ) = @_;
1866
1867 my $rules = $cfg->{Rules};
1868
1869 return
1870 unless $rules
1871 && ref $rules eq 'HASH'
1872 && keys %$rules;
1873
1874 die "'Rules' must be a HASH ref"
1875 unless ref $rules eq 'HASH';
1876
1877 my $banned = $self->{_hssBanList};
1878
1879 my %prepared_rules;
1880 foreach my $tag ( keys %$rules ) {
1881 my $rule = $rules->{$tag};
1882 $tag = lc($tag);
1883
1884 # TAG => 0
1885 if ( !$rule ) {
1886 $banned->{$tag} ||= 1;
1887 next;
1888 }
1889
1890 delete $banned->{$tag};
1891 if ( my $rule_ref = ref $rule ) {
1892
1893 # TAG => CODEREF
1894 $rule = { tag => $rule }
1895 if $rule_ref eq 'CODE';
1896
1897 die "Unknown value for tag '$tag'. Must be a HASH or a CODE ref"
1898 unless ref $rule eq 'HASH';
1899 }
1900 else {
1901
1902 # TAG => 1
1903 next;
1904 }
1905
1906 # TAG => HASHREF
1907 my %prepared_rule;
1908
1909 # Required attributes
1910 if ( my $required = delete $rule->{required} ) {
1911 if ( ref $required eq 'ARRAY' && @$required ) {
1912 $prepared_rule{required} = $required;
1913 }
1914 }
1915
1916 RULE:
1917 while ( my ( $key, $value ) = each %$rule ) {
1918
1919 $key = lc($key);
1920
1921 # Pass through code refs
1922 my $ref_type = ref $value;
1923 if ( $ref_type eq 'CODE' ) {
1924 $prepared_rule{$key} = $value;
1925 next RULE;
1926 }
1927
1928 if ( !$ref_type ) {
1929
1930 # Pass through 1 / 0
1931 if ( $value eq '0' or $value eq '1' ) {
1932 $prepared_rule{$key} = $value;
1933 next RULE;
1934 }
1935
1936 # Any remaining values must be regexes
1937 $value = eval {qr/$value/}
1938 or die "Invalid regex rule for '$tag' => '$key' : $@";
1939 $ref_type = 'Regexp';
1940 }
1941
1942 die "Invalid rule value for '$tag' => '$key' : $ref_type"
1943 unless $ref_type eq 'Regexp';
1944
1945 # Convert regex into anonymous sub
1946 $prepared_rule{$key} = sub {
1947 my ( $rule, $tagname, $attname, $attval ) = @_;
1948 return $attval =~ m/$value/
1949 ? $attval
1950 : undef;
1951 };
1952
1953 }
1954 $prepared_rules{$tag} = \%prepared_rule
1955 if keys %prepared_rule;
1956 }
1957 return
1958 unless keys %prepared_rules;
1959
1960 # Add default setting of {'*' => {'*' => 1}}
1961 # unless it already has a value
1962 unless ( exists $prepared_rules{'*'}{'*'} ) {
1963 $prepared_rules{'*'}{'*'} = 1;
1964 }
1965
1966 # Remove required attribs from default
1967 delete $prepared_rules{'*'}{required};
1968
1969 # Remove 'tag' from default unless is a sub
1970 delete $prepared_rules{'*'}{tag} unless ref $prepared_rules{'*'}{tag};
1971 return \%prepared_rules;
1972}
1973
1974=item _hss_get_attr_filter ( DEFAULT_FILTERS TAG_FILTERS ATTR_NAME)
1975
1976Returns the attribute filter rule to apply to this particular attribute.
1977
1978Checks for:
1979
1980 - a named attribute rule in a named tag
1981 - a default * attribute rule in a named tag
1982 - a named attribute rule in the default * rules
1983 - a default * attribute rule in the default * rules
1984
1985=cut
1986
1987sub _hss_get_attr_filter {
1988 my ( $self, $default_filters, $tag_filters, $key ) = @_;
1989
1990 return $tag_filters->{$key}
1991 if exists $tag_filters->{$key};
1992
1993 return $tag_filters->{'*'}
1994 if exists $tag_filters->{'*'};
1995
1996 return $default_filters->{$key}
1997 if exists $default_filters->{$key};
1998
1999 return $default_filters->{'*'};
2000
2001}
2002
2003=item _hss_join_attribs (FILTERED_ATTRIBS)
2004
2005Accepts a hash ref containing the attribute names as the keys, and the attribute
2006values as the values. Escapes them and returns a string ready for output to
2007HTML
2008
2009=cut
2010
2011sub _hss_join_attribs {
2012 my ( $self, $attrs ) = @_;
2013 my $filtered_attrs = '';
2014 foreach my $key ( sort keys %$attrs ) {
2015 my $escaped = $self->canonical_form_to_attval( $attrs->{$key} );
2016 $filtered_attrs .= qq| $key="$escaped"|;
2017
2018 }
2019 return $filtered_attrs;
2020}
2021
2022=item _hss_decode_numeric ( NUMERIC )
2023
2024Returns the string that should replace the numeric entity NUMERIC
2025in the text_to_canonical_form() method.
2026
2027=cut
2028
2029sub _hss_decode_numeric {
2030 my ( $self, $numeric ) = @_;
2031
2032 my $hex = ( $numeric =~ s/^x//i ? 1 : 0 );
2033
2034 $numeric =~ s/^0+//;
2035 my $number = ( $hex ? hex($numeric) : $numeric );
2036
2037 if ( $number == ord '&' ) {
2038 return '&amp;';
2039 }
2040 elsif ( $number < 127 ) {
2041 return chr $number;
2042 }
2043 else {
2044 return '&#' . ( $hex ? 'x' : '' ) . uc($numeric) . ';';
2045 }
2046}
2047
2048=item _hss_tag_is_banned ( TAGNAME )
2049
2050Returns true if the lower case tag name TAGNAME is on the list of
2051harmless tags that the filter is configured to block, false otherwise.
2052
2053=cut
2054
2055sub _hss_tag_is_banned {
2056 my ( $self, $tag ) = @_;
2057
2058 exists $self->{_hssBanList}{$tag} ? 1 : 0;
2059}
2060
2061=item _hss_get_to_valid_context ( TAG )
2062
2063Tries to get the filter to a context in which the tag TAG is
2064allowed, by introducing extra end tags or start tags if
2065necessary. TAG can be either the lower case name of a tag or
2066the string 'CDATA'.
2067
2068Returns 1 if an allowed context is reached, or 0 if there's no
2069reasonable way to get to an allowed context and the tag should
2070just be rejected.
2071
2072=cut
2073
2074sub _hss_get_to_valid_context {
2075 my ( $self, $tag ) = @_;
2076
2077 # Special case: nested <a> is never valid.
2078 if ( $tag eq 'a' ) {
2079 foreach my $ancestor ( @{ $self->{_hssStack} } ) {
2080 return 0 if $ancestor->{NAME} eq 'a';
2081 }
2082 }
2083
2084 return 1 if $self->_hss_valid_in_current_context($tag);
2085
2086 if ( $self->_hss_context eq 'Document' ) {
2087 $self->input_start('<html>');
2088 return 1 if $self->_hss_valid_in_current_context($tag);
2089 }
2090
2091 if ( $self->_hss_context eq 'Html'
2092 and $self->_hss_valid_in_context( $tag, 'Flow' ) )
2093 {
2094 $self->input_start('<body>');
2095 return 1;
2096 }
2097
2098 return 0
2099 unless grep { $self->_hss_valid_in_context( $tag, $_->{CTX} ) }
2100 @{ $self->{_hssStack} };
2101
2102 until ( $self->_hss_valid_in_current_context($tag) ) {
2103 $self->_hss_close_innermost_tag;
2104 }
2105
2106 return 1;
2107}
2108
2109=item _hss_close_innermost_tag ()
2110
2111Closes the innermost open tag.
2112
2113=cut
2114
2115sub _hss_close_innermost_tag {
2116 my ($self) = @_;
2117 $self->output_stack_entry( shift @{ $self->{_hssStack} } );
2118 die 'tag stack underflow' unless scalar @{ $self->{_hssStack} };
2119}
2120
2121=item _hss_context ()
2122
2123Returns the current named context of the filter.
2124
2125=cut
2126
2127sub _hss_context {
2128 my ($self) = @_;
2129
2130 $self->{_hssStack}[0]{CTX};
2131}
2132
2133=item _hss_valid_in_context ( TAG, CONTEXT )
2134
2135Returns true if the lowercase tag name TAG is valid in context
2136CONTEXT, false otherwise.
2137
2138=cut
2139
2140sub _hss_valid_in_context {
2141 my ( $self, $tag, $context ) = @_;
2142
2143 $self->{_hssContext}{$context}{$tag} ? 1 : 0;
2144}
2145
2146=item _hss_valid_in_current_context ( TAG )
2147
2148Returns true if the lowercase tag name TAG is valid in the filter's
2149current context, false otherwise.
2150
2151=cut
2152
2153sub _hss_valid_in_current_context {
2154 my ( $self, $tag ) = @_;
2155
2156 $self->_hss_valid_in_context( $tag, $self->_hss_context );
2157}
2158
2159=back
2160
2161=head1 BUGS AND LIMITATIONS
2162
2163=over
2164
2165=item Performance
2166
2167This module does a lot of work to ensure that tags are correctly
2168nested and are not left open, causing unnecessary overhead for
2169applications where that doesn't matter.
2170
2171Such applications may benefit from using the more lightweight
2172L<HTML::Scrubber::StripScripts> module instead.
2173
2174=item Strictness
2175
2176URIs and email addresses are cleaned up to be safe, but not
2177necessarily accurate. That would have required adding dependencies.
2178Attribute callbacks can be used to add this functionality if required,
2179or the validation methods can be overridden.
2180
2181By default, filtered HTML may not be valid strict XHTML, for instance empty
2182required attributes may be outputted. However, with L</"Rules">,
2183it should be possible to force the HTML to validate.
2184
2185=item REPORTING BUGS
2186
2187Please report any bugs or feature requests to
2188bug-html-stripscripts@rt.cpan.org, or through the web interface at
2189L<http://rt.cpan.org>.
2190
2191=back
2192
2193=head1 SEE ALSO
2194
2195L<HTML::Parser>, L<HTML::StripScripts::Parser>,
2196L<HTML::StripScripts::Regex>
2197
2198=head1 AUTHOR
2199
2200Original author Nick Cleaton E<lt>nick@cleaton.netE<gt>
2201
2202New code added and module maintained by Clinton Gormley
2203E<lt>clint@traveljury.comE<gt>
2204
2205=head1 COPYRIGHT
2206
2207Copyright (C) 2003 Nick Cleaton. All Rights Reserved.
2208
2209Copyright (C) 2007 Clinton Gormley. All Rights Reserved.
2210
2211=head1 LICENSE
2212
2213This module is free software; you can redistribute it and/or modify it
2214under the same terms as Perl itself.
2215
2216=cut
2217
22181;
2219