← Index
NYTProf Performance Profile   « line view »
For /usr/local/libexec/sympa/task_manager-debug.pl
  Run on Tue Jun 1 22:32:51 2021
Reported on Tue Jun 1 22:35:11 2021

Filename/usr/local/lib/perl5/site_perl/HTML/TreeBuilder.pm
StatementsExecuted 0 statements in 0s
Subroutines
Calls P F Exclusive
Time
Inclusive
Time
Subroutine
0000s0sHTML::TreeBuilder::::BEGIN@16HTML::TreeBuilder::BEGIN@16
0000s0sHTML::TreeBuilder::::BEGIN@5HTML::TreeBuilder::BEGIN@5
0000s0sHTML::TreeBuilder::::BEGIN@59HTML::TreeBuilder::BEGIN@59
0000s0sHTML::TreeBuilder::::BEGIN@6HTML::TreeBuilder::BEGIN@6
0000s0sHTML::TreeBuilder::::BEGIN@60HTML::TreeBuilder::BEGIN@60
0000s0sHTML::TreeBuilder::::BEGIN@62HTML::TreeBuilder::BEGIN@62
0000s0sHTML::TreeBuilder::::BEGIN@63HTML::TreeBuilder::BEGIN@63
0000s0sHTML::TreeBuilder::::BEGIN@7HTML::TreeBuilder::BEGIN@7
0000s0sHTML::TreeBuilder::::BEGIN@8HTML::TreeBuilder::BEGIN@8
0000s0sHTML::TreeBuilder::::_elemHTML::TreeBuilder::_elem
0000s0sHTML::TreeBuilder::::commentHTML::TreeBuilder::comment
0000s0sHTML::TreeBuilder::::declarationHTML::TreeBuilder::declaration
0000s0sHTML::TreeBuilder::::deleteHTML::TreeBuilder::delete
0000s0sHTML::TreeBuilder::::disembowelHTML::TreeBuilder::disembowel
0000s0sHTML::TreeBuilder::::doneHTML::TreeBuilder::done
0000s0sHTML::TreeBuilder::::element_classHTML::TreeBuilder::element_class
0000s0sHTML::TreeBuilder::::elementifyHTML::TreeBuilder::elementify
0000s0sHTML::TreeBuilder::::endHTML::TreeBuilder::end
0000s0sHTML::TreeBuilder::::eofHTML::TreeBuilder::eof
0000s0sHTML::TreeBuilder::::gutsHTML::TreeBuilder::guts
0000s0sHTML::TreeBuilder::::ignore_ignorable_whitespaceHTML::TreeBuilder::ignore_ignorable_whitespace
0000s0sHTML::TreeBuilder::::ignore_textHTML::TreeBuilder::ignore_text
0000s0sHTML::TreeBuilder::::ignore_unknownHTML::TreeBuilder::ignore_unknown
0000s0sHTML::TreeBuilder::::implicit_body_p_tagHTML::TreeBuilder::implicit_body_p_tag
0000s0sHTML::TreeBuilder::::implicit_tagsHTML::TreeBuilder::implicit_tags
0000s0sHTML::TreeBuilder::::newHTML::TreeBuilder::new
0000s0sHTML::TreeBuilder::::new_from_contentHTML::TreeBuilder::new_from_content
0000s0sHTML::TreeBuilder::::new_from_fileHTML::TreeBuilder::new_from_file
0000s0sHTML::TreeBuilder::::new_from_urlHTML::TreeBuilder::new_from_url
0000s0sHTML::TreeBuilder::::no_expand_entitiesHTML::TreeBuilder::no_expand_entities
0000s0sHTML::TreeBuilder::::no_space_compactingHTML::TreeBuilder::no_space_compacting
0000s0sHTML::TreeBuilder::::p_strictHTML::TreeBuilder::p_strict
0000s0sHTML::TreeBuilder::::parse_contentHTML::TreeBuilder::parse_content
0000s0sHTML::TreeBuilder::::processHTML::TreeBuilder::process
0000s0sHTML::TreeBuilder::::startHTML::TreeBuilder::start
0000s0sHTML::TreeBuilder::::store_commentsHTML::TreeBuilder::store_comments
0000s0sHTML::TreeBuilder::::store_declarationsHTML::TreeBuilder::store_declarations
0000s0sHTML::TreeBuilder::::store_pisHTML::TreeBuilder::store_pis
0000s0sHTML::TreeBuilder::::stuntHTML::TreeBuilder::stunt
0000s0sHTML::TreeBuilder::::stuntedHTML::TreeBuilder::stunted
0000s0sHTML::TreeBuilder::::textHTML::TreeBuilder::text
0000s0sHTML::TreeBuilder::::tighten_upHTML::TreeBuilder::tighten_up
0000s0sHTML::TreeBuilder::::warnHTML::TreeBuilder::warn
0000s0sHTML::TreeBuilder::::warningHTML::TreeBuilder::warning
Call graph for these subroutines as a Graphviz dot language file.
Line State
ments
Time
on line
Calls Time
in subs
Code
1package HTML::TreeBuilder;
2
3# ABSTRACT: Parser that builds a HTML syntax tree
4
5use warnings;
6use strict;
7use integer; # vroom vroom!
8use Carp ();
9
10our $VERSION = '5.07'; # VERSION from OurPkgVersion
11
12#---------------------------------------------------------------------------
13# Make a 'DEBUG' constant...
14
15our $DEBUG; # Must be set BEFORE loading this file
16BEGIN {
17
18 # We used to have things like
19 # print $indent, "lalala" if $Debug;
20 # But there were an awful lot of having to evaluate $Debug's value.
21 # If we make that depend on a constant, like so:
22 # sub DEBUG () { 1 } # or whatever value.
23 # ...
24 # print $indent, "lalala" if DEBUG;
25 # Which at compile-time (thru the miracle of constant folding) turns into:
26 # print $indent, "lalala";
27 # or, if DEBUG is a constant with a true value, then that print statement
28 # is simply optimized away, and doesn't appear in the target code at all.
29 # If you don't believe me, run:
30 # perl -MO=Deparse,-uHTML::TreeBuilder -e 'BEGIN { \
31 # $HTML::TreeBuilder::DEBUG = 4} use HTML::TreeBuilder'
32 # and see for yourself (substituting whatever value you want for $DEBUG
33 # there).
34## no critic
35 if ( defined &DEBUG ) {
36
37 # Already been defined! Do nothing.
38 }
39 elsif ( $] < 5.00404 ) {
40
41 # Grudgingly accomodate ancient (pre-constant) versions.
42 eval 'sub DEBUG { $Debug } ';
43 }
44 elsif ( !$DEBUG ) {
45 eval 'sub DEBUG () {0}'; # Make it a constant.
46 }
47 elsif ( $DEBUG =~ m<^\d+$>s ) {
48 eval 'sub DEBUG () { ' . $DEBUG . ' }'; # Make THAT a constant.
49 }
50 else { # WTF?
51 warn "Non-numeric value \"$DEBUG\" in \$HTML::Element::DEBUG";
52 eval 'sub DEBUG () { $DEBUG }'; # I guess.
53 }
54## use critic
55}
56
57#---------------------------------------------------------------------------
58
59use HTML::Entities ();
60use HTML::Tagset 3.02 ();
61
62use HTML::Element ();
63use HTML::Parser 3.46 ();
64our @ISA = qw(HTML::Element HTML::Parser);
65
66# This looks schizoid, I know.
67# It's not that we ARE an element AND a parser.
68# We ARE an element, but one that knows how to handle signals
69# (method calls) from Parser in order to elaborate its subtree.
70
71# Legacy aliases:
72*HTML::TreeBuilder::isKnown = \%HTML::Tagset::isKnown;
73*HTML::TreeBuilder::canTighten = \%HTML::Tagset::canTighten;
74*HTML::TreeBuilder::isHeadElement = \%HTML::Tagset::isHeadElement;
75*HTML::TreeBuilder::isBodyElement = \%HTML::Tagset::isBodyElement;
76*HTML::TreeBuilder::isPhraseMarkup = \%HTML::Tagset::isPhraseMarkup;
77*HTML::TreeBuilder::isHeadOrBodyElement = \%HTML::Tagset::isHeadOrBodyElement;
78*HTML::TreeBuilder::isList = \%HTML::Tagset::isList;
79*HTML::TreeBuilder::isTableElement = \%HTML::Tagset::isTableElement;
80*HTML::TreeBuilder::isFormElement = \%HTML::Tagset::isFormElement;
81*HTML::TreeBuilder::p_closure_barriers = \@HTML::Tagset::p_closure_barriers;
82
83#==========================================================================
84# Two little shortcut constructors:
85
86sub new_from_file { # or from a FH
87 my $class = shift;
88 Carp::croak("new_from_file takes only one argument")
89 unless @_ == 1;
90 Carp::croak("new_from_file is a class method only")
91 if ref $class;
92 my $new = $class->new();
93 defined $new->parse_file( $_[0] )
94 or Carp::croak("unable to parse file: $!");
95 return $new;
96}
97
98sub new_from_content { # from any number of scalars
99 my $class = shift;
100 Carp::croak("new_from_content is a class method only")
101 if ref $class;
102 my $new = $class->new();
103 foreach my $whunk (@_) {
104 if ( ref($whunk) eq 'SCALAR' ) {
105 $new->parse($$whunk);
106 }
107 else {
108 $new->parse($whunk);
109 }
110 last if $new->{'_stunted'}; # might as well check that.
111 }
112 $new->eof();
113 return $new;
114}
115
116sub new_from_url { # should accept anything that LWP does.
117 undef our $lwp_response;
118 my $class = shift;
119 Carp::croak("new_from_url takes only one argument")
120 unless @_ == 1;
121 Carp::croak("new_from_url is a class method only")
122 if ref $class;
123 my $url = shift;
124 my $new = $class->new();
125
126 require LWP::UserAgent;
127 # RECOMMEND PREREQ: LWP::UserAgent 5.815
128 LWP::UserAgent->VERSION( 5.815 ); # HTTP::Headers content_is_html method
129 $lwp_response = LWP::UserAgent->new->get( $url );
130
131 Carp::croak("GET failed on $url: " . $lwp_response->status_line)
132 unless $lwp_response->is_success;
133 Carp::croak("$url returned " . $lwp_response->content_type . " not HTML")
134 unless $lwp_response->content_is_html;
135
136 $new->parse( $lwp_response->decoded_content );
137 $new->eof;
138 undef $lwp_response; # Processed successfully
139 return $new;
140}
141
142# TODO: document more fully?
143sub parse_content { # from any number of scalars
144 my $tree = shift;
145 my $retval;
146 foreach my $whunk (@_) {
147 if ( ref($whunk) eq 'SCALAR' ) {
148 $retval = $tree->parse($$whunk);
149 }
150 else {
151 $retval = $tree->parse($whunk);
152 }
153 last if $tree->{'_stunted'}; # might as well check that.
154 }
155 $tree->eof();
156 return $retval;
157}
158
159#---------------------------------------------------------------------------
160
161sub new { # constructor!
162 my $class = shift;
163 $class = ref($class) || $class;
164
165 # Initialize HTML::Element part
166 my $self = $class->element_class->new('html');
167
168 {
169
170 # A hack for certain strange versions of Parser:
171 my $other_self = HTML::Parser->new();
172 %$self = ( %$self, %$other_self ); # copy fields
173 # Yes, multiple inheritance is messy. Kids, don't try this at home.
174 bless $other_self, "HTML::TreeBuilder::_hideyhole";
175
176 # whack it out of the HTML::Parser class, to avoid the destructor
177 }
178
179 # The root of the tree is special, as it has these funny attributes,
180 # and gets reblessed into this class.
181
182 # Initialize parser settings
183 $self->{'_implicit_tags'} = 1;
184 $self->{'_implicit_body_p_tag'} = 0;
185
186 # If true, trying to insert text, or any of %isPhraseMarkup right
187 # under 'body' will implicate a 'p'. If false, will just go there.
188
189 $self->{'_tighten'} = 1;
190
191 # whether ignorable WS in this tree should be deleted
192
193 $self->{'_implicit'} = 1; # to delete, once we find a real open-"html" tag
194
195 $self->{'_ignore_unknown'} = 1;
196 $self->{'_ignore_text'} = 0;
197 $self->{'_warn'} = 0;
198 $self->{'_no_space_compacting'} = 0;
199 $self->{'_store_comments'} = 0;
200 $self->{'_store_declarations'} = 1;
201 $self->{'_store_pis'} = 0;
202 $self->{'_p_strict'} = 0;
203 $self->{'_no_expand_entities'} = 0;
204
205 # Parse attributes passed in as arguments
206 if (@_) {
207 my %attr = @_;
208 for ( keys %attr ) {
209 $self->{"_$_"} = $attr{$_};
210 }
211 }
212
213 $HTML::Element::encoded_content = $self->{'_no_expand_entities'};
214
215 # rebless to our class
216 bless $self, $class;
217
218 $self->{'_element_count'} = 1;
219
220 # undocumented, informal, and maybe not exactly correct
221
222 $self->{'_head'} = $self->insert_element( 'head', 1 );
223 $self->{'_pos'} = undef; # pull it back up
224 $self->{'_body'} = $self->insert_element( 'body', 1 );
225 $self->{'_pos'} = undef; # pull it back up again
226
227 return $self;
228}
229
230#==========================================================================
231
232sub _elem # universal accessor...
233{
234 my ( $self, $elem, $val ) = @_;
235 my $old = $self->{$elem};
236 $self->{$elem} = $val if defined $val;
237 return $old;
238}
239
240# accessors....
241sub implicit_tags { shift->_elem( '_implicit_tags', @_ ); }
242sub implicit_body_p_tag { shift->_elem( '_implicit_body_p_tag', @_ ); }
243sub p_strict { shift->_elem( '_p_strict', @_ ); }
244sub no_space_compacting { shift->_elem( '_no_space_compacting', @_ ); }
245sub ignore_unknown { shift->_elem( '_ignore_unknown', @_ ); }
246sub ignore_text { shift->_elem( '_ignore_text', @_ ); }
247sub ignore_ignorable_whitespace { shift->_elem( '_tighten', @_ ); }
248sub store_comments { shift->_elem( '_store_comments', @_ ); }
249sub store_declarations { shift->_elem( '_store_declarations', @_ ); }
250sub store_pis { shift->_elem( '_store_pis', @_ ); }
251sub warn { shift->_elem( '_warn', @_ ); }
252
253sub no_expand_entities {
254 shift->_elem( '_no_expand_entities', @_ );
255 $HTML::Element::encoded_content = @_;
256}
257
258#==========================================================================
259
260sub warning {
261 my $self = shift;
262 CORE::warn("HTML::Parse: $_[0]\n") if $self->{'_warn'};
263
264 # should maybe say HTML::TreeBuilder instead
265}
266
267#==========================================================================
268
269{
270
271 # To avoid having to rebuild these lists constantly...
272 my $_Closed_by_structurals = [qw(p h1 h2 h3 h4 h5 h6 pre textarea)];
273 my $indent;
274
275 sub start {
276 return if $_[0]{'_stunted'};
277
278 # Accept a signal from HTML::Parser for start-tags.
279 my ( $self, $tag, $attr ) = @_;
280
281 # Parser passes more, actually:
282 # $self->start($tag, $attr, $attrseq, $origtext)
283 # But we can merrily ignore $attrseq and $origtext.
284
285 if ( $tag eq 'x-html' ) {
286 print "Ignoring open-x-html tag.\n" if DEBUG;
287
288 # inserted by some lame code-generators.
289 return; # bypass tweaking.
290 }
291
292 $tag =~ s{/$}{}s; # So <b/> turns into <b>. Silently forgive.
293
294 unless ( $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) {
295 DEBUG and print "Start-tag name $tag is no good. Skipping.\n";
296 return;
297
298 # This avoids having Element's new() throw an exception.
299 }
300
301 my $ptag = ( my $pos = $self->{'_pos'} || $self )->{'_tag'};
302 my $already_inserted;
303
304 #my($indent);
305 if (DEBUG) {
306
307 # optimization -- don't figure out indenting unless we're in debug mode
308 my @lineage = $pos->lineage;
309 $indent = ' ' x ( 1 + @lineage );
310 print $indent, "Proposing a new \U$tag\E under ",
311 join( '/', map $_->{'_tag'}, reverse( $pos, @lineage ) )
312 || 'Root',
313 ".\n";
314
315 #} else {
316 # $indent = ' ';
317 }
318
319 #print $indent, "POS: $pos ($ptag)\n" if DEBUG > 2;
320 # $attr = {%$attr};
321
322 foreach my $k ( keys %$attr ) {
323
324 # Make sure some stooge doesn't have "<span _content='pie'>".
325 # That happens every few million Web pages.
326 $attr->{ ' ' . $k } = delete $attr->{$k}
327 if length $k and substr( $k, 0, 1 ) eq '_';
328
329 # Looks bad, but is fine for round-tripping.
330 }
331
332 my $e = $self->element_class->new( $tag, %$attr );
333
334 # Make a new element object.
335 # (Only rarely do we end up just throwing it away later in this call.)
336
337 # Some prep -- custom messiness for those damned tables, and strict P's.
338 if ( $self->{'_implicit_tags'} ) { # wallawallawalla!
339
340 unless ( $HTML::TreeBuilder::isTableElement{$tag} ) {
341 if ( $ptag eq 'table' ) {
342 print $indent,
343 " * Phrasal \U$tag\E right under TABLE makes implicit TR and TD\n"
344 if DEBUG > 1;
345 $self->insert_element( 'tr', 1 );
346 $pos = $self->insert_element( 'td', 1 )
347 ; # yes, needs updating
348 }
349 elsif ( $ptag eq 'tr' ) {
350 print $indent,
351 " * Phrasal \U$tag\E right under TR makes an implicit TD\n"
352 if DEBUG > 1;
353 $pos = $self->insert_element( 'td', 1 )
354 ; # yes, needs updating
355 }
356 $ptag = $pos->{'_tag'}; # yes, needs updating
357 }
358
359 # end of table-implication block.
360
361 # Now maybe do a little dance to enforce P-strictness.
362 # This seems like it should be integrated with the big
363 # "ALL HOPE..." block, further below, but that doesn't
364 # seem feasable.
365 if ( $self->{'_p_strict'}
366 and $HTML::TreeBuilder::isKnown{$tag}
367 and not $HTML::Tagset::is_Possible_Strict_P_Content{$tag} )
368 {
369 my $here = $pos;
370 my $here_tag = $ptag;
371 while (1) {
372 if ( $here_tag eq 'p' ) {
373 print $indent, " * Inserting $tag closes strict P.\n"
374 if DEBUG > 1;
375 $self->end( \q{p} );
376
377 # NB: same as \'q', but less confusing to emacs cperl-mode
378 last;
379 }
380
381 #print("Lasting from $here_tag\n"),
382 last
383 if $HTML::TreeBuilder::isKnown{$here_tag}
384 and
385 not $HTML::Tagset::is_Possible_Strict_P_Content{
386 $here_tag};
387
388 # Don't keep looking up the tree if we see something that can't
389 # be strict-P content.
390
391 $here_tag
392 = ( $here = $here->{'_parent'} || last )->{'_tag'};
393 } # end while
394 $ptag = ( $pos = $self->{'_pos'} || $self )
395 ->{'_tag'}; # better update!
396 }
397
398 # end of strict-p block.
399 }
400
401 # And now, get busy...
402 #----------------------------------------------------------------------
403 if ( !$self->{'_implicit_tags'} ) { # bimskalabim
404 # do nothing
405 print $indent, " * _implicit_tags is off. doing nothing\n"
406 if DEBUG > 1;
407
408 #----------------------------------------------------------------------
409 }
410 elsif ( $HTML::TreeBuilder::isHeadOrBodyElement{$tag} ) {
411 if ( $pos->is_inside('body') ) { # all is well
412 print $indent,
413 " * ambilocal element \U$tag\E is fine under BODY.\n"
414 if DEBUG > 1;
415 }
416 elsif ( $pos->is_inside('head') ) {
417 print $indent,
418 " * ambilocal element \U$tag\E is fine under HEAD.\n"
419 if DEBUG > 1;
420 }
421 else {
422
423 # In neither head nor body! mmmmm... put under head?
424
425 if ( $ptag eq 'html' ) { # expected case
426 # TODO?? : would there ever be a case where _head would be
427 # absent from a tree that would ever be accessed at this
428 # point?
429 die "Where'd my head go?" unless ref $self->{'_head'};
430 if ( $self->{'_head'}{'_implicit'} ) {
431 print $indent,
432 " * ambilocal element \U$tag\E makes an implicit HEAD.\n"
433 if DEBUG > 1;
434
435 # or rather, points us at it.
436 $self->{'_pos'}
437 = $self->{'_head'}; # to insert under...
438 }
439 else {
440 $self->warning(
441 "Ambilocal element <$tag> not under HEAD or BODY!?"
442 );
443
444 # Put it under HEAD by default, I guess
445 $self->{'_pos'}
446 = $self->{'_head'}; # to insert under...
447 }
448
449 }
450 else {
451
452 # Neither under head nor body, nor right under html... pass thru?
453 $self->warning(
454 "Ambilocal element <$tag> neither under head nor body, nor right under html!?"
455 );
456 }
457 }
458
459 #----------------------------------------------------------------------
460 }
461 elsif ( $HTML::TreeBuilder::isBodyElement{$tag} ) {
462
463 # Ensure that we are within <body>
464 if ( $ptag eq 'body' ) {
465
466 # We're good.
467 }
468 elsif (
469 $HTML::TreeBuilder::isBodyElement{$ptag} # glarg
470 and not $HTML::TreeBuilder::isHeadOrBodyElement{$ptag}
471 )
472 {
473
474 # Special case: Save ourselves a call to is_inside further down.
475 # If our $ptag is an isBodyElement element (but not an
476 # isHeadOrBodyElement element), then we must be under body!
477 print $indent, " * Inferring that $ptag is under BODY.\n",
478 if DEBUG > 3;
479
480 # I think this and the test for 'body' trap everything
481 # bodyworthy, except the case where the parent element is
482 # under an unknown element that's a descendant of body.
483 }
484 elsif ( $pos->is_inside('head') ) {
485 print $indent,
486 " * body-element \U$tag\E minimizes HEAD, makes implicit BODY.\n"
487 if DEBUG > 1;
488 $ptag = (
489 $pos = $self->{'_pos'}
490 = $self->{'_body'} # yes, needs updating
491 || die "Where'd my body go?"
492 )->{'_tag'}; # yes, needs updating
493 }
494 elsif ( !$pos->is_inside('body') ) {
495 print $indent,
496 " * body-element \U$tag\E makes implicit BODY.\n"
497 if DEBUG > 1;
498 $ptag = (
499 $pos = $self->{'_pos'}
500 = $self->{'_body'} # yes, needs updating
501 || die "Where'd my body go?"
502 )->{'_tag'}; # yes, needs updating
503 }
504
505 # else we ARE under body, so okay.
506
507 # Handle implicit endings and insert based on <tag> and position
508 # ... ALL HOPE ABANDON ALL YE WHO ENTER HERE ...
509 if ( $tag eq 'p'
510 or $tag eq 'h1'
511 or $tag eq 'h2'
512 or $tag eq 'h3'
513 or $tag eq 'h4'
514 or $tag eq 'h5'
515 or $tag eq 'h6'
516 or $tag eq 'form'
517
518 # Hm, should <form> really be here?!
519 )
520 {
521
522 # Can't have <p>, <h#> or <form> inside these
523 $self->end(
524 $_Closed_by_structurals,
525 @HTML::TreeBuilder::p_closure_barriers
526
527 # used to be just li!
528 );
529
530 }
531 elsif ( $tag eq 'ol' or $tag eq 'ul' or $tag eq 'dl' ) {
532
533 # Can't have lists inside <h#> -- in the unlikely
534 # event anyone tries to put them there!
535 if ( $ptag eq 'h1'
536 or $ptag eq 'h2'
537 or $ptag eq 'h3'
538 or $ptag eq 'h4'
539 or $ptag eq 'h5'
540 or $ptag eq 'h6' )
541 {
542 $self->end( \$ptag );
543 }
544
545 # TODO: Maybe keep closing up the tree until
546 # the ptag isn't any of the above?
547 # But anyone that says <h1><h2><ul>...
548 # deserves what they get anyway.
549
550 }
551 elsif ( $tag eq 'li' ) { # list item
552 # Get under a list tag, one way or another
553 unless (
554 exists $HTML::TreeBuilder::isList{$ptag}
555 or $self->end( \q{*}, keys %HTML::TreeBuilder::isList ) #'
556 )
557 {
558 print $indent,
559 " * inserting implicit UL for lack of containing ",
560 join( '|', keys %HTML::TreeBuilder::isList ), ".\n"
561 if DEBUG > 1;
562 $self->insert_element( 'ul', 1 );
563 }
564
565 }
566 elsif ( $tag eq 'dt' or $tag eq 'dd' ) {
567
568 # Get under a DL, one way or another
569 unless ( $ptag eq 'dl' or $self->end( \q{*}, 'dl' ) ) { #'
570 print $indent,
571 " * inserting implicit DL for lack of containing DL.\n"
572 if DEBUG > 1;
573 $self->insert_element( 'dl', 1 );
574 }
575
576 }
577 elsif ( $HTML::TreeBuilder::isFormElement{$tag} ) {
578 if ($self->{
579 '_ignore_formies_outside_form'} # TODO: document this
580 and not $pos->is_inside('form')
581 )
582 {
583 print $indent,
584 " * ignoring \U$tag\E because not in a FORM.\n"
585 if DEBUG > 1;
586 return; # bypass tweaking.
587 }
588 if ( $tag eq 'option' ) {
589
590 # return unless $ptag eq 'select';
591 $self->end( \q{option} );
592 $ptag = ( $self->{'_pos'} || $self )->{'_tag'};
593 unless ( $ptag eq 'select' or $ptag eq 'optgroup' ) {
594 print $indent,
595 " * \U$tag\E makes an implicit SELECT.\n"
596 if DEBUG > 1;
597 $pos = $self->insert_element( 'select', 1 );
598
599 # but not a very useful select -- has no 'name' attribute!
600 # is $pos's value used after this?
601 }
602 }
603 }
604 elsif ( $HTML::TreeBuilder::isTableElement{$tag} ) {
605 if ( !$pos->is_inside('table') ) {
606 print $indent, " * \U$tag\E makes an implicit TABLE\n"
607 if DEBUG > 1;
608 $self->insert_element( 'table', 1 );
609 }
610
611 if ( $tag eq 'td' or $tag eq 'th' ) {
612
613 # Get under a tr one way or another
614 unless (
615 $ptag eq 'tr' # either under a tr
616 or $self->end( \q{*}, 'tr',
617 'table' ) #or we can get under one
618 )
619 {
620 print $indent,
621 " * \U$tag\E under \U$ptag\E makes an implicit TR\n"
622 if DEBUG > 1;
623 $self->insert_element( 'tr', 1 );
624
625 # presumably pos's value isn't used after this.
626 }
627 }
628 else {
629 $self->end( \$tag, 'table' ); #'
630 }
631
632 # Hmm, I guess this is right. To work it out:
633 # tr closes any open tr (limited at a table)
634 # thead closes any open thead (limited at a table)
635 # tbody closes any open tbody (limited at a table)
636 # tfoot closes any open tfoot (limited at a table)
637 # colgroup closes any open colgroup (limited at a table)
638 # col can try, but will always fail, at the enclosing table,
639 # as col is empty, and therefore never open!
640 # But!
641 # td closes any open td OR th (limited at a table)
642 # th closes any open th OR td (limited at a table)
643 # ...implementable as "close to a tr, or make a tr"
644
645 }
646 elsif ( $HTML::TreeBuilder::isPhraseMarkup{$tag} ) {
647 if ( $ptag eq 'body' and $self->{'_implicit_body_p_tag'} ) {
648 print
649 " * Phrasal \U$tag\E right under BODY makes an implicit P\n"
650 if DEBUG > 1;
651 $pos = $self->insert_element( 'p', 1 );
652
653 # is $pos's value used after this?
654 }
655 }
656
657 # End of implicit endings logic
658
659 # End of "elsif ($HTML::TreeBuilder::isBodyElement{$tag}"
660 #----------------------------------------------------------------------
661
662 }
663 elsif ( $HTML::TreeBuilder::isHeadElement{$tag} ) {
664 if ( $pos->is_inside('body') ) {
665 print $indent, " * head element \U$tag\E found inside BODY!\n"
666 if DEBUG;
667 $self->warning("Header element <$tag> in body"); # [sic]
668 }
669 elsif ( !$pos->is_inside('head') ) {
670 print $indent,
671 " * head element \U$tag\E makes an implicit HEAD.\n"
672 if DEBUG > 1;
673 }
674 else {
675 print $indent,
676 " * head element \U$tag\E goes inside existing HEAD.\n"
677 if DEBUG > 1;
678 }
679 $self->{'_pos'} = $self->{'_head'} || die "Where'd my head go?";
680
681 #----------------------------------------------------------------------
682 }
683 elsif ( $tag eq 'html' ) {
684 if ( delete $self->{'_implicit'} ) { # first time here
685 print $indent, " * good! found the real HTML element!\n"
686 if DEBUG > 1;
687 }
688 else {
689 print $indent, " * Found a second HTML element\n"
690 if DEBUG;
691 $self->warning("Found a nested <html> element");
692 }
693
694 # in either case, migrate attributes to the real element
695 for ( keys %$attr ) {
696 $self->attr( $_, $attr->{$_} );
697 }
698 $self->{'_pos'} = undef;
699 return $self; # bypass tweaking.
700
701 #----------------------------------------------------------------------
702 }
703 elsif ( $tag eq 'head' ) {
704 my $head = $self->{'_head'} || die "Where'd my head go?";
705 if ( delete $head->{'_implicit'} ) { # first time here
706 print $indent, " * good! found the real HEAD element!\n"
707 if DEBUG > 1;
708 }
709 else { # been here before
710 print $indent, " * Found a second HEAD element\n"
711 if DEBUG;
712 $self->warning("Found a second <head> element");
713 }
714
715 # in either case, migrate attributes to the real element
716 for ( keys %$attr ) {
717 $head->attr( $_, $attr->{$_} );
718 }
719 return $self->{'_pos'} = $head; # bypass tweaking.
720
721 #----------------------------------------------------------------------
722 }
723 elsif ( $tag eq 'body' ) {
724 my $body = $self->{'_body'} || die "Where'd my body go?";
725 if ( delete $body->{'_implicit'} ) { # first time here
726 print $indent, " * good! found the real BODY element!\n"
727 if DEBUG > 1;
728 }
729 else { # been here before
730 print $indent, " * Found a second BODY element\n"
731 if DEBUG;
732 $self->warning("Found a second <body> element");
733 }
734
735 # in either case, migrate attributes to the real element
736 for ( keys %$attr ) {
737 $body->attr( $_, $attr->{$_} );
738 }
739 return $self->{'_pos'} = $body; # bypass tweaking.
740
741 #----------------------------------------------------------------------
742 }
743 elsif ( $tag eq 'frameset' ) {
744 if (!( $self->{'_frameset_seen'}++ ) # first frameset seen
745 and !$self->{'_noframes_seen'}
746
747 # otherwise it'll be under the noframes already
748 and !$self->is_inside('body')
749 )
750 {
751
752 # The following is a bit of a hack. We don't use the normal
753 # insert_element because 1) we don't want it as _pos, but instead
754 # right under $self, and 2), more importantly, that we don't want
755 # this inserted at the /end/ of $self's content_list, but instead
756 # in the middle of it, specifically right before the body element.
757 #
758 my $c = $self->{'_content'} || die "Contentless root?";
759 my $body = $self->{'_body'} || die "Where'd my BODY go?";
760 for ( my $i = 0; $i < @$c; ++$i ) {
761 if ( $c->[$i] eq $body ) {
762 splice( @$c, $i, 0, $self->{'_pos'} = $pos = $e );
763 HTML::Element::_weaken($e->{'_parent'} = $self);
764 $already_inserted = 1;
765 print $indent,
766 " * inserting 'frameset' right before BODY.\n"
767 if DEBUG > 1;
768 last;
769 }
770 }
771 die "BODY not found in children of root?"
772 unless $already_inserted;
773 }
774
775 }
776 elsif ( $tag eq 'frame' ) {
777
778 # Okay, fine, pass thru.
779 # Should probably enforce that these should be under a frameset.
780 # But hey. Ditto for enforcing that 'noframes' should be under
781 # a 'frameset', as the DTDs say.
782
783 }
784 elsif ( $tag eq 'noframes' ) {
785
786 # This basically assumes there'll be exactly one 'noframes' element
787 # per document. At least, only the first one gets to have the
788 # body under it. And if there are no noframes elements, then
789 # the body pretty much stays where it is. Is that ever a problem?
790 if ( $self->{'_noframes_seen'}++ ) {
791 print $indent, " * ANOTHER noframes element?\n" if DEBUG;
792 }
793 else {
794 if ( $pos->is_inside('body') ) {
795 print $indent, " * 'noframes' inside 'body'. Odd!\n"
796 if DEBUG;
797
798 # In that odd case, we /can't/ make body a child of 'noframes',
799 # because it's an ancestor of the 'noframes'!
800 }
801 else {
802 $e->push_content( $self->{'_body'}
803 || die "Where'd my body go?" );
804 print $indent, " * Moving body to be under noframes.\n"
805 if DEBUG;
806 }
807 }
808
809 #----------------------------------------------------------------------
810 }
811 else {
812
813 # unknown tag
814 if ( $self->{'_ignore_unknown'} ) {
815 print $indent, " * Ignoring unknown tag \U$tag\E\n" if DEBUG;
816 $self->warning("Skipping unknown tag $tag");
817 return;
818 }
819 else {
820 print $indent, " * Accepting unknown tag \U$tag\E\n"
821 if DEBUG;
822 }
823 }
824
825 #----------------------------------------------------------------------
826 # End of mumbo-jumbo
827
828 print $indent, "(Attaching ", $e->{'_tag'}, " under ",
829 ( $self->{'_pos'} || $self )->{'_tag'}, ")\n"
830
831 # because if _pos isn't defined, it goes under self
832 if DEBUG;
833
834 # The following if-clause is to delete /some/ ignorable whitespace
835 # nodes, as we're making the tree.
836 # This'd be a node we'd catch later anyway, but we might as well
837 # nip it in the bud now.
838 # This doesn't catch /all/ deletable WS-nodes, so we do have to call
839 # the tightener later to catch the rest.
840
841 if ( $self->{'_tighten'} and !$self->{'_ignore_text'} )
842 { # if tightenable
843 my ( $sibs, $par );
844 if (( $sibs = ( $par = $self->{'_pos'} || $self )->{'_content'} )
845 and @$sibs # parent already has content
846 and !
847 ref( $sibs->[-1] ) # and the last one there is a text node
848 and $sibs->[-1] !~ m<[^\n\r\f\t ]>s # and it's all whitespace
849
850 and ( # one of these has to be eligible...
851 $HTML::TreeBuilder::canTighten{$tag}
852 or (( @$sibs == 1 )
853 ? # WS is leftmost -- so parent matters
854 $HTML::TreeBuilder::canTighten{ $par->{'_tag'} }
855 : # WS is after another node -- it matters
856 ( ref $sibs->[-2]
857 and
858 $HTML::TreeBuilder::canTighten{ $sibs->[-2]
859 {'_tag'} }
860 )
861 )
862 )
863
864 and !$par->is_inside( 'pre', 'xmp', 'textarea', 'plaintext' )
865
866 # we're clear
867 )
868 {
869 pop @$sibs;
870 print $indent, "Popping a preceding all-WS node\n" if DEBUG;
871 }
872 }
873
874 $self->insert_element($e) unless $already_inserted;
875
876 if (DEBUG) {
877 if ( $self->{'_pos'} ) {
878 print $indent, "(Current lineage of pos: \U$tag\E under ",
879 join(
880 '/',
881 reverse(
882
883 # $self->{'_pos'}{'_tag'}, # don't list myself!
884 $self->{'_pos'}->lineage_tag_names
885 )
886 ),
887 ".)\n";
888 }
889 else {
890 print $indent, "(Pos points nowhere!?)\n";
891 }
892 }
893
894 unless ( ( $self->{'_pos'} || '' ) eq $e ) {
895
896 # if it's an empty element -- i.e., if it didn't change the _pos
897 &{ $self->{"_tweak_$tag"}
898 || $self->{'_tweak_*'}
899 || return $e }( map $_, $e, $tag, $self )
900 ; # make a list so the user can't clobber
901 }
902
903 return $e;
904 }
905}
906
907#==========================================================================
908
909{
910 my $indent;
911
912 sub end {
913 return if $_[0]{'_stunted'};
914
915 # Either: Acccept an end-tag signal from HTML::Parser
916 # Or: Method for closing currently open elements in some fairly complex
917 # way, as used by other methods in this class.
918 my ( $self, $tag, @stop ) = @_;
919 if ( $tag eq 'x-html' ) {
920 print "Ignoring close-x-html tag.\n" if DEBUG;
921
922 # inserted by some lame code-generators.
923 return;
924 }
925
926 unless ( ref($tag) or $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) {
927 DEBUG and print "End-tag name $tag is no good. Skipping.\n";
928 return;
929
930 # This avoids having Element's new() throw an exception.
931 }
932
933 # This method accepts two calling formats:
934 # 1) from Parser: $self->end('tag_name', 'origtext')
935 # in which case we shouldn't mistake origtext as a blocker tag
936 # 2) from myself: $self->end(\q{tagname1}, 'blk1', ... )
937 # from myself: $self->end(['tagname1', 'tagname2'], 'blk1', ... )
938
939 # End the specified tag, but don't move above any of the blocker tags.
940 # The tag can also be a reference to an array. Terminate the first
941 # tag found.
942
943 my $ptag = ( my $p = $self->{'_pos'} || $self )->{'_tag'};
944
945 # $p and $ptag are sort-of stratch
946
947 if ( ref($tag) ) {
948
949 # First param is a ref of one sort or another --
950 # THE CALL IS COMING FROM INSIDE THE HOUSE!
951 $tag = $$tag if ref($tag) eq 'SCALAR';
952
953 # otherwise it's an arrayref.
954 }
955 else {
956
957 # the call came from Parser -- just ignore origtext
958 # except in a table ignore unmatched table tags RT #59980
959 @stop = $tag =~ /^t[hdr]\z/ ? 'table' : ();
960 }
961
962 #my($indent);
963 if (DEBUG) {
964
965 # optimization -- don't figure out depth unless we're in debug mode
966 my @lineage_tags = $p->lineage_tag_names;
967 $indent = ' ' x ( 1 + @lineage_tags );
968
969 # now announce ourselves
970 print $indent, "Ending ",
971 ref($tag) ? ( '[', join( ' ', @$tag ), ']' ) : "\U$tag\E",
972 scalar(@stop)
973 ? ( " no higher than [", join( ' ', @stop ), "]" )
974 : (), ".\n";
975
976 print $indent, " (Current lineage: ", join( '/', @lineage_tags ),
977 ".)\n"
978 if DEBUG > 1;
979
980 if ( DEBUG > 3 ) {
981
982 #my(
983 # $package, $filename, $line, $subroutine,
984 # $hasargs, $wantarray, $evaltext, $is_require) = caller;
985 print $indent,
986 " (Called from ", ( caller(1) )[3], ' line ',
987 ( caller(1) )[2],
988 ")\n";
989 }
990
991 #} else {
992 # $indent = ' ';
993 }
994
995 # End of if DEBUG
996
997 # Now actually do it
998 my @to_close;
999 if ( $tag eq '*' ) {
1000
1001 # Special -- close everything up to (but not including) the first
1002 # limiting tag, or return if none found. Somewhat of a special case.
1003 PARENT:
1004 while ( defined $p ) {
1005 $ptag = $p->{'_tag'};
1006 print $indent, " (Looking at $ptag.)\n" if DEBUG > 2;
1007 for (@stop) {
1008 if ( $ptag eq $_ ) {
1009 print $indent,
1010 " (Hit a $_; closing everything up to here.)\n"
1011 if DEBUG > 2;
1012 last PARENT;
1013 }
1014 }
1015 push @to_close, $p;
1016 $p = $p->{'_parent'}; # no match so far? keep moving up
1017 print $indent,
1018 " (Moving on up to ", $p ? $p->{'_tag'} : 'nil', ")\n"
1019 if DEBUG > 1;
1020 }
1021 unless ( defined $p ) { # We never found what we were looking for.
1022 print $indent, " (We never found a limit.)\n" if DEBUG > 1;
1023 return;
1024 }
1025
1026 #print
1027 # $indent,
1028 # " (To close: ", join('/', map $_->tag, @to_close), ".)\n"
1029 # if DEBUG > 4;
1030
1031 # Otherwise update pos and fall thru.
1032 $self->{'_pos'} = $p;
1033 }
1034 elsif ( ref $tag ) {
1035
1036 # Close the first of any of the matching tags, giving up if you hit
1037 # any of the stop-tags.
1038 PARENT:
1039 while ( defined $p ) {
1040 $ptag = $p->{'_tag'};
1041 print $indent, " (Looking at $ptag.)\n" if DEBUG > 2;
1042 for (@$tag) {
1043 if ( $ptag eq $_ ) {
1044 print $indent, " (Closing $_.)\n" if DEBUG > 2;
1045 last PARENT;
1046 }
1047 }
1048 for (@stop) {
1049 if ( $ptag eq $_ ) {
1050 print $indent,
1051 " (Hit a limiting $_ -- bailing out.)\n"
1052 if DEBUG > 1;
1053 return; # so it was all for naught
1054 }
1055 }
1056 push @to_close, $p;
1057 $p = $p->{'_parent'};
1058 }
1059 return unless defined $p; # We went off the top of the tree.
1060 # Otherwise specified element was found; set pos to its parent.
1061 push @to_close, $p;
1062 $self->{'_pos'} = $p->{'_parent'};
1063 }
1064 else {
1065
1066 # Close the first of the specified tag, giving up if you hit
1067 # any of the stop-tags.
1068 while ( defined $p ) {
1069 $ptag = $p->{'_tag'};
1070 print $indent, " (Looking at $ptag.)\n" if DEBUG > 2;
1071 if ( $ptag eq $tag ) {
1072 print $indent, " (Closing $tag.)\n" if DEBUG > 2;
1073 last;
1074 }
1075 for (@stop) {
1076 if ( $ptag eq $_ ) {
1077 print $indent,
1078 " (Hit a limiting $_ -- bailing out.)\n"
1079 if DEBUG > 1;
1080 return; # so it was all for naught
1081 }
1082 }
1083 push @to_close, $p;
1084 $p = $p->{'_parent'};
1085 }
1086 return unless defined $p; # We went off the top of the tree.
1087 # Otherwise specified element was found; set pos to its parent.
1088 push @to_close, $p;
1089 $self->{'_pos'} = $p->{'_parent'};
1090 }
1091
1092 $self->{'_pos'} = undef if $self eq ( $self->{'_pos'} || '' );
1093 print $indent, "(Pos now points to ",
1094 $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : '???', ".)\n"
1095 if DEBUG > 1;
1096
1097 ### EXPENSIVE, because has to check that it's not under a pre
1098 ### or a CDATA-parent. That's one more method call per end()!
1099 ### Might as well just do this at the end of the tree-parse, I guess,
1100 ### at which point we'd be parsing top-down, and just not traversing
1101 ### under pre's or CDATA-parents.
1102 ##
1103 ## Take this opportunity to nix any terminal whitespace nodes.
1104 ## TODO: consider whether this (plus the logic in start(), above)
1105 ## would ever leave any WS nodes in the tree.
1106 ## If not, then there's no reason to have eof() call
1107 ## delete_ignorable_whitespace on the tree, is there?
1108 ##
1109 #if(@to_close and $self->{'_tighten'} and !$self->{'_ignore_text'} and
1110 # ! $to_close[-1]->is_inside('pre', keys %HTML::Tagset::isCDATA_Parent)
1111 #) { # if tightenable
1112 # my($children, $e_tag);
1113 # foreach my $e (reverse @to_close) { # going top-down
1114 # last if 'pre' eq ($e_tag = $e->{'_tag'}) or
1115 # $HTML::Tagset::isCDATA_Parent{$e_tag};
1116 #
1117 # if(
1118 # $children = $e->{'_content'}
1119 # and @$children # has children
1120 # and !ref($children->[-1])
1121 # and $children->[-1] =~ m<^\s+$>s # last node is all-WS
1122 # and
1123 # (
1124 # # has a tightable parent:
1125 # $HTML::TreeBuilder::canTighten{ $e_tag }
1126 # or
1127 # ( # has a tightenable left sibling:
1128 # @$children > 1 and
1129 # ref($children->[-2])
1130 # and $HTML::TreeBuilder::canTighten{ $children->[-2]{'_tag'} }
1131 # )
1132 # )
1133 # ) {
1134 # pop @$children;
1135 # #print $indent, "Popping a terminal WS node from ", $e->{'_tag'},
1136 # # " (", $e->address, ") while exiting.\n" if DEBUG;
1137 # }
1138 # }
1139 #}
1140
1141 foreach my $e (@to_close) {
1142
1143 # Call the applicable callback, if any
1144 $ptag = $e->{'_tag'};
1145 &{ $self->{"_tweak_$ptag"}
1146 || $self->{'_tweak_*'}
1147 || next }( map $_, $e, $ptag, $self );
1148 print $indent, "Back from tweaking.\n" if DEBUG;
1149 last
1150 if $self->{ '_stunted'
1151 }; # in case one of the handlers called stunt
1152 }
1153 return @to_close;
1154 }
1155}
1156
1157#==========================================================================
1158{
1159 my ( $indent, $nugget );
1160
1161 sub text {
1162 return if $_[0]{'_stunted'};
1163
1164 # Accept a "here's a text token" signal from HTML::Parser.
1165 my ( $self, $text, $is_cdata ) = @_;
1166
1167 # the >3.0 versions of Parser may pass a cdata node.
1168 # Thanks to Gisle Aas for pointing this out.
1169
1170 return unless length $text; # I guess that's always right
1171
1172 my $ignore_text = $self->{'_ignore_text'};
1173 my $no_space_compacting = $self->{'_no_space_compacting'};
1174 my $no_expand_entities = $self->{'_no_expand_entities'};
1175 my $pos = $self->{'_pos'} || $self;
1176
1177 HTML::Entities::decode($text)
1178 unless $ignore_text
1179 || $is_cdata
1180 || $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} }
1181 || $no_expand_entities;
1182
1183 #my($indent, $nugget);
1184 if (DEBUG) {
1185
1186 # optimization -- don't figure out depth unless we're in debug mode
1187 my @lineage_tags = $pos->lineage_tag_names;
1188 $indent = ' ' x ( 1 + @lineage_tags );
1189
1190 $nugget
1191 = ( length($text) <= 25 )
1192 ? $text
1193 : ( substr( $text, 0, 25 ) . '...' );
1194 $nugget =~ s<([\x00-\x1F])>
1195 <'\\x'.(unpack("H2",$1))>eg;
1196 print $indent, "Proposing a new text node ($nugget) under ",
1197 join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) )
1198 || 'Root',
1199 ".\n";
1200
1201 #} else {
1202 # $indent = ' ';
1203 }
1204
1205 my $ptag;
1206 if ($HTML::Tagset::isCDATA_Parent{ $ptag = $pos->{'_tag'} }
1207
1208 #or $pos->is_inside('pre')
1209 or $pos->is_inside( 'pre', 'textarea' )
1210 )
1211 {
1212 return if $ignore_text;
1213 $pos->push_content($text);
1214 }
1215 else {
1216
1217 # return unless $text =~ /\S/; # This is sometimes wrong
1218
1219 if ( !$self->{'_implicit_tags'} || $text !~ /[^\n\r\f\t ]/ ) {
1220
1221 # don't change anything
1222 }
1223 elsif ( $ptag eq 'head' or $ptag eq 'noframes' ) {
1224 if ( $self->{'_implicit_body_p_tag'} ) {
1225 print $indent,
1226 " * Text node under \U$ptag\E closes \U$ptag\E, implicates BODY and P.\n"
1227 if DEBUG > 1;
1228 $self->end( \$ptag );
1229 $pos = $self->{'_body'}
1230 ? ( $self->{'_pos'}
1231 = $self->{'_body'} ) # expected case
1232 : $self->insert_element( 'body', 1 );
1233 $pos = $self->insert_element( 'p', 1 );
1234 }
1235 else {
1236 print $indent,
1237 " * Text node under \U$ptag\E closes, implicates BODY.\n"
1238 if DEBUG > 1;
1239 $self->end( \$ptag );
1240 $pos = $self->{'_body'}
1241 ? ( $self->{'_pos'}
1242 = $self->{'_body'} ) # expected case
1243 : $self->insert_element( 'body', 1 );
1244 }
1245 }
1246 elsif ( $ptag eq 'html' ) {
1247 if ( $self->{'_implicit_body_p_tag'} ) {
1248 print $indent,
1249 " * Text node under HTML implicates BODY and P.\n"
1250 if DEBUG > 1;
1251 $pos = $self->{'_body'}
1252 ? ( $self->{'_pos'}
1253 = $self->{'_body'} ) # expected case
1254 : $self->insert_element( 'body', 1 );
1255 $pos = $self->insert_element( 'p', 1 );
1256 }
1257 else {
1258 print $indent,
1259 " * Text node under HTML implicates BODY.\n"
1260 if DEBUG > 1;
1261 $pos = $self->{'_body'}
1262 ? ( $self->{'_pos'}
1263 = $self->{'_body'} ) # expected case
1264 : $self->insert_element( 'body', 1 );
1265
1266 #print "POS is $pos, ", $pos->{'_tag'}, "\n";
1267 }
1268 }
1269 elsif ( $ptag eq 'body' ) {
1270 if ( $self->{'_implicit_body_p_tag'} ) {
1271 print $indent, " * Text node under BODY implicates P.\n"
1272 if DEBUG > 1;
1273 $pos = $self->insert_element( 'p', 1 );
1274 }
1275 }
1276 elsif ( $ptag eq 'table' ) {
1277 print $indent,
1278 " * Text node under TABLE implicates TR and TD.\n"
1279 if DEBUG > 1;
1280 $self->insert_element( 'tr', 1 );
1281 $pos = $self->insert_element( 'td', 1 );
1282
1283 # double whammy!
1284 }
1285 elsif ( $ptag eq 'tr' ) {
1286 print $indent, " * Text node under TR implicates TD.\n"
1287 if DEBUG > 1;
1288 $pos = $self->insert_element( 'td', 1 );
1289 }
1290
1291 # elsif (
1292 # # $ptag eq 'li' ||
1293 # # $ptag eq 'dd' ||
1294 # $ptag eq 'form') {
1295 # $pos = $self->insert_element('p', 1);
1296 #}
1297
1298 # Whatever we've done above should have had the side
1299 # effect of updating $self->{'_pos'}
1300
1301 #print "POS is now $pos, ", $pos->{'_tag'}, "\n";
1302
1303 return if $ignore_text;
1304 $text =~ s/[\n\r\f\t ]+/ /g # canonical space
1305 unless $no_space_compacting;
1306
1307 print $indent, " (Attaching text node ($nugget) under ",
1308
1309 # was: $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : $self->{'_tag'},
1310 $pos->{'_tag'}, ").\n"
1311 if DEBUG > 1;
1312
1313 $pos->push_content($text);
1314 }
1315
1316 &{ $self->{'_tweak_~text'} || return }( $text, $pos,
1317 $pos->{'_tag'} . '' );
1318
1319 # Note that this is very exceptional -- it doesn't fall back to
1320 # _tweak_*, and it gives its tweak different arguments.
1321 return;
1322 }
1323}
1324
1325#==========================================================================
1326
1327# TODO: test whether comment(), declaration(), and process(), do the right
1328# thing as far as tightening and whatnot.
1329# Also, currently, doctypes and comments that appear before head or body
1330# show up in the tree in the wrong place. Something should be done about
1331# this. Tricky. Maybe this whole business of pre-making the body and
1332# whatnot is wrong.
1333
1334sub comment {
1335 return if $_[0]{'_stunted'};
1336
1337 # Accept a "here's a comment" signal from HTML::Parser.
1338
1339 my ( $self, $text ) = @_;
1340 my $pos = $self->{'_pos'} || $self;
1341 return
1342 unless $self->{'_store_comments'}
1343 || $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} };
1344
1345 if (DEBUG) {
1346 my @lineage_tags = $pos->lineage_tag_names;
1347 my $indent = ' ' x ( 1 + @lineage_tags );
1348
1349 my $nugget
1350 = ( length($text) <= 25 )
1351 ? $text
1352 : ( substr( $text, 0, 25 ) . '...' );
1353 $nugget =~ s<([\x00-\x1F])>
1354 <'\\x'.(unpack("H2",$1))>eg;
1355 print $indent, "Proposing a Comment ($nugget) under ",
1356 join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root',
1357 ".\n";
1358 }
1359
1360 ( my $e = $self->element_class->new('~comment') )->{'text'} = $text;
1361 $pos->push_content($e);
1362 ++( $self->{'_element_count'} );
1363
1364 &{ $self->{'_tweak_~comment'}
1365 || $self->{'_tweak_*'}
1366 || return $e }( map $_, $e, '~comment', $self );
1367
1368 return $e;
1369}
1370
1371sub declaration {
1372 return if $_[0]{'_stunted'};
1373
1374 # Accept a "here's a markup declaration" signal from HTML::Parser.
1375
1376 my ( $self, $text ) = @_;
1377 my $pos = $self->{'_pos'} || $self;
1378
1379 if (DEBUG) {
1380 my @lineage_tags = $pos->lineage_tag_names;
1381 my $indent = ' ' x ( 1 + @lineage_tags );
1382
1383 my $nugget
1384 = ( length($text) <= 25 )
1385 ? $text
1386 : ( substr( $text, 0, 25 ) . '...' );
1387 $nugget =~ s<([\x00-\x1F])>
1388 <'\\x'.(unpack("H2",$1))>eg;
1389 print $indent, "Proposing a Declaration ($nugget) under ",
1390 join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root',
1391 ".\n";
1392 }
1393 ( my $e = $self->element_class->new('~declaration') )->{'text'} = $text;
1394
1395 $self->{_decl} = $e;
1396 return $e;
1397}
1398
1399#==========================================================================
1400
1401sub process {
1402 return if $_[0]{'_stunted'};
1403
1404 # Accept a "here's a PI" signal from HTML::Parser.
1405
1406 return unless $_[0]->{'_store_pis'};
1407 my ( $self, $text ) = @_;
1408 my $pos = $self->{'_pos'} || $self;
1409
1410 if (DEBUG) {
1411 my @lineage_tags = $pos->lineage_tag_names;
1412 my $indent = ' ' x ( 1 + @lineage_tags );
1413
1414 my $nugget
1415 = ( length($text) <= 25 )
1416 ? $text
1417 : ( substr( $text, 0, 25 ) . '...' );
1418 $nugget =~ s<([\x00-\x1F])>
1419 <'\\x'.(unpack("H2",$1))>eg;
1420 print $indent, "Proposing a PI ($nugget) under ",
1421 join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root',
1422 ".\n";
1423 }
1424 ( my $e = $self->element_class->new('~pi') )->{'text'} = $text;
1425 $pos->push_content($e);
1426 ++( $self->{'_element_count'} );
1427
1428 &{ $self->{'_tweak_~pi'} || $self->{'_tweak_*'} || return $e }( map $_,
1429 $e, '~pi', $self );
1430
1431 return $e;
1432}
1433
1434#==========================================================================
1435
1436#When you call $tree->parse_file($filename), and the
1437#tree's ignore_ignorable_whitespace attribute is on (as it is
1438#by default), HTML::TreeBuilder's logic will manage to avoid
1439#creating some, but not all, nodes that represent ignorable
1440#whitespace. However, at the end of its parse, it traverses the
1441#tree and deletes any that it missed. (It does this with an
1442#around-method around HTML::Parser's eof method.)
1443#
1444#However, with $tree->parse($content), the cleanup-traversal step
1445#doesn't happen automatically -- so when you're done parsing all
1446#content for a document (regardless of whether $content is the only
1447#bit, or whether it's just another chunk of content you're parsing into
1448#the tree), call $tree->eof() to signal that you're at the end of the
1449#text you're inputting to the tree. Besides properly cleaning any bits
1450#of ignorable whitespace from the tree, this will also ensure that
1451#HTML::Parser's internal buffer is flushed.
1452
1453sub eof {
1454
1455 # Accept an "end-of-file" signal from HTML::Parser, or thrown by the user.
1456
1457 return if $_[0]->{'_done'}; # we've already been here
1458
1459 return $_[0]->SUPER::eof() if $_[0]->{'_stunted'};
1460
1461 my $x = $_[0];
1462 print "EOF received.\n" if DEBUG;
1463 my (@rv);
1464 if (wantarray) {
1465
1466 # I don't think this makes any difference for this particular
1467 # method, but let's be scrupulous, for once.
1468 @rv = $x->SUPER::eof();
1469 }
1470 else {
1471 $rv[0] = $x->SUPER::eof();
1472 }
1473
1474 $x->end('html') unless $x eq ( $x->{'_pos'} || $x );
1475
1476 # That SHOULD close everything, and will run the appropriate tweaks.
1477 # We /could/ be running under some insane mode such that there's more
1478 # than one HTML element, but really, that's just insane to do anyhow.
1479
1480 unless ( $x->{'_implicit_tags'} ) {
1481
1482 # delete those silly implicit head and body in case we put
1483 # them there in implicit tags mode
1484 foreach my $node ( $x->{'_head'}, $x->{'_body'} ) {
1485 $node->replace_with_content
1486 if defined $node
1487 and ref $node
1488 and $node->{'_implicit'}
1489 and $node->{'_parent'};
1490
1491 # I think they should be empty anyhow, since the only
1492 # logic that'd insert under them can apply only, I think,
1493 # in the case where _implicit_tags is on
1494 }
1495
1496 # this may still leave an implicit 'html' at the top, but there's
1497 # nothing we can do about that, is there?
1498 }
1499
1500 $x->delete_ignorable_whitespace()
1501
1502 # this's why we trap this -- an after-method
1503 if $x->{'_tighten'} and !$x->{'_ignore_text'};
1504 $x->{'_done'} = 1;
1505
1506 return @rv if wantarray;
1507 return $rv[0];
1508}
1509
1510#==========================================================================
1511
1512# TODO: document
1513
1514sub stunt {
1515 my $self = $_[0];
1516 print "Stunting the tree.\n" if DEBUG;
1517 $self->{'_done'} = 1;
1518
1519 if ( $HTML::Parser::VERSION < 3 ) {
1520
1521 #This is a MEAN MEAN HACK. And it works most of the time!
1522 $self->{'_buf'} = '';
1523 my $fh = *HTML::Parser::F{IO};
1524
1525 # the local'd FH used by parse_file loop
1526 if ( defined $fh ) {
1527 print "Closing Parser's filehandle $fh\n" if DEBUG;
1528 close($fh);
1529 }
1530
1531 # But if they called $tree->parse_file($filehandle)
1532 # or $tree->parse_file(*IO), then there will be no *HTML::Parser::F{IO}
1533 # to close. Ahwell. Not a problem for most users these days.
1534
1535 }
1536 else {
1537 $self->SUPER::eof();
1538
1539 # Under 3+ versions, calling eof from inside a parse will abort the
1540 # parse / parse_file
1541 }
1542
1543 # In the off chance that the above didn't work, we'll throw
1544 # this flag to make any future events be no-ops.
1545 $self->stunted(1);
1546 return;
1547}
1548
1549# TODO: document
1550sub stunted { shift->_elem( '_stunted', @_ ); }
1551sub done { shift->_elem( '_done', @_ ); }
1552
1553#==========================================================================
1554
1555sub delete {
1556
1557 # Override Element's delete method.
1558 # This does most, if not all, of what Element's delete does anyway.
1559 # Deletes content, including content in some special attributes.
1560 # But doesn't empty out the hash.
1561
1562 $_[0]->{'_element_count'} = 1; # never hurts to be scrupulously correct
1563
1564 delete @{ $_[0] }{ '_body', '_head', '_pos' };
1565 for (
1566 @{ delete( $_[0]->{'_content'} ) || [] }, # all/any content
1567
1568 # delete @{$_[0]}{'_body', '_head', '_pos'}
1569 # ...and these, in case these elements don't appear in the
1570 # content, which is possible. If they did appear (as they
1571 # usually do), then calling $_->delete on them again is harmless.
1572 # I don't think that's such a hot idea now. Thru creative reattachment,
1573 # those could actually now point to elements in OTHER trees (which we do
1574 # NOT want to delete!).
1575## Reasoned out:
1576 # If these point to elements not in the content list of any element in this
1577 # tree, but not in the content list of any element in any OTHER tree, then
1578 # just deleting these will make their refcounts hit zero.
1579 # If these point to elements in the content lists of elements in THIS tree,
1580 # then we'll get to deleting them when we delete from the top.
1581 # If these point to elements in the content lists of elements in SOME OTHER
1582 # tree, then they're not to be deleted.
1583 )
1584 {
1585 $_->delete
1586 if defined $_ and ref $_ # Make sure it's an object.
1587 and $_ ne $_[0]; # And avoid hitting myself, just in case!
1588 }
1589
1590 $_[0]->detach if $_[0]->{'_parent'} and $_[0]->{'_parent'}{'_content'};
1591
1592 # An 'html' element having a parent is quite unlikely.
1593
1594 return;
1595}
1596
1597sub tighten_up { # legacy
1598 shift->delete_ignorable_whitespace(@_);
1599}
1600
1601sub elementify {
1602
1603 # Rebless this object down into the normal element class.
1604 my $self = $_[0];
1605 my $to_class = $self->element_class;
1606 delete @{$self}{
1607 grep {
1608 ;
1609 length $_ and substr( $_, 0, 1 ) eq '_'
1610
1611 # The private attributes that we'll retain:
1612 and $_ ne '_tag'
1613 and $_ ne '_parent'
1614 and $_ ne '_content'
1615 and $_ ne '_implicit'
1616 and $_ ne '_pos'
1617 and $_ ne '_element_class'
1618 } keys %$self
1619 };
1620 bless $self, $to_class; # Returns the same object we were fed
1621}
1622
1623sub element_class {
1624 return 'HTML::Element' if not ref $_[0];
1625 return $_[0]->{_element_class} || 'HTML::Element';
1626}
1627
1628#--------------------------------------------------------------------------
1629
1630sub guts {
1631 my @out;
1632 my @stack = ( $_[0] );
1633 my $destructive = $_[1];
1634 my $this;
1635 while (@stack) {
1636 $this = shift @stack;
1637 if ( !ref $this ) {
1638 push @out, $this; # yes, it can include text nodes
1639 }
1640 elsif ( !$this->{'_implicit'} ) {
1641 push @out, $this;
1642 delete $this->{'_parent'} if $destructive;
1643 }
1644 else {
1645
1646 # it's an implicit node. Delete it and recurse
1647 delete $this->{'_parent'} if $destructive;
1648 unshift @stack,
1649 @{
1650 ( $destructive
1651 ? delete( $this->{'_content'} )
1652 : $this->{'_content'}
1653 )
1654 || []
1655 };
1656 }
1657 }
1658
1659 # Doesn't call a real $root->delete on the (when implicit) root,
1660 # but I don't think it needs to.
1661
1662 return @out if wantarray; # one simple normal case.
1663 return unless @out;
1664 return $out[0] if @out == 1 and ref( $out[0] );
1665 my $x = HTML::Element->new( 'div', '_implicit' => 1 );
1666 $x->push_content(@out);
1667 return $x;
1668}
1669
1670sub disembowel { $_[0]->guts(1) }
1671
1672#--------------------------------------------------------------------------
16731;
1674
1675__END__