← Index
NYTProf Performance Profile   « line view »
For /usr/local/libexec/sympa/task_manager-debug.pl
  Run on Tue Jun 1 22:32:51 2021
Reported on Tue Jun 1 22:35:03 2021

Filename/usr/local/lib/perl5/5.32/Pod/Simple/BlackBox.pm
StatementsExecuted 0 statements in 0s
Subroutines
Calls P F Exclusive
Time
Inclusive
Time
Subroutine
0000s0sPod::Simple::BlackBox::::BEGIN@1522Pod::Simple::BlackBox::BEGIN@1522
0000s0sPod::Simple::BlackBox::::BEGIN@21Pod::Simple::BlackBox::BEGIN@21
0000s0sPod::Simple::BlackBox::::BEGIN@22Pod::Simple::BlackBox::BEGIN@22
0000s0sPod::Simple::BlackBox::::BEGIN@23Pod::Simple::BlackBox::BEGIN@23
0000s0sPod::Simple::BlackBox::::BEGIN@24Pod::Simple::BlackBox::BEGIN@24
0000s0sPod::Simple::BlackBox::::BEGIN@275Pod::Simple::BlackBox::BEGIN@275
0000s0sPod::Simple::BlackBox::::BEGIN@291Pod::Simple::BlackBox::BEGIN@291
0000s0sPod::Simple::BlackBox::::BEGIN@55Pod::Simple::BlackBox::BEGIN@55
0000s0sPod::Simple::BlackBox::::CORE:matchPod::Simple::BlackBox::CORE:match (opcode)
0000s0sPod::Simple::BlackBox::::CORE:qrPod::Simple::BlackBox::CORE:qr (opcode)
0000s0sPod::Simple::BlackBox::::_closers_for_all_curr_openPod::Simple::BlackBox::_closers_for_all_curr_open
0000s0sPod::Simple::BlackBox::::_dump_curr_openPod::Simple::BlackBox::_dump_curr_open
0000s0sPod::Simple::BlackBox::::_gen_errataPod::Simple::BlackBox::_gen_errata
0000s0sPod::Simple::BlackBox::::_handle_encoding_linePod::Simple::BlackBox::_handle_encoding_line
0000s0sPod::Simple::BlackBox::::_handle_encoding_second_levelPod::Simple::BlackBox::_handle_encoding_second_level
0000s0sPod::Simple::BlackBox::::_ponder_DataPod::Simple::BlackBox::_ponder_Data
0000s0sPod::Simple::BlackBox::::_ponder_PlainPod::Simple::BlackBox::_ponder_Plain
0000s0sPod::Simple::BlackBox::::_ponder_VerbatimPod::Simple::BlackBox::_ponder_Verbatim
0000s0sPod::Simple::BlackBox::::_ponder_backPod::Simple::BlackBox::_ponder_back
0000s0sPod::Simple::BlackBox::::_ponder_beginPod::Simple::BlackBox::_ponder_begin
0000s0sPod::Simple::BlackBox::::_ponder_doc_endPod::Simple::BlackBox::_ponder_doc_end
0000s0sPod::Simple::BlackBox::::_ponder_endPod::Simple::BlackBox::_ponder_end
0000s0sPod::Simple::BlackBox::::_ponder_forPod::Simple::BlackBox::_ponder_for
0000s0sPod::Simple::BlackBox::::_ponder_itemPod::Simple::BlackBox::_ponder_item
0000s0sPod::Simple::BlackBox::::_ponder_overPod::Simple::BlackBox::_ponder_over
0000s0sPod::Simple::BlackBox::::_ponder_paragraph_bufferPod::Simple::BlackBox::_ponder_paragraph_buffer
0000s0sPod::Simple::BlackBox::::_ponder_podPod::Simple::BlackBox::_ponder_pod
0000s0sPod::Simple::BlackBox::::_stringify_lolPod::Simple::BlackBox::_stringify_lol
0000s0sPod::Simple::BlackBox::::_traverse_treelet_bitPod::Simple::BlackBox::_traverse_treelet_bit
0000s0sPod::Simple::BlackBox::::_treelet_from_formatting_codesPod::Simple::BlackBox::_treelet_from_formatting_codes
0000s0sPod::Simple::BlackBox::::_verbatim_formatPod::Simple::BlackBox::_verbatim_format
0000s0sPod::Simple::BlackBox::::my_qrPod::Simple::BlackBox::my_qr
0000s0sPod::Simple::BlackBox::::parse_linePod::Simple::BlackBox::parse_line
0000s0sPod::Simple::BlackBox::::parse_linesPod::Simple::BlackBox::parse_lines
0000s0sPod::Simple::BlackBox::::prettyPod::Simple::BlackBox::pretty
0000s0sPod::Simple::BlackBox::::reinitPod::Simple::BlackBox::reinit
0000s0sPod::Simple::BlackBox::::stringify_lolPod::Simple::BlackBox::stringify_lol
0000s0sPod::Simple::BlackBox::::text_content_of_treeletPod::Simple::BlackBox::text_content_of_treelet
Call graph for these subroutines as a Graphviz dot language file.
Line State
ments
Time
on line
Calls Time
in subs
Code
1package Pod::Simple::BlackBox;
2#
3# "What's in the box?" "Pain."
4#
5###########################################################################
6#
7# This is where all the scary things happen: parsing lines into
8# paragraphs; and then into directives, verbatims, and then also
9# turning formatting sequences into treelets.
10#
11# Are you really sure you want to read this code?
12#
13#-----------------------------------------------------------------------------
14#
15# The basic work of this module Pod::Simple::BlackBox is doing the dirty work
16# of parsing Pod into treelets (generally one per non-verbatim paragraph), and
17# to call the proper callbacks on the treelets.
18#
19# Every node in a treelet is a ['name', {attrhash}, ...children...]
20
21use integer; # vroom!
22use strict;
23use Carp ();
24use vars qw($VERSION );
25$VERSION = '3.40';
26#use constant DEBUG => 7;
27
28sub my_qr ($$) {
29
30 # $1 is a pattern to compile and return. Older perls compile any
31 # syntactically valid property, even if it isn't legal. To cope with
32 # this, return an empty string unless the compiled pattern also
33 # successfully matches $2, which the caller furnishes.
34
35 my ($input_re, $should_match) = @_;
36 # XXX could have a third parameter $shouldnt_match for extra safety
37
38 my $use_utf8 = ($] le 5.006002) ? 'use utf8;' : "";
39
40 my $re = eval "no warnings; $use_utf8 qr/$input_re/";
# spent 0s executing statements in string eval # spent 0s executing statements in string eval # spent 0s executing statements in string eval # spent 0s executing statements in string eval # spent 0s executing statements in string eval # spent 0s executing statements in string eval # spent 0s executing statements in string eval # spent 0s executing statements in string eval # spent 0s executing statements in string eval
41 #print STDERR __LINE__, ": $input_re: $@\n" if $@;
42 return "" if $@;
43
44 my $matches = eval "no warnings; $use_utf8 '$should_match' =~ /$re/";
# spent 0s executing statements in string eval # spent 0s executing statements in string eval # spent 0s executing statements in string eval # spent 0s executing statements in string eval # spent 0s executing statements in string eval # spent 0s executing statements in string eval # spent 0s executing statements in string eval # spent 0s executing statements in string eval # spent 0s executing statements in string eval
45 #print STDERR __LINE__, ": $input_re: $@\n" if $@;
46 return "" if $@;
47
48 #print STDERR __LINE__, ": SUCCESS: $re\n" if $matches;
49 return $re if $matches;
50
51 #print STDERR __LINE__, ": $re: didn't match\n";
52 return "";
53}
54
55BEGIN {
56 require Pod::Simple;
57 *DEBUG = \&Pod::Simple::DEBUG unless defined &DEBUG
58}
59
60# Matches a character iff the character will have a different meaning
61# if we choose CP1252 vs UTF-8 if there is no =encoding line.
62# This is broken for early Perls on non-ASCII platforms.
63my $non_ascii_re = my_qr('[[:^ascii:]]', "\xB6");
64$non_ascii_re = qr/[\x80-\xFF]/ unless $non_ascii_re;
65
66# Use patterns understandable by Perl 5.6, if possible
67my $cs_re = my_qr('\p{IsCs}', "\x{D800}");
68my $cn_re = my_qr('\p{IsCn}', "\x{09E4}"); # <reserved> code point unlikely
69 # to get assigned
70my $rare_blocks_re = my_qr('[\p{InIPAExtensions}\p{InSpacingModifierLetters}]',
71 "\x{250}");
72$rare_blocks_re = my_qr('[\x{0250}-\x{02FF}]', "\x{250}") unless $rare_blocks_re;
73
74my $script_run_re = eval 'no warnings "experimental::script_run";
# spent 0s executing statements in string eval
75 qr/(*script_run: ^ .* $ )/x';
76my $latin_re = my_qr('[\p{IsLatin}\p{IsInherited}\p{IsCommon}]', "\x{100}");
77unless ($latin_re) {
78 # This was machine generated to be the ranges of the union of the above
79 # three properties, with things that were undefined by Unicode 4.1 filling
80 # gaps. That is the version in use when Perl advanced enough to
81 # successfully compile and execute the above pattern.
82 $latin_re = my_qr('[\x00-\x{02E9}\x{02EC}-\x{0374}\x{037E}\x{0385}\x{0387}\x{0485}\x{0486}\x{0589}\x{060C}\x{061B}\x{061F}\x{0640}\x{064B}-\x{0655}\x{0670}\x{06DD}\x{0951}-\x{0954}\x{0964}\x{0965}\x{0E3F}\x{10FB}\x{16EB}-\x{16ED}\x{1735}\x{1736}\x{1802}\x{1803}\x{1805}\x{1D00}-\x{1D25}\x{1D2C}-\x{1D5C}\x{1D62}-\x{1D65}\x{1D6B}-\x{1D77}\x{1D79}-\x{1DBE}\x{1DC0}-\x{1EF9}\x{2000}-\x{2125}\x{2127}-\x{27FF}\x{2900}-\x{2B13}\x{2E00}-\x{2E1D}\x{2FF0}-\x{3004}\x{3006}\x{3008}-\x{3020}\x{302A}-\x{302D}\x{3030}-\x{3037}\x{303C}-\x{303F}\x{3099}-\x{309C}\x{30A0}\x{30FB}\x{30FC}\x{3190}-\x{319F}\x{31C0}-\x{31CF}\x{3220}-\x{325F}\x{327F}-\x{32CF}\x{3358}-\x{33FF}\x{4DC0}-\x{4DFF}\x{A700}-\x{A716}\x{FB00}-\x{FB06}\x{FD3E}\x{FD3F}\x{FE00}-\x{FE6B}\x{FEFF}-\x{FF65}\x{FF70}\x{FF9E}\x{FF9F}\x{FFE0}-\x{FFFD}\x{10100}-\x{1013F}\x{1D000}-\x{1D1DD}\x{1D300}-\x{1D7FF}]', "\x{100}");
83}
84
85my $every_char_is_latin_re = my_qr("^(?:$latin_re)*\\z", "A");
86
87# Latin script code points not in the first release of Unicode
88my $later_latin_re = my_qr('[^\P{IsLatin}\p{IsAge=1.1}]', "\x{1F6}");
89
90# If this perl doesn't have the Deprecated property, there's only one code
91# point in it that we need be concerned with.
92my $deprecated_re = my_qr('\p{IsDeprecated}', "\x{149}");
93$deprecated_re = qr/\x{149}/ unless $deprecated_re;
94
95my $utf8_bom;
96if (($] ge 5.007_003)) {
97 $utf8_bom = "\x{FEFF}";
98 utf8::encode($utf8_bom);
99} else {
100 $utf8_bom = "\xEF\xBB\xBF"; # No EBCDIC BOM detection for early Perls.
101}
102
103# This is used so that the 'content_seen' method doesn't return true on a
104# file that just happens to have a line that matches /^=[a-zA-z]/. Only if
105# there is a valid =foo line will we return that content was seen.
106my $seen_legal_directive = 0;
107
108#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
109
110sub parse_line { shift->parse_lines(@_) } # alias
111
112# - - - Turn back now! Run away! - - -
113
114sub parse_lines { # Usage: $parser->parse_lines(@lines)
115 # an undef means end-of-stream
116 my $self = shift;
117
118 my $code_handler = $self->{'code_handler'};
119 my $cut_handler = $self->{'cut_handler'};
120 my $wl_handler = $self->{'whiteline_handler'};
121 $self->{'line_count'} ||= 0;
122
123 my $scratch;
124
125 DEBUG > 4 and
126 print STDERR "# Parsing starting at line ", $self->{'line_count'}, ".\n";
127
128 DEBUG > 5 and
129 print STDERR "# About to parse lines: ",
130 join(' ', map defined($_) ? "[$_]" : "EOF", @_), "\n";
131
132 my $paras = ($self->{'paras'} ||= []);
133 # paragraph buffer. Because we need to defer processing of =over
134 # directives and verbatim paragraphs. We call _ponder_paragraph_buffer
135 # to process this.
136
137 $self->{'pod_para_count'} ||= 0;
138
139 # An attempt to match the pod portions of a line. This is not fool proof,
140 # but is good enough to serve as part of the heuristic for guessing the pod
141 # encoding if not specified.
142 my $format_codes = join "", '[', grep { / ^ [A-Za-z] $/x }
143 keys %{$self->{accept_codes}};
144 $format_codes .= ']';
145 my $pod_chars_re = qr/ ^ = [A-Za-z]+ | $format_codes < /x;
146
147 my $line;
148 foreach my $source_line (@_) {
149 if( $self->{'source_dead'} ) {
150 DEBUG > 4 and print STDERR "# Source is dead.\n";
151 last;
152 }
153
154 unless( defined $source_line ) {
155 DEBUG > 4 and print STDERR "# Undef-line seen.\n";
156
157 push @$paras, ['~end', {'start_line' => $self->{'line_count'}}];
158 push @$paras, $paras->[-1], $paras->[-1];
159 # So that it definitely fills the buffer.
160 $self->{'source_dead'} = 1;
161 $self->_ponder_paragraph_buffer;
162 next;
163 }
164
165
166 if( $self->{'line_count'}++ ) {
167 ($line = $source_line) =~ tr/\n\r//d;
168 # If we don't have two vars, we'll end up with that there
169 # tr/// modding the (potentially read-only) original source line!
170
171 } else {
172 DEBUG > 2 and print STDERR "First line: [$source_line]\n";
173
174 if( ($line = $source_line) =~ s/^$utf8_bom//s ) {
175 DEBUG and print STDERR "UTF-8 BOM seen. Faking a '=encoding utf8'.\n";
176 $self->_handle_encoding_line( "=encoding utf8" );
177 delete $self->{'_processed_encoding'};
178 $line =~ tr/\n\r//d;
179
180 } elsif( $line =~ s/^\xFE\xFF//s ) {
181 DEBUG and print STDERR "Big-endian UTF-16 BOM seen. Aborting parsing.\n";
182 $self->scream(
183 $self->{'line_count'},
184 "UTF16-BE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
185 );
186 splice @_;
187 push @_, undef;
188 next;
189
190 # TODO: implement somehow?
191
192 } elsif( $line =~ s/^\xFF\xFE//s ) {
193 DEBUG and print STDERR "Little-endian UTF-16 BOM seen. Aborting parsing.\n";
194 $self->scream(
195 $self->{'line_count'},
196 "UTF16-LE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
197 );
198 splice @_;
199 push @_, undef;
200 next;
201
202 # TODO: implement somehow?
203
204 } else {
205 DEBUG > 2 and print STDERR "First line is BOM-less.\n";
206 ($line = $source_line) =~ tr/\n\r//d;
207 }
208 }
209
210 if(!$self->{'parse_characters'} && !$self->{'encoding'}
211 && ($self->{'in_pod'} || $line =~ /^=/s)
212 && $line =~ /$non_ascii_re/
213 ) {
214
215 my $encoding;
216
217 # No =encoding line, and we are at the first pod line in the input that
218 # contains a non-ascii byte, that is, one whose meaning varies depending
219 # on whether the file is encoded in UTF-8 or CP1252, which are the two
220 # possibilities permitted by the pod spec. (ASCII is assumed if the
221 # file only contains ASCII bytes.) In order to process this line, we
222 # need to figure out what encoding we will use for the file.
223 #
224 # Strictly speaking ISO 8859-1 (Latin 1) refers to the code points
225 # 160-255, but it is used here, as it often colloquially is, to refer to
226 # the complete set of code points 0-255, including ASCII (0-127), the C1
227 # controls (128-159), and strict Latin 1 (160-255).
228 #
229 # CP1252 is effectively a superset of Latin 1, because it differs only
230 # from colloquial 8859-1 in the C1 controls, which are very unlikely to
231 # actually be present in 8859-1 files, so can be used for other purposes
232 # without conflict. CP 1252 uses most of them for graphic characters.
233 #
234 # Note that all ASCII-range bytes represent their corresponding code
235 # points in both CP1252 and UTF-8. In ASCII platform UTF-8, all other
236 # code points require multiple (non-ASCII) bytes to represent. (A
237 # separate paragraph for EBCDIC is below.) The multi-byte
238 # representation is quite structured. If we find an isolated byte that
239 # would require multiple bytes to represent in UTF-8, we know that the
240 # encoding is not UTF-8. If we find a sequence of bytes that violates
241 # the UTF-8 structure, we also can presume the encoding isn't UTF-8, and
242 # hence must be 1252.
243 #
244 # But there are ambiguous cases where we could guess wrong. If so, the
245 # user will end up having to supply an =encoding line. We use all
246 # readily available information to improve our chances of guessing
247 # right. The odds of something not being UTF-8, but still passing a
248 # UTF-8 validity test go down very rapidly with increasing length of the
249 # sequence. Therefore we look at all non-ascii sequences on the line.
250 # If any of the sequences can't be UTF-8, we quit there and choose
251 # CP1252. If all could be UTF-8, we see if any of the code points
252 # represented are unlikely to be in pod. If so, we guess CP1252. If
253 # not, we check if the line is all in the same script; if not guess
254 # CP1252; otherwise UTF-8. For perls that don't have convenient script
255 # run testing, see if there is both Latin and non-Latin. If so, CP1252,
256 # otherwise UTF-8.
257 #
258 # On EBCDIC platforms, the situation is somewhat different. In
259 # UTF-EBCDIC, not only do ASCII-range bytes represent their code points,
260 # but so do the bytes that are for the C1 controls. Recall that these
261 # correspond to the unused portion of 8859-1 that 1252 mostly takes
262 # over. That means that there are fewer code points that are
263 # represented by multi-bytes. But, note that the these controls are
264 # very unlikely to be in pod text. So if we encounter one of them, it
265 # means that it is quite likely CP1252 and not UTF-8. The net result is
266 # the same code below is used for both platforms.
267 #
268 # XXX probably if the line has E<foo> that evaluates to illegal CP1252,
269 # then it is UTF-8. But we haven't processed E<> yet.
270
271 goto set_1252 if $] lt 5.006_000; # No UTF-8 on very early perls
272
273 my $copy;
274
275 no warnings 'utf8';
276
277 if ($] ge 5.007_003) {
278 $copy = $line;
279
280 # On perls that have this function, we can use it to easily see if the
281 # sequence is valid UTF-8 or not; if valid it turns on the UTF-8 flag
282 # needed below for script run detection
283 goto set_1252 if ! utf8::decode($copy);
284 }
285 elsif (ord("A") != 65) { # Early EBCDIC, assume UTF-8. What's a windows
286 # code page doing here anyway?
287 goto set_utf8;
288 }
289 else { # ASCII, no decode(): do it ourselves using the fundamental
290 # characteristics of UTF-8
291 use if $] le 5.006002, 'utf8';
292
293 my $char_ord;
294 my $needed; # How many continuation bytes to gobble up
295
296 # Initialize the translated line with a dummy character that will be
297 # deleted after everything else is done. This dummy makes sure that
298 # $copy will be in UTF-8. Doing it now avoids the bugs in early perls
299 # with upgrading in the middle
300 $copy = chr(0x100);
301
302 # Parse through the line
303 for (my $i = 0; $i < length $line; $i++) {
304 my $byte = substr($line, $i, 1);
305
306 # ASCII bytes are trivially dealt with
307 if ($byte !~ $non_ascii_re) {
308 $copy .= $byte;
309 next;
310 }
311
312 my $b_ord = ord $byte;
313
314 # Now figure out what this code point would be if the input is
315 # actually in UTF-8. If, in the process, we discover that it isn't
316 # well-formed UTF-8, we guess CP1252.
317 #
318 # Start the process. If it is UTF-8, we are at the first, start
319 # byte, of a multi-byte sequence. We look at this byte to figure
320 # out how many continuation bytes are needed, and to initialize the
321 # code point accumulator with the data from this byte.
322 #
323 # Normally the minimum continuation byte is 0x80, but in certain
324 # instances the minimum is a higher number. So the code below
325 # overrides this for those instances.
326 my $min_cont = 0x80;
327
328 if ($b_ord < 0xC2) { # A start byte < C2 is malformed
329 goto set_1252;
330 }
331 elsif ($b_ord <= 0xDF) {
332 $needed = 1;
333 $char_ord = $b_ord & 0x1F;
334 }
335 elsif ($b_ord <= 0xEF) {
336 $min_cont = 0xA0 if $b_ord == 0xE0;
337 $needed = 2;
338 $char_ord = $b_ord & (0x1F >> 1);
339 }
340 elsif ($b_ord <= 0xF4) {
341 $min_cont = 0x90 if $b_ord == 0xF0;
342 $needed = 3;
343 $char_ord = $b_ord & (0x1F >> 2);
344 }
345 else { # F4 is the highest start byte for legal Unicode; higher is
346 # unlikely to be in pod.
347 goto set_1252;
348 }
349
350 # ? not enough continuation bytes available
351 goto set_1252 if $i + $needed >= length $line;
352
353 # Accumulate the ordinal of the character from the remaining
354 # (continuation) bytes.
355 while ($needed-- > 0) {
356 my $cont = substr($line, ++$i, 1);
357 $b_ord = ord $cont;
358 goto set_1252 if $b_ord < $min_cont || $b_ord > 0xBF;
359
360 # In all cases, any next continuation bytes all have the same
361 # minimum legal value
362 $min_cont = 0x80;
363
364 # Accumulate this byte's contribution to the code point
365 $char_ord <<= 6;
366 $char_ord |= ($b_ord & 0x3F);
367 }
368
369 # Here, the sequence that formed this code point was valid UTF-8,
370 # so add the completed character to the output
371 $copy .= chr $char_ord;
372 } # End of loop through line
373
374 # Delete the dummy first character
375 $copy = substr($copy, 1);
376 }
377
378 # Here, $copy is legal UTF-8.
379
380 # If it can't be legal CP1252, no need to look further. (These bytes
381 # aren't valid in CP1252.) This test could have been placed higher in
382 # the code, but it seemed wrong to set the encoding to UTF-8 without
383 # making sure that the very first instance is well-formed. But what if
384 # it isn't legal CP1252 either? We have to choose one or the other, and
385 # It seems safer to favor the single-byte encoding over the multi-byte.
386 goto set_utf8 if ord("A") == 65 && $line =~ /[\x81\x8D\x8F\x90\x9D]/;
387
388 # The C1 controls are not likely to appear in pod
389 goto set_1252 if ord("A") == 65 && $copy =~ /[\x80-\x9F]/;
390
391 # Nor are surrogates nor unassigned, nor deprecated.
392 DEBUG > 8 and print STDERR __LINE__, ": $copy: surrogate\n" if $copy =~ $cs_re;
393 goto set_1252 if $cs_re && $copy =~ $cs_re;
394 DEBUG > 8 and print STDERR __LINE__, ": $copy: unassigned\n" if $cn_re && $copy =~ $cn_re;
395 goto set_1252 if $cn_re && $copy =~ $cn_re;
396 DEBUG > 8 and print STDERR __LINE__, ": $copy: deprecated\n" if $copy =~ $deprecated_re;
397 goto set_1252 if $copy =~ $deprecated_re;
398
399 # Nor are rare code points. But this is hard to determine. khw
400 # believes that IPA characters and the modifier letters are unlikely to
401 # be in pod (and certainly very unlikely to be the in the first line in
402 # the pod containing non-ASCII)
403 DEBUG > 8 and print STDERR __LINE__, ": $copy: rare\n" if $copy =~ $rare_blocks_re;
404 goto set_1252 if $rare_blocks_re && $copy =~ $rare_blocks_re;
405
406 # The first Unicode version included essentially every Latin character
407 # in modern usage. So, a Latin character not in the first release will
408 # unlikely be in pod.
409 DEBUG > 8 and print STDERR __LINE__, ": $copy: later_latin\n" if $later_latin_re && $copy =~ $later_latin_re;
410 goto set_1252 if $later_latin_re && $copy =~ $later_latin_re;
411
412 # On perls that handle script runs, if the UTF-8 interpretation yields
413 # a single script, we guess UTF-8, otherwise just having a mixture of
414 # scripts is suspicious, so guess CP1252. We first strip off, as best
415 # we can, the ASCII characters that look like they are pod directives,
416 # as these would always show as mixed with non-Latin text.
417 $copy =~ s/$pod_chars_re//g;
418
419 if ($script_run_re) {
420 goto set_utf8 if $copy =~ $script_run_re;
421 DEBUG > 8 and print STDERR __LINE__, ": not script run\n";
422 goto set_1252;
423 }
424
425 # Even without script runs, but on recent enough perls and Unicodes, we
426 # can check if there is a mixture of both Latin and non-Latin. Again,
427 # having a mixture of scripts is suspicious, so assume CP1252
428
429 # If it's all non-Latin, there is no CP1252, as that is Latin
430 # characters and punct, etc.
431 DEBUG > 8 and print STDERR __LINE__, ": $copy: not latin\n" if $copy !~ $latin_re;
432 goto set_utf8 if $copy !~ $latin_re;
433
434 DEBUG > 8 and print STDERR __LINE__, ": $copy: all latin\n" if $copy =~ $every_char_is_latin_re;
435 goto set_utf8 if $copy =~ $every_char_is_latin_re;
436
437 DEBUG > 8 and print STDERR __LINE__, ": $copy: mixed\n";
438
439 set_1252:
440 DEBUG > 9 and print STDERR __LINE__, ": $copy: is 1252\n";
441 $encoding = 'CP1252';
442 goto done_set;
443
444 set_utf8:
445 DEBUG > 9 and print STDERR __LINE__, ": $copy: is UTF-8\n";
446 $encoding = 'UTF-8';
447
448 done_set:
449 $self->_handle_encoding_line( "=encoding $encoding" );
450 delete $self->{'_processed_encoding'};
451 $self->{'_transcoder'} && $self->{'_transcoder'}->($line);
452
453 my ($word) = $line =~ /(\S*$non_ascii_re\S*)/;
454
455 $self->whine(
456 $self->{'line_count'},
457 "Non-ASCII character seen before =encoding in '$word'. Assuming $encoding"
458 );
459 }
460
461 DEBUG > 5 and print STDERR "# Parsing line: [$line]\n";
462
463 if(!$self->{'in_pod'}) {
464 if($line =~ m/^=([a-zA-Z][a-zA-Z0-9]*)(?:\s|$)/s) {
465 if($1 eq 'cut') {
466 $self->scream(
467 $self->{'line_count'},
468 "=cut found outside a pod block. Skipping to next block."
469 );
470
471 ## Before there were errata sections in the world, it was
472 ## least-pessimal to abort processing the file. But now we can
473 ## just barrel on thru (but still not start a pod block).
474 #splice @_;
475 #push @_, undef;
476
477 next;
478 } else {
479 $self->{'in_pod'} = $self->{'start_of_pod_block'}
480 = $self->{'last_was_blank'} = 1;
481 # And fall thru to the pod-mode block further down
482 }
483 } else {
484 DEBUG > 5 and print STDERR "# It's a code-line.\n";
485 $code_handler->(map $_, $line, $self->{'line_count'}, $self)
486 if $code_handler;
487 # Note: this may cause code to be processed out of order relative
488 # to pods, but in order relative to cuts.
489
490 # Note also that we haven't yet applied the transcoding to $line
491 # by time we call $code_handler!
492
493 if( $line =~ m/^#\s*line\s+(\d+)\s*(?:\s"([^"]+)")?\s*$/ ) {
494 # That RE is from perlsyn, section "Plain Old Comments (Not!)",
495 #$fname = $2 if defined $2;
496 #DEBUG > 1 and defined $2 and print STDERR "# Setting fname to \"$fname\"\n";
497 DEBUG > 1 and print STDERR "# Setting nextline to $1\n";
498 $self->{'line_count'} = $1 - 1;
499 }
500
501 next;
502 }
503 }
504
505 # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
506 # Else we're in pod mode:
507
508 # Apply any necessary transcoding:
509 $self->{'_transcoder'} && $self->{'_transcoder'}->($line);
510
511 # HERE WE CATCH =encoding EARLY!
512 if( $line =~ m/^=encoding\s+\S+\s*$/s ) {
513 next if $self->parse_characters; # Ignore this line
514 $line = $self->_handle_encoding_line( $line );
515 }
516
517 if($line =~ m/^=cut/s) {
518 # here ends the pod block, and therefore the previous pod para
519 DEBUG > 1 and print STDERR "Noting =cut at line ${$self}{'line_count'}\n";
520 $self->{'in_pod'} = 0;
521 # ++$self->{'pod_para_count'};
522 $self->_ponder_paragraph_buffer();
523 # by now it's safe to consider the previous paragraph as done.
524 DEBUG > 6 and print STDERR "Processing any cut handler, line ${$self}{'line_count'}\n";
525 $cut_handler->(map $_, $line, $self->{'line_count'}, $self)
526 if $cut_handler;
527
528 # TODO: add to docs: Note: this may cause cuts to be processed out
529 # of order relative to pods, but in order relative to code.
530
531 } elsif($line =~ m/^(\s*)$/s) { # it's a blank line
532 if (defined $1 and $1 =~ /[^\S\r\n]/) { # it's a white line
533 $wl_handler->(map $_, $line, $self->{'line_count'}, $self)
534 if $wl_handler;
535 }
536
537 if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
538 DEBUG > 1 and print STDERR "Saving blank line at line ${$self}{'line_count'}\n";
539 push @{$paras->[-1]}, $line;
540 } # otherwise it's not interesting
541
542 if(!$self->{'start_of_pod_block'} and !$self->{'last_was_blank'}) {
543 DEBUG > 1 and print STDERR "Noting para ends with blank line at ${$self}{'line_count'}\n";
544 }
545
546 $self->{'last_was_blank'} = 1;
547
548 } elsif($self->{'last_was_blank'}) { # A non-blank line starting a new para...
549
550 if($line =~ m/^(=[a-zA-Z][a-zA-Z0-9]*)(\s+|$)(.*)/s) {
551 # THIS IS THE ONE PLACE WHERE WE CONSTRUCT NEW DIRECTIVE OBJECTS
552 my $new = [$1, {'start_line' => $self->{'line_count'}}, $3];
553 $new->[1]{'~orig_spacer'} = $2 if $2 && $2 ne " ";
554 # Note that in "=head1 foo", the WS is lost.
555 # Example: ['=head1', {'start_line' => 123}, ' foo']
556
557 ++$self->{'pod_para_count'};
558
559 $self->_ponder_paragraph_buffer();
560 # by now it's safe to consider the previous paragraph as done.
561
562 push @$paras, $new; # the new incipient paragraph
563 DEBUG > 1 and print STDERR "Starting new ${$paras}[-1][0] para at line ${$self}{'line_count'}\n";
564
565 } elsif($line =~ m/^\s/s) {
566
567 if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
568 DEBUG > 1 and print STDERR "Resuming verbatim para at line ${$self}{'line_count'}\n";
569 push @{$paras->[-1]}, $line;
570 } else {
571 ++$self->{'pod_para_count'};
572 $self->_ponder_paragraph_buffer();
573 # by now it's safe to consider the previous paragraph as done.
574 DEBUG > 1 and print STDERR "Starting verbatim para at line ${$self}{'line_count'}\n";
575 push @$paras, ['~Verbatim', {'start_line' => $self->{'line_count'}}, $line];
576 }
577 } else {
578 ++$self->{'pod_para_count'};
579 $self->_ponder_paragraph_buffer();
580 # by now it's safe to consider the previous paragraph as done.
581 push @$paras, ['~Para', {'start_line' => $self->{'line_count'}}, $line];
582 DEBUG > 1 and print STDERR "Starting plain para at line ${$self}{'line_count'}\n";
583 }
584 $self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
585
586 } else {
587 # It's a non-blank line /continuing/ the current para
588 if(@$paras) {
589 DEBUG > 2 and print STDERR "Line ${$self}{'line_count'} continues current paragraph\n";
590 push @{$paras->[-1]}, $line;
591 } else {
592 # Unexpected case!
593 die "Continuing a paragraph but \@\$paras is empty?";
594 }
595 $self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
596 }
597
598 } # ends the big while loop
599
600 DEBUG > 1 and print STDERR (pretty(@$paras), "\n");
601 return $self;
602}
603
604#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
605
606sub _handle_encoding_line {
607 my($self, $line) = @_;
608
609 return if $self->parse_characters;
610
611 # The point of this routine is to set $self->{'_transcoder'} as indicated.
612
613 return $line unless $line =~ m/^=encoding\s+(\S+)\s*$/s;
614 DEBUG > 1 and print STDERR "Found an encoding line \"=encoding $1\"\n";
615
616 my $e = $1;
617 my $orig = $e;
618 push @{ $self->{'encoding_command_reqs'} }, "=encoding $orig";
619
620 my $enc_error;
621
622 # Cf. perldoc Encode and perldoc Encode::Supported
623
624 require Pod::Simple::Transcode;
625
626 if( $self->{'encoding'} ) {
627 my $norm_current = $self->{'encoding'};
628 my $norm_e = $e;
629 foreach my $that ($norm_current, $norm_e) {
630 $that = lc($that);
631 $that =~ s/[-_]//g;
632 }
633 if($norm_current eq $norm_e) {
634 DEBUG > 1 and print STDERR "The '=encoding $orig' line is ",
635 "redundant. ($norm_current eq $norm_e). Ignoring.\n";
636 $enc_error = '';
637 # But that doesn't necessarily mean that the earlier one went okay
638 } else {
639 $enc_error = "Encoding is already set to " . $self->{'encoding'};
640 DEBUG > 1 and print STDERR $enc_error;
641 }
642 } elsif (
643 # OK, let's turn on the encoding
644 do {
645 DEBUG > 1 and print STDERR " Setting encoding to $e\n";
646 $self->{'encoding'} = $e;
647 1;
648 }
649 and $e eq 'HACKRAW'
650 ) {
651 DEBUG and print STDERR " Putting in HACKRAW (no-op) encoding mode.\n";
652
653 } elsif( Pod::Simple::Transcode::->encoding_is_available($e) ) {
654
655 die($enc_error = "WHAT? _transcoder is already set?!")
656 if $self->{'_transcoder'}; # should never happen
657 require Pod::Simple::Transcode;
658 $self->{'_transcoder'} = Pod::Simple::Transcode::->make_transcoder($e);
659 eval {
660 my @x = ('', "abc", "123");
661 $self->{'_transcoder'}->(@x);
662 };
663 $@ && die( $enc_error =
664 "Really unexpected error setting up encoding $e: $@\nAborting"
665 );
666 $self->{'detected_encoding'} = $e;
667
668 } else {
669 my @supported = Pod::Simple::Transcode::->all_encodings;
670
671 # Note unsupported, and complain
672 DEBUG and print STDERR " Encoding [$e] is unsupported.",
673 "\nSupporteds: @supported\n";
674 my $suggestion = '';
675
676 # Look for a near match:
677 my $norm = lc($e);
678 $norm =~ tr[-_][]d;
679 my $n;
680 foreach my $enc (@supported) {
681 $n = lc($enc);
682 $n =~ tr[-_][]d;
683 next unless $n eq $norm;
684 $suggestion = " (Maybe \"$e\" should be \"$enc\"?)";
685 last;
686 }
687 my $encmodver = Pod::Simple::Transcode::->encmodver;
688 $enc_error = join '' =>
689 "This document probably does not appear as it should, because its ",
690 "\"=encoding $e\" line calls for an unsupported encoding.",
691 $suggestion, " [$encmodver\'s supported encodings are: @supported]"
692 ;
693
694 $self->scream( $self->{'line_count'}, $enc_error );
695 }
696 push @{ $self->{'encoding_command_statuses'} }, $enc_error;
697 if (defined($self->{'_processed_encoding'})) {
698 # Double declaration.
699 $self->scream( $self->{'line_count'}, 'Cannot have multiple =encoding directives');
700 }
701 $self->{'_processed_encoding'} = $orig;
702
703 return $line;
704}
705
706# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
707
708sub _handle_encoding_second_level {
709 # By time this is called, the encoding (if well formed) will already
710 # have been acted on.
711 my($self, $para) = @_;
712 my @x = @$para;
713 my $content = join ' ', splice @x, 2;
714 $content =~ s/^\s+//s;
715 $content =~ s/\s+$//s;
716
717 DEBUG > 2 and print STDERR "Ogling encoding directive: =encoding $content\n";
718
719 if (defined($self->{'_processed_encoding'})) {
720 #if($content ne $self->{'_processed_encoding'}) {
721 # Could it happen?
722 #}
723 delete $self->{'_processed_encoding'};
724 # It's already been handled. Check for errors.
725 if(! $self->{'encoding_command_statuses'} ) {
726 DEBUG > 2 and print STDERR " CRAZY ERROR: It wasn't really handled?!\n";
727 } elsif( $self->{'encoding_command_statuses'}[-1] ) {
728 $self->whine( $para->[1]{'start_line'},
729 sprintf "Couldn't do %s: %s",
730 $self->{'encoding_command_reqs' }[-1],
731 $self->{'encoding_command_statuses'}[-1],
732 );
733 } else {
734 DEBUG > 2 and print STDERR " (Yup, it was successfully handled already.)\n";
735 }
736
737 } else {
738 # Otherwise it's a syntax error
739 $self->whine( $para->[1]{'start_line'},
740 "Invalid =encoding syntax: $content"
741 );
742 }
743
744 return;
745}
746
747#~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`
748
749{
750my $m = -321; # magic line number
751
752sub _gen_errata {
753 my $self = $_[0];
754 # Return 0 or more fake-o paragraphs explaining the accumulated
755 # errors on this document.
756
757 return() unless $self->{'errata'} and keys %{$self->{'errata'}};
758
759 my @out;
760
761 foreach my $line (sort {$a <=> $b} keys %{$self->{'errata'}}) {
762 push @out,
763 ['=item', {'start_line' => $m}, "Around line $line:"],
764 map( ['~Para', {'start_line' => $m, '~cooked' => 1},
765 #['~Top', {'start_line' => $m},
766 $_
767 #]
768 ],
769 @{$self->{'errata'}{$line}}
770 )
771 ;
772 }
773
774 # TODO: report of unknown entities? unrenderable characters?
775
776 unshift @out,
777 ['=head1', {'start_line' => $m, 'errata' => 1}, 'POD ERRORS'],
778 ['~Para', {'start_line' => $m, '~cooked' => 1, 'errata' => 1},
779 "Hey! ",
780 ['B', {},
781 'The above document had some coding errors, which are explained below:'
782 ]
783 ],
784 ['=over', {'start_line' => $m, 'errata' => 1}, ''],
785 ;
786
787 push @out,
788 ['=back', {'start_line' => $m, 'errata' => 1}, ''],
789 ;
790
791 DEBUG and print STDERR "\n<<\n", pretty(\@out), "\n>>\n\n";
792
793 return @out;
794}
795
796}
797
798#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
799
800##############################################################################
801##
802## stop reading now stop reading now stop reading now stop reading now stop
803##
804## HERE IT BECOMES REALLY SCARY
805##
806## stop reading now stop reading now stop reading now stop reading now stop
807##
808##############################################################################
809
810sub _ponder_paragraph_buffer {
811
812 # Para-token types as found in the buffer.
813 # ~Verbatim, ~Para, ~end, =head1..4, =for, =begin, =end,
814 # =over, =back, =item
815 # and the null =pod (to be complained about if over one line)
816 #
817 # "~data" paragraphs are something we generate at this level, depending on
818 # a currently open =over region
819
820 # Events fired: Begin and end for:
821 # directivename (like head1 .. head4), item, extend,
822 # for (from =begin...=end, =for),
823 # over-bullet, over-number, over-text, over-block,
824 # item-bullet, item-number, item-text,
825 # Document,
826 # Data, Para, Verbatim
827 # B, C, longdirname (TODO -- wha?), etc. for all directives
828 #
829
830 my $self = $_[0];
831 my $paras;
832 return unless @{$paras = $self->{'paras'}};
833 my $curr_open = ($self->{'curr_open'} ||= []);
834
835 my $scratch;
836
837 DEBUG > 10 and print STDERR "# Paragraph buffer: <<", pretty($paras), ">>\n";
838
839 # We have something in our buffer. So apparently the document has started.
840 unless($self->{'doc_has_started'}) {
841 $self->{'doc_has_started'} = 1;
842
843 my $starting_contentless;
844 $starting_contentless =
845 (
846 !@$curr_open
847 and @$paras and ! grep $_->[0] ne '~end', @$paras
848 # i.e., if the paras is all ~ends
849 )
850 ;
851 DEBUG and print STDERR "# Starting ",
852 $starting_contentless ? 'contentless' : 'contentful',
853 " document\n"
854 ;
855
856 $self->_handle_element_start(
857 ($scratch = 'Document'),
858 {
859 'start_line' => $paras->[0][1]{'start_line'},
860 $starting_contentless ? ( 'contentless' => 1 ) : (),
861 },
862 );
863 }
864
865 my($para, $para_type);
866 while(@$paras) {
867
868 # If a directive, assume it's legal; subtract below if found not to be
869 $seen_legal_directive++ if $paras->[0][0] =~ /^=/;
870
871 last if @$paras == 1
872 and ( $paras->[0][0] eq '=over'
873 or $paras->[0][0] eq '=item'
874 or ($paras->[0][0] eq '~Verbatim' and $self->{'in_pod'}));
875 # Those're the three kinds of paragraphs that require lookahead.
876 # Actually, an "=item Foo" inside an <over type=text> region
877 # and any =item inside an <over type=block> region (rare)
878 # don't require any lookahead, but all others (bullets
879 # and numbers) do.
880 # The verbatim is different from the other two, because those might be
881 # like:
882 #
883 # =item
884 # ...
885 # =cut
886 # ...
887 # =item
888 #
889 # The =cut here finishes the paragraph but doesn't terminate the =over
890 # they should be in. (khw apologizes that he didn't comment at the time
891 # why the 'in_pod' works, and no longer remembers why, and doesn't think
892 # it is currently worth the effort to re-figure it out.)
893
894# TODO: whinge about many kinds of directives in non-resolving =for regions?
895# TODO: many? like what? =head1 etc?
896
897 $para = shift @$paras;
898 $para_type = $para->[0];
899
900 DEBUG > 1 and print STDERR "Pondering a $para_type paragraph, given the stack: (",
901 $self->_dump_curr_open(), ")\n";
902
903 if($para_type eq '=for') {
904 next if $self->_ponder_for($para,$curr_open,$paras);
905
906 } elsif($para_type eq '=begin') {
907 next if $self->_ponder_begin($para,$curr_open,$paras);
908
909 } elsif($para_type eq '=end') {
910 next if $self->_ponder_end($para,$curr_open,$paras);
911
912 } elsif($para_type eq '~end') { # The virtual end-document signal
913 next if $self->_ponder_doc_end($para,$curr_open,$paras);
914 }
915
916
917 # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
918 #~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
919 if(grep $_->[1]{'~ignore'}, @$curr_open) {
920 DEBUG > 1 and
921 print STDERR "Skipping $para_type paragraph because in ignore mode.\n";
922 next;
923 }
924 #~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
925 # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
926
927 if($para_type eq '=pod') {
928 $self->_ponder_pod($para,$curr_open,$paras);
929
930 } elsif($para_type eq '=over') {
931 next if $self->_ponder_over($para,$curr_open,$paras);
932
933 } elsif($para_type eq '=back') {
934 next if $self->_ponder_back($para,$curr_open,$paras);
935
936 } else {
937
938 # All non-magical codes!!!
939
940 # Here we start using $para_type for our own twisted purposes, to
941 # mean how it should get treated, not as what the element name
942 # should be.
943
944 DEBUG > 1 and print STDERR "Pondering non-magical $para_type\n";
945
946 my $i;
947
948 # Enforce some =headN discipline
949 if($para_type =~ m/^=head\d$/s
950 and ! $self->{'accept_heads_anywhere'}
951 and @$curr_open
952 and $curr_open->[-1][0] eq '=over'
953 ) {
954 DEBUG > 2 and print STDERR "'=$para_type' inside an '=over'!\n";
955 $self->whine(
956 $para->[1]{'start_line'},
957 "You forgot a '=back' before '$para_type'"
958 );
959 unshift @$paras, ['=back', {}, ''], $para; # close the =over
960 next;
961 }
962
963
964 if($para_type eq '=item') {
965
966 my $over;
967 unless(@$curr_open and
968 $over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
969 $self->whine(
970 $para->[1]{'start_line'},
971 "'=item' outside of any '=over'"
972 );
973 unshift @$paras,
974 ['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
975 $para
976 ;
977 next;
978 }
979
980
981 my $over_type = $over->[1]{'~type'};
982
983 if(!$over_type) {
984 # Shouldn't happen1
985 die "Typeless over in stack, starting at line "
986 . $over->[1]{'start_line'};
987
988 } elsif($over_type eq 'block') {
989 unless($curr_open->[-1][1]{'~bitched_about'}) {
990 $curr_open->[-1][1]{'~bitched_about'} = 1;
991 $self->whine(
992 $curr_open->[-1][1]{'start_line'},
993 "You can't have =items (as at line "
994 . $para->[1]{'start_line'}
995 . ") unless the first thing after the =over is an =item"
996 );
997 }
998 # Just turn it into a paragraph and reconsider it
999 $para->[0] = '~Para';
1000 unshift @$paras, $para;
1001 next;
1002
1003 } elsif($over_type eq 'text') {
1004 my $item_type = $self->_get_item_type($para);
1005 # That kills the content of the item if it's a number or bullet.
1006 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1007
1008 if($item_type eq 'text') {
1009 # Nothing special needs doing for 'text'
1010 } elsif($item_type eq 'number' or $item_type eq 'bullet') {
1011 $self->whine(
1012 $para->[1]{'start_line'},
1013 "Expected text after =item, not a $item_type"
1014 );
1015 # Undo our clobbering:
1016 push @$para, $para->[1]{'~orig_content'};
1017 delete $para->[1]{'number'};
1018 # Only a PROPER item-number element is allowed
1019 # to have a number attribute.
1020 } else {
1021 die "Unhandled item type $item_type"; # should never happen
1022 }
1023
1024 # =item-text thingies don't need any assimilation, it seems.
1025
1026 } elsif($over_type eq 'number') {
1027 my $item_type = $self->_get_item_type($para);
1028 # That kills the content of the item if it's a number or bullet.
1029 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1030
1031 my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
1032
1033 if($item_type eq 'bullet') {
1034 # Hm, it's not numeric. Correct for this.
1035 $para->[1]{'number'} = $expected_value;
1036 $self->whine(
1037 $para->[1]{'start_line'},
1038 "Expected '=item $expected_value'"
1039 );
1040 push @$para, $para->[1]{'~orig_content'};
1041 # restore the bullet, blocking the assimilation of next para
1042
1043 } elsif($item_type eq 'text') {
1044 # Hm, it's not numeric. Correct for this.
1045 $para->[1]{'number'} = $expected_value;
1046 $self->whine(
1047 $para->[1]{'start_line'},
1048 "Expected '=item $expected_value'"
1049 );
1050 # Text content will still be there and will block next ~Para
1051
1052 } elsif($item_type ne 'number') {
1053 die "Unknown item type $item_type"; # should never happen
1054
1055 } elsif($expected_value == $para->[1]{'number'}) {
1056 DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
1057
1058 } else {
1059 DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
1060 " instead of the expected value of $expected_value\n";
1061 $self->whine(
1062 $para->[1]{'start_line'},
1063 "You have '=item " . $para->[1]{'number'} .
1064 "' instead of the expected '=item $expected_value'"
1065 );
1066 $para->[1]{'number'} = $expected_value; # correcting!!
1067 }
1068
1069 if(@$para == 2) {
1070 # For the cases where we /didn't/ push to @$para
1071 if($paras->[0][0] eq '~Para') {
1072 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1073 push @$para, splice @{shift @$paras},2;
1074 } else {
1075 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1076 push @$para, ''; # Just so it's not contentless
1077 }
1078 }
1079
1080
1081 } elsif($over_type eq 'bullet') {
1082 my $item_type = $self->_get_item_type($para);
1083 # That kills the content of the item if it's a number or bullet.
1084 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1085
1086 if($item_type eq 'bullet') {
1087 # as expected!
1088
1089 if( $para->[1]{'~_freaky_para_hack'} ) {
1090 DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
1091 push @$para, $para->[1]{'~_freaky_para_hack'};
1092 }
1093
1094 } elsif($item_type eq 'number') {
1095 $self->whine(
1096 $para->[1]{'start_line'},
1097 "Expected '=item *'"
1098 );
1099 push @$para, $para->[1]{'~orig_content'};
1100 # and block assimilation of the next paragraph
1101 delete $para->[1]{'number'};
1102 # Only a PROPER item-number element is allowed
1103 # to have a number attribute.
1104 } elsif($item_type eq 'text') {
1105 $self->whine(
1106 $para->[1]{'start_line'},
1107 "Expected '=item *'"
1108 );
1109 # But doesn't need processing. But it'll block assimilation
1110 # of the next para.
1111 } else {
1112 die "Unhandled item type $item_type"; # should never happen
1113 }
1114
1115 if(@$para == 2) {
1116 # For the cases where we /didn't/ push to @$para
1117 if($paras->[0][0] eq '~Para') {
1118 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1119 push @$para, splice @{shift @$paras},2;
1120 } else {
1121 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1122 push @$para, ''; # Just so it's not contentless
1123 }
1124 }
1125
1126 } else {
1127 die "Unhandled =over type \"$over_type\"?";
1128 # Shouldn't happen!
1129 }
1130
1131 $para_type = 'Plain';
1132 $para->[0] .= '-' . $over_type;
1133 # Whew. Now fall thru and process it.
1134
1135
1136 } elsif($para_type eq '=extend') {
1137 # Well, might as well implement it here.
1138 $self->_ponder_extend($para);
1139 next; # and skip
1140 } elsif($para_type eq '=encoding') {
1141 # Not actually acted on here, but we catch errors here.
1142 $self->_handle_encoding_second_level($para);
1143 next unless $self->keep_encoding_directive;
1144 $para_type = 'Plain';
1145 } elsif($para_type eq '~Verbatim') {
1146 $para->[0] = 'Verbatim';
1147 $para_type = '?Verbatim';
1148 } elsif($para_type eq '~Para') {
1149 $para->[0] = 'Para';
1150 $para_type = '?Plain';
1151 } elsif($para_type eq 'Data') {
1152 $para->[0] = 'Data';
1153 $para_type = '?Data';
1154 } elsif( $para_type =~ s/^=//s
1155 and defined( $para_type = $self->{'accept_directives'}{$para_type} )
1156 ) {
1157 DEBUG > 1 and print STDERR " Pondering known directive ${$para}[0] as $para_type\n";
1158 } else {
1159 # An unknown directive!
1160 $seen_legal_directive--;
1161 DEBUG > 1 and printf STDERR "Unhandled directive %s (Handled: %s)\n",
1162 $para->[0], join(' ', sort keys %{$self->{'accept_directives'}} )
1163 ;
1164 $self->whine(
1165 $para->[1]{'start_line'},
1166 "Unknown directive: $para->[0]"
1167 );
1168
1169 # And maybe treat it as text instead of just letting it go?
1170 next;
1171 }
1172
1173 if($para_type =~ s/^\?//s) {
1174 if(! @$curr_open) { # usual case
1175 DEBUG and print STDERR "Treating $para_type paragraph as such because stack is empty.\n";
1176 } else {
1177 my @fors = grep $_->[0] eq '=for', @$curr_open;
1178 DEBUG > 1 and print STDERR "Containing fors: ",
1179 join(',', map $_->[1]{'target'}, @fors), "\n";
1180
1181 if(! @fors) {
1182 DEBUG and print STDERR "Treating $para_type paragraph as such because stack has no =for's\n";
1183
1184 #} elsif(grep $_->[1]{'~resolve'}, @fors) {
1185 #} elsif(not grep !$_->[1]{'~resolve'}, @fors) {
1186 } elsif( $fors[-1][1]{'~resolve'} ) {
1187 # Look to the immediately containing for
1188
1189 if($para_type eq 'Data') {
1190 DEBUG and print STDERR "Treating Data paragraph as Plain/Verbatim because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
1191 $para->[0] = 'Para';
1192 $para_type = 'Plain';
1193 } else {
1194 DEBUG and print STDERR "Treating $para_type paragraph as such because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
1195 }
1196 } else {
1197 DEBUG and print STDERR "Treating $para_type paragraph as Data because the containing =for ($fors[-1][1]{'target'}) is a non-resolver\n";
1198 $para->[0] = $para_type = 'Data';
1199 }
1200 }
1201 }
1202
1203 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1204 if($para_type eq 'Plain') {
1205 $self->_ponder_Plain($para);
1206 } elsif($para_type eq 'Verbatim') {
1207 $self->_ponder_Verbatim($para);
1208 } elsif($para_type eq 'Data') {
1209 $self->_ponder_Data($para);
1210 } else {
1211 die "\$para type is $para_type -- how did that happen?";
1212 # Shouldn't happen.
1213 }
1214
1215 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1216 $para->[0] =~ s/^[~=]//s;
1217
1218 DEBUG and print STDERR "\n", pretty($para), "\n";
1219
1220 # traverse the treelet (which might well be just one string scalar)
1221 $self->{'content_seen'} ||= 1 if $seen_legal_directive
1222 && ! $self->{'~tried_gen_errata'};
1223 $self->_traverse_treelet_bit(@$para);
1224 }
1225 }
1226
1227 return;
1228}
1229
1230###########################################################################
1231# The sub-ponderers...
1232
- -
1235sub _ponder_for {
1236 my ($self,$para,$curr_open,$paras) = @_;
1237
1238 # Fake it out as a begin/end
1239 my $target;
1240
1241 if(grep $_->[1]{'~ignore'}, @$curr_open) {
1242 DEBUG > 1 and print STDERR "Ignoring ignorable =for\n";
1243 return 1;
1244 }
1245
1246 for(my $i = 2; $i < @$para; ++$i) {
1247 if($para->[$i] =~ s/^\s*(\S+)\s*//s) {
1248 $target = $1;
1249 last;
1250 }
1251 }
1252 unless(defined $target) {
1253 $self->whine(
1254 $para->[1]{'start_line'},
1255 "=for without a target?"
1256 );
1257 return 1;
1258 }
1259 DEBUG > 1 and
1260 print STDERR "Faking out a =for $target as a =begin $target / =end $target\n";
1261
1262 $para->[0] = 'Data';
1263
1264 unshift @$paras,
1265 ['=begin',
1266 {'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1267 $target,
1268 ],
1269 $para,
1270 ['=end',
1271 {'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1272 $target,
1273 ],
1274 ;
1275
1276 return 1;
1277}
1278
1279sub _ponder_begin {
1280 my ($self,$para,$curr_open,$paras) = @_;
1281 my $content = join ' ', splice @$para, 2;
1282 $content =~ s/^\s+//s;
1283 $content =~ s/\s+$//s;
1284 unless(length($content)) {
1285 $self->whine(
1286 $para->[1]{'start_line'},
1287 "=begin without a target?"
1288 );
1289 DEBUG and print STDERR "Ignoring targetless =begin\n";
1290 return 1;
1291 }
1292
1293 my ($target, $title) = $content =~ m/^(\S+)\s*(.*)$/;
1294 $para->[1]{'title'} = $title if ($title);
1295 $para->[1]{'target'} = $target; # without any ':'
1296 $content = $target; # strip off the title
1297
1298 $content =~ s/^:!/!:/s;
1299 my $neg; # whether this is a negation-match
1300 $neg = 1 if $content =~ s/^!//s;
1301 my $to_resolve; # whether to process formatting codes
1302 $to_resolve = 1 if $content =~ s/^://s;
1303
1304 my $dont_ignore; # whether this target matches us
1305
1306 foreach my $target_name (
1307 split(',', $content, -1),
1308 $neg ? () : '*'
1309 ) {
1310 DEBUG > 2 and
1311 print STDERR " Considering whether =begin $content matches $target_name\n";
1312 next unless $self->{'accept_targets'}{$target_name};
1313
1314 DEBUG > 2 and
1315 print STDERR " It DOES match the acceptable target $target_name!\n";
1316 $to_resolve = 1
1317 if $self->{'accept_targets'}{$target_name} eq 'force_resolve';
1318 $dont_ignore = 1;
1319 $para->[1]{'target_matching'} = $target_name;
1320 last; # stop looking at other target names
1321 }
1322
1323 if($neg) {
1324 if( $dont_ignore ) {
1325 $dont_ignore = '';
1326 delete $para->[1]{'target_matching'};
1327 DEBUG > 2 and print STDERR " But the leading ! means that this is a NON-match!\n";
1328 } else {
1329 $dont_ignore = 1;
1330 $para->[1]{'target_matching'} = '!';
1331 DEBUG > 2 and print STDERR " But the leading ! means that this IS a match!\n";
1332 }
1333 }
1334
1335 $para->[0] = '=for'; # Just what we happen to call these, internally
1336 $para->[1]{'~really'} ||= '=begin';
1337 $para->[1]{'~ignore'} = (! $dont_ignore) || 0;
1338 $para->[1]{'~resolve'} = $to_resolve || 0;
1339
1340 DEBUG > 1 and print STDERR " Making note to ", $dont_ignore ? 'not ' : '',
1341 "ignore contents of this region\n";
1342 DEBUG > 1 and $dont_ignore and print STDERR " Making note to treat contents as ",
1343 ($to_resolve ? 'verbatim/plain' : 'data'), " paragraphs\n";
1344 DEBUG > 1 and print STDERR " (Stack now: ", $self->_dump_curr_open(), ")\n";
1345
1346 push @$curr_open, $para;
1347 if(!$dont_ignore or scalar grep $_->[1]{'~ignore'}, @$curr_open) {
1348 DEBUG > 1 and print STDERR "Ignoring ignorable =begin\n";
1349 } else {
1350 $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1351 $self->_handle_element_start((my $scratch='for'), $para->[1]);
1352 }
1353
1354 return 1;
1355}
1356
1357sub _ponder_end {
1358 my ($self,$para,$curr_open,$paras) = @_;
1359 my $content = join ' ', splice @$para, 2;
1360 $content =~ s/^\s+//s;
1361 $content =~ s/\s+$//s;
1362 DEBUG and print STDERR "Ogling '=end $content' directive\n";
1363
1364 unless(length($content)) {
1365 $self->whine(
1366 $para->[1]{'start_line'},
1367 "'=end' without a target?" . (
1368 ( @$curr_open and $curr_open->[-1][0] eq '=for' )
1369 ? ( " (Should be \"=end " . $curr_open->[-1][1]{'target'} . '")' )
1370 : ''
1371 )
1372 );
1373 DEBUG and print STDERR "Ignoring targetless =end\n";
1374 return 1;
1375 }
1376
1377 unless($content =~ m/^\S+$/) { # i.e., unless it's one word
1378 $self->whine(
1379 $para->[1]{'start_line'},
1380 "'=end $content' is invalid. (Stack: "
1381 . $self->_dump_curr_open() . ')'
1382 );
1383 DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1384 return 1;
1385 }
1386
1387 unless(@$curr_open and $curr_open->[-1][0] eq '=for') {
1388 $self->whine(
1389 $para->[1]{'start_line'},
1390 "=end $content without matching =begin. (Stack: "
1391 . $self->_dump_curr_open() . ')'
1392 );
1393 DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1394 return 1;
1395 }
1396
1397 unless($content eq $curr_open->[-1][1]{'target'}) {
1398 $self->whine(
1399 $para->[1]{'start_line'},
1400 "=end $content doesn't match =begin "
1401 . $curr_open->[-1][1]{'target'}
1402 . ". (Stack: "
1403 . $self->_dump_curr_open() . ')'
1404 );
1405 DEBUG and print STDERR "Ignoring mistargetted =end $content at line $para->[1]{'start_line'}\n";
1406 return 1;
1407 }
1408
1409 # Else it's okay to close...
1410 if(grep $_->[1]{'~ignore'}, @$curr_open) {
1411 DEBUG > 1 and print STDERR "Not firing any event for this =end $content because in an ignored region\n";
1412 # And that may be because of this to-be-closed =for region, or some
1413 # other one, but it doesn't matter.
1414 } else {
1415 $curr_open->[-1][1]{'start_line'} = $para->[1]{'start_line'};
1416 # what's that for?
1417
1418 $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1419 $self->_handle_element_end( my $scratch = 'for', $para->[1]);
1420 }
1421 DEBUG > 1 and print STDERR "Popping $curr_open->[-1][0] $curr_open->[-1][1]{'target'} because of =end $content\n";
1422 pop @$curr_open;
1423
1424 return 1;
1425}
1426
1427sub _ponder_doc_end {
1428 my ($self,$para,$curr_open,$paras) = @_;
1429 if(@$curr_open) { # Deal with things left open
1430 DEBUG and print STDERR "Stack is nonempty at end-document: (",
1431 $self->_dump_curr_open(), ")\n";
1432
1433 DEBUG > 9 and print STDERR "Stack: ", pretty($curr_open), "\n";
1434 unshift @$paras, $self->_closers_for_all_curr_open;
1435 # Make sure there is exactly one ~end in the parastack, at the end:
1436 @$paras = grep $_->[0] ne '~end', @$paras;
1437 push @$paras, $para, $para;
1438 # We need two -- once for the next cycle where we
1439 # generate errata, and then another to be at the end
1440 # when that loop back around to process the errata.
1441 return 1;
1442
1443 } else {
1444 DEBUG and print STDERR "Okay, stack is empty now.\n";
1445 }
1446
1447 # Try generating errata section, if applicable
1448 unless($self->{'~tried_gen_errata'}) {
1449 $self->{'~tried_gen_errata'} = 1;
1450 my @extras = $self->_gen_errata();
1451 if(@extras) {
1452 unshift @$paras, @extras;
1453 DEBUG and print STDERR "Generated errata... relooping...\n";
1454 return 1; # I.e., loop around again to process these fake-o paragraphs
1455 }
1456 }
1457
1458 splice @$paras; # Well, that's that for this paragraph buffer.
1459 DEBUG and print STDERR "Throwing end-document event.\n";
1460
1461 $self->_handle_element_end( my $scratch = 'Document' );
1462 return 1; # Hasta la byebye
1463}
1464
1465sub _ponder_pod {
1466 my ($self,$para,$curr_open,$paras) = @_;
1467 $self->whine(
1468 $para->[1]{'start_line'},
1469 "=pod directives shouldn't be over one line long! Ignoring all "
1470 . (@$para - 2) . " lines of content"
1471 ) if @$para > 3;
1472
1473 # Content ignored unless 'pod_handler' is set
1474 if (my $pod_handler = $self->{'pod_handler'}) {
1475 my ($line_num, $line) = map $_, $para->[1]{'start_line'}, $para->[2];
1476 $line = $line eq '' ? "=pod" : "=pod $line"; # imitate cut_handler output
1477 $pod_handler->($line, $line_num, $self);
1478 }
1479
1480 # The surrounding methods set content_seen, so let us remain consistent.
1481 # I do not know why it was not here before -- should it not be here?
1482 # $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1483
1484 return;
1485}
1486
1487sub _ponder_over {
1488 my ($self,$para,$curr_open,$paras) = @_;
1489 return 1 unless @$paras;
1490 my $list_type;
1491
1492 if($paras->[0][0] eq '=item') { # most common case
1493 $list_type = $self->_get_initial_item_type($paras->[0]);
1494
1495 } elsif($paras->[0][0] eq '=back') {
1496 # Ignore empty lists by default
1497 if ($self->{'parse_empty_lists'}) {
1498 $list_type = 'empty';
1499 } else {
1500 shift @$paras;
1501 return 1;
1502 }
1503 } elsif($paras->[0][0] eq '~end') {
1504 $self->whine(
1505 $para->[1]{'start_line'},
1506 "=over is the last thing in the document?!"
1507 );
1508 return 1; # But feh, ignore it.
1509 } else {
1510 $list_type = 'block';
1511 }
1512 $para->[1]{'~type'} = $list_type;
1513 push @$curr_open, $para;
1514 # yes, we reuse the paragraph as a stack item
1515
1516 my $content = join ' ', splice @$para, 2;
1517 $para->[1]{'~orig_content'} = $content;
1518 my $overness;
1519 if($content =~ m/^\s*$/s) {
1520 $para->[1]{'indent'} = 4;
1521 } elsif($content =~ m/^\s*((?:\d*\.)?\d+)\s*$/s) {
1522 no integer;
1523 $para->[1]{'indent'} = $1;
1524 if($1 == 0) {
1525 $self->whine(
1526 $para->[1]{'start_line'},
1527 "Can't have a 0 in =over $content"
1528 );
1529 $para->[1]{'indent'} = 4;
1530 }
1531 } else {
1532 $self->whine(
1533 $para->[1]{'start_line'},
1534 "=over should be: '=over' or '=over positive_number'"
1535 );
1536 $para->[1]{'indent'} = 4;
1537 }
1538 DEBUG > 1 and print STDERR "=over found of type $list_type\n";
1539
1540 $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1541 $self->_handle_element_start((my $scratch = 'over-' . $list_type), $para->[1]);
1542
1543 return;
1544}
1545
1546sub _ponder_back {
1547 my ($self,$para,$curr_open,$paras) = @_;
1548 # TODO: fire off </item-number> or </item-bullet> or </item-text> ??
1549
1550 my $content = join ' ', splice @$para, 2;
1551 if($content =~ m/\S/) {
1552 $self->whine(
1553 $para->[1]{'start_line'},
1554 "=back doesn't take any parameters, but you said =back $content"
1555 );
1556 }
1557
1558 if(@$curr_open and $curr_open->[-1][0] eq '=over') {
1559 DEBUG > 1 and print STDERR "=back happily closes matching =over\n";
1560 # Expected case: we're closing the most recently opened thing
1561 #my $over = pop @$curr_open;
1562 $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1563 $self->_handle_element_end( my $scratch =
1564 'over-' . ( (pop @$curr_open)->[1]{'~type'} ), $para->[1]
1565 );
1566 } else {
1567 DEBUG > 1 and print STDERR "=back found without a matching =over. Stack: (",
1568 join(', ', map $_->[0], @$curr_open), ").\n";
1569 $self->whine(
1570 $para->[1]{'start_line'},
1571 '=back without =over'
1572 );
1573 return 1; # and ignore it
1574 }
1575}
1576
1577sub _ponder_item {
1578 my ($self,$para,$curr_open,$paras) = @_;
1579 my $over;
1580 unless(@$curr_open and
1581 $over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
1582 $self->whine(
1583 $para->[1]{'start_line'},
1584 "'=item' outside of any '=over'"
1585 );
1586 unshift @$paras,
1587 ['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
1588 $para
1589 ;
1590 return 1;
1591 }
1592
1593
1594 my $over_type = $over->[1]{'~type'};
1595
1596 if(!$over_type) {
1597 # Shouldn't happen1
1598 die "Typeless over in stack, starting at line "
1599 . $over->[1]{'start_line'};
1600
1601 } elsif($over_type eq 'block') {
1602 unless($curr_open->[-1][1]{'~bitched_about'}) {
1603 $curr_open->[-1][1]{'~bitched_about'} = 1;
1604 $self->whine(
1605 $curr_open->[-1][1]{'start_line'},
1606 "You can't have =items (as at line "
1607 . $para->[1]{'start_line'}
1608 . ") unless the first thing after the =over is an =item"
1609 );
1610 }
1611 # Just turn it into a paragraph and reconsider it
1612 $para->[0] = '~Para';
1613 unshift @$paras, $para;
1614 return 1;
1615
1616 } elsif($over_type eq 'text') {
1617 my $item_type = $self->_get_item_type($para);
1618 # That kills the content of the item if it's a number or bullet.
1619 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1620
1621 if($item_type eq 'text') {
1622 # Nothing special needs doing for 'text'
1623 } elsif($item_type eq 'number' or $item_type eq 'bullet') {
1624 $self->whine(
1625 $para->[1]{'start_line'},
1626 "Expected text after =item, not a $item_type"
1627 );
1628 # Undo our clobbering:
1629 push @$para, $para->[1]{'~orig_content'};
1630 delete $para->[1]{'number'};
1631 # Only a PROPER item-number element is allowed
1632 # to have a number attribute.
1633 } else {
1634 die "Unhandled item type $item_type"; # should never happen
1635 }
1636
1637 # =item-text thingies don't need any assimilation, it seems.
1638
1639 } elsif($over_type eq 'number') {
1640 my $item_type = $self->_get_item_type($para);
1641 # That kills the content of the item if it's a number or bullet.
1642 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1643
1644 my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
1645
1646 if($item_type eq 'bullet') {
1647 # Hm, it's not numeric. Correct for this.
1648 $para->[1]{'number'} = $expected_value;
1649 $self->whine(
1650 $para->[1]{'start_line'},
1651 "Expected '=item $expected_value'"
1652 );
1653 push @$para, $para->[1]{'~orig_content'};
1654 # restore the bullet, blocking the assimilation of next para
1655
1656 } elsif($item_type eq 'text') {
1657 # Hm, it's not numeric. Correct for this.
1658 $para->[1]{'number'} = $expected_value;
1659 $self->whine(
1660 $para->[1]{'start_line'},
1661 "Expected '=item $expected_value'"
1662 );
1663 # Text content will still be there and will block next ~Para
1664
1665 } elsif($item_type ne 'number') {
1666 die "Unknown item type $item_type"; # should never happen
1667
1668 } elsif($expected_value == $para->[1]{'number'}) {
1669 DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
1670
1671 } else {
1672 DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
1673 " instead of the expected value of $expected_value\n";
1674 $self->whine(
1675 $para->[1]{'start_line'},
1676 "You have '=item " . $para->[1]{'number'} .
1677 "' instead of the expected '=item $expected_value'"
1678 );
1679 $para->[1]{'number'} = $expected_value; # correcting!!
1680 }
1681
1682 if(@$para == 2) {
1683 # For the cases where we /didn't/ push to @$para
1684 if($paras->[0][0] eq '~Para') {
1685 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1686 push @$para, splice @{shift @$paras},2;
1687 } else {
1688 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1689 push @$para, ''; # Just so it's not contentless
1690 }
1691 }
1692
1693
1694 } elsif($over_type eq 'bullet') {
1695 my $item_type = $self->_get_item_type($para);
1696 # That kills the content of the item if it's a number or bullet.
1697 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1698
1699 if($item_type eq 'bullet') {
1700 # as expected!
1701
1702 if( $para->[1]{'~_freaky_para_hack'} ) {
1703 DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
1704 push @$para, $para->[1]{'~_freaky_para_hack'};
1705 }
1706
1707 } elsif($item_type eq 'number') {
1708 $self->whine(
1709 $para->[1]{'start_line'},
1710 "Expected '=item *'"
1711 );
1712 push @$para, $para->[1]{'~orig_content'};
1713 # and block assimilation of the next paragraph
1714 delete $para->[1]{'number'};
1715 # Only a PROPER item-number element is allowed
1716 # to have a number attribute.
1717 } elsif($item_type eq 'text') {
1718 $self->whine(
1719 $para->[1]{'start_line'},
1720 "Expected '=item *'"
1721 );
1722 # But doesn't need processing. But it'll block assimilation
1723 # of the next para.
1724 } else {
1725 die "Unhandled item type $item_type"; # should never happen
1726 }
1727
1728 if(@$para == 2) {
1729 # For the cases where we /didn't/ push to @$para
1730 if($paras->[0][0] eq '~Para') {
1731 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1732 push @$para, splice @{shift @$paras},2;
1733 } else {
1734 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1735 push @$para, ''; # Just so it's not contentless
1736 }
1737 }
1738
1739 } else {
1740 die "Unhandled =over type \"$over_type\"?";
1741 # Shouldn't happen!
1742 }
1743 $para->[0] .= '-' . $over_type;
1744
1745 return;
1746}
1747
1748sub _ponder_Plain {
1749 my ($self,$para) = @_;
1750 DEBUG and print STDERR " giving plain treatment...\n";
1751 unless( @$para == 2 or ( @$para == 3 and $para->[2] eq '' )
1752 or $para->[1]{'~cooked'}
1753 ) {
1754 push @$para,
1755 @{$self->_make_treelet(
1756 join("\n", splice(@$para, 2)),
1757 $para->[1]{'start_line'}
1758 )};
1759 }
1760 # Empty paragraphs don't need a treelet for any reason I can see.
1761 # And precooked paragraphs already have a treelet.
1762 return;
1763}
1764
1765sub _ponder_Verbatim {
1766 my ($self,$para) = @_;
1767 DEBUG and print STDERR " giving verbatim treatment...\n";
1768
1769 $para->[1]{'xml:space'} = 'preserve';
1770
1771 unless ($self->{'_output_is_for_JustPod'}) {
1772 # Fix illegal settings for expand_verbatim_tabs()
1773 # This is because this module doesn't do input error checking, but khw
1774 # doesn't want to add yet another instance of that.
1775 $self->expand_verbatim_tabs(8)
1776 if ! defined $self->expand_verbatim_tabs()
1777 || $self->expand_verbatim_tabs() =~ /\D/;
1778
1779 my $indent = $self->strip_verbatim_indent;
1780 if ($indent && ref $indent eq 'CODE') {
1781 my @shifted = (shift @{$para}, shift @{$para});
1782 $indent = $indent->($para);
1783 unshift @{$para}, @shifted;
1784 }
1785
1786 for(my $i = 2; $i < @$para; $i++) {
1787 foreach my $line ($para->[$i]) { # just for aliasing
1788 # Strip indentation.
1789 $line =~ s/^\Q$indent// if $indent;
1790 next unless $self->expand_verbatim_tabs;
1791
1792 # This is commented out because of github issue #85, and the
1793 # current maintainers don't know why it was there in the first
1794 # place.
1795 #&& !($self->{accept_codes} && $self->{accept_codes}{VerbatimFormatted});
1796 while( $line =~
1797 # Sort of adapted from Text::Tabs.
1798 s/^([^\t]*)(\t+)/$1.(" " x ((length($2)
1799 * $self->expand_verbatim_tabs)
1800 -(length($1)&7)))/e
1801 ) {}
1802
1803 # TODO: whinge about (or otherwise treat) unindented or overlong lines
1804
1805 }
1806 }
1807 }
1808
1809 # Now the VerbatimFormatted hoodoo...
1810 if( $self->{'accept_codes'} and
1811 $self->{'accept_codes'}{'VerbatimFormatted'}
1812 ) {
1813 while(@$para > 3 and $para->[-1] !~ m/\S/) { pop @$para }
1814 # Kill any number of terminal newlines
1815 $self->_verbatim_format($para);
1816 } elsif ($self->{'codes_in_verbatim'}) {
1817 push @$para,
1818 @{$self->_make_treelet(
1819 join("\n", splice(@$para, 2)),
1820 $para->[1]{'start_line'}, $para->[1]{'xml:space'}
1821 )};
1822 $para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1823 } else {
1824 push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1825 $para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1826 }
1827 return;
1828}
1829
1830sub _ponder_Data {
1831 my ($self,$para) = @_;
1832 DEBUG and print STDERR " giving data treatment...\n";
1833 $para->[1]{'xml:space'} = 'preserve';
1834 push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1835 return;
1836}
1837
- -
1841###########################################################################
1842
1843sub _traverse_treelet_bit { # for use only by the routine above
1844 my($self, $name) = splice @_,0,2;
1845
1846 my $scratch;
1847 $self->_handle_element_start(($scratch=$name), shift @_);
1848
1849 while (@_) {
1850 my $x = shift;
1851 if (ref($x)) {
1852 &_traverse_treelet_bit($self, @$x);
1853 } else {
1854 $x .= shift while @_ && !ref($_[0]);
1855 $self->_handle_text($x);
1856 }
1857 }
1858
1859 $self->_handle_element_end($scratch=$name);
1860 return;
1861}
1862
1863#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1864
1865sub _closers_for_all_curr_open {
1866 my $self = $_[0];
1867 my @closers;
1868 foreach my $still_open (@{ $self->{'curr_open'} || return }) {
1869 my @copy = @$still_open;
1870 $copy[1] = {%{ $copy[1] }};
1871 #$copy[1]{'start_line'} = -1;
1872 if($copy[0] eq '=for') {
1873 $copy[0] = '=end';
1874 } elsif($copy[0] eq '=over') {
1875 $self->whine(
1876 $still_open->[1]{start_line} ,
1877 "=over without closing =back"
1878 );
1879
1880 $copy[0] = '=back';
1881 } else {
1882 die "I don't know how to auto-close an open $copy[0] region";
1883 }
1884
1885 unless( @copy > 2 ) {
1886 push @copy, $copy[1]{'target'};
1887 $copy[-1] = '' unless defined $copy[-1];
1888 # since =over's don't have targets
1889 }
1890
1891 $copy[1]{'fake-closer'} = 1;
1892
1893 DEBUG and print STDERR "Queuing up fake-o event: ", pretty(\@copy), "\n";
1894 unshift @closers, \@copy;
1895 }
1896 return @closers;
1897}
1898
1899#--------------------------------------------------------------------------
1900
1901sub _verbatim_format {
1902 my($it, $p) = @_;
1903
1904 my $formatting;
1905
1906 for(my $i = 2; $i < @$p; $i++) { # work backwards over the lines
1907 DEBUG and print STDERR "_verbatim_format appends a newline to $i: $p->[$i]\n";
1908 $p->[$i] .= "\n";
1909 # Unlike with simple Verbatim blocks, we don't end up just doing
1910 # a join("\n", ...) on the contents, so we have to append a
1911 # newline to every line, and then nix the last one later.
1912 }
1913
1914 if( DEBUG > 4 ) {
1915 print STDERR "<<\n";
1916 for(my $i = $#$p; $i >= 2; $i--) { # work backwards over the lines
1917 print STDERR "_verbatim_format $i: $p->[$i]";
1918 }
1919 print STDERR ">>\n";
1920 }
1921
1922 for(my $i = $#$p; $i > 2; $i--) {
1923 # work backwards over the lines, except the first (#2)
1924
1925 #next unless $p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s
1926 # and $p->[$i-1] !~ m{^#:[ \^\/\%]*\n?$}s;
1927 # look at a formatty line preceding a nonformatty one
1928 DEBUG > 5 and print STDERR "Scrutinizing line $i: $$p[$i]\n";
1929 if($p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s) {
1930 DEBUG > 5 and print STDERR " It's a formatty line. ",
1931 "Peeking at previous line ", $i-1, ": $$p[$i-1]: \n";
1932
1933 if( $p->[$i-1] =~ m{^#:[ \^\/\%]*\n?$}s ) {
1934 DEBUG > 5 and print STDERR " Previous line is formatty! Skipping this one.\n";
1935 next;
1936 } else {
1937 DEBUG > 5 and print STDERR " Previous line is non-formatty! Yay!\n";
1938 }
1939 } else {
1940 DEBUG > 5 and print STDERR " It's not a formatty line. Ignoring\n";
1941 next;
1942 }
1943
1944 # A formatty line has to have #: in the first two columns, and uses
1945 # "^" to mean bold, "/" to mean underline, and "%" to mean bold italic.
1946 # Example:
1947 # What do you want? i like pie. [or whatever]
1948 # #:^^^^^^^^^^^^^^^^^ /////////////
1949
1950
1951 DEBUG > 4 and print STDERR "_verbatim_format considers:\n<$p->[$i-1]>\n<$p->[$i]>\n";
1952
1953 $formatting = ' ' . $1;
1954 $formatting =~ s/\s+$//s; # nix trailing whitespace
1955 unless(length $formatting and $p->[$i-1] =~ m/\S/) { # no-op
1956 splice @$p,$i,1; # remove this line
1957 $i--; # don't consider next line
1958 next;
1959 }
1960
1961 if( length($formatting) >= length($p->[$i-1]) ) {
1962 $formatting = substr($formatting, 0, length($p->[$i-1]) - 1) . ' ';
1963 } else {
1964 $formatting .= ' ' x (length($p->[$i-1]) - length($formatting));
1965 }
1966 # Make $formatting and the previous line be exactly the same length,
1967 # with $formatting having a " " as the last character.
1968
1969 DEBUG > 4 and print STDERR "Formatting <$formatting> on <", $p->[$i-1], ">\n";
1970
1971
1972 my @new_line;
1973 while( $formatting =~ m{\G(( +)|(\^+)|(\/+)|(\%+))}g ) {
1974 #print STDERR "Format matches $1\n";
1975
1976 if($2) {
1977 #print STDERR "SKIPPING <$2>\n";
1978 push @new_line,
1979 substr($p->[$i-1], pos($formatting)-length($1), length($1));
1980 } else {
1981 #print STDERR "SNARING $+\n";
1982 push @new_line, [
1983 (
1984 $3 ? 'VerbatimB' :
1985 $4 ? 'VerbatimI' :
1986 $5 ? 'VerbatimBI' : die("Should never get called")
1987 ), {},
1988 substr($p->[$i-1], pos($formatting)-length($1), length($1))
1989 ];
1990 #print STDERR "Formatting <$new_line[-1][-1]> as $new_line[-1][0]\n";
1991 }
1992 }
1993 my @nixed =
1994 splice @$p, $i-1, 2, @new_line; # replace myself and the next line
1995 DEBUG > 10 and print STDERR "Nixed count: ", scalar(@nixed), "\n";
1996
1997 DEBUG > 6 and print STDERR "New version of the above line is these tokens (",
1998 scalar(@new_line), "):",
1999 map( ref($_)?"<@$_> ":"<$_>", @new_line ), "\n";
2000 $i--; # So the next line we scrutinize is the line before the one
2001 # that we just went and formatted
2002 }
2003
2004 $p->[0] = 'VerbatimFormatted';
2005
2006 # Collapse adjacent text nodes, just for kicks.
2007 for( my $i = 2; $i > $#$p; $i++ ) { # work forwards over the tokens except for the last
2008 if( !ref($p->[$i]) and !ref($p->[$i + 1]) ) {
2009 DEBUG > 5 and print STDERR "_verbatim_format merges {$p->[$i]} and {$p->[$i+1]}\n";
2010 $p->[$i] .= splice @$p, $i+1, 1; # merge
2011 --$i; # and back up
2012 }
2013 }
2014
2015 # Now look for the last text token, and remove the terminal newline
2016 for( my $i = $#$p; $i >= 2; $i-- ) {
2017 # work backwards over the tokens, even the first
2018 if( !ref($p->[$i]) ) {
2019 if($p->[$i] =~ s/\n$//s) {
2020 DEBUG > 5 and print STDERR "_verbatim_format killed the terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]}\n";
2021 } else {
2022 DEBUG > 5 and print STDERR
2023 "No terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]} !?\n";
2024 }
2025 last; # we only want the next one
2026 }
2027 }
2028
2029 return;
2030}
2031
2032
2033#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2034
2035
2036sub _treelet_from_formatting_codes {
2037 # Given a paragraph, returns a treelet. Full of scary tokenizing code.
2038 # Like [ '~Top', {'start_line' => $start_line},
2039 # "I like ",
2040 # [ 'B', {}, "pie" ],
2041 # "!"
2042 # ]
2043 # This illustrates the general format of a treelet. It is an array:
2044 # [0] is a scalar indicating its type. In the example above, the
2045 # types are '~Top' and 'B'
2046 # [1] is a hash of various flags about it, possibly empty
2047 # [2] - [N] are an ordered list of the subcomponents of the treelet.
2048 # Scalars are literal text, refs are sub-treelets, to
2049 # arbitrary levels. Stringifying a treelet will recursively
2050 # stringify the sub-treelets, concatentating everything
2051 # together to form the exact text of the treelet.
2052
2053 my($self, $para, $start_line, $preserve_space) = @_;
2054
2055 my $treelet = ['~Top', {'start_line' => $start_line},];
2056
2057 unless ($preserve_space || $self->{'preserve_whitespace'}) {
2058 $para =~ s/\s+/ /g; # collapse and trim all whitespace first.
2059 $para =~ s/ $//;
2060 $para =~ s/^ //;
2061 }
2062
2063 # Only apparent problem the above code is that N<< >> turns into
2064 # N<< >>. But then, word wrapping does that too! So don't do that!
2065
2066
2067 # As a Start-code is encountered, the number of opening bracket '<'
2068 # characters minus 1 is pushed onto @stack (so 0 means a single bracket,
2069 # etc). When closing brackets are found in the text, at least this number
2070 # (plus the 1) will be required to mean the Start-code is terminated. When
2071 # those are found, @stack is popped.
2072 my @stack;
2073
2074 my @lineage = ($treelet);
2075 my $raw = ''; # raw content of L<> fcode before splitting/processing
2076 # XXX 'raw' is not 100% accurate: all surrounding whitespace is condensed
2077 # into just 1 ' '. Is this the regex's doing or 'raw's? Answer is it's
2078 # the 'collapse and trim all whitespace first' lines just above.
2079 my $inL = 0;
2080
2081 DEBUG > 4 and print STDERR "Paragraph:\n$para\n\n";
2082
2083 # Here begins our frightening tokenizer RE. The following regex matches
2084 # text in four main parts:
2085 #
2086 # * Start-codes. The first alternative matches C< or C<<, the latter
2087 # followed by some whitespace. $1 will hold the entire start code
2088 # (including any space following a multiple-angle-bracket delimiter),
2089 # and $2 will hold only the additional brackets past the first in a
2090 # multiple-bracket delimiter. length($2) + 1 will be the number of
2091 # closing brackets we have to find.
2092 #
2093 # * Closing brackets. Match some amount of whitespace followed by
2094 # multiple close brackets. The logic to see if this closes anything
2095 # is down below. Note that in order to parse C<< >> correctly, we
2096 # have to use look-behind (?<=\s\s), since the match of the starting
2097 # code will have consumed the whitespace.
2098 #
2099 # * A single closing bracket, to close a simple code like C<>.
2100 #
2101 # * Something that isn't a start or end code. We have to be careful
2102 # about accepting whitespace, since perlpodspec says that any whitespace
2103 # before a multiple-bracket closing delimiter should be ignored.
2104 #
2105 while($para =~
2106 m/\G
2107 (?:
2108 # Match starting codes, including the whitespace following a
2109 # multiple-delimiter start code. $1 gets the whole start code and
2110 # $2 gets all but one of the <s in the multiple-bracket case.
2111 ([A-Z]<(?:(<+)\s+)?)
2112 |
2113 # Match multiple-bracket end codes. $3 gets the whitespace that
2114 # should be discarded before an end bracket but kept in other cases
2115 # and $4 gets the end brackets themselves. ($3 can be empty if the
2116 # construct is empty, like C<< >>, and all the white-space has been
2117 # gobbled up already, considered to be space after the opening
2118 # bracket. In this case we use look-behind to verify that there are
2119 # at least 2 spaces in a row before the ">".)
2120 (\s+|(?<=\s\s))(>{2,})
2121 |
2122 (\s?>) # $5: simple end-codes
2123 |
2124 ( # $6: stuff containing no start-codes or end-codes
2125 (?:
2126 [^A-Z\s>]
2127 |
2128 (?:
2129 [A-Z](?!<)
2130 )
2131 |
2132 # whitespace is ok, but we don't want to eat the whitespace before
2133 # a multiple-bracket end code.
2134 # NOTE: we may still have problems with e.g. S<< >>
2135 (?:
2136 \s(?!\s*>{2,})
2137 )
2138 )+
2139 )
2140 )
2141 /xgo
2142 ) {
2143 DEBUG > 4 and print STDERR "\nParagraphic tokenstack = (@stack)\n";
2144 if(defined $1) {
2145 my $bracket_count; # How many '<<<' in a row this has. Needed for
2146 # Pod::Simple::JustPod
2147 if(defined $2) {
2148 DEBUG > 3 and print STDERR "Found complex start-text code \"$1\"\n";
2149 $bracket_count = length($2) + 1;
2150 push @stack, $bracket_count; # length of the necessary complex
2151 # end-code string
2152 } else {
2153 DEBUG > 3 and print STDERR "Found simple start-text code \"$1\"\n";
2154 push @stack, 0; # signal that we're looking for simple
2155 $bracket_count = 1;
2156 }
2157 my $code = substr($1,0,1);
2158 if ('L' eq $code) {
2159 if ($inL) {
2160 $raw .= $1;
2161 $self->scream( $start_line,
2162 'Nested L<> are illegal. Pretending inner one is '
2163 . 'X<...> so can continue looking for other errors.');
2164 $code = "X";
2165 }
2166 else {
2167 $raw = ""; # reset raw content accumulator
2168 $inL = @stack;
2169 }
2170 } else {
2171 $raw .= $1 if $inL;
2172 }
2173 push @lineage, [ $code, {}, ]; # new node object
2174
2175 # Tell Pod::Simple::JustPod how many brackets there were, but to save
2176 # space, not in the most usual case of there was just 1. It can be
2177 # inferred by the absence of this element. Similarly, if there is more
2178 # than one bracket, extract the white space between the final bracket
2179 # and the real beginning of the interior. Save that if it isn't just a
2180 # single space
2181 if ($self->{'_output_is_for_JustPod'} && $bracket_count > 1) {
2182 $lineage[-1][1]{'~bracket_count'} = $bracket_count;
2183 my $lspacer = substr($1, 1 + $bracket_count);
2184 $lineage[-1][1]{'~lspacer'} = $lspacer if $lspacer ne " ";
2185 }
2186 push @{ $lineage[-2] }, $lineage[-1];
2187 } elsif(defined $4) {
2188 DEBUG > 3 and print STDERR "Found apparent complex end-text code \"$3$4\"\n";
2189 # This is where it gets messy...
2190 if(! @stack) {
2191 # We saw " >>>>" but needed nothing. This is ALL just stuff then.
2192 DEBUG > 4 and print STDERR " But it's really just stuff.\n";
2193 push @{ $lineage[-1] }, $3, $4;
2194 next;
2195 } elsif(!$stack[-1]) {
2196 # We saw " >>>>" but needed only ">". Back pos up.
2197 DEBUG > 4 and print STDERR " And that's more than we needed to close simple.\n";
2198 push @{ $lineage[-1] }, $3; # That was a for-real space, too.
2199 pos($para) = pos($para) - length($4) + 1;
2200 } elsif($stack[-1] == length($4)) {
2201 # We found " >>>>", and it was exactly what we needed. Commonest case.
2202 DEBUG > 4 and print STDERR " And that's exactly what we needed to close complex.\n";
2203 } elsif($stack[-1] < length($4)) {
2204 # We saw " >>>>" but needed only " >>". Back pos up.
2205 DEBUG > 4 and print STDERR " And that's more than we needed to close complex.\n";
2206 pos($para) = pos($para) - length($4) + $stack[-1];
2207 } else {
2208 # We saw " >>>>" but needed " >>>>>>". So this is all just stuff!
2209 DEBUG > 4 and print STDERR " But it's really just stuff, because we needed more.\n";
2210 push @{ $lineage[-1] }, $3, $4;
2211 next;
2212 }
2213 #print STDERR "\nHOOBOY ", scalar(@{$lineage[-1]}), "!!!\n";
2214
2215 if ($3 ne " " && $self->{'_output_is_for_JustPod'}) {
2216 if ($3 ne "") {
2217 $lineage[-1][1]{'~rspacer'} = $3;
2218 }
2219 elsif ($lineage[-1][1]{'~lspacer'} eq " ") {
2220
2221 # Here we had something like C<< >> which was a false positive
2222 delete $lineage[-1][1]{'~lspacer'};
2223 }
2224 else {
2225 $lineage[-1][1]{'~rspacer'}
2226 = substr($lineage[-1][1]{'~lspacer'}, -1, 1);
2227 chop $lineage[-1][1]{'~lspacer'};
2228 }
2229 }
2230
2231 push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
2232 # Keep the element from being childless
2233
2234 if ($inL == @stack) {
2235 $lineage[-1][1]{'raw'} = $raw;
2236 $inL = 0;
2237 }
2238
2239 pop @stack;
2240 pop @lineage;
2241
2242 $raw .= $3.$4 if $inL;
2243
2244 } elsif(defined $5) {
2245 DEBUG > 3 and print STDERR "Found apparent simple end-text code \"$5\"\n";
2246
2247 if(@stack and ! $stack[-1]) {
2248 # We're indeed expecting a simple end-code
2249 DEBUG > 4 and print STDERR " It's indeed an end-code.\n";
2250
2251 if(length($5) == 2) { # There was a space there: " >"
2252 push @{ $lineage[-1] }, ' ';
2253 } elsif( 2 == @{ $lineage[-1] } ) { # Closing a childless element
2254 push @{ $lineage[-1] }, ''; # keep it from being really childless
2255 }
2256
2257 if ($inL == @stack) {
2258 $lineage[-1][1]{'raw'} = $raw;
2259 $inL = 0;
2260 }
2261
2262 pop @stack;
2263 pop @lineage;
2264 } else {
2265 DEBUG > 4 and print STDERR " It's just stuff.\n";
2266 push @{ $lineage[-1] }, $5;
2267 }
2268
2269 $raw .= $5 if $inL;
2270
2271 } elsif(defined $6) {
2272 DEBUG > 3 and print STDERR "Found stuff \"$6\"\n";
2273 push @{ $lineage[-1] }, $6;
2274 $raw .= $6 if $inL;
2275 # XXX does not capture multiplace whitespaces -- 'raw' ends up with
2276 # at most 1 leading/trailing whitespace, why not all of it?
2277 # Answer, because we deliberately trimmed it above
2278
2279 } else {
2280 # should never ever ever ever happen
2281 DEBUG and print STDERR "AYYAYAAAAA at line ", __LINE__, "\n";
2282 die "SPORK 512512!";
2283 }
2284 }
2285
2286 if(@stack) { # Uhoh, some sequences weren't closed.
2287 my $x= "...";
2288 while(@stack) {
2289 push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
2290 # Hmmmmm!
2291
2292 my $code = (pop @lineage)->[0];
2293 my $ender_length = pop @stack;
2294 if($ender_length) {
2295 --$ender_length;
2296 $x = $code . ("<" x $ender_length) . " $x " . (">" x $ender_length);
2297 } else {
2298 $x = $code . "<$x>";
2299 }
2300 }
2301 DEBUG > 1 and print STDERR "Unterminated $x sequence\n";
2302 $self->whine($start_line,
2303 "Unterminated $x sequence",
2304 );
2305 }
2306
2307 return $treelet;
2308}
2309
2310#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2311
2312sub text_content_of_treelet { # method: $parser->text_content_of_treelet($lol)
2313 return stringify_lol($_[1]);
2314}
2315
2316sub stringify_lol { # function: stringify_lol($lol)
2317 my $string_form = '';
2318 _stringify_lol( $_[0] => \$string_form );
2319 return $string_form;
2320}
2321
2322sub _stringify_lol { # the real recursor
2323 my($lol, $to) = @_;
2324 for(my $i = 2; $i < @$lol; ++$i) {
2325 if( ref($lol->[$i] || '') and UNIVERSAL::isa($lol->[$i], 'ARRAY') ) {
2326 _stringify_lol( $lol->[$i], $to); # recurse!
2327 } else {
2328 $$to .= $lol->[$i];
2329 }
2330 }
2331 return;
2332}
2333
2334#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2335
2336sub _dump_curr_open { # return a string representation of the stack
2337 my $curr_open = $_[0]{'curr_open'};
2338
2339 return '[empty]' unless @$curr_open;
2340 return join '; ',
2341 map {;
2342 ($_->[0] eq '=for')
2343 ? ( ($_->[1]{'~really'} || '=over')
2344 . ' ' . $_->[1]{'target'})
2345 : $_->[0]
2346 }
2347 @$curr_open
2348 ;
2349}
2350
2351###########################################################################
2352my %pretty_form = (
2353 "\a" => '\a', # ding!
2354 "\b" => '\b', # BS
2355 "\e" => '\e', # ESC
2356 "\f" => '\f', # FF
2357 "\t" => '\t', # tab
2358 "\cm" => '\cm',
2359 "\cj" => '\cj',
2360 "\n" => '\n', # probably overrides one of either \cm or \cj
2361 '"' => '\"',
2362 '\\' => '\\\\',
2363 '$' => '\\$',
2364 '@' => '\\@',
2365 '%' => '\\%',
2366 '#' => '\\#',
2367);
2368
2369sub pretty { # adopted from Class::Classless
2370 # Not the most brilliant routine, but passable.
2371 # Don't give it a cyclic data structure!
2372 my @stuff = @_; # copy
2373 my $x;
2374 my $out =
2375 # join ",\n" .
2376 join ", ",
2377 map {;
2378 if(!defined($_)) {
2379 "undef";
2380 } elsif(ref($_) eq 'ARRAY' or ref($_) eq 'Pod::Simple::LinkSection') {
2381 $x = "[ " . pretty(@$_) . " ]" ;
2382 $x;
2383 } elsif(ref($_) eq 'SCALAR') {
2384 $x = "\\" . pretty($$_) ;
2385 $x;
2386 } elsif(ref($_) eq 'HASH') {
2387 my $hr = $_;
2388 $x = "{" . join(", ",
2389 map(pretty($_) . '=>' . pretty($hr->{$_}),
2390 sort keys %$hr ) ) . "}" ;
2391 $x;
2392 } elsif(!length($_)) { q{''} # empty string
2393 } elsif(
2394 $_ eq '0' # very common case
2395 or(
2396 m/^-?(?:[123456789]\d*|0)(?:\.\d+)?$/s
2397 and $_ ne '-0' # the strange case that RE lets thru
2398 )
2399 ) { $_;
2400 } else {
2401 # Yes, explicitly name every character desired. There are shorcuts one
2402 # could make, but I (Karl Williamson) was afraid that some Perl
2403 # releases would have bugs in some of them. For example [A-Z] works
2404 # even on EBCDIC platforms to match exactly the 26 uppercase English
2405 # letters, but I don't know if it has always worked without bugs. It
2406 # seemed safest just to list the characters.
2407 # s<([^\x20\x21\x23\x27-\x3F\x41-\x5B\x5D-\x7E])>
2408 s<([^ !"#'()*+,\-./0123456789:;\<=\>?ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]^_`abcdefghijklmnopqrstuvwxyz{|}~])>
2409 <$pretty_form{$1} || '\\x{'.sprintf("%x", ord($1)).'}'>eg;
2410 #<$pretty_form{$1} || '\\x'.(unpack("H2",$1))>eg;
2411 qq{"$_"};
2412 }
2413 } @stuff;
2414 # $out =~ s/\n */ /g if length($out) < 75;
2415 return $out;
2416}
2417
2418#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2419
2420# A rather unsubtle method of blowing away all the state information
2421# from a parser object so it can be reused. Provided as a utility for
2422# backward compatibility in Pod::Man, etc. but not recommended for
2423# general use.
2424
2425sub reinit {
2426 my $self = shift;
2427 foreach (qw(source_dead source_filename doc_has_started
2428start_of_pod_block content_seen last_was_blank paras curr_open
2429line_count pod_para_count in_pod ~tried_gen_errata all_errata errata errors_seen
2430Title)) {
2431
2432 delete $self->{$_};
2433 }
2434}
2435
2436#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
24371;
2438