← Index
NYTProf Performance Profile   « line view »
For /usr/local/libexec/sympa/task_manager-debug.pl
  Run on Tue Jun 1 22:32:51 2021
Reported on Tue Jun 1 22:35:12 2021

Filename/usr/local/lib/perl5/site_perl/MIME/WordDecoder.pm
StatementsExecuted 0 statements in 0s
Subroutines
Calls P F Exclusive
Time
Inclusive
Time
Subroutine
0000s0sMIME::WordDecoder::::BEGIN@91 MIME::WordDecoder::BEGIN@91
0000s0sMIME::WordDecoder::::BEGIN@92 MIME::WordDecoder::BEGIN@92
0000s0sMIME::WordDecoder::::BEGIN@93 MIME::WordDecoder::BEGIN@93
0000s0sMIME::WordDecoder::::BEGIN@94 MIME::WordDecoder::BEGIN@94
0000s0sMIME::WordDecoder::::BEGIN@95 MIME::WordDecoder::BEGIN@95
0000s0sMIME::WordDecoder::ISO_8859::::BEGIN@407MIME::WordDecoder::ISO_8859::BEGIN@407
0000s0sMIME::WordDecoder::ISO_8859::::BEGIN@408MIME::WordDecoder::ISO_8859::BEGIN@408
0000s0sMIME::WordDecoder::ISO_8859::::collapseMIME::WordDecoder::ISO_8859::collapse
0000s0sMIME::WordDecoder::ISO_8859::::decodeMIME::WordDecoder::ISO_8859::decode
0000s0sMIME::WordDecoder::ISO_8859::::guess_handlerMIME::WordDecoder::ISO_8859::guess_handler
0000s0sMIME::WordDecoder::ISO_8859::::h_keep7bitMIME::WordDecoder::ISO_8859::h_keep7bit
0000s0sMIME::WordDecoder::ISO_8859::::h_utf16MIME::WordDecoder::ISO_8859::h_utf16
0000s0sMIME::WordDecoder::ISO_8859::::h_utf8MIME::WordDecoder::ISO_8859::h_utf8
0000s0sMIME::WordDecoder::ISO_8859::::newMIME::WordDecoder::ISO_8859::new
0000s0sMIME::WordDecoder::ISO_8859::::unknownMIME::WordDecoder::ISO_8859::unknown
0000s0sMIME::WordDecoder::US_ASCII::::BEGIN@573MIME::WordDecoder::US_ASCII::BEGIN@573
0000s0sMIME::WordDecoder::US_ASCII::::BEGIN@574MIME::WordDecoder::US_ASCII::BEGIN@574
0000s0sMIME::WordDecoder::US_ASCII::::decodeMIME::WordDecoder::US_ASCII::decode
0000s0sMIME::WordDecoder::US_ASCII::::newMIME::WordDecoder::US_ASCII::new
0000s0sMIME::WordDecoder::UTF_8::::BEGIN@600 MIME::WordDecoder::UTF_8::BEGIN@600
0000s0sMIME::WordDecoder::UTF_8::::BEGIN@601 MIME::WordDecoder::UTF_8::BEGIN@601
0000s0sMIME::WordDecoder::UTF_8::::BEGIN@602 MIME::WordDecoder::UTF_8::BEGIN@602
0000s0sMIME::WordDecoder::UTF_8::::BEGIN@603 MIME::WordDecoder::UTF_8::BEGIN@603
0000s0sMIME::WordDecoder::UTF_8::::h_convert_to_utf8 MIME::WordDecoder::UTF_8::h_convert_to_utf8
0000s0sMIME::WordDecoder::UTF_8::::new MIME::WordDecoder::UTF_8::new
0000s0sMIME::WordDecoder::::__ANON__[:114] MIME::WordDecoder::__ANON__[:114]
0000s0sMIME::WordDecoder::::__ANON__[:115] MIME::WordDecoder::__ANON__[:115]
0000s0sMIME::WordDecoder::::__ANON__[:116] MIME::WordDecoder::__ANON__[:116]
0000s0sMIME::WordDecoder::::__ANON__[:117] MIME::WordDecoder::__ANON__[:117]
0000s0sMIME::WordDecoder::::decode MIME::WordDecoder::decode
0000s0sMIME::WordDecoder::::default MIME::WordDecoder::default
0000s0sMIME::WordDecoder::::guess_handler MIME::WordDecoder::guess_handler
0000s0sMIME::WordDecoder::::handler MIME::WordDecoder::handler
0000s0sMIME::WordDecoder::::mime_to_perl_string MIME::WordDecoder::mime_to_perl_string
0000s0sMIME::WordDecoder::::new MIME::WordDecoder::new
0000s0sMIME::WordDecoder::::real_handler MIME::WordDecoder::real_handler
0000s0sMIME::WordDecoder::::supported MIME::WordDecoder::supported
0000s0sMIME::WordDecoder::::unmime MIME::WordDecoder::unmime
Call graph for these subroutines as a Graphviz dot language file.
Line State
ments
Time
on line
Calls Time
in subs
Code
1package MIME::WordDecoder;
2
3=head1 NAME
4
5MIME::WordDecoder - decode RFC 2047 encoded words to a local representation
6
7WARNING: Most of this module is deprecated and may disappear. The only
8function you should use for MIME decoding is "mime_to_perl_string".
9
10=head1 SYNOPSIS
11
12See L<MIME::Words> for the basics of encoded words.
13See L<"DESCRIPTION"> for how this class works.
14
15 use MIME::WordDecoder;
16
17
18 ### Get the default word-decoder (used by unmime()):
19 $wd = default MIME::WordDecoder;
20
21 ### Get a word-decoder which maps to ISO-8859-1 (Latin1):
22 $wd = supported MIME::WordDecoder "ISO-8859-1";
23
24
25 ### Decode a MIME string (e.g., into Latin1) via the default decoder:
26 $str = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld>');
27
28 ### Decode a string using the default decoder, non-OO style:
29 $str = unmime('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld>');
30
31 ### Decode a string to an internal Perl string, non-OO style
32 ### The result is likely to have the UTF8 flag ON.
33 $str = mime_to_perl_string('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld>');
34
35=head1 DESCRIPTION
36
37WARNING: Most of this module is deprecated and may disappear. It
38duplicates (badly) the function of the standard 'Encode' module. The
39only function you should rely on is mime_to_perl_string.
40
41A MIME::WordDecoder consists, fundamentally, of a hash which maps
42a character set name (US-ASCII, ISO-8859-1, etc.) to a subroutine which
43knows how to take bytes in that character set and turn them into
44the target string representation. Ideally, this target representation
45would be Unicode, but we don't want to overspecify the translation
46that takes place: if you want to convert MIME strings directly to Big5,
47that's your own decision.
48
49The subroutine will be invoked with two arguments: DATA (the data in
50the given character set), and CHARSET (the upcased character set name).
51
52For example:
53
54 ### Keep 7-bit characters as-is, convert 8-bit characters to '#':
55 sub keep7bit {
56 local $_ = shift;
57 tr/\x00-\x7F/#/c;
58 $_;
59 }
60
61Here's a decoder which uses that:
62
63 ### Construct a decoder:
64 $wd = MIME::WordDecoder->new({'US-ASCII' => "KEEP", ### sub { $_[0] }
65 'ISO-8859-1' => \&keep7bit,
66 'ISO-8859-2' => \&keep7bit,
67 'Big5' => "WARN",
68 '*' => "DIE"});
69
70 ### Convert some MIME text to a pure ASCII string...
71 $ascii = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld>');
72
73 ### ...which will now hold: "To: Keld J#rn Simonsen <keld>"
74
75The UTF-8 built-in decoder decodes everything into Perl's internal
76string format, possibly turning on the internal UTF8 flag. Use it like
77this:
78
79 $wd = supported MIME::WordDecoder 'UTF-8';
80 $perl_string = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld>');
81 # perl_string will be a valid UTF-8 string with the "UTF8" flag set.
82
83Generally, you should use the UTF-8 decoder in preference to "unmime".
84
85=head1 PUBLIC INTERFACE
86
87=over
88
89=cut
90
91use strict;
92use Carp qw( carp croak );
93use MIME::Words qw(decode_mimewords);
94use Exporter;
95use vars qw(@ISA @EXPORT);
96
97@ISA = qw(Exporter);
98@EXPORT = qw( unmime mime_to_perl_string );
99
- -
102#------------------------------
103#
104# Globals
105#
106#------------------------------
107
108### Decoders.
109my %DecoderFor = ();
110
111### Standard handlers.
112my %Handler =
113(
114 KEEP => sub {$_[0]},
115 IGNORE => sub {''},
116 WARN => sub { carp "ignoring text in character set `$_[1]'\n" },
117 DIE => sub { croak "can't handle text in character set `$_[1]'\n" },
118 );
119
120### Global default decoder. We init it below.
121my $Default;
122
123### Global UTF8 decoder.
124my $DefaultUTF8;
125
126#------------------------------
127
128=item default [DECODER]
129
130I<Class method.>
131Get/set the default DECODER object.
132
133=cut
134
135sub default {
136 my $class = shift;
137 if (@_) {
138 $Default = shift;
139 }
140 $Default;
141}
142
143#------------------------------
144
145=item supported CHARSET, [DECODER]
146
147I<Class method.>
148If just CHARSET is given, returns a decoder object which maps
149data into that character set (the character set is forced to
150all-uppercase).
151
152 $wd = supported MIME::WordDecoder "ISO-8859-1";
153
154If DECODER is given, installs such an object:
155
156 MIME::WordDecoder->supported("ISO-8859-1" =>
157 (new MIME::WordDecoder::ISO_8859 "1"));
158
159You should not override this method.
160
161=cut
162
163sub supported {
164 my ($class, $charset, $decoder) = @_;
165 $DecoderFor{uc($charset)} = $decoder if (@_ > 2);
166 $DecoderFor{uc($charset)};
167}
168
169#------------------------------
170
171=item new [\@HANDLERS]
172
173I<Class method, constructor.>
174If \@HANDLERS is given, then @HANDLERS is passed to handler()
175to initialize the internal map.
176
177=cut
178
179sub new {
180 my ($class, $h) = @_;
181 my $self = bless { MWD_Map=>{} }, $class;
182
183 ### Init the map:
184 $self->handler(@$h);
185
186 ### Add fallbacks:
187 $self->{MWD_Map}{'*'} ||= $Handler{WARN};
188 $self->{MWD_Map}{'raw'} ||= $self->{MWD_Map}{'US-ASCII'};
189 $self;
190}
191
192#------------------------------
193
194=item handler CHARSET=>\&SUBREF, ...
195
196I<Instance method.>
197Set the handler SUBREF for a given CHARSET, for as many pairs
198as you care to supply.
199
200When performing the translation of a MIME-encoded string, a
201given SUBREF will be invoked when translating a block of text
202in character set CHARSET. The subroutine will be invoked with
203the following arguments:
204
205 DATA - the data in the given character set.
206 CHARSET - the upcased character set name, which may prove useful
207 if you are using the same SUBREF for multiple CHARSETs.
208 DECODER - the decoder itself, if it contains configuration information
209 that your handler function needs.
210
211For example:
212
213 $wd = new MIME::WordDecoder;
214 $wd->handler('US-ASCII' => "KEEP");
215 $wd->handler('ISO-8859-1' => \&handle_latin1,
216 'ISO-8859-2' => \&handle_latin1,
217 '*' => "DIE");
218
219Notice that, much as with %SIG, the SUBREF can also be taken from
220a set of special keywords:
221
222 KEEP Pass data through unchanged.
223 IGNORE Ignore data in this character set, without warning.
224 WARN Ignore data in this character set, with warning.
225 DIE Fatal exception with "can't handle character set" message.
226
227The subroutine for the special CHARSET of 'raw' is used for raw
228(non-MIME-encoded) text, which is supposed to be US-ASCII.
229The handler for 'raw' defaults to whatever was specified for 'US-ASCII'
230at the time of construction.
231
232The subroutine for the special CHARSET of '*' is used for any
233unrecognized character set. The default action for '*' is WARN.
234
235=cut
236
237sub handler {
238 my $self = shift;
239
240 ### Copy the hash, and edit it:
241 while (@_) {
242 my $c = shift;
243 my $sub = shift;
244 $self->{MWD_Map}{$c} = $self->real_handler($sub);
245 }
246 $self;
247}
248
249#------------------------------
250
251=item decode STRING
252
253I<Instance method.>
254Decode a STRING which might contain MIME-encoded components into a
255local representation (e.g., UTF-8, etc.).
256
257=cut
258
259sub decode {
260 my ($self, $str) = @_;
261 defined($str) or return undef;
262 join('', map {
263 ### Get the data and (upcased) charset:
264 my $data = $_->[0];
265 my $charset = (defined($_->[1]) ? uc($_->[1]) : 'raw');
266 $charset =~ s/\*\w+\Z//; ### RFC2184 language suffix
267
268 ### Get the handler; guess if never seen before:
269 defined($self->{MWD_Map}{$charset}) or
270 $self->{MWD_Map}{$charset} =
271 ($self->real_handler($self->guess_handler($charset)) || 0);
272 my $subr = $self->{MWD_Map}{$charset} || $self->{MWD_Map}{'*'};
273
274 ### Map this chunk:
275 &$subr($data, $charset, $self);
276 } decode_mimewords($str));
277}
278
279#------------------------------
280#
281# guess_handler CHARSET
282#
283# Instance method.
284# An unrecognized charset has been seen. Guess a handler subref
285# for the given charset, returning false if there is none.
286# Successful mappings will be cached in the main map.
287#
288sub guess_handler {
289 undef;
290}
291
292#------------------------------
293#
294# real_handler HANDLER
295#
296# Instance method.
297# Translate the given handler, which might be a subref or a string.
298#
299sub real_handler {
300 my ($self, $sub) = @_;
301 (!$sub) or
302 (ref($sub) eq 'CODE') or
303 $sub = ($Handler{$sub} || croak "bad named handler: $sub\n");
304 $sub;
305}
306
307#------------------------------
308
309=item unmime STRING
310
311I<Function, exported.>
312Decode the given STRING using the default() decoder.
313See L<default()|/default>.
314
315You should consider using the UTF-8 decoder instead. It decodes
316MIME strings into Perl's internal string format.
317
318=cut
319
320sub unmime($) {
321 my $str = shift;
322 $Default->decode($str);
323}
324
325=item mime_to_perl_string
326
327I<Function, exported.>
328Decode the given STRING into an internal Perl Unicode string.
329You should use this function in preference to all others.
330
331The result of mime_to_perl_string is likely to have Perl's
332UTF8 flag set.
333
334=cut
335
336sub mime_to_perl_string($) {
337 my $str = shift;
338 $DecoderFor{'UTF-8'}->decode($str);
339}
340
341=back
342
343=cut
344
- -
349=head1 SUBCLASSES
350
351=over
352
353=cut
354
355#------------------------------------------------------------
356#------------------------------------------------------------
357
358=item MIME::WordDecoder::ISO_8859
359
360A simple decoder which keeps US-ASCII and the 7-bit characters
361of ISO-8859 character sets and UTF8, and also keeps 8-bit
362characters from the indicated character set.
363
364 ### Construct:
365 $wd = new MIME::WordDecoder::ISO_8859 2; ### ISO-8859-2
366
367 ### What to translate unknown characters to (can also use empty):
368 ### Default is "?".
369 $wd->unknown("?");
370
371 ### Collapse runs of unknown characters to a single unknown()?
372 ### Default is false.
373 $wd->collapse(1);
374
375
376According to B<http://czyborra.com/charsets/iso8859.html>
377(ca. November 2000):
378
379ISO 8859 is a full series of 10 (and soon even more) standardized
380multilingual single-byte coded (8bit) graphic character sets for
381writing in alphabetic languages:
382
383 1. Latin1 (West European)
384 2. Latin2 (East European)
385 3. Latin3 (South European)
386 4. Latin4 (North European)
387 5. Cyrillic
388 6. Arabic
389 7. Greek
390 8. Hebrew
391 9. Latin5 (Turkish)
392 10. Latin6 (Nordic)
393
394The ISO 8859 charsets are not even remotely as complete as the truly
395great Unicode but they have been around and usable for quite a while
396(first registered Internet charsets for use with MIME) and have
397already offered a major improvement over the plain 7bit US-ASCII.
398
399Characters 0 to 127 are always identical with US-ASCII and the
400positions 128 to 159 hold some less used control characters: the
401so-called C1 set from ISO 6429.
402
403=cut
404
405package MIME::WordDecoder::ISO_8859;
406
407use strict;
408use vars qw(@ISA);
409@ISA = qw( MIME::WordDecoder );
410
411
412#------------------------------
413#
414# HANDLERS
415#
416#------------------------------
417
418### Keep 7bit characters.
419### Turn all else to the special \x00.
420sub h_keep7bit {
421 local $_ = $_[0];
422# my $unknown = $_[2]->{MWDI_Unknown};
423
424 s{[\x80-\xFF]}{\x00}g;
425 $_;
426}
427
428### Note: should use Unicode::String, converting/manipulating
429### everything into full Unicode form.
430
431### Keep 7bit UTF8 characters (ASCII).
432### Keep ISO-8859-1 if this decoder is for Latin-1.
433### Turn all else to the special \x00.
434sub h_utf8 {
435 local $_ = $_[0];
436# my $unknown = $_[2]->{MWDI_Unknown};
437 my $latin1 = ($_[2]->{MWDI_Num} == 1);
438 #print STDERR "UTF8 in: <$_>\n";
439
440 local($1,$2,$3);
441 my $tgt = '';
442 while (m{\G(
443 ([\x00-\x7F]) | # 0xxxxxxx
444 ([\xC0-\xDF] [\x80-\xBF]) | # 110yyyyy 10xxxxxx
445 ([\xE0-\xEF] [\x80-\xBF]{2}) | # 1110zzzz 10yyyyyy 10xxxxxx
446 ([\xF0-\xF7] [\x80-\xBF]{3}) | # 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
447 . # error; synch
448 )}gcsx and ($1 ne '')) {
449
450 if (defined($2)) { $tgt .= $2 }
451 elsif (defined($3) && $latin1) { $tgt .= "\x00" }
452 else { $tgt .= "\x00" }
453 }
454
455 #print STDERR "UTF8 out: <$tgt>\n";
456 $tgt;
457}
458
459### Keep characters which are 7bit in UTF8 (ASCII).
460### Keep ISO-8859-1 if this decoder is for Latin-1.
461### Turn all else to the special \x00.
462sub h_utf16 {
463 local $_ = $_[0];
464# my $unknown = $_[2]->{MWDI_Unknown};
465 my $latin1 = ($_[2]->{MWDI_Num} == 1);
466 #print STDERR "UTF16 in: <$_>\n";
467
468 local($1,$2,$3,$4,$5);
469 my $tgt = '';
470 while (m{\G(
471 ( \x00 ([\x00-\x7F])) | # 00000000 0xxxxxxx
472 ( \x00 ([\x80-\xFF])) | # 00000000 1xxxxxxx
473 ( [^\x00] [\x00-\xFF]) | # etc
474 )
475 }gcsx and ($1 ne '')) {
476
477 if (defined($2)) { $tgt .= $3 }
478 elsif (defined($4) && $latin1) { $tgt .= $5 }
479 else { $tgt .= "\x00" }
480 }
481
482 #print STDERR "UTF16 out: <$tgt>\n";
483 $tgt;
484}
485
486
487#------------------------------
488#
489# PUBLIC INTERFACE
490#
491#------------------------------
492
493#------------------------------
494#
495# new NUMBER
496#
497sub new {
498 my ($class, $num) = @_;
499
500 my $self = $class->SUPER::new();
501 $self->handler('raw' => 'KEEP',
502 'US-ASCII' => 'KEEP');
503
504 $self->{MWDI_Num} = $num;
505 $self->{MWDI_Unknown} = "?";
506 $self->{MWDI_Collapse} = 0;
507 $self;
508}
509
510#------------------------------
511#
512# guess_handler CHARSET
513#
514sub guess_handler {
515 my ($self, $charset) = @_;
516 return 'KEEP' if (($charset =~ /^ISO[-_]?8859[-_](\d+)$/) &&
517 ($1 eq $self->{MWDI_Num}));
518 return \&h_keep7bit if ($charset =~ /^ISO[-_]?8859/);
519 return \&h_utf8 if ($charset =~ /^UTF[-_]?8$/);
520 return \&h_utf16 if ($charset =~ /^UTF[-_]?16$/);
521 undef;
522}
523
524#------------------------------
525#
526# unknown [REPLACEMENT]
527#
528sub unknown {
529 my $self = shift;
530 $self->{MWDI_Unknown} = shift if @_;
531 $self->{MWDI_Unknown};
532}
533
534#------------------------------
535#
536# collapse [YESNO]
537#
538sub collapse {
539 my $self = shift;
540 $self->{MWDI_Collapse} = shift if @_;
541 $self->{MWDI_Collapse};
542}
543
544#------------------------------
545#
546# decode STRING
547#
548sub decode {
549 my $self = shift;
550
551 ### Do inherited action:
552 my $basic = $self->SUPER::decode(@_);
553 defined($basic) or return undef;
554
555 ### Translate/consolidate illegal characters:
556 $basic =~ tr{\x00}{\x00}c if $self->{MWDI_Collapse};
557 $basic =~ s{\x00}{$self->{MWDI_Unknown}}g;
558 $basic;
559}
560
561#------------------------------------------------------------
562#------------------------------------------------------------
563
564=item MIME::WordDecoder::US_ASCII
565
566A subclass of the ISO-8859-1 decoder which discards 8-bit characters.
567You're probably better off using ISO-8859-1.
568
569=cut
570
571package MIME::WordDecoder::US_ASCII;
572
573use strict;
574use vars qw(@ISA);
575@ISA = qw( MIME::WordDecoder::ISO_8859 );
576
577sub new {
578 my ($class) = @_;
579 return $class->SUPER::new("1");
580}
581
582sub decode {
583 my $self = shift;
584
585 ### Do inherited action:
586 my $basic = $self->SUPER::decode(@_);
587 defined($basic) or return undef;
588
589 ### Translate/consolidate 8-bit characters:
590 $basic =~ tr{\x80-\xFF}{}c if $self->{MWDI_Collapse};
591 $basic =~ s{[\x80-\xFF]}{$self->{MWDI_Unknown}}g;
592 $basic;
593}
594
595=back
596
597=cut
598
599package MIME::WordDecoder::UTF_8;
600use strict;
601use Encode qw();
602use Carp qw( carp );
603use vars qw(@ISA);
604
605@ISA = qw( MIME::WordDecoder );
606
607sub h_convert_to_utf8
608{
609 my ($data, $charset, $decoder) = @_;
610 $charset = 'US-ASCII' if ($charset eq 'raw');
611 my $enc = Encode::find_encoding($charset);
612 if (!$enc) {
613 carp "Unable to convert text in character set `$charset' to UTF-8... ignoring\n";
614 return '';
615 }
616 my $ans = $enc->decode($data, Encode::FB_PERLQQ);
617 return $ans;
618}
619
620sub new {
621 my ($class) = @_;
622 my $self = $class->SUPER::new();
623 $self->handler('*' => \&h_convert_to_utf8);
624}
625
626
627#------------------------------------------------------------
628#------------------------------------------------------------
629
630package MIME::WordDecoder;
631
632### Now we can init the default handler.
633$Default = (MIME::WordDecoder::ISO_8859->new('1'));
634
635
636### Add US-ASCII handler:
637$DecoderFor{"US-ASCII"} = MIME::WordDecoder::US_ASCII->new;
638
639### Add ISO-8859-{1..15} handlers:
640for (1..15) {
641 $DecoderFor{"ISO-8859-$_"} = MIME::WordDecoder::ISO_8859->new($_);
642}
643
644### UTF-8
645$DecoderFor{'UTF-8'} = MIME::WordDecoder::UTF_8->new();
646
6471; # end the module
648__END__