Filename | /usr/local/lib/perl5/site_perl/MIME/EncWords.pm |
Statements | Executed 0 statements in 0s |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
0 | 0 | 0 | 0s | 0s | BEGIN@102 | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | BEGIN@103 | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | BEGIN@104 | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | BEGIN@113 | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | BEGIN@122 | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | BEGIN@87 | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | BEGIN@88 | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | BEGIN@91 | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | CORE:qr (opcode) | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | CORE:regcomp (opcode) | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | _clip_unsafe | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | _convert | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | _decode_B | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | _decode_Q | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | _encode_B | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | _encode_Q | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | _getparams | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | _split | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | _split_ascii | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | _utf_to_unicode | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | decode_mimewords | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | encode_mimeword | MIME::EncWords::
0 | 0 | 0 | 0s | 0s | encode_mimewords | MIME::EncWords::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | #-*- perl -*- | ||||
2 | |||||
3 | package MIME::EncWords; | ||||
4 | require 5.005; | ||||
5 | |||||
6 | =head1 NAME | ||||
7 | |||||
8 | MIME::EncWords - deal with RFC 2047 encoded words (improved) | ||||
9 | |||||
10 | =head1 SYNOPSIS | ||||
11 | |||||
12 | I<L<MIME::EncWords> is aimed to be another implimentation | ||||
13 | of L<MIME::Words> so that it will achieve more exact conformance with | ||||
14 | RFC 2047 (formerly RFC 1522) specifications. Additionally, it contains | ||||
15 | some improvements. | ||||
16 | Following synopsis and descriptions are inherited from its inspirer, | ||||
17 | then added descriptions on improvements (B<**>) or changes and | ||||
18 | clarifications (B<*>).> | ||||
19 | |||||
20 | Before reading further, you should see L<MIME::Tools> to make sure that | ||||
21 | you understand where this module fits into the grand scheme of things. | ||||
22 | Go on, do it now. I'll wait. | ||||
23 | |||||
24 | Ready? Ok... | ||||
25 | |||||
26 | use MIME::EncWords qw(:all); | ||||
27 | |||||
28 | ### Decode the string into another string, forgetting the charsets: | ||||
29 | $decoded = decode_mimewords( | ||||
30 | 'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>', | ||||
31 | ); | ||||
32 | |||||
33 | ### Split string into array of decoded [DATA,CHARSET] pairs: | ||||
34 | @decoded = decode_mimewords( | ||||
35 | 'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>', | ||||
36 | ); | ||||
37 | |||||
38 | ### Encode a single unsafe word: | ||||
39 | $encoded = encode_mimeword("\xABFran\xE7ois\xBB"); | ||||
40 | |||||
41 | ### Encode a string, trying to find the unsafe words inside it: | ||||
42 | $encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB in town"); | ||||
43 | |||||
44 | =head1 DESCRIPTION | ||||
45 | |||||
46 | Fellow Americans, you probably won't know what the hell this module | ||||
47 | is for. Europeans, Russians, et al, you probably do. C<:-)>. | ||||
48 | |||||
49 | For example, here's a valid MIME header you might get: | ||||
50 | |||||
51 | From: =?US-ASCII?Q?Keith_Moore?= <moore@cs.utk.edu> | ||||
52 | To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk> | ||||
53 | CC: =?ISO-8859-1?Q?Andr=E9_?= Pirard <PIRARD@vm1.ulg.ac.be> | ||||
54 | Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?= | ||||
55 | =?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?= | ||||
56 | =?US-ASCII?Q?.._cool!?= | ||||
57 | |||||
58 | The fields basically decode to (sorry, I can only approximate the | ||||
59 | Latin characters with 7 bit sequences /o and 'e): | ||||
60 | |||||
61 | From: Keith Moore <moore@cs.utk.edu> | ||||
62 | To: Keld J/orn Simonsen <keld@dkuug.dk> | ||||
63 | CC: Andr'e Pirard <PIRARD@vm1.ulg.ac.be> | ||||
64 | Subject: If you can read this you understand the example... cool! | ||||
65 | |||||
66 | B<Supplement>: Fellow Americans, Europeans, you probably won't know | ||||
67 | what the hell this module is for. East Asians, et al, you probably do. | ||||
68 | C<(^_^)>. | ||||
69 | |||||
70 | For example, here's a valid MIME header you might get: | ||||
71 | |||||
72 | Subject: =?EUC-KR?B?sNTAuLinKGxhemluZXNzKSwgwvzB9ri7seIoaW1w?= | ||||
73 | =?EUC-KR?B?YXRpZW5jZSksILGzuLgoaHVicmlzKQ==?= | ||||
74 | |||||
75 | The fields basically decode to (sorry, I cannot approximate the | ||||
76 | non-Latin multibyte characters with any 7 bit sequences): | ||||
77 | |||||
78 | Subject: ???(laziness), ????(impatience), ??(hubris) | ||||
79 | |||||
80 | =head1 PUBLIC INTERFACE | ||||
81 | |||||
82 | =over 4 | ||||
83 | |||||
84 | =cut | ||||
85 | |||||
86 | ### Pragmas: | ||||
87 | use strict; | ||||
88 | use vars qw($VERSION @EXPORT_OK %EXPORT_TAGS @ISA $Config); | ||||
89 | |||||
90 | ### Exporting: | ||||
91 | use Exporter; | ||||
92 | |||||
93 | %EXPORT_TAGS = (all => [qw(decode_mimewords | ||||
94 | encode_mimeword | ||||
95 | encode_mimewords)]); | ||||
96 | Exporter::export_ok_tags(qw(all)); | ||||
97 | |||||
98 | ### Inheritance: | ||||
99 | @ISA = qw(Exporter); | ||||
100 | |||||
101 | ### Other modules: | ||||
102 | use Carp qw(croak carp); | ||||
103 | use MIME::Base64; | ||||
104 | use MIME::Charset qw(:trans); | ||||
105 | |||||
106 | my @ENCODE_SUBS = qw(FB_CROAK is_utf8 resolve_alias); | ||||
107 | if (MIME::Charset::USE_ENCODE) { | ||||
108 | eval "use ".MIME::Charset::USE_ENCODE." \@ENCODE_SUBS;"; # spent 0s executing statements in string eval | ||||
109 | if ($@) { # Perl 5.7.3 + Encode 0.40 | ||||
110 | eval "use ".MIME::Charset::USE_ENCODE." qw(is_utf8);"; | ||||
111 | require MIME::Charset::_Compat; | ||||
112 | for my $sub (@ENCODE_SUBS) { | ||||
113 | no strict "refs"; | ||||
114 | *{$sub} = \&{"MIME::Charset::_Compat::$sub"} | ||||
115 | unless $sub eq 'is_utf8'; | ||||
116 | } | ||||
117 | } | ||||
118 | } else { | ||||
119 | require Unicode::String; | ||||
120 | require MIME::Charset::_Compat; | ||||
121 | for my $sub (@ENCODE_SUBS) { | ||||
122 | no strict "refs"; | ||||
123 | *{$sub} = \&{"MIME::Charset::_Compat::$sub"}; | ||||
124 | } | ||||
125 | } | ||||
126 | |||||
127 | #------------------------------ | ||||
128 | # | ||||
129 | # Globals... | ||||
130 | # | ||||
131 | #------------------------------ | ||||
132 | |||||
133 | ### The package version, both in 1.23 style *and* usable by MakeMaker: | ||||
134 | $VERSION = '1.014.3'; | ||||
135 | |||||
136 | ### Public Configuration Attributes | ||||
137 | $Config = { | ||||
138 | %{$MIME::Charset::Config}, # Detect7bit, Replacement, Mapping | ||||
139 | Charset => 'ISO-8859-1', | ||||
140 | Encoding => 'A', | ||||
141 | Field => undef, | ||||
142 | Folding => "\n", | ||||
143 | MaxLineLen => 76, | ||||
144 | Minimal => 'YES', | ||||
145 | }; | ||||
146 | eval { require MIME::EncWords::Defaults; }; | ||||
147 | |||||
148 | ### Private Constants | ||||
149 | |||||
150 | my $PRINTABLE = "\\x21-\\x7E"; | ||||
151 | #my $NONPRINT = "\\x00-\\x1F\\x7F-\\xFF"; | ||||
152 | my $NONPRINT = qr{[^$PRINTABLE]}; # Improvement: Unicode support. | ||||
153 | my $UNSAFE = qr{[^\x01-\x20$PRINTABLE]}; | ||||
154 | my $WIDECHAR = qr{[^\x00-\xFF]}; | ||||
155 | my $ASCIITRANS = qr{^(?:HZ-GB-2312|UTF-7)$}i; | ||||
156 | my $ASCIIINCOMPAT = qr{^UTF-(?:16|32)(?:BE|LE)?$}i; | ||||
157 | my $DISPNAMESPECIAL = "\\x22(),:;<>\\x40\\x5C"; # RFC5322 name-addr specials. | ||||
158 | |||||
159 | #------------------------------ | ||||
160 | |||||
161 | # _utf_to_unicode CSETOBJ, STR | ||||
162 | # Private: Convert UTF-16*/32* to Unicode or UTF-8. | ||||
163 | sub _utf_to_unicode { | ||||
164 | my $csetobj = shift; | ||||
165 | my $str = shift; | ||||
166 | |||||
167 | return $str if is_utf8($str); | ||||
168 | |||||
169 | return $csetobj->decode($str) | ||||
170 | if MIME::Charset::USE_ENCODE(); | ||||
171 | |||||
172 | my $cset = $csetobj->as_string; | ||||
173 | my $unistr = Unicode::String->new(); | ||||
174 | if ($cset eq 'UTF-16' or $cset eq 'UTF-16BE') { | ||||
175 | $unistr->utf16($str); | ||||
176 | } elsif ($cset eq 'UTF-16LE') { | ||||
177 | $unistr->utf16le($str); | ||||
178 | } elsif ($cset eq 'UTF-32' or $cset eq 'UTF-32BE') { | ||||
179 | $unistr->utf32($str); | ||||
180 | } elsif ($cset eq 'UTF-32LE') { | ||||
181 | $unistr->utf32le($str); | ||||
182 | } else { | ||||
183 | croak "unknown transformation '$cset'"; | ||||
184 | } | ||||
185 | return $unistr->utf8; | ||||
186 | } | ||||
187 | |||||
188 | #------------------------------ | ||||
189 | |||||
190 | # _decode_B STRING | ||||
191 | # Private: used by _decode_header() to decode "B" encoding. | ||||
192 | # Improvement by this module: sanity check on encoded sequence. | ||||
193 | sub _decode_B { | ||||
194 | my $str = shift; | ||||
195 | unless ((length($str) % 4 == 0) and | ||||
196 | $str =~ m|^[A-Za-z0-9+/]+={0,2}$|) { | ||||
197 | return undef; | ||||
198 | } | ||||
199 | return decode_base64($str); | ||||
200 | } | ||||
201 | |||||
202 | # _decode_Q STRING | ||||
203 | # Private: used by _decode_header() to decode "Q" encoding, which is | ||||
204 | # almost, but not exactly, quoted-printable. :-P | ||||
205 | # Improvement by this module: sanity check on encoded sequence (>=1.012.3). | ||||
206 | sub _decode_Q { | ||||
207 | my $str = shift; | ||||
208 | if ($str =~ /=(?![0-9a-fA-F][0-9a-fA-F])/) { #XXX:" " and "\t" are allowed | ||||
209 | return undef; | ||||
210 | } | ||||
211 | $str =~ s/_/\x20/g; # RFC 2047, Q rule 2 | ||||
212 | $str =~ s/=([0-9a-fA-F]{2})/pack("C", hex($1))/ge; # RFC 2047, Q rule 1 | ||||
213 | $str; | ||||
214 | } | ||||
215 | |||||
216 | # _encode_B STRING | ||||
217 | # Private: used by encode_mimeword() to encode "B" encoding. | ||||
218 | sub _encode_B { | ||||
219 | my $str = shift; | ||||
220 | encode_base64($str, ''); | ||||
221 | } | ||||
222 | |||||
223 | # _encode_Q STRING | ||||
224 | # Private: used by encode_mimeword() to encode "Q" encoding, which is | ||||
225 | # almost, but not exactly, quoted-printable. :-P | ||||
226 | # Improvement by this module: Spaces are escaped by ``_''. | ||||
227 | sub _encode_Q { | ||||
228 | my $str = shift; | ||||
229 | # Restrict characters to those listed in RFC 2047 section 5 (3) | ||||
230 | $str =~ s{[^-!*+/0-9A-Za-z]}{ | ||||
231 | $& eq "\x20"? "_": sprintf("=%02X", ord($&)) | ||||
232 | }eog; | ||||
233 | $str; | ||||
234 | } | ||||
235 | |||||
236 | #------------------------------ | ||||
237 | |||||
238 | =item decode_mimewords ENCODED, [OPTS...] | ||||
239 | |||||
240 | I<Function.> | ||||
241 | Go through the string looking for RFC 2047-style "Q" | ||||
242 | (quoted-printable, sort of) or "B" (base64) encoding, and decode them. | ||||
243 | |||||
244 | B<In an array context,> splits the ENCODED string into a list of decoded | ||||
245 | C<[DATA, CHARSET]> pairs, and returns that list. Unencoded | ||||
246 | data are returned in a 1-element array C<[DATA]>, giving an effective | ||||
247 | CHARSET of C<undef>. | ||||
248 | |||||
249 | $enc = '=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>'; | ||||
250 | foreach (decode_mimewords($enc)) { | ||||
251 | print "", ($_[1] || 'US-ASCII'), ": ", $_[0], "\n"; | ||||
252 | } | ||||
253 | |||||
254 | B<**> | ||||
255 | However, adjacent encoded-words with same charset will be concatenated | ||||
256 | to handle multibyte sequences safely. | ||||
257 | |||||
258 | B<**> | ||||
259 | Language information defined by RFC2231, section 5 will be additonal | ||||
260 | third element, if any. | ||||
261 | |||||
262 | B<*> | ||||
263 | Whitespaces surrounding unencoded data will not be stripped so that | ||||
264 | compatibility with L<MIME::Words> will be ensured. | ||||
265 | |||||
266 | B<In a scalar context,> joins the "data" elements of the above | ||||
267 | list together, and returns that. I<Warning: this is information-lossy,> | ||||
268 | and probably I<not> what you want, but if you know that all charsets | ||||
269 | in the ENCODED string are identical, it might be useful to you. | ||||
270 | (Before you use this, please see L<MIME::WordDecoder/unmime>, | ||||
271 | which is probably what you want.) | ||||
272 | B<**> | ||||
273 | See also "Charset" option below. | ||||
274 | |||||
275 | In the event of a syntax error, $@ will be set to a description | ||||
276 | of the error, but parsing will continue as best as possible (so as to | ||||
277 | get I<something> back when decoding headers). | ||||
278 | $@ will be false if no error was detected. | ||||
279 | |||||
280 | B<*> | ||||
281 | Malformed encoded-words will be kept encoded. | ||||
282 | In this case $@ will be set. | ||||
283 | |||||
284 | Any arguments past the ENCODED string are taken to define a hash of options. | ||||
285 | B<**> | ||||
286 | When Unicode/multibyte support is disabled | ||||
287 | (see L<MIME::Charset/USE_ENCODE>), | ||||
288 | these options will not have any effects. | ||||
289 | |||||
290 | =over 4 | ||||
291 | |||||
292 | =item Charset | ||||
293 | B<**> | ||||
294 | |||||
295 | Name of character set by which data elements in scalar context | ||||
296 | will be converted. | ||||
297 | The default is no conversion. | ||||
298 | If this option is specified as special value C<"_UNICODE_">, | ||||
299 | returned value will be Unicode string. | ||||
300 | |||||
301 | B<Note>: | ||||
302 | This feature is still information-lossy, I<except> when C<"_UNICODE_"> is | ||||
303 | specified. | ||||
304 | |||||
305 | =item Detect7bit | ||||
306 | B<**> | ||||
307 | |||||
308 | Try to detect 7-bit charset on unencoded portions. | ||||
309 | Default is C<"YES">. | ||||
310 | |||||
311 | =cut | ||||
312 | |||||
313 | #=item Field | ||||
314 | # | ||||
315 | #Name of the mail field this string came from. I<Currently ignored.> | ||||
316 | |||||
317 | =item Mapping | ||||
318 | B<**> | ||||
319 | |||||
320 | In scalar context, specify mappings actually used for charset names. | ||||
321 | C<"EXTENDED"> uses extended mappings. | ||||
322 | C<"STANDARD"> uses standardized strict mappings. | ||||
323 | Default is C<"EXTENDED">. | ||||
324 | |||||
325 | =back | ||||
326 | |||||
327 | =cut | ||||
328 | |||||
329 | sub decode_mimewords { | ||||
330 | my $encstr = shift; | ||||
331 | my %params = @_; | ||||
332 | my %Params = &_getparams(\%params, | ||||
333 | NoDefault => [qw(Charset)], # default is no conv. | ||||
334 | YesNo => [qw(Detect7bit)], | ||||
335 | Others => [qw(Mapping)], | ||||
336 | Obsoleted => [qw(Field)], | ||||
337 | ToUpper => [qw(Charset Mapping)], | ||||
338 | ); | ||||
339 | my $cset = MIME::Charset->new($Params{Charset}, | ||||
340 | Mapping => $Params{Mapping}); | ||||
341 | # unfolding: normalize linear-white-spaces and orphan newlines. | ||||
342 | $encstr =~ s/(?:[\r\n]+[\t ])*[\r\n]+([\t ]|\Z)/$1? " ": ""/eg; | ||||
343 | $encstr =~ s/[\r\n]+/ /g; | ||||
344 | |||||
345 | my @tokens; | ||||
346 | $@ = ''; ### error-return | ||||
347 | |||||
348 | ### Decode: | ||||
349 | my ($word, $charset, $language, $encoding, $enc, $dec); | ||||
350 | my $spc = ''; | ||||
351 | pos($encstr) = 0; | ||||
352 | while (1) { | ||||
353 | last if (pos($encstr) >= length($encstr)); | ||||
354 | my $pos = pos($encstr); ### save it | ||||
355 | |||||
356 | ### Case 1: are we looking at "=?..?..?="? | ||||
357 | if ($encstr =~ m{\G # from where we left off.. | ||||
358 | =\?([^?]*) # "=?" + charset + | ||||
359 | \?([bq]) # "?" + encoding + | ||||
360 | \?([^?]+) # "?" + data maybe with spcs + | ||||
361 | \?= # "?=" | ||||
362 | ([\r\n\t ]*) | ||||
363 | }xgi) { | ||||
364 | ($word, $charset, $encoding, $enc) = ($&, $1, lc($2), $3); | ||||
365 | my $tspc = $4; | ||||
366 | |||||
367 | # RFC 2231 section 5 extension | ||||
368 | if ($charset =~ s/^([^\*]*)\*(.*)/$1/) { | ||||
369 | $language = $2 || undef; | ||||
370 | $charset ||= undef; | ||||
371 | } else { | ||||
372 | $language = undef; | ||||
373 | } | ||||
374 | |||||
375 | if ($encoding eq 'q') { | ||||
376 | $dec = _decode_Q($enc); | ||||
377 | } else { | ||||
378 | $dec = _decode_B($enc); | ||||
379 | } | ||||
380 | unless (defined $dec) { | ||||
381 | $@ .= qq|Illegal sequence in "$word" (pos $pos)\n|; | ||||
382 | push @tokens, [$spc.$word]; | ||||
383 | $spc = ''; | ||||
384 | next; | ||||
385 | } | ||||
386 | |||||
387 | { local $@; | ||||
388 | if (scalar(@tokens) and | ||||
389 | lc($charset || "") eq lc($tokens[-1]->[1] || "") and | ||||
390 | resolve_alias($charset) and | ||||
391 | (!${tokens[-1]}[2] and !$language or | ||||
392 | lc(${tokens[-1]}[2]) eq lc($language))) { # Concat words if possible. | ||||
393 | $tokens[-1]->[0] .= $dec; | ||||
394 | } elsif ($language) { | ||||
395 | push @tokens, [$dec, $charset, $language]; | ||||
396 | } elsif ($charset) { | ||||
397 | push @tokens, [$dec, $charset]; | ||||
398 | } else { | ||||
399 | push @tokens, [$dec]; | ||||
400 | } | ||||
401 | $spc = $tspc; | ||||
402 | } | ||||
403 | next; | ||||
404 | } | ||||
405 | |||||
406 | ### Case 2: are we looking at a bad "=?..." prefix? | ||||
407 | ### We need this to detect problems for case 3, which stops at "=?": | ||||
408 | pos($encstr) = $pos; # reset the pointer. | ||||
409 | if ($encstr =~ m{\G=\?}xg) { | ||||
410 | $@ .= qq|unterminated "=?..?..?=" in "$encstr" (pos $pos)\n|; | ||||
411 | push @tokens, [$spc.'=?']; | ||||
412 | $spc = ''; | ||||
413 | next; | ||||
414 | } | ||||
415 | |||||
416 | ### Case 3: are we looking at ordinary text? | ||||
417 | pos($encstr) = $pos; # reset the pointer. | ||||
418 | if ($encstr =~ m{\G # from where we left off... | ||||
419 | (.*? # shortest possible string, | ||||
420 | \n*) # followed by 0 or more NLs, | ||||
421 | (?=(\Z|=\?)) # terminated by "=?" or EOS | ||||
422 | }xgs) { | ||||
423 | length($1) or croak "MIME::EncWords: internal logic err: empty token\n"; | ||||
424 | push @tokens, [$spc.$1]; | ||||
425 | $spc = ''; | ||||
426 | next; | ||||
427 | } | ||||
428 | |||||
429 | ### Case 4: bug! | ||||
430 | croak "MIME::EncWords: unexpected case:\n($encstr) pos $pos\n\t". | ||||
431 | "Please alert developer.\n"; | ||||
432 | } | ||||
433 | push @tokens, [$spc] if $spc; | ||||
434 | |||||
435 | # Detect 7-bit charset | ||||
436 | if ($Params{Detect7bit} ne "NO") { | ||||
437 | local $@; | ||||
438 | foreach my $t (@tokens) { | ||||
439 | unless ($t->[0] =~ $UNSAFE or $t->[1]) { | ||||
440 | my $charset = MIME::Charset::_detect_7bit_charset($t->[0]); | ||||
441 | if ($charset and $charset ne &MIME::Charset::default()) { | ||||
442 | $t->[1] = $charset; | ||||
443 | } | ||||
444 | } | ||||
445 | } | ||||
446 | } | ||||
447 | |||||
448 | if (wantarray) { | ||||
449 | @tokens; | ||||
450 | } else { | ||||
451 | join('', map { | ||||
452 | &_convert($_->[0], $_->[1], $cset, $Params{Mapping}) | ||||
453 | } @tokens); | ||||
454 | } | ||||
455 | } | ||||
456 | |||||
457 | #------------------------------ | ||||
458 | |||||
459 | # _convert RAW, FROMCHARSET, TOCHARSET, MAPPING | ||||
460 | # Private: used by decode_mimewords() to convert string by other charset | ||||
461 | # or to decode to Unicode. | ||||
462 | # When source charset is unknown and Unicode string is requested, at first | ||||
463 | # try well-formed UTF-8 then fallback to ISO-8859-1 so that almost all | ||||
464 | # non-ASCII bytes will be preserved. | ||||
465 | sub _convert($$$$) { | ||||
466 | my $s = shift; | ||||
467 | my $charset = shift; | ||||
468 | my $cset = shift; | ||||
469 | my $mapping = shift; | ||||
470 | return $s unless &MIME::Charset::USE_ENCODE; | ||||
471 | return $s unless $cset->as_string; | ||||
472 | croak "unsupported charset ``".$cset->as_string."''" | ||||
473 | unless $cset->decoder or $cset->as_string eq "_UNICODE_"; | ||||
474 | |||||
475 | local($@); | ||||
476 | $charset = MIME::Charset->new($charset, Mapping => $mapping); | ||||
477 | if ($charset->as_string and $charset->as_string eq $cset->as_string) { | ||||
478 | return $s; | ||||
479 | } | ||||
480 | # build charset object to transform string from $charset to $cset. | ||||
481 | $charset->encoder($cset); | ||||
482 | |||||
483 | my $converted = $s; | ||||
484 | if (is_utf8($s) or $s =~ $WIDECHAR) { | ||||
485 | if ($charset->output_charset ne "_UNICODE_") { | ||||
486 | $converted = $charset->encode($s); | ||||
487 | } | ||||
488 | } elsif ($charset->output_charset eq "_UNICODE_") { | ||||
489 | if (!$charset->decoder) { | ||||
490 | if ($s =~ $UNSAFE) { | ||||
491 | $@ = ''; | ||||
492 | eval { | ||||
493 | $charset = MIME::Charset->new("UTF-8", | ||||
494 | Mapping => 'STANDARD'); | ||||
495 | $converted = $charset->decode($converted, FB_CROAK()); | ||||
496 | }; | ||||
497 | if ($@) { | ||||
498 | $converted = $s; | ||||
499 | $charset = MIME::Charset->new("ISO-8859-1", | ||||
500 | Mapping => 'STANDARD'); | ||||
501 | $converted = $charset->decode($converted, 0); | ||||
502 | } | ||||
503 | } | ||||
504 | } else { | ||||
505 | $converted = $charset->decode($s); | ||||
506 | } | ||||
507 | } elsif ($charset->decoder) { | ||||
508 | $converted = $charset->encode($s); | ||||
509 | } | ||||
510 | return $converted; | ||||
511 | } | ||||
512 | |||||
513 | #------------------------------ | ||||
514 | |||||
515 | =item encode_mimeword RAW, [ENCODING], [CHARSET] | ||||
516 | |||||
517 | I<Function.> | ||||
518 | Encode a single RAW "word" that has unsafe characters. | ||||
519 | The "word" will be encoded in its entirety. | ||||
520 | |||||
521 | ### Encode "<<Franc,ois>>": | ||||
522 | $encoded = encode_mimeword("\xABFran\xE7ois\xBB"); | ||||
523 | |||||
524 | You may specify the ENCODING (C<"Q"> or C<"B">), which defaults to C<"Q">. | ||||
525 | B<**> | ||||
526 | You may also specify it as ``special'' value: C<"S"> to choose shorter | ||||
527 | one of either C<"Q"> or C<"B">. | ||||
528 | |||||
529 | You may specify the CHARSET, which defaults to C<iso-8859-1>. | ||||
530 | |||||
531 | B<*> | ||||
532 | Spaces will be escaped with ``_'' by C<"Q"> encoding. | ||||
533 | |||||
534 | =cut | ||||
535 | |||||
536 | sub encode_mimeword { | ||||
537 | my $word = shift; | ||||
538 | my $encoding = uc(shift || 'Q'); # not overridden. | ||||
539 | my $charset = shift || 'ISO-8859-1'; # ditto. | ||||
540 | my $language = uc(shift || ""); # ditto. | ||||
541 | |||||
542 | if (ref $charset) { | ||||
543 | if (is_utf8($word) or $word =~ /$WIDECHAR/) { | ||||
544 | $word = $charset->undecode($word, 0); | ||||
545 | } | ||||
546 | $charset = $charset->as_string; | ||||
547 | } else { | ||||
548 | $charset = uc($charset); | ||||
549 | } | ||||
550 | my $encstr; | ||||
551 | if ($encoding eq 'Q') { | ||||
552 | $encstr = &_encode_Q($word); | ||||
553 | } elsif ($encoding eq "S") { | ||||
554 | my ($B, $Q) = (&_encode_B($word), &_encode_Q($word)); | ||||
555 | if (length($B) < length($Q)) { | ||||
556 | $encoding = "B"; | ||||
557 | $encstr = $B; | ||||
558 | } else { | ||||
559 | $encoding = "Q"; | ||||
560 | $encstr = $Q; | ||||
561 | } | ||||
562 | } else { # "B" | ||||
563 | $encoding = "B"; | ||||
564 | $encstr = &_encode_B($word); | ||||
565 | } | ||||
566 | |||||
567 | if ($language) { | ||||
568 | return "=?$charset*$language?$encoding?$encstr?="; | ||||
569 | } else { | ||||
570 | return "=?$charset?$encoding?$encstr?="; | ||||
571 | } | ||||
572 | } | ||||
573 | |||||
574 | #------------------------------ | ||||
575 | |||||
576 | =item encode_mimewords RAW, [OPTS] | ||||
577 | |||||
578 | I<Function.> | ||||
579 | Given a RAW string, try to find and encode all "unsafe" sequences | ||||
580 | of characters: | ||||
581 | |||||
582 | ### Encode a string with some unsafe "words": | ||||
583 | $encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB"); | ||||
584 | |||||
585 | Returns the encoded string. | ||||
586 | |||||
587 | B<**> | ||||
588 | RAW may be a Unicode string when Unicode/multibyte support is enabled | ||||
589 | (see L<MIME::Charset/USE_ENCODE>). | ||||
590 | Furthermore, RAW may be a reference to that returned | ||||
591 | by L</decode_mimewords> on array context. In latter case "Charset" | ||||
592 | option (see below) will be overridden (see also a note below). | ||||
593 | |||||
594 | B<Note>: | ||||
595 | B<*> | ||||
596 | When RAW is an arrayref, | ||||
597 | adjacent encoded-words (i.e. elements having non-ASCII charset element) | ||||
598 | are concatenated. Then they are split taking | ||||
599 | care of character boundaries of multibyte sequences when Unicode/multibyte | ||||
600 | support is enabled. | ||||
601 | Portions for unencoded data should include surrounding whitespace(s), or | ||||
602 | they will be merged into adjoining encoded-word(s). | ||||
603 | |||||
604 | Any arguments past the RAW string are taken to define a hash of options: | ||||
605 | |||||
606 | =over 4 | ||||
607 | |||||
608 | =item Charset | ||||
609 | |||||
610 | Encode all unsafe stuff with this charset. Default is 'ISO-8859-1', | ||||
611 | a.k.a. "Latin-1". | ||||
612 | |||||
613 | =item Detect7bit | ||||
614 | B<**> | ||||
615 | |||||
616 | When "Encoding" option (see below) is specified as C<"a"> and "Charset" | ||||
617 | option is unknown, try to detect 7-bit charset on given RAW string. | ||||
618 | Default is C<"YES">. | ||||
619 | When Unicode/multibyte support is disabled, | ||||
620 | this option will not have any effects | ||||
621 | (see L<MIME::Charset/USE_ENCODE>). | ||||
622 | |||||
623 | =item Encoding | ||||
624 | |||||
625 | The encoding to use, C<"q"> or C<"b">. | ||||
626 | B<**> | ||||
627 | You may also specify ``special'' values: C<"a"> will automatically choose | ||||
628 | recommended encoding to use (with charset conversion if alternative | ||||
629 | charset is recommended: see L<MIME::Charset>); | ||||
630 | C<"s"> will choose shorter one of either C<"q"> or C<"b">. | ||||
631 | B<Note>: | ||||
632 | B<*> | ||||
633 | As of release 1.005, The default was changed from C<"q"> | ||||
634 | (the default on MIME::Words) to C<"a">. | ||||
635 | |||||
636 | =item Field | ||||
637 | |||||
638 | Name of the mail field this string will be used in. | ||||
639 | B<**> | ||||
640 | Length of mail field name will be considered in the first line of | ||||
641 | encoded header. | ||||
642 | |||||
643 | =item Folding | ||||
644 | B<**> | ||||
645 | |||||
646 | A Sequence to fold encoded lines. The default is C<"\n">. | ||||
647 | If empty string C<""> is specified, encoded-words exceeding line length | ||||
648 | (see L</MaxLineLen> below) will be split by SPACE. | ||||
649 | |||||
650 | B<Note>: | ||||
651 | B<*> | ||||
652 | Though RFC 5322 (formerly RFC 2822) states that the lines in | ||||
653 | Internet messages are delimited by CRLF (C<"\r\n">), | ||||
654 | this module chose LF (C<"\n">) as a default to keep backward compatibility. | ||||
655 | When you use the default, you might need converting newlines | ||||
656 | before encoded headers are thrown into session. | ||||
657 | |||||
658 | =item Mapping | ||||
659 | B<**> | ||||
660 | |||||
661 | Specify mappings actually used for charset names. | ||||
662 | C<"EXTENDED"> uses extended mappings. | ||||
663 | C<"STANDARD"> uses standardized strict mappings. | ||||
664 | The default is C<"EXTENDED">. | ||||
665 | When Unicode/multibyte support is disabled, | ||||
666 | this option will not have any effects | ||||
667 | (see L<MIME::Charset/USE_ENCODE>). | ||||
668 | |||||
669 | =item MaxLineLen | ||||
670 | B<**> | ||||
671 | |||||
672 | Maximum line length excluding newline. | ||||
673 | The default is 76. | ||||
674 | Negative value means unlimited line length (as of release 1.012.3). | ||||
675 | |||||
676 | =item Minimal | ||||
677 | B<**> | ||||
678 | |||||
679 | Takes care of natural word separators (i.e. whitespaces) | ||||
680 | in the text to be encoded. | ||||
681 | If C<"NO"> is specified, this module will encode whole text | ||||
682 | (if encoding needed) not regarding whitespaces; | ||||
683 | encoded-words exceeding line length will be split based only on their | ||||
684 | lengths. | ||||
685 | Default is C<"YES"> by which minimal portions of text are encoded. | ||||
686 | If C<"DISPNAME"> is specified, portions including special characters | ||||
687 | described in RFC5322 (formerly RFC2822, RFC822) address specification | ||||
688 | (section 3.4) are also encoded. | ||||
689 | This is useful for encoding display-name of address fields. | ||||
690 | |||||
691 | B<Note>: | ||||
692 | As of release 0.040, default has been changed to C<"YES"> to ensure | ||||
693 | compatibility with MIME::Words. | ||||
694 | On earlier releases, this option was fixed to be C<"NO">. | ||||
695 | |||||
696 | B<Note>: | ||||
697 | C<"DISPNAME"> option was introduced at release 1.012. | ||||
698 | |||||
699 | =item Replacement | ||||
700 | B<**> | ||||
701 | |||||
702 | See L<MIME::Charset/Error Handling>. | ||||
703 | |||||
704 | =back | ||||
705 | |||||
706 | =cut | ||||
707 | |||||
708 | sub encode_mimewords { | ||||
709 | my $words = shift; | ||||
710 | my %params = @_; | ||||
711 | my %Params = &_getparams(\%params, | ||||
712 | YesNo => [qw(Detect7bit)], | ||||
713 | Others => [qw(Charset Encoding Field Folding | ||||
714 | Mapping MaxLineLen Minimal | ||||
715 | Replacement)], | ||||
716 | ToUpper => [qw(Charset Encoding Mapping Minimal | ||||
717 | Replacement)], | ||||
718 | ); | ||||
719 | croak "unsupported encoding ``$Params{Encoding}''" | ||||
720 | unless $Params{Encoding} =~ /^[ABQS]$/; | ||||
721 | # newline and following WSP | ||||
722 | my ($fwsbrk, $fwsspc); | ||||
723 | if ($Params{Folding} =~ m/^([\r\n]*)([\t ]?)$/) { | ||||
724 | $fwsbrk = $1; | ||||
725 | $fwsspc = $2 || " "; | ||||
726 | } else { | ||||
727 | croak sprintf "illegal folding sequence ``\\x%*v02X''", '\\x', | ||||
728 | $Params{Folding}; | ||||
729 | } | ||||
730 | # charset objects | ||||
731 | my $charsetobj = MIME::Charset->new($Params{Charset}, | ||||
732 | Mapping => $Params{Mapping}); | ||||
733 | my $ascii = MIME::Charset->new("US-ASCII", Mapping => 'STANDARD'); | ||||
734 | $ascii->encoder($ascii); | ||||
735 | # lengths | ||||
736 | my $firstlinelen = $Params{MaxLineLen} - | ||||
737 | ($Params{Field}? length("$Params{Field}: "): 0); | ||||
738 | my $maxrestlen = $Params{MaxLineLen} - length($fwsspc); | ||||
739 | # minimal encoding flag | ||||
740 | if (!$Params{Minimal}) { | ||||
741 | $Params{Minimal} = 'NO'; | ||||
742 | } elsif ($Params{Minimal} !~ /^(NO|DISPNAME)$/) { | ||||
743 | $Params{Minimal} = 'YES'; | ||||
744 | } | ||||
745 | # unsafe ASCII sequences | ||||
746 | my $UNSAFEASCII = ($maxrestlen <= 1)? | ||||
747 | qr{(?: =\? )}ox: | ||||
748 | qr{(?: =\? | [$PRINTABLE]{$Params{MaxLineLen}} )}x; | ||||
749 | $UNSAFEASCII = qr{(?: [$DISPNAMESPECIAL] | $UNSAFEASCII )}x | ||||
750 | if $Params{Minimal} eq 'DISPNAME'; | ||||
751 | |||||
752 | unless (ref($words) eq "ARRAY") { | ||||
753 | # workaround for UTF-16* & UTF-32*: force UTF-8. | ||||
754 | if ($charsetobj->as_string =~ /$ASCIIINCOMPAT/) { | ||||
755 | $words = _utf_to_unicode($charsetobj, $words); | ||||
756 | $charsetobj = MIME::Charset->new('UTF-8'); | ||||
757 | } | ||||
758 | |||||
759 | my @words = (); | ||||
760 | # unfolding: normalize linear-white-spaces and orphan newlines. | ||||
761 | $words =~ s/(?:[\r\n]+[\t ])*[\r\n]+([\t ]|\Z)/$1? " ": ""/eg; | ||||
762 | $words =~ s/[\r\n]+/ /g; | ||||
763 | # split if required | ||||
764 | if ($Params{Minimal} =~ /YES|DISPNAME/) { | ||||
765 | my ($spc, $unsafe_last) = ('', 0); | ||||
766 | foreach my $w (split(/([\t ]+)/, $words)) { | ||||
767 | next unless scalar(@words) or length($w); # skip garbage | ||||
768 | if ($w =~ /[\t ]/) { | ||||
769 | $spc = $w; | ||||
770 | next; | ||||
771 | } | ||||
772 | |||||
773 | # workaround for ``ASCII transformation'' charsets | ||||
774 | my $u = $w; | ||||
775 | if ($charsetobj->as_string =~ /$ASCIITRANS/) { | ||||
776 | if (MIME::Charset::USE_ENCODE) { | ||||
777 | if (is_utf8($w) or $w =~ /$WIDECHAR/) { | ||||
778 | $w = $charsetobj->undecode($u); | ||||
779 | } else { | ||||
780 | $u = $charsetobj->decode($w); | ||||
781 | } | ||||
782 | } elsif ($w =~ /[+~]/) { #FIXME: for pre-Encode environment | ||||
783 | $u = "x$w"; | ||||
784 | } | ||||
785 | } | ||||
786 | if (scalar(@words)) { | ||||
787 | if (($w =~ /$NONPRINT|$UNSAFEASCII/ or $u ne $w) xor | ||||
788 | $unsafe_last) { | ||||
789 | if ($unsafe_last) { | ||||
790 | push @words, $spc.$w; | ||||
791 | } else { | ||||
792 | $words[-1] .= $spc; | ||||
793 | push @words, $w; | ||||
794 | } | ||||
795 | $unsafe_last = not $unsafe_last; | ||||
796 | } else { | ||||
797 | $words[-1] .= $spc.$w; | ||||
798 | } | ||||
799 | } else { | ||||
800 | push @words, $spc.$w; | ||||
801 | $unsafe_last = | ||||
802 | ($w =~ /$NONPRINT|$UNSAFEASCII/ or $u ne $w); | ||||
803 | } | ||||
804 | $spc = ''; | ||||
805 | } | ||||
806 | if ($spc) { | ||||
807 | if (scalar(@words)) { | ||||
808 | $words[-1] .= $spc; | ||||
809 | } else { # only WSPs | ||||
810 | push @words, $spc; | ||||
811 | } | ||||
812 | } | ||||
813 | } else { | ||||
814 | @words = ($words); | ||||
815 | } | ||||
816 | $words = [map { [$_, $Params{Charset}] } @words]; | ||||
817 | } | ||||
818 | |||||
819 | # Translate / concatenate words. | ||||
820 | my @triplets; | ||||
821 | foreach (@$words) { | ||||
822 | my ($s, $cset) = @$_; | ||||
823 | next unless length($s); | ||||
824 | my $csetobj = MIME::Charset->new($cset || "", | ||||
825 | Mapping => $Params{Mapping}); | ||||
826 | |||||
827 | # workaround for UTF-16*/UTF-32*: force UTF-8 | ||||
828 | if ($csetobj->as_string and $csetobj->as_string =~ /$ASCIIINCOMPAT/) { | ||||
829 | $s = _utf_to_unicode($csetobj, $s); | ||||
830 | $csetobj = MIME::Charset->new('UTF-8'); | ||||
831 | } | ||||
832 | |||||
833 | # determine charset and encoding | ||||
834 | # try defaults only if 7-bit charset detection is not required | ||||
835 | my $enc; | ||||
836 | my $obj = $csetobj; | ||||
837 | unless ($obj->as_string) { | ||||
838 | if ($Params{Encoding} ne "A" or $Params{Detect7bit} eq "NO" or | ||||
839 | $s =~ /$UNSAFE/) { | ||||
840 | $obj = $charsetobj; | ||||
841 | } | ||||
842 | } | ||||
843 | ($s, $cset, $enc) = | ||||
844 | $obj->header_encode($s, | ||||
845 | Detect7bit => $Params{Detect7bit}, | ||||
846 | Replacement => $Params{Replacement}, | ||||
847 | Encoding => $Params{Encoding}); | ||||
848 | # Resolve 'S' encoding based on global length. See (*). | ||||
849 | $enc = 'S' | ||||
850 | if defined $enc and | ||||
851 | ($Params{Encoding} eq 'S' or | ||||
852 | $Params{Encoding} eq 'A' and $obj->header_encoding eq 'S'); | ||||
853 | |||||
854 | # pure ASCII | ||||
855 | if ($cset eq "US-ASCII" and !$enc and $s =~ /$UNSAFEASCII/) { | ||||
856 | # pure ASCII with unsafe sequences should be encoded | ||||
857 | $cset = $csetobj->output_charset || | ||||
858 | $charsetobj->output_charset || | ||||
859 | $ascii->output_charset; | ||||
860 | $csetobj = MIME::Charset->new($cset, | ||||
861 | Mapping => $Params{Mapping}); | ||||
862 | # Preserve original Encoding option unless it was 'A'. | ||||
863 | $enc = ($Params{Encoding} eq 'A') ? | ||||
864 | ($csetobj->header_encoding || 'Q') : | ||||
865 | $Params{Encoding}; | ||||
866 | } else { | ||||
867 | $csetobj = MIME::Charset->new($cset, | ||||
868 | Mapping => $Params{Mapping}); | ||||
869 | } | ||||
870 | |||||
871 | # Now no charset translations are needed. | ||||
872 | $csetobj->encoder($csetobj); | ||||
873 | |||||
874 | # Concatenate adjacent ``words'' so that multibyte sequences will | ||||
875 | # be handled safely. | ||||
876 | # Note: Encoded-word and unencoded text must not adjoin without | ||||
877 | # separating whitespace(s). | ||||
878 | if (scalar(@triplets)) { | ||||
879 | my ($last, $lastenc, $lastcsetobj) = @{$triplets[-1]}; | ||||
880 | if ($csetobj->decoder and | ||||
881 | ($lastcsetobj->as_string || "") eq $csetobj->as_string and | ||||
882 | ($lastenc || "") eq ($enc || "")) { | ||||
883 | $triplets[-1]->[0] .= $s; | ||||
884 | next; | ||||
885 | } elsif (!$lastenc and $enc and $last !~ /[\r\n\t ]$/) { | ||||
886 | if ($last =~ /^(.*)([\r\n\t ])([$PRINTABLE]+)$/s) { | ||||
887 | $triplets[-1]->[0] = $1.$2; | ||||
888 | $s = $3.$s; | ||||
889 | } elsif ($lastcsetobj->as_string eq "US-ASCII") { | ||||
890 | $triplets[-1]->[0] .= $s; | ||||
891 | $triplets[-1]->[1] = $enc; | ||||
892 | $triplets[-1]->[2] = $csetobj; | ||||
893 | next; | ||||
894 | } | ||||
895 | } elsif ($lastenc and !$enc and $s !~ /^[\r\n\t ]/) { | ||||
896 | if ($s =~ /^([$PRINTABLE]+)([\r\n\t ])(.*)$/s) { | ||||
897 | $triplets[-1]->[0] .= $1; | ||||
898 | $s = $2.$3; | ||||
899 | } elsif ($csetobj->as_string eq "US-ASCII") { | ||||
900 | $triplets[-1]->[0] .= $s; | ||||
901 | next; | ||||
902 | } | ||||
903 | } | ||||
904 | } | ||||
905 | push @triplets, [$s, $enc, $csetobj]; | ||||
906 | } | ||||
907 | |||||
908 | # (*) Resolve 'S' encoding based on global length. | ||||
909 | my @s_enc = grep { $_->[1] and $_->[1] eq 'S' } @triplets; | ||||
910 | if (scalar @s_enc) { | ||||
911 | my $enc; | ||||
912 | my $b = scalar grep { $_->[1] and $_->[1] eq 'B' } @triplets; | ||||
913 | my $q = scalar grep { $_->[1] and $_->[1] eq 'Q' } @triplets; | ||||
914 | # 'A' chooses 'B' or 'Q' when all other encoded-words have same enc. | ||||
915 | if ($Params{Encoding} eq 'A' and $b and ! $q) { | ||||
916 | $enc = 'B'; | ||||
917 | } elsif ($Params{Encoding} eq 'A' and ! $b and $q) { | ||||
918 | $enc = 'Q'; | ||||
919 | # Otherwise, assuming 'Q', when characters to be encoded are more than | ||||
920 | # 6th of total (plus a little fraction), 'B' will win. | ||||
921 | # Note: This might give 'Q' so great advantage... | ||||
922 | } else { | ||||
923 | my @no_enc = grep { ! $_->[1] } @triplets; | ||||
924 | my $total = length join('', map { $_->[0] } (@no_enc, @s_enc)); | ||||
925 | my $q = scalar(() = join('', map { $_->[0] } @s_enc) =~ | ||||
926 | m{[^- !*+/0-9A-Za-z]}g); | ||||
927 | if ($total + 8 < $q * 6) { | ||||
928 | $enc = 'B'; | ||||
929 | } else { | ||||
930 | $enc = 'Q'; | ||||
931 | } | ||||
932 | } | ||||
933 | foreach (@triplets) { | ||||
934 | $_->[1] = $enc if $_->[1] and $_->[1] eq 'S'; | ||||
935 | } | ||||
936 | } | ||||
937 | |||||
938 | # chop leading FWS | ||||
939 | while (scalar(@triplets) and $triplets[0]->[0] =~ s/^[\r\n\t ]+//) { | ||||
940 | shift @triplets unless length($triplets[0]->[0]); | ||||
941 | } | ||||
942 | |||||
943 | # Split long ``words''. | ||||
944 | my @splitwords; | ||||
945 | my $restlen; | ||||
946 | if ($Params{MaxLineLen} < 0) { | ||||
947 | @splitwords = @triplets; | ||||
948 | } else { | ||||
949 | $restlen = $firstlinelen; | ||||
950 | foreach (@triplets) { | ||||
951 | my ($s, $enc, $csetobj) = @$_; | ||||
952 | |||||
953 | my @s = &_split($s, $enc, $csetobj, $restlen, $maxrestlen); | ||||
954 | push @splitwords, @s; | ||||
955 | my ($last, $lastenc, $lastcsetobj) = @{$s[-1]}; | ||||
956 | my $lastlen; | ||||
957 | if ($lastenc) { | ||||
958 | $lastlen = $lastcsetobj->encoded_header_len($last, $lastenc); | ||||
959 | } else { | ||||
960 | $lastlen = length($last); | ||||
961 | } | ||||
962 | $restlen = $maxrestlen if scalar @s > 1; # has split; new line(s) fed | ||||
963 | $restlen -= $lastlen; | ||||
964 | $restlen = $maxrestlen if $restlen <= 1; | ||||
965 | } | ||||
966 | } | ||||
967 | |||||
968 | # Do encoding. | ||||
969 | my @lines; | ||||
970 | $restlen = $firstlinelen; | ||||
971 | foreach (@splitwords) { | ||||
972 | my ($str, $encoding, $charsetobj) = @$_; | ||||
973 | next unless length($str); | ||||
974 | |||||
975 | my $s; | ||||
976 | if (!$encoding) { | ||||
977 | $s = $str; | ||||
978 | } else { | ||||
979 | $s = encode_mimeword($str, $encoding, $charsetobj); | ||||
980 | } | ||||
981 | |||||
982 | my $spc = (scalar(@lines) and $lines[-1] =~ /[\r\n\t ]$/ or | ||||
983 | $s =~ /^[\r\n\t ]/)? '': ' '; | ||||
984 | if (!scalar(@lines)) { | ||||
985 | push @lines, $s; | ||||
986 | } elsif ($Params{MaxLineLen} < 0) { | ||||
987 | $lines[-1] .= $spc.$s; | ||||
988 | } elsif (length($lines[-1].$spc.$s) <= $restlen) { | ||||
989 | $lines[-1] .= $spc.$s; | ||||
990 | } else { | ||||
991 | if ($lines[-1] =~ s/([\r\n\t ]+)$//) { | ||||
992 | $s = $1.$s; | ||||
993 | } | ||||
994 | $s =~ s/^[\r\n]*[\t ]//; # strip only one WSP replaced by FWS | ||||
995 | push @lines, $s; | ||||
996 | $restlen = $maxrestlen; | ||||
997 | } | ||||
998 | } | ||||
999 | |||||
1000 | join($fwsbrk.$fwsspc, @lines); | ||||
1001 | } | ||||
1002 | |||||
1003 | #------------------------------ | ||||
1004 | |||||
1005 | # _split RAW, ENCODING, CHARSET_OBJECT, ROOM_OF_FIRST_LINE, MAXRESTLEN | ||||
1006 | # Private: used by encode_mimewords() to split a string into | ||||
1007 | # (encoded or non-encoded) words. | ||||
1008 | # Returns an array of arrayrefs [SUBSTRING, ENCODING, CHARSET]. | ||||
1009 | sub _split { | ||||
1010 | my $str = shift; | ||||
1011 | my $encoding = shift; | ||||
1012 | my $charset = shift; | ||||
1013 | my $restlen = shift; | ||||
1014 | my $maxrestlen = shift; | ||||
1015 | |||||
1016 | if (!$charset->as_string or $charset->as_string eq '8BIT') {# Undecodable. | ||||
1017 | $str =~ s/[\r\n]+[\t ]*|\x00/ /g; # Eliminate hostile characters. | ||||
1018 | return ([$str, undef, $charset]); | ||||
1019 | } | ||||
1020 | if (!$encoding and $charset->as_string eq 'US-ASCII') { # Pure ASCII. | ||||
1021 | return &_split_ascii($str, $restlen, $maxrestlen); | ||||
1022 | } | ||||
1023 | if (!$charset->decoder and MIME::Charset::USE_ENCODE) { # Unsupported. | ||||
1024 | return ([$str, $encoding, $charset]); | ||||
1025 | } | ||||
1026 | |||||
1027 | my (@splitwords, $ustr, $first); | ||||
1028 | while (length($str)) { | ||||
1029 | if ($charset->encoded_header_len($str, $encoding) <= $restlen) { | ||||
1030 | push @splitwords, [$str, $encoding, $charset]; | ||||
1031 | last; | ||||
1032 | } | ||||
1033 | $ustr = $str; | ||||
1034 | if (!(is_utf8($ustr) or $ustr =~ /$WIDECHAR/) and | ||||
1035 | MIME::Charset::USE_ENCODE) { | ||||
1036 | $ustr = $charset->decode($ustr); | ||||
1037 | } | ||||
1038 | ($first, $str) = &_clip_unsafe($ustr, $encoding, $charset, $restlen); | ||||
1039 | # retry splitting if failed | ||||
1040 | if ($first and !$str and | ||||
1041 | $maxrestlen < $charset->encoded_header_len($first, $encoding)) { | ||||
1042 | ($first, $str) = &_clip_unsafe($ustr, $encoding, $charset, | ||||
1043 | $maxrestlen); | ||||
1044 | } | ||||
1045 | push @splitwords, [$first, $encoding, $charset]; | ||||
1046 | $restlen = $maxrestlen; | ||||
1047 | } | ||||
1048 | return @splitwords; | ||||
1049 | } | ||||
1050 | |||||
1051 | # _split_ascii RAW, ROOM_OF_FIRST_LINE, MAXRESTLEN | ||||
1052 | # Private: used by encode_mimewords() to split an US-ASCII string into | ||||
1053 | # (encoded or non-encoded) words. | ||||
1054 | # Returns an array of arrayrefs [SUBSTRING, undef, "US-ASCII"]. | ||||
1055 | sub _split_ascii { | ||||
1056 | my $s = shift; | ||||
1057 | my $restlen = shift; | ||||
1058 | my $maxrestlen = shift; | ||||
1059 | $restlen ||= $maxrestlen; | ||||
1060 | |||||
1061 | my @splitwords; | ||||
1062 | my $ascii = MIME::Charset->new("US-ASCII", Mapping => 'STANDARD'); | ||||
1063 | foreach my $line (split(/(?:[\t ]*[\r\n]+)+/, $s)) { | ||||
1064 | my $spc = ''; | ||||
1065 | foreach my $word (split(/([\t ]+)/, $line)) { | ||||
1066 | # skip first garbage | ||||
1067 | next unless scalar(@splitwords) or defined $word; | ||||
1068 | if ($word =~ /[\t ]/) { | ||||
1069 | $spc = $word; | ||||
1070 | next; | ||||
1071 | } | ||||
1072 | |||||
1073 | my $cont = $spc.$word; | ||||
1074 | my $elen = length($cont); | ||||
1075 | next unless $elen; | ||||
1076 | if (scalar(@splitwords)) { | ||||
1077 | # Concatenate adjacent words so that encoded-word and | ||||
1078 | # unencoded text will adjoin with separating whitespace. | ||||
1079 | if ($elen <= $restlen) { | ||||
1080 | $splitwords[-1]->[0] .= $cont; | ||||
1081 | $restlen -= $elen; | ||||
1082 | } else { | ||||
1083 | push @splitwords, [$cont, undef, $ascii]; | ||||
1084 | $restlen = $maxrestlen - $elen; | ||||
1085 | } | ||||
1086 | } else { | ||||
1087 | push @splitwords, [$cont, undef, $ascii]; | ||||
1088 | $restlen -= $elen; | ||||
1089 | } | ||||
1090 | $spc = ''; | ||||
1091 | } | ||||
1092 | if ($spc) { | ||||
1093 | if (scalar(@splitwords)) { | ||||
1094 | $splitwords[-1]->[0] .= $spc; | ||||
1095 | $restlen -= length($spc); | ||||
1096 | } else { # only WSPs | ||||
1097 | push @splitwords, [$spc, undef, $ascii]; | ||||
1098 | $restlen = $maxrestlen - length($spc); | ||||
1099 | } | ||||
1100 | } | ||||
1101 | } | ||||
1102 | return @splitwords; | ||||
1103 | } | ||||
1104 | |||||
1105 | # _clip_unsafe UNICODE, ENCODING, CHARSET_OBJECT, ROOM_OF_FIRST_LINE | ||||
1106 | # Private: used by encode_mimewords() to bite off one encodable | ||||
1107 | # ``word'' from a Unicode string. | ||||
1108 | # Note: When Unicode/multibyte support is not enabled, character | ||||
1109 | # boundaries of multibyte string shall be broken! | ||||
1110 | sub _clip_unsafe { | ||||
1111 | my $ustr = shift; | ||||
1112 | my $encoding = shift; | ||||
1113 | my $charset = shift; | ||||
1114 | my $restlen = shift; | ||||
1115 | return ("", "") unless length($ustr); | ||||
1116 | |||||
1117 | # Seek maximal division point. | ||||
1118 | my ($shorter, $longer) = (0, length($ustr)); | ||||
1119 | while ($shorter < $longer) { | ||||
1120 | my $cur = ($shorter + $longer + 1) >> 1; | ||||
1121 | my $enc = substr($ustr, 0, $cur); | ||||
1122 | if (MIME::Charset::USE_ENCODE ne '') { | ||||
1123 | $enc = $charset->undecode($enc); | ||||
1124 | } | ||||
1125 | my $elen = $charset->encoded_header_len($enc, $encoding); | ||||
1126 | if ($elen <= $restlen) { | ||||
1127 | $shorter = $cur; | ||||
1128 | } else { | ||||
1129 | $longer = $cur - 1; | ||||
1130 | } | ||||
1131 | } | ||||
1132 | |||||
1133 | # Make sure that combined characters won't be divided. | ||||
1134 | my ($fenc, $renc); | ||||
1135 | my $max = length($ustr); | ||||
1136 | while (1) { | ||||
1137 | $@ = ''; | ||||
1138 | eval { | ||||
1139 | ($fenc, $renc) = | ||||
1140 | (substr($ustr, 0, $shorter), substr($ustr, $shorter)); | ||||
1141 | if (MIME::Charset::USE_ENCODE ne '') { | ||||
1142 | # FIXME: croak if $renc =~ /^\p{M}/ | ||||
1143 | $fenc = $charset->undecode($fenc, FB_CROAK()); | ||||
1144 | $renc = $charset->undecode($renc, FB_CROAK()); | ||||
1145 | } | ||||
1146 | }; | ||||
1147 | last unless ($@); | ||||
1148 | |||||
1149 | $shorter++; | ||||
1150 | unless ($shorter < $max) { # Unencodable character(s) may be included. | ||||
1151 | return ($charset->undecode($ustr), ""); | ||||
1152 | } | ||||
1153 | } | ||||
1154 | |||||
1155 | if (length($fenc)) { | ||||
1156 | return ($fenc, $renc); | ||||
1157 | } else { | ||||
1158 | return ($renc, ""); | ||||
1159 | } | ||||
1160 | } | ||||
1161 | |||||
1162 | #------------------------------ | ||||
1163 | |||||
1164 | # _getparams HASHREF, OPTS | ||||
1165 | # Private: used to get option parameters. | ||||
1166 | sub _getparams { | ||||
1167 | my $params = shift; | ||||
1168 | my %params = @_; | ||||
1169 | my %Params; | ||||
1170 | my %GotParams; | ||||
1171 | foreach my $k (qw(NoDefault YesNo Others Obsoleted ToUpper)) { | ||||
1172 | $Params{$k} = $params{$k} || []; | ||||
1173 | } | ||||
1174 | foreach my $k (keys %$params) { | ||||
1175 | my $supported = 0; | ||||
1176 | foreach my $i (@{$Params{NoDefault}}, @{$Params{YesNo}}, | ||||
1177 | @{$Params{Others}}, @{$Params{Obsoleted}}) { | ||||
1178 | if (lc $i eq lc $k) { | ||||
1179 | $GotParams{$i} = $params->{$k}; | ||||
1180 | $supported = 1; | ||||
1181 | last; | ||||
1182 | } | ||||
1183 | } | ||||
1184 | carp "unknown or deprecated option ``$k''" unless $supported; | ||||
1185 | } | ||||
1186 | # get defaults | ||||
1187 | foreach my $i (@{$Params{YesNo}}, @{$Params{Others}}) { | ||||
1188 | $GotParams{$i} = $Config->{$i} unless defined $GotParams{$i}; | ||||
1189 | } | ||||
1190 | # yesno params | ||||
1191 | foreach my $i (@{$Params{YesNo}}) { | ||||
1192 | if (!$GotParams{$i} or uc $GotParams{$i} eq "NO") { | ||||
1193 | $GotParams{$i} = "NO"; | ||||
1194 | } else { | ||||
1195 | $GotParams{$i} = "YES"; | ||||
1196 | } | ||||
1197 | } | ||||
1198 | # normalize case | ||||
1199 | foreach my $i (@{$Params{ToUpper}}) { | ||||
1200 | $GotParams{$i} &&= uc $GotParams{$i}; | ||||
1201 | } | ||||
1202 | return %GotParams; | ||||
1203 | } | ||||
1204 | |||||
1205 | #------------------------------ | ||||
1206 | |||||
1207 | =back | ||||
1208 | |||||
1209 | =head2 Configuration Files | ||||
1210 | B<**> | ||||
1211 | |||||
1212 | Built-in defaults of option parameters for L</decode_mimewords> | ||||
1213 | (except 'Charset' option) and | ||||
1214 | L</encode_mimewords> can be overridden by configuration files: | ||||
1215 | F<MIME/Charset/Defaults.pm> and F<MIME/EncWords/Defaults.pm>. | ||||
1216 | For more details read F<MIME/EncWords/Defaults.pm.sample>. | ||||
1217 | |||||
1218 | =head1 VERSION | ||||
1219 | |||||
1220 | Consult C<$VERSION> variable. | ||||
1221 | |||||
1222 | Development versions of this module may be found at | ||||
1223 | L<http://hatuka.nezumi.nu/repos/MIME-EncWords/>. | ||||
1224 | |||||
1225 | =head1 SEE ALSO | ||||
1226 | |||||
1227 | L<MIME::Charset>, | ||||
1228 | L<MIME::Tools> | ||||
1229 | |||||
1230 | =head1 AUTHORS | ||||
1231 | |||||
1232 | The original version of function decode_mimewords() is derived from | ||||
1233 | L<MIME::Words> module that was written by: | ||||
1234 | Eryq (F<eryq@zeegee.com>), ZeeGee Software Inc (F<http://www.zeegee.com>). | ||||
1235 | David F. Skoll (dfs@roaringpenguin.com) http://www.roaringpenguin.com | ||||
1236 | |||||
1237 | Other stuff are rewritten or added by: | ||||
1238 | Hatuka*nezumi - IKEDA Soji <hatuka(at)nezumi.nu>. | ||||
1239 | |||||
1240 | This program is free software; you can redistribute | ||||
1241 | it and/or modify it under the same terms as Perl itself. | ||||
1242 | |||||
1243 | =cut | ||||
1244 | |||||
1245 | 1; |