Filename | /usr/local/lib/perl5/site_perl/MIME/Charset.pm |
Statements | Executed 0 statements in 0s |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
0 | 0 | 0 | 0s | 0s | BEGIN@110 | MIME::Charset::
0 | 0 | 0 | 0s | 0s | BEGIN@112 | MIME::Charset::
0 | 0 | 0 | 0s | 0s | BEGIN@122 | MIME::Charset::
0 | 0 | 0 | 0s | 0s | BEGIN@130 | MIME::Charset::
0 | 0 | 0 | 0s | 0s | BEGIN@4 | MIME::Charset::
0 | 0 | 0 | 0s | 0s | BEGIN@97 | MIME::Charset::
0 | 0 | 0 | 0s | 0s | BEGIN@98 | MIME::Charset::
0 | 0 | 0 | 0s | 0s | BEGIN@99 | MIME::Charset::
0 | 0 | 0 | 0s | 0s | CORE:qr (opcode) | MIME::Charset::
0 | 0 | 0 | 0s | 0s | _detect_7bit_charset | MIME::Charset::
0 | 0 | 0 | 0s | 0s | _enclen_B | MIME::Charset::
0 | 0 | 0 | 0s | 0s | _enclen_Q | MIME::Charset::
0 | 0 | 0 | 0s | 0s | _find_encoder | MIME::Charset::
0 | 0 | 0 | 0s | 0s | _resolve_S | MIME::Charset::
0 | 0 | 0 | 0s | 0s | _text_encode | MIME::Charset::
0 | 0 | 0 | 0s | 0s | alias | MIME::Charset::
0 | 0 | 0 | 0s | 0s | as_string | MIME::Charset::
0 | 0 | 0 | 0s | 0s | body_encode | MIME::Charset::
0 | 0 | 0 | 0s | 0s | body_encoding | MIME::Charset::
0 | 0 | 0 | 0s | 0s | canonical_charset | MIME::Charset::
0 | 0 | 0 | 0s | 0s | decode | MIME::Charset::
0 | 0 | 0 | 0s | 0s | decoder | MIME::Charset::
0 | 0 | 0 | 0s | 0s | default | MIME::Charset::
0 | 0 | 0 | 0s | 0s | detect_7bit_charset | MIME::Charset::
0 | 0 | 0 | 0s | 0s | dup | MIME::Charset::
0 | 0 | 0 | 0s | 0s | encode | MIME::Charset::
0 | 0 | 0 | 0s | 0s | encoded_header_len | MIME::Charset::
0 | 0 | 0 | 0s | 0s | encoder | MIME::Charset::
0 | 0 | 0 | 0s | 0s | fallback | MIME::Charset::
0 | 0 | 0 | 0s | 0s | header_encode | MIME::Charset::
0 | 0 | 0 | 0s | 0s | header_encoding | MIME::Charset::
0 | 0 | 0 | 0s | 0s | new | MIME::Charset::
0 | 0 | 0 | 0s | 0s | output_charset | MIME::Charset::
0 | 0 | 0 | 0s | 0s | recommended | MIME::Charset::
0 | 0 | 0 | 0s | 0s | undecode | MIME::Charset::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | #-*- perl -*- | ||||
2 | |||||
3 | package MIME::Charset; | ||||
4 | use 5.005; | ||||
5 | |||||
6 | =head1 NAME | ||||
7 | |||||
8 | MIME::Charset - Charset Information for MIME | ||||
9 | |||||
10 | =head1 SYNOPSIS | ||||
11 | |||||
12 | use MIME::Charset: | ||||
13 | |||||
14 | $charset = MIME::Charset->new("euc-jp"); | ||||
15 | |||||
16 | Getting charset information: | ||||
17 | |||||
18 | $benc = $charset->body_encoding; # e.g. "Q" | ||||
19 | $cset = $charset->as_string; # e.g. "US-ASCII" | ||||
20 | $henc = $charset->header_encoding; # e.g. "S" | ||||
21 | $cset = $charset->output_charset; # e.g. "ISO-2022-JP" | ||||
22 | |||||
23 | Translating text data: | ||||
24 | |||||
25 | ($text, $charset, $encoding) = | ||||
26 | $charset->header_encode( | ||||
27 | "\xc9\xc2\xc5\xaa\xc0\xde\xc3\xef\xc5\xaa". | ||||
28 | "\xc7\xd1\xca\xaa\xbd\xd0\xce\xcf\xb4\xef", | ||||
29 | Charset => 'euc-jp'); | ||||
30 | # ...returns e.g. (<converted>, "ISO-2022-JP", "B"). | ||||
31 | |||||
32 | ($text, $charset, $encoding) = | ||||
33 | $charset->body_encode( | ||||
34 | "Collectioneur path\xe9tiquement ". | ||||
35 | "\xe9clectique de d\xe9chets", | ||||
36 | Charset => 'latin1'); | ||||
37 | # ...returns e.g. (<original>, "ISO-8859-1", "QUOTED-PRINTABLE"). | ||||
38 | |||||
39 | $len = $charset->encoded_header_len( | ||||
40 | "Perl\xe8\xa8\x80\xe8\xaa\x9e", | ||||
41 | Charset => 'utf-8', | ||||
42 | Encoding => "b"); | ||||
43 | # ...returns e.g. 28. | ||||
44 | |||||
45 | Manipulating module defaults: | ||||
46 | |||||
47 | MIME::Charset::alias("csEUCKR", "euc-kr"); | ||||
48 | MIME::Charset::default("iso-8859-1"); | ||||
49 | MIME::Charset::fallback("us-ascii"); | ||||
50 | |||||
51 | Non-OO functions (may be deprecated in near future): | ||||
52 | |||||
53 | use MIME::Charset qw(:info); | ||||
54 | |||||
55 | $benc = body_encoding("iso-8859-2"); # "Q" | ||||
56 | $cset = canonical_charset("ANSI X3.4-1968"); # "US-ASCII" | ||||
57 | $henc = header_encoding("utf-8"); # "S" | ||||
58 | $cset = output_charset("shift_jis"); # "ISO-2022-JP" | ||||
59 | |||||
60 | use MIME::Charset qw(:trans); | ||||
61 | |||||
62 | ($text, $charset, $encoding) = | ||||
63 | header_encode( | ||||
64 | "\xc9\xc2\xc5\xaa\xc0\xde\xc3\xef\xc5\xaa". | ||||
65 | "\xc7\xd1\xca\xaa\xbd\xd0\xce\xcf\xb4\xef", | ||||
66 | "euc-jp"); | ||||
67 | # ...returns (<converted>, "ISO-2022-JP", "B"); | ||||
68 | |||||
69 | ($text, $charset, $encoding) = | ||||
70 | body_encode( | ||||
71 | "Collectioneur path\xe9tiquement ". | ||||
72 | "\xe9clectique de d\xe9chets", | ||||
73 | "latin1"); | ||||
74 | # ...returns (<original>, "ISO-8859-1", "QUOTED-PRINTABLE"); | ||||
75 | |||||
76 | $len = encoded_header_len( | ||||
77 | "Perl\xe8\xa8\x80\xe8\xaa\x9e", "b", "utf-8"); # 28 | ||||
78 | |||||
79 | =head1 DESCRIPTION | ||||
80 | |||||
81 | MIME::Charset provides information about character sets used for | ||||
82 | MIME messages on Internet. | ||||
83 | |||||
84 | =head2 Definitions | ||||
85 | |||||
86 | The B<charset> is ``character set'' used in MIME to refer to a | ||||
87 | method of converting a sequence of octets into a sequence of characters. | ||||
88 | It includes both concepts of ``coded character set'' (CCS) and | ||||
89 | ``character encoding scheme'' (CES) of ISO/IEC. | ||||
90 | |||||
91 | The B<encoding> is that used in MIME to refer to a method of representing | ||||
92 | a body part or a header body as sequence(s) of printable US-ASCII | ||||
93 | characters. | ||||
94 | |||||
95 | =cut | ||||
96 | |||||
97 | use strict; | ||||
98 | use vars qw(@ISA $VERSION @EXPORT @EXPORT_OK %EXPORT_TAGS $Config); | ||||
99 | use Exporter; | ||||
100 | @ISA = qw(Exporter); | ||||
101 | @EXPORT = qw(body_encoding canonical_charset header_encoding output_charset | ||||
102 | body_encode encoded_header_len header_encode); | ||||
103 | @EXPORT_OK = qw(alias default fallback recommended); | ||||
104 | %EXPORT_TAGS = ( | ||||
105 | "info" => [qw(body_encoding header_encoding | ||||
106 | canonical_charset output_charset)], | ||||
107 | "trans" =>[ qw(body_encode encoded_header_len | ||||
108 | header_encode)], | ||||
109 | ); | ||||
110 | use Carp qw(croak); | ||||
111 | |||||
112 | use constant USE_ENCODE => ($] >= 5.007003)? 'Encode': ''; | ||||
113 | |||||
114 | my @ENCODE_SUBS = qw(FB_CROAK FB_PERLQQ FB_HTMLCREF FB_XMLCREF | ||||
115 | is_utf8 resolve_alias); | ||||
116 | if (USE_ENCODE) { | ||||
117 | eval "use ".USE_ENCODE." \@ENCODE_SUBS;"; # spent 0s executing statements in string eval | ||||
118 | if ($@) { # Perl 5.7.3 + Encode 0.40 | ||||
119 | eval "use ".USE_ENCODE." qw(is_utf8);"; | ||||
120 | require MIME::Charset::_Compat; | ||||
121 | for my $sub (@ENCODE_SUBS) { | ||||
122 | no strict "refs"; | ||||
123 | *{$sub} = \&{"MIME::Charset::_Compat::$sub"} | ||||
124 | unless $sub eq 'is_utf8'; | ||||
125 | } | ||||
126 | } | ||||
127 | } else { | ||||
128 | require MIME::Charset::_Compat; | ||||
129 | for my $sub (@ENCODE_SUBS) { | ||||
130 | no strict "refs"; | ||||
131 | *{$sub} = \&{"MIME::Charset::_Compat::$sub"}; | ||||
132 | } | ||||
133 | } | ||||
134 | |||||
135 | $VERSION = '1.012.2'; | ||||
136 | |||||
137 | ######## Private Attributes ######## | ||||
138 | |||||
139 | my $DEFAULT_CHARSET = 'US-ASCII'; | ||||
140 | my $FALLBACK_CHARSET = 'UTF-8'; | ||||
141 | |||||
142 | # This table was initially borrowed from Python email package. | ||||
143 | |||||
144 | my %CHARSETS = (# input header enc body enc output conv | ||||
145 | 'ISO-8859-1' => ['Q', 'Q', undef], | ||||
146 | 'ISO-8859-2' => ['Q', 'Q', undef], | ||||
147 | 'ISO-8859-3' => ['Q', 'Q', undef], | ||||
148 | 'ISO-8859-4' => ['Q', 'Q', undef], | ||||
149 | # ISO-8859-5 is Cyrillic, and not especially used | ||||
150 | # ISO-8859-6 is Arabic, also not particularly used | ||||
151 | # ISO-8859-7 is Greek, 'Q' will not make it readable | ||||
152 | # ISO-8859-8 is Hebrew, 'Q' will not make it readable | ||||
153 | 'ISO-8859-9' => ['Q', 'Q', undef], | ||||
154 | 'ISO-8859-10' => ['Q', 'Q', undef], | ||||
155 | # ISO-8859-11 is Thai, 'Q' will not make it readable | ||||
156 | 'ISO-8859-13' => ['Q', 'Q', undef], | ||||
157 | 'ISO-8859-14' => ['Q', 'Q', undef], | ||||
158 | 'ISO-8859-15' => ['Q', 'Q', undef], | ||||
159 | 'ISO-8859-16' => ['Q', 'Q', undef], | ||||
160 | 'WINDOWS-1252' => ['Q', 'Q', undef], | ||||
161 | 'VISCII' => ['Q', 'Q', undef], | ||||
162 | 'US-ASCII' => [undef, undef, undef], | ||||
163 | 'BIG5' => ['B', 'B', undef], | ||||
164 | 'GB2312' => ['B', 'B', undef], | ||||
165 | 'HZ-GB-2312' => ['B', undef, undef], | ||||
166 | 'EUC-JP' => ['B', undef, 'ISO-2022-JP'], | ||||
167 | 'SHIFT_JIS' => ['B', undef, 'ISO-2022-JP'], | ||||
168 | 'ISO-2022-JP' => ['B', undef, undef], | ||||
169 | 'ISO-2022-JP-1' => ['B', undef, undef], | ||||
170 | 'ISO-2022-JP-2' => ['B', undef, undef], | ||||
171 | 'EUC-JISX0213' => ['B', undef, 'ISO-2022-JP-3'], | ||||
172 | 'SHIFT_JISX0213' => ['B', undef, 'ISO-2022-JP-3'], | ||||
173 | 'ISO-2022-JP-3' => ['B', undef, undef], | ||||
174 | 'EUC-JIS-2004' => ['B', undef, 'ISO-2022-JP-2004'], | ||||
175 | 'SHIFT_JIS-2004' => ['B', undef, 'ISO-2022-JP-2004'], | ||||
176 | 'ISO-2022-JP-2004' => ['B', undef, undef], | ||||
177 | 'KOI8-R' => ['B', 'B', undef], | ||||
178 | 'TIS-620' => ['B', 'B', undef], # cf. Mew | ||||
179 | 'UTF-16' => ['B', 'B', undef], | ||||
180 | 'UTF-16BE' => ['B', 'B', undef], | ||||
181 | 'UTF-16LE' => ['B', 'B', undef], | ||||
182 | 'UTF-32' => ['B', 'B', undef], | ||||
183 | 'UTF-32BE' => ['B', 'B', undef], | ||||
184 | 'UTF-32LE' => ['B', 'B', undef], | ||||
185 | 'UTF-7' => ['Q', undef, undef], | ||||
186 | 'UTF-8' => ['S', 'S', undef], | ||||
187 | 'GSM03.38' => [undef, undef, undef], # not for MIME | ||||
188 | # We're making this one up to represent raw unencoded 8bit | ||||
189 | '8BIT' => [undef, 'B', 'ISO-8859-1'], | ||||
190 | ); | ||||
191 | |||||
192 | # Fix some unexpected or unpreferred names returned by | ||||
193 | # Encode::resolve_alias() or used by somebodies else. | ||||
194 | my %CHARSET_ALIASES = (# unpreferred preferred | ||||
195 | "ASCII" => "US-ASCII", | ||||
196 | "BIG5-ETEN" => "BIG5", | ||||
197 | "CP1250" => "WINDOWS-1250", | ||||
198 | "CP1251" => "WINDOWS-1251", | ||||
199 | "CP1252" => "WINDOWS-1252", | ||||
200 | "CP1253" => "WINDOWS-1253", | ||||
201 | "CP1254" => "WINDOWS-1254", | ||||
202 | "CP1255" => "WINDOWS-1255", | ||||
203 | "CP1256" => "WINDOWS-1256", | ||||
204 | "CP1257" => "WINDOWS-1257", | ||||
205 | "CP1258" => "WINDOWS-1258", | ||||
206 | "CP874" => "WINDOWS-874", | ||||
207 | "CP936" => "GBK", | ||||
208 | "CP949" => "KS_C_5601-1987", | ||||
209 | "EUC-CN" => "GB2312", | ||||
210 | "HZ" => "HZ-GB-2312", # RFC 1842 | ||||
211 | "KS_C_5601" => "KS_C_5601-1987", | ||||
212 | "SHIFTJIS" => "SHIFT_JIS", | ||||
213 | "SHIFTJISX0213" => "SHIFT_JISX0213", | ||||
214 | "TIS620" => "TIS-620", # IANA MIBenum 2259 | ||||
215 | "UNICODE-1-1-UTF-7" => "UTF-7", # RFC 1642 (obs.) | ||||
216 | "UTF8" => "UTF-8", | ||||
217 | "UTF-8-STRICT" => "UTF-8", # Perl internal use | ||||
218 | "GSM0338" => "GSM03.38", # not for MIME | ||||
219 | ); | ||||
220 | |||||
221 | # Some vendors encode characters beyond standardized mappings using extended | ||||
222 | # encoders. Some other standard encoders need additional encode modules. | ||||
223 | my %ENCODERS = ( | ||||
224 | 'EXTENDED' => { | ||||
225 | 'ISO-8859-1' => [['cp1252'], ], # Encode::Byte | ||||
226 | 'ISO-8859-2' => [['cp1250'], ], # Encode::Byte | ||||
227 | 'ISO-8859-5' => [['cp1251'], ], # Encode::Byte | ||||
228 | 'ISO-8859-6' => [ | ||||
229 | ['cp1256'], # Encode::Byte | ||||
230 | # ['cp1006'], # ditto, for Farsi | ||||
231 | ], | ||||
232 | 'ISO-8859-6-I'=>[['cp1256'], ], # ditto | ||||
233 | 'ISO-8859-7' => [['cp1253'], ], # Encode::Byte | ||||
234 | 'ISO-8859-8' => [['cp1255'], ], # Encode::Byte | ||||
235 | 'ISO-8859-8-I'=>[['cp1255'], ], # ditto | ||||
236 | 'ISO-8859-9' => [['cp1254'], ], # Encode::Byte | ||||
237 | 'ISO-8859-13'=> [['cp1257'], ], # Encode::Byte | ||||
238 | 'GB2312' => [ | ||||
239 | ['gb18030', 'Encode::HanExtra'], | ||||
240 | ['cp936'], # Encode::CN | ||||
241 | ], | ||||
242 | 'EUC-JP' => [ | ||||
243 | ['eucJP-ascii', 'Encode::EUCJPASCII'], | ||||
244 | # ['cp51932', 'Encode::EUCJPMS'], | ||||
245 | ], | ||||
246 | 'ISO-2022-JP'=> [ | ||||
247 | ['x-iso2022jp-ascii', | ||||
248 | 'Encode::EUCJPASCII'], | ||||
249 | # ['iso-2022-jp-ms','Encode::ISO2022JPMS'], | ||||
250 | # ['cp50220', 'Encode::EUCJPMS'], | ||||
251 | # ['cp50221', 'Encode::EUCJPMS'], | ||||
252 | ['iso-2022-jp-1'], # Encode::JP (note*) | ||||
253 | ], | ||||
254 | 'SHIFT_JIS' => [ | ||||
255 | ['cp932'], # Encode::JP | ||||
256 | ], | ||||
257 | 'EUC-JISX0213' => [['euc-jis-2004', 'Encode::JISX0213'], ], | ||||
258 | 'ISO-2022-JP-3' => [['iso-2022-jp-2004', 'Encode::JISX0213'], ], | ||||
259 | 'SHIFT_JISX0213'=> [['shift_jis-2004', 'Encode::ShiftJIS2004'], ], | ||||
260 | 'EUC-KR' => [['cp949'], ], # Encode::KR | ||||
261 | 'BIG5' => [ | ||||
262 | # ['big5plus', 'Encode::HanExtra'], | ||||
263 | # ['big5-2003', 'Encode::HanExtra'], | ||||
264 | ['cp950'], # Encode::TW | ||||
265 | # ['big5-1984', 'Encode::HanExtra'], | ||||
266 | ], | ||||
267 | 'TIS-620' => [['cp874'], ], # Encode::Byte | ||||
268 | 'UTF-8' => [['utf8'], ], # Special name on Perl | ||||
269 | }, | ||||
270 | 'STANDARD' => { | ||||
271 | 'ISO-8859-6-E' => [['iso-8859-6'],],# Encode::Byte | ||||
272 | 'ISO-8859-6-I' => [['iso-8859-6'],],# ditto | ||||
273 | 'ISO-8859-8-E' => [['iso-8859-8'],],# Encode::Byte | ||||
274 | 'ISO-8859-8-I' => [['iso-8859-8'],],# ditto | ||||
275 | 'GB18030' => [['gb18030', 'Encode::HanExtra'], ], | ||||
276 | 'ISO-2022-JP-2' => [['iso-2022-jp-2','Encode::ISO2022JP2'], ], | ||||
277 | 'EUC-JISX0213' => [['euc-jisx0213', 'Encode::JISX0213'], ], | ||||
278 | 'ISO-2022-JP-3' => [['iso-2022-jp-3', 'Encode::JISX0213'], ], | ||||
279 | 'EUC-JIS-2004' => [['euc-jis-2004', 'Encode::JISX0213'], ], | ||||
280 | 'ISO-2022-JP-2004' => [['iso-2022-jp-2004', 'Encode::JISX0213'], ], | ||||
281 | 'SHIFT_JIS-2004'=> [['shift_jis-2004', 'Encode::ShiftJIS2004'], ], | ||||
282 | 'EUC-TW' => [['euc-tw', 'Encode::HanExtra'], ], | ||||
283 | 'HZ-GB-2312' => [['hz'], ], # Encode::CN | ||||
284 | 'TIS-620' => [['tis620'], ], # (note*) | ||||
285 | 'UTF-16' => [['x-utf16auto', 'MIME::Charset::UTF'],], | ||||
286 | 'UTF-32' => [['x-utf32auto', 'MIME::Charset::UTF'],], | ||||
287 | 'GSM03.38' => [['gsm0338'], ], # Encode::GSM0338 | ||||
288 | |||||
289 | # (note*) ISO-8859-11 was not registered by IANA. | ||||
290 | # L<Encode> treats it as canonical name of ``tis-?620''. | ||||
291 | }, | ||||
292 | ); | ||||
293 | |||||
294 | # ISO-2022-* escape sequences etc. to detect charset from unencoded data. | ||||
295 | my @ESCAPE_SEQS = ( | ||||
296 | # ISO-2022-* sequences | ||||
297 | # escape seq, possible charset | ||||
298 | # Following sequences are commonly used. | ||||
299 | ["\033\$\@", "ISO-2022-JP"], # RFC 1468 | ||||
300 | ["\033\$B", "ISO-2022-JP"], # ditto | ||||
301 | ["\033(J", "ISO-2022-JP"], # ditto | ||||
302 | ["\033(I", "ISO-2022-JP"], # ditto (nonstandard) | ||||
303 | ["\033\$(D", "ISO-2022-JP"], # RFC 2237 (note*) | ||||
304 | # Following sequences are less commonly used. | ||||
305 | ["\033.A", "ISO-2022-JP-2"], # RFC 1554 | ||||
306 | ["\033.F", "ISO-2022-JP-2"], # ditto | ||||
307 | ["\033\$(C", "ISO-2022-JP-2"], # ditto | ||||
308 | ["\033\$(O", "ISO-2022-JP-3"], # JIS X 0213:2000 | ||||
309 | ["\033\$(P", "ISO-2022-JP-2004"], # JIS X 0213:2000/2004 | ||||
310 | ["\033\$(Q", "ISO-2022-JP-2004"], # JIS X 0213:2004 | ||||
311 | ["\033\$)C", "ISO-2022-KR"], # RFC 1557 | ||||
312 | ["\033\$)A", "ISO-2022-CN"], # RFC 1922 | ||||
313 | ["\033\$A", "ISO-2022-CN"], # ditto (nonstandard) | ||||
314 | ["\033\$)G", "ISO-2022-CN"], # ditto | ||||
315 | ["\033\$*H", "ISO-2022-CN"], # ditto | ||||
316 | # Other sequences will be used with appropriate charset | ||||
317 | # parameters, or hardly used. | ||||
318 | |||||
319 | # note*: This RFC defines ISO-2022-JP-1, superset of | ||||
320 | # ISO-2022-JP. But that charset name is rarely used. | ||||
321 | # OTOH many of encoders for ISO-2022-JP recognize this | ||||
322 | # sequence so that comatibility with EUC-JP will be | ||||
323 | # guaranteed. | ||||
324 | |||||
325 | # Singlebyte 7-bit sequences | ||||
326 | # escape seq, possible charset | ||||
327 | ["\033e", "GSM03.38"], # ESTI GSM 03.38 (note*) | ||||
328 | ["\033\012", "GSM03.38"], # ditto | ||||
329 | ["\033<", "GSM03.38"], # ditto | ||||
330 | ["\033/", "GSM03.38"], # ditto | ||||
331 | ["\033>", "GSM03.38"], # ditto | ||||
332 | ["\033\024", "GSM03.38"], # ditto | ||||
333 | ["\033(", "GSM03.38"], # ditto | ||||
334 | ["\033\@", "GSM03.38"], # ditto | ||||
335 | ["\033)", "GSM03.38"], # ditto | ||||
336 | ["\033=", "GSM03.38"], # ditto | ||||
337 | |||||
338 | # note*: This is not used for MIME message. | ||||
339 | ); | ||||
340 | |||||
341 | ######## Public Configuration Attributes ######## | ||||
342 | |||||
343 | $Config = { | ||||
344 | Detect7bit => 'YES', | ||||
345 | Mapping => 'EXTENDED', | ||||
346 | Replacement => 'DEFAULT', | ||||
347 | }; | ||||
348 | local @INC = @INC; | ||||
349 | pop @INC if $INC[-1] eq '.'; | ||||
350 | eval { require MIME::Charset::Defaults; }; | ||||
351 | |||||
352 | ######## Private Constants ######## | ||||
353 | |||||
354 | my $NON7BITRE = qr{ | ||||
355 | [^\x01-\x7e] | ||||
356 | }x; | ||||
357 | |||||
358 | my $NONASCIIRE = qr{ | ||||
359 | [^\x09\x0a\x0d\x20\x21-\x7e] | ||||
360 | }x; | ||||
361 | |||||
362 | my $ISO2022RE = qr{ | ||||
363 | ISO-2022-.+ | ||||
364 | }ix; | ||||
365 | |||||
366 | my $ASCIITRANSRE = qr{ | ||||
367 | HZ-GB-2312 | UTF-7 | ||||
368 | }ix; | ||||
369 | |||||
370 | |||||
371 | ######## Public Functions ######## | ||||
372 | |||||
373 | =head2 Constructor | ||||
374 | |||||
375 | =over | ||||
376 | |||||
377 | =item $charset = MIME::Charset->new([CHARSET [, OPTS]]) | ||||
378 | |||||
379 | Create charset object. | ||||
380 | |||||
381 | OPTS may accept following key-value pair. | ||||
382 | B<NOTE>: | ||||
383 | When Unicode/multibyte support is disabled (see L<"USE_ENCODE">), | ||||
384 | conversion will not be performed. So this option do not have any effects. | ||||
385 | |||||
386 | =over 4 | ||||
387 | |||||
388 | =item Mapping => MAPTYPE | ||||
389 | |||||
390 | Whether to extend mappings actually used for charset names or not. | ||||
391 | C<"EXTENDED"> uses extended mappings. | ||||
392 | C<"STANDARD"> uses standardized strict mappings. | ||||
393 | Default is C<"EXTENDED">. | ||||
394 | |||||
395 | =back | ||||
396 | |||||
397 | =cut | ||||
398 | |||||
399 | sub new { | ||||
400 | my $class = shift; | ||||
401 | my $charset = shift; | ||||
402 | return bless {}, $class unless $charset; | ||||
403 | return bless {}, $class if 75 < length $charset; # w/a for CPAN RT #65796. | ||||
404 | my %params = @_; | ||||
405 | my $mapping = uc($params{'Mapping'} || $Config->{Mapping}); | ||||
406 | |||||
407 | if ($charset =~ /\bhz.?gb.?2312$/i) { | ||||
408 | # workaround: "HZ-GB-2312" mistakenly treated as "EUC-CN" by Encode | ||||
409 | # (2.12). | ||||
410 | $charset = "HZ-GB-2312"; | ||||
411 | } elsif ($charset =~ /\btis-?620$/i) { | ||||
412 | # workaround: "TIS620" treated as ISO-8859-11 by Encode. | ||||
413 | # And "TIS-620" not known by some versions of Encode (cf. | ||||
414 | # CPAN RT #20781). | ||||
415 | $charset = "TIS-620"; | ||||
416 | } else { | ||||
417 | $charset = resolve_alias($charset) || $charset | ||||
418 | } | ||||
419 | $charset = $CHARSET_ALIASES{uc($charset)} || uc($charset); | ||||
420 | my ($henc, $benc, $outcset); | ||||
421 | my $spec = $CHARSETS{$charset}; | ||||
422 | if ($spec) { | ||||
423 | ($henc, $benc, $outcset) = | ||||
424 | ($$spec[0], $$spec[1], USE_ENCODE? $$spec[2]: undef); | ||||
425 | } else { | ||||
426 | ($henc, $benc, $outcset) = ('S', 'B', undef); | ||||
427 | } | ||||
428 | my ($decoder, $encoder); | ||||
429 | if (USE_ENCODE) { | ||||
430 | $decoder = _find_encoder($charset, $mapping); | ||||
431 | $encoder = _find_encoder($outcset, $mapping); | ||||
432 | } else { | ||||
433 | $decoder = $encoder = undef; | ||||
434 | } | ||||
435 | |||||
436 | bless { | ||||
437 | InputCharset => $charset, | ||||
438 | Decoder => $decoder, | ||||
439 | HeaderEncoding => $henc, | ||||
440 | BodyEncoding => $benc, | ||||
441 | OutputCharset => ($outcset || $charset), | ||||
442 | Encoder => ($encoder || $decoder), | ||||
443 | }, $class; | ||||
444 | } | ||||
445 | |||||
446 | my %encoder_cache = (); | ||||
447 | |||||
448 | sub _find_encoder($$) { | ||||
449 | my $charset = uc(shift || ""); | ||||
450 | return undef unless $charset; | ||||
451 | my $mapping = uc(shift); | ||||
452 | my ($spec, $name, $module, $encoder); | ||||
453 | |||||
454 | local($@); | ||||
455 | $encoder = $encoder_cache{$charset, $mapping}; | ||||
456 | return $encoder if ref $encoder; | ||||
457 | |||||
458 | foreach my $m (('EXTENDED', 'STANDARD')) { | ||||
459 | next if $m eq 'EXTENDED' and $mapping ne 'EXTENDED'; | ||||
460 | $spec = $ENCODERS{$m}->{$charset}; | ||||
461 | next unless $spec; | ||||
462 | foreach my $s (@{$spec}) { | ||||
463 | ($name, $module) = @{$s}; | ||||
464 | if ($module) { | ||||
465 | next unless eval "require $module;"; | ||||
466 | } | ||||
467 | $encoder = Encode::find_encoding($name); | ||||
468 | last if ref $encoder; | ||||
469 | } | ||||
470 | last if ref $encoder; | ||||
471 | } | ||||
472 | $encoder ||= Encode::find_encoding($charset); | ||||
473 | $encoder_cache{$charset, $mapping} = $encoder if $encoder; | ||||
474 | return $encoder; | ||||
475 | } | ||||
476 | |||||
477 | =back | ||||
478 | |||||
479 | =head2 Getting Information of Charsets | ||||
480 | |||||
481 | =over | ||||
482 | |||||
483 | =item $charset->body_encoding | ||||
484 | |||||
485 | =item body_encoding CHARSET | ||||
486 | |||||
487 | Get recommended transfer-encoding of CHARSET for message body. | ||||
488 | |||||
489 | Returned value will be one of C<"B"> (BASE64), C<"Q"> (QUOTED-PRINTABLE), | ||||
490 | C<"S"> (shorter one of either) or | ||||
491 | C<undef> (might not be transfer-encoded; either 7BIT or 8BIT). This may | ||||
492 | not be same as encoding for message header. | ||||
493 | |||||
494 | =cut | ||||
495 | |||||
496 | sub body_encoding($) { | ||||
497 | my $self = shift; | ||||
498 | return undef unless $self; | ||||
499 | $self = __PACKAGE__->new($self) unless ref $self; | ||||
500 | $self->{BodyEncoding}; | ||||
501 | } | ||||
502 | |||||
503 | =item $charset->as_string | ||||
504 | |||||
505 | =item canonical_charset CHARSET | ||||
506 | |||||
507 | Get canonical name for charset. | ||||
508 | |||||
509 | =cut | ||||
510 | |||||
511 | sub canonical_charset($) { | ||||
512 | my $self = shift; | ||||
513 | return undef unless $self; | ||||
514 | $self = __PACKAGE__->new($self) unless ref $self; | ||||
515 | $self->{InputCharset}; | ||||
516 | } | ||||
517 | |||||
518 | sub as_string($) { | ||||
519 | my $self = shift; | ||||
520 | $self->{InputCharset}; | ||||
521 | } | ||||
522 | |||||
523 | =item $charset->decoder | ||||
524 | |||||
525 | Get L<"Encode::Encoding"> object to decode strings to Unicode by charset. | ||||
526 | If charset is not specified or not known by this module, | ||||
527 | undef will be returned. | ||||
528 | |||||
529 | =cut | ||||
530 | |||||
531 | sub decoder($) { | ||||
532 | my $self = shift; | ||||
533 | $self->{Decoder}; | ||||
534 | } | ||||
535 | |||||
536 | =item $charset->dup | ||||
537 | |||||
538 | Get a copy of charset object. | ||||
539 | |||||
540 | =cut | ||||
541 | |||||
542 | sub dup($) { | ||||
543 | my $self = shift; | ||||
544 | my $obj = __PACKAGE__->new(undef); | ||||
545 | %{$obj} = %{$self}; | ||||
546 | $obj; | ||||
547 | } | ||||
548 | |||||
549 | =item $charset->encoder([CHARSET]) | ||||
550 | |||||
551 | Get L<"Encode::Encoding"> object to encode Unicode string using compatible | ||||
552 | charset recommended to be used for messages on Internet. | ||||
553 | |||||
554 | If optional CHARSET is specified, replace encoder (and output charset | ||||
555 | name) of $charset object with those of CHARSET, therefore, | ||||
556 | $charset object will be a converter between original charset and | ||||
557 | new CHARSET. | ||||
558 | |||||
559 | =cut | ||||
560 | |||||
561 | sub encoder($$;) { | ||||
562 | my $self = shift; | ||||
563 | my $charset = shift; | ||||
564 | if ($charset) { | ||||
565 | $charset = __PACKAGE__->new($charset) unless ref $charset; | ||||
566 | $self->{OutputCharset} = $charset->{InputCharset}; | ||||
567 | $self->{Encoder} = $charset->{Decoder}; | ||||
568 | $self->{BodyEncoding} = $charset->{BodyEncoding}; | ||||
569 | $self->{HeaderEncoding} = $charset->{HeaderEncoding}; | ||||
570 | } | ||||
571 | $self->{Encoder}; | ||||
572 | } | ||||
573 | |||||
574 | =item $charset->header_encoding | ||||
575 | |||||
576 | =item header_encoding CHARSET | ||||
577 | |||||
578 | Get recommended encoding scheme of CHARSET for message header. | ||||
579 | |||||
580 | Returned value will be one of C<"B">, C<"Q">, C<"S"> (shorter one of either) | ||||
581 | or C<undef> (might not be encoded). This may not be same as encoding | ||||
582 | for message body. | ||||
583 | |||||
584 | =cut | ||||
585 | |||||
586 | sub header_encoding($) { | ||||
587 | my $self = shift; | ||||
588 | return undef unless $self; | ||||
589 | $self = __PACKAGE__->new($self) unless ref $self; | ||||
590 | $self->{HeaderEncoding}; | ||||
591 | } | ||||
592 | |||||
593 | =item $charset->output_charset | ||||
594 | |||||
595 | =item output_charset CHARSET | ||||
596 | |||||
597 | Get a charset which is compatible with given CHARSET and is recommended | ||||
598 | to be used for MIME messages on Internet (if it is known by this module). | ||||
599 | |||||
600 | When Unicode/multibyte support is disabled (see L<"USE_ENCODE">), | ||||
601 | this function will simply | ||||
602 | return the result of L<"canonical_charset">. | ||||
603 | |||||
604 | =cut | ||||
605 | |||||
606 | sub output_charset($) { | ||||
607 | my $self = shift; | ||||
608 | return undef unless $self; | ||||
609 | $self = __PACKAGE__->new($self) unless ref $self; | ||||
610 | $self->{OutputCharset}; | ||||
611 | } | ||||
612 | |||||
613 | =back | ||||
614 | |||||
615 | =head2 Translating Text Data | ||||
616 | |||||
617 | =over | ||||
618 | |||||
619 | =item $charset->body_encode(STRING [, OPTS]) | ||||
620 | |||||
621 | =item body_encode STRING, CHARSET [, OPTS] | ||||
622 | |||||
623 | Get converted (if needed) data of STRING and recommended transfer-encoding | ||||
624 | of that data for message body. CHARSET is the charset by which STRING | ||||
625 | is encoded. | ||||
626 | |||||
627 | OPTS may accept following key-value pairs. | ||||
628 | B<NOTE>: | ||||
629 | When Unicode/multibyte support is disabled (see L<"USE_ENCODE">), | ||||
630 | conversion will not be performed. So these options do not have any effects. | ||||
631 | |||||
632 | =over 4 | ||||
633 | |||||
634 | =item Detect7bit => YESNO | ||||
635 | |||||
636 | Try auto-detecting 7-bit charset when CHARSET is not given. | ||||
637 | Default is C<"YES">. | ||||
638 | |||||
639 | =item Replacement => REPLACEMENT | ||||
640 | |||||
641 | Specifies error handling scheme. See L<"Error Handling">. | ||||
642 | |||||
643 | =back | ||||
644 | |||||
645 | 3-item list of (I<converted string>, I<charset for output>, | ||||
646 | I<transfer-encoding>) will be returned. | ||||
647 | I<Transfer-encoding> will be either C<"BASE64">, C<"QUOTED-PRINTABLE">, | ||||
648 | C<"7BIT"> or C<"8BIT">. If I<charset for output> could not be determined | ||||
649 | and I<converted string> contains non-ASCII byte(s), I<charset for output> will | ||||
650 | be C<undef> and I<transfer-encoding> will be C<"BASE64">. | ||||
651 | I<Charset for output> will be C<"US-ASCII"> if and only if string does not | ||||
652 | contain any non-ASCII bytes. | ||||
653 | |||||
654 | =cut | ||||
655 | |||||
656 | sub body_encode { | ||||
657 | my $self = shift; | ||||
658 | my $text; | ||||
659 | if (ref $self) { | ||||
660 | $text = shift; | ||||
661 | } else { | ||||
662 | $text = $self; | ||||
663 | $self = __PACKAGE__->new(shift); | ||||
664 | } | ||||
665 | my ($encoded, $charset) = $self->_text_encode($text, @_); | ||||
666 | return ($encoded, undef, 'BASE64') | ||||
667 | unless $charset and $charset->{InputCharset}; | ||||
668 | my $cset = $charset->{OutputCharset}; | ||||
669 | |||||
670 | # Determine transfer-encoding. | ||||
671 | my $enc = $charset->{BodyEncoding}; | ||||
672 | |||||
673 | if (!$enc and $encoded !~ /\x00/) { # Eliminate hostile NUL character. | ||||
674 | if ($encoded =~ $NON7BITRE) { # String contains 8bit char(s). | ||||
675 | $enc = '8BIT'; | ||||
676 | } elsif ($cset =~ /^($ISO2022RE|$ASCIITRANSRE)$/) { # 7BIT. | ||||
677 | $enc = '7BIT'; | ||||
678 | } else { # Pure ASCII. | ||||
679 | $enc = '7BIT'; | ||||
680 | $cset = 'US-ASCII'; | ||||
681 | } | ||||
682 | } elsif ($enc eq 'S') { | ||||
683 | $enc = _resolve_S($encoded, 1); | ||||
684 | } elsif ($enc eq 'B') { | ||||
685 | $enc = 'BASE64'; | ||||
686 | } elsif ($enc eq 'Q') { | ||||
687 | $enc = 'QUOTED-PRINTABLE'; | ||||
688 | } else { | ||||
689 | $enc = 'BASE64'; | ||||
690 | } | ||||
691 | return ($encoded, $cset, $enc); | ||||
692 | } | ||||
693 | |||||
694 | =item $charset->decode(STRING [,CHECK]) | ||||
695 | |||||
696 | Decode STRING to Unicode. | ||||
697 | |||||
698 | B<Note>: | ||||
699 | When Unicode/multibyte support is disabled (see L<"USE_ENCODE">), | ||||
700 | this function will die. | ||||
701 | |||||
702 | =cut | ||||
703 | |||||
704 | sub decode($$$;) { | ||||
705 | my $self = shift; | ||||
706 | my $s = shift; | ||||
707 | my $check = shift || 0; | ||||
708 | $self->{Decoder}->decode($s, $check); | ||||
709 | } | ||||
710 | |||||
711 | =item detect_7bit_charset STRING | ||||
712 | |||||
713 | Guess 7-bit charset that may encode a string STRING. | ||||
714 | If STRING contains any 8-bit bytes, C<undef> will be returned. | ||||
715 | Otherwise, Default Charset will be returned for unknown charset. | ||||
716 | |||||
717 | =cut | ||||
718 | |||||
719 | sub detect_7bit_charset($) { | ||||
720 | return $DEFAULT_CHARSET unless &USE_ENCODE; | ||||
721 | my $s = shift; | ||||
722 | return $DEFAULT_CHARSET unless $s; | ||||
723 | |||||
724 | # Non-7bit string | ||||
725 | return undef if $s =~ $NON7BITRE; | ||||
726 | |||||
727 | # Try to detect 7-bit escape sequences. | ||||
728 | foreach (@ESCAPE_SEQS) { | ||||
729 | my ($seq, $cset) = @$_; | ||||
730 | if (index($s, $seq) >= 0) { | ||||
731 | my $decoder = __PACKAGE__->new($cset); | ||||
732 | next unless $decoder->{Decoder}; | ||||
733 | eval { | ||||
734 | my $dummy = $s; | ||||
735 | $decoder->decode($dummy, FB_CROAK()); | ||||
736 | }; | ||||
737 | if ($@) { | ||||
738 | next; | ||||
739 | } | ||||
740 | return $decoder->{InputCharset}; | ||||
741 | } | ||||
742 | } | ||||
743 | |||||
744 | # How about HZ, VIQR, UTF-7, ...? | ||||
745 | |||||
746 | return $DEFAULT_CHARSET; | ||||
747 | } | ||||
748 | |||||
749 | sub _detect_7bit_charset { | ||||
750 | detect_7bit_charset(@_); | ||||
751 | } | ||||
752 | |||||
753 | =item $charset->encode(STRING [, CHECK]) | ||||
754 | |||||
755 | Encode STRING (Unicode or non-Unicode) using compatible charset recommended | ||||
756 | to be used for messages on Internet (if this module knows it). | ||||
757 | Note that string will be decoded to Unicode then encoded even if compatible charset | ||||
758 | was equal to original charset. | ||||
759 | |||||
760 | B<Note>: | ||||
761 | When Unicode/multibyte support is disabled (see L<"USE_ENCODE">), | ||||
762 | this function will die. | ||||
763 | |||||
764 | =cut | ||||
765 | |||||
766 | sub encode($$$;) { | ||||
767 | my $self = shift; | ||||
768 | my $s = shift; | ||||
769 | my $check = shift || 0; | ||||
770 | |||||
771 | unless (is_utf8($s) or $s =~ /[^\x00-\xFF]/) { | ||||
772 | $s = $self->{Decoder}->decode($s, ($check & 0x1)? FB_CROAK(): 0); | ||||
773 | } | ||||
774 | my $enc = $self->{Encoder}->encode($s, $check); | ||||
775 | Encode::_utf8_off($enc) if is_utf8($enc); # workaround for RT #35120 | ||||
776 | $enc; | ||||
777 | } | ||||
778 | |||||
779 | =item $charset->encoded_header_len(STRING [, ENCODING]) | ||||
780 | |||||
781 | =item encoded_header_len STRING, ENCODING, CHARSET | ||||
782 | |||||
783 | Get length of encoded STRING for message header | ||||
784 | (without folding). | ||||
785 | |||||
786 | ENCODING may be one of C<"B">, C<"Q"> or C<"S"> (shorter | ||||
787 | one of either C<"B"> or C<"Q">). | ||||
788 | |||||
789 | =cut | ||||
790 | |||||
791 | sub encoded_header_len($$$;) { | ||||
792 | my $self = shift; | ||||
793 | my ($encoding, $s); | ||||
794 | if (ref $self) { | ||||
795 | $s = shift; | ||||
796 | $encoding = uc(shift || $self->{HeaderEncoding}); | ||||
797 | } else { | ||||
798 | $s = $self; | ||||
799 | $encoding = uc(shift); | ||||
800 | $self = shift; | ||||
801 | $self = __PACKAGE__->new($self) unless ref $self; | ||||
802 | } | ||||
803 | |||||
804 | #FIXME:$encoding === undef | ||||
805 | |||||
806 | my $enclen; | ||||
807 | if ($encoding eq 'Q') { | ||||
808 | $enclen = _enclen_Q($s); | ||||
809 | } elsif ($encoding eq 'S' and _resolve_S($s) eq 'Q') { | ||||
810 | $enclen = _enclen_Q($s); | ||||
811 | } else { # "B" | ||||
812 | $enclen = _enclen_B($s); | ||||
813 | } | ||||
814 | |||||
815 | length($self->{OutputCharset})+$enclen+7; | ||||
816 | } | ||||
817 | |||||
818 | sub _enclen_B($) { | ||||
819 | int((length(shift) + 2) / 3) * 4; | ||||
820 | } | ||||
821 | |||||
822 | sub _enclen_Q($;$) { | ||||
823 | my $s = shift; | ||||
824 | my $in_body = shift; | ||||
825 | my @o; | ||||
826 | if ($in_body) { | ||||
827 | @o = ($s =~ m{([^-\t\r\n !*+/0-9A-Za-z])}go); | ||||
828 | } else { | ||||
829 | @o = ($s =~ m{([^- !*+/0-9A-Za-z])}gos); | ||||
830 | } | ||||
831 | length($s) + scalar(@o) * 2; | ||||
832 | } | ||||
833 | |||||
834 | sub _resolve_S($;$) { | ||||
835 | my $s = shift; | ||||
836 | my $in_body = shift; | ||||
837 | my $e; | ||||
838 | if ($in_body) { | ||||
839 | $e = scalar(() = $s =~ m{[^-\t\r\n !*+/0-9A-Za-z]}g); | ||||
840 | return (length($s) + 8 < $e * 6) ? 'BASE64' : 'QUOTED-PRINTABLE'; | ||||
841 | } else { | ||||
842 | $e = scalar(() = $s =~ m{[^- !*+/0-9A-Za-z]}g); | ||||
843 | return (length($s) + 8 < $e * 6) ? 'B' : 'Q'; | ||||
844 | } | ||||
845 | } | ||||
846 | |||||
847 | =item $charset->header_encode(STRING [, OPTS]) | ||||
848 | |||||
849 | =item header_encode STRING, CHARSET [, OPTS] | ||||
850 | |||||
851 | Get converted (if needed) data of STRING and recommended encoding scheme of | ||||
852 | that data for message headers. CHARSET is the charset by which STRING | ||||
853 | is encoded. | ||||
854 | |||||
855 | OPTS may accept following key-value pairs. | ||||
856 | B<NOTE>: | ||||
857 | When Unicode/multibyte support is disabled (see L<"USE_ENCODE">), | ||||
858 | conversion will not be performed. So these options do not have any effects. | ||||
859 | |||||
860 | =over 4 | ||||
861 | |||||
862 | =item Detect7bit => YESNO | ||||
863 | |||||
864 | Try auto-detecting 7-bit charset when CHARSET is not given. | ||||
865 | Default is C<"YES">. | ||||
866 | |||||
867 | =item Replacement => REPLACEMENT | ||||
868 | |||||
869 | Specifies error handling scheme. See L<"Error Handling">. | ||||
870 | |||||
871 | =back | ||||
872 | |||||
873 | 3-item list of (I<converted string>, I<charset for output>, | ||||
874 | I<encoding scheme>) will be returned. I<Encoding scheme> will be | ||||
875 | either C<"B">, C<"Q"> or C<undef> (might not be encoded). | ||||
876 | If I<charset for output> could not be determined and I<converted string> | ||||
877 | contains non-ASCII byte(s), I<charset for output> will be C<"8BIT"> | ||||
878 | (this is I<not> charset name but a special value to represent unencodable | ||||
879 | data) and I<encoding scheme> will be C<undef> (should not be encoded). | ||||
880 | I<Charset for output> will be C<"US-ASCII"> if and only if string does not | ||||
881 | contain any non-ASCII bytes. | ||||
882 | |||||
883 | =cut | ||||
884 | |||||
885 | sub header_encode { | ||||
886 | my $self = shift; | ||||
887 | my $text; | ||||
888 | if (ref $self) { | ||||
889 | $text = shift; | ||||
890 | } else { | ||||
891 | $text = $self; | ||||
892 | $self = __PACKAGE__->new(shift); | ||||
893 | } | ||||
894 | my ($encoded, $charset) = $self->_text_encode($text, @_); | ||||
895 | return ($encoded, '8BIT', undef) | ||||
896 | unless $charset and $charset->{InputCharset}; | ||||
897 | my $cset = $charset->{OutputCharset}; | ||||
898 | |||||
899 | # Determine encoding scheme. | ||||
900 | my $enc = $charset->{HeaderEncoding}; | ||||
901 | |||||
902 | if (!$enc and $encoded !~ $NON7BITRE) { | ||||
903 | unless ($cset =~ /^($ISO2022RE|$ASCIITRANSRE)$/) { # 7BIT. | ||||
904 | $cset = 'US-ASCII'; | ||||
905 | } | ||||
906 | } elsif ($enc eq 'S') { | ||||
907 | $enc = _resolve_S($encoded); | ||||
908 | } elsif ($enc !~ /^[BQ]$/) { | ||||
909 | $enc = 'B'; | ||||
910 | } | ||||
911 | return ($encoded, $cset, $enc); | ||||
912 | } | ||||
913 | |||||
914 | sub _text_encode { | ||||
915 | my $charset = shift; | ||||
916 | my $s = shift; | ||||
917 | my %params = @_; | ||||
918 | my $replacement = uc($params{'Replacement'} || $Config->{Replacement}); | ||||
919 | my $detect7bit = uc($params{'Detect7bit'} || $Config->{Detect7bit}); | ||||
920 | my $encoding = $params{'Encoding'} || | ||||
921 | (exists $params{'Encoding'}? undef: 'A'); # undocumented | ||||
922 | |||||
923 | if (!$encoding or $encoding ne 'A') { # no 7-bit auto-detection | ||||
924 | $detect7bit = 'NO'; | ||||
925 | } | ||||
926 | unless ($charset->{InputCharset}) { | ||||
927 | if ($s =~ $NON7BITRE) { | ||||
928 | return ($s, undef); | ||||
929 | } elsif ($detect7bit ne "NO") { | ||||
930 | $charset = __PACKAGE__->new(&detect_7bit_charset($s)); | ||||
931 | } else { | ||||
932 | $charset = __PACKAGE__->new($DEFAULT_CHARSET, | ||||
933 | Mapping => 'STANDARD'); | ||||
934 | } | ||||
935 | } | ||||
936 | if (!$encoding or $encoding ne 'A') { # no conversion | ||||
937 | $charset = $charset->dup; | ||||
938 | $charset->encoder($charset); | ||||
939 | $charset->{HeaderEncoding} = $encoding; | ||||
940 | $charset->{BodyEncoding} = $encoding; | ||||
941 | } | ||||
942 | my $check = ($replacement and $replacement =~ /^\d+$/)? | ||||
943 | $replacement: | ||||
944 | { | ||||
945 | 'CROAK' => FB_CROAK(), | ||||
946 | 'STRICT' => FB_CROAK(), | ||||
947 | 'FALLBACK' => FB_CROAK(), # special | ||||
948 | 'PERLQQ' => FB_PERLQQ(), | ||||
949 | 'HTMLCREF' => FB_HTMLCREF(), | ||||
950 | 'XMLCREF' => FB_XMLCREF(), | ||||
951 | }->{$replacement || ""} || 0; | ||||
952 | |||||
953 | # Encode data by output charset if required. If failed, fallback to | ||||
954 | # fallback charset. | ||||
955 | my $encoded; | ||||
956 | if (is_utf8($s) or $s =~ /[^\x00-\xFF]/ or | ||||
957 | ($charset->{InputCharset} || "") ne ($charset->{OutputCharset} || "")) { | ||||
958 | if ($check & 0x1) { # CROAK or FALLBACK | ||||
959 | eval { | ||||
960 | $encoded = $s; | ||||
961 | $encoded = $charset->encode($encoded, FB_CROAK()); | ||||
962 | }; | ||||
963 | if ($@) { | ||||
964 | if ($replacement eq "FALLBACK" and $FALLBACK_CHARSET) { | ||||
965 | my $cset = __PACKAGE__->new($FALLBACK_CHARSET, | ||||
966 | Mapping => 'STANDARD'); | ||||
967 | # croak unknown charset | ||||
968 | croak "unknown charset ``$FALLBACK_CHARSET''" | ||||
969 | unless $cset->{Decoder}; | ||||
970 | # charset translation | ||||
971 | $charset = $charset->dup; | ||||
972 | $charset->encoder($cset); | ||||
973 | $encoded = $s; | ||||
974 | $encoded = $charset->encode($encoded, 0); | ||||
975 | # replace input & output charsets with fallback charset | ||||
976 | $cset->encoder($cset); | ||||
977 | $charset = $cset; | ||||
978 | } else { | ||||
979 | $@ =~ s/ at .+$//; | ||||
980 | croak $@; | ||||
981 | } | ||||
982 | } | ||||
983 | } else { | ||||
984 | $encoded = $s; | ||||
985 | $encoded = $charset->encode($encoded, $check); | ||||
986 | } | ||||
987 | } else { | ||||
988 | $encoded = $s; | ||||
989 | } | ||||
990 | |||||
991 | if ($encoded !~ /$NONASCIIRE/) { # maybe ASCII | ||||
992 | # check ``ASCII transformation'' charsets | ||||
993 | if ($charset->{OutputCharset} =~ /^($ASCIITRANSRE)$/) { | ||||
994 | my $u = $encoded; | ||||
995 | if (USE_ENCODE) { | ||||
996 | $u = $charset->encoder->decode($encoded); # dec. by output | ||||
997 | } elsif ($encoded =~ /[+~]/) { # workaround for pre-Encode env. | ||||
998 | $u = "x$u"; | ||||
999 | } | ||||
1000 | if ($u eq $encoded) { | ||||
1001 | $charset = $charset->dup; | ||||
1002 | $charset->encoder($DEFAULT_CHARSET); | ||||
1003 | } | ||||
1004 | } elsif ($charset->{OutputCharset} ne "US-ASCII") { | ||||
1005 | $charset = $charset->dup; | ||||
1006 | $charset->encoder($DEFAULT_CHARSET); | ||||
1007 | } | ||||
1008 | } | ||||
1009 | |||||
1010 | return ($encoded, $charset); | ||||
1011 | } | ||||
1012 | |||||
1013 | =item $charset->undecode(STRING [,CHECK]) | ||||
1014 | |||||
1015 | Encode Unicode string STRING to byte string by input charset of $charset. | ||||
1016 | This is equivalent to C<$charset-E<gt>decoder-E<gt>encode()>. | ||||
1017 | |||||
1018 | B<Note>: | ||||
1019 | When Unicode/multibyte support is disabled (see L<"USE_ENCODE">), | ||||
1020 | this function will die. | ||||
1021 | |||||
1022 | =cut | ||||
1023 | |||||
1024 | sub undecode($$$;) { | ||||
1025 | my $self = shift; | ||||
1026 | my $s = shift; | ||||
1027 | my $check = shift || 0; | ||||
1028 | my $enc = $self->{Decoder}->encode($s, $check); | ||||
1029 | Encode::_utf8_off($enc); # workaround for RT #35120 | ||||
1030 | $enc; | ||||
1031 | } | ||||
1032 | |||||
1033 | =back | ||||
1034 | |||||
1035 | =head2 Manipulating Module Defaults | ||||
1036 | |||||
1037 | =over | ||||
1038 | |||||
1039 | =item alias ALIAS [, CHARSET] | ||||
1040 | |||||
1041 | Get/set charset alias for canonical names determined by | ||||
1042 | L<"canonical_charset">. | ||||
1043 | |||||
1044 | If CHARSET is given and isn't false, ALIAS will be assigned as an alias of | ||||
1045 | CHARSET. Otherwise, alias won't be changed. In both cases, | ||||
1046 | current charset name that ALIAS is assigned will be returned. | ||||
1047 | |||||
1048 | =cut | ||||
1049 | |||||
1050 | sub alias ($;$) { | ||||
1051 | my $alias = uc(shift); | ||||
1052 | my $charset = uc(shift); | ||||
1053 | |||||
1054 | return $CHARSET_ALIASES{$alias} unless $charset; | ||||
1055 | |||||
1056 | $CHARSET_ALIASES{$alias} = $charset; | ||||
1057 | return $charset; | ||||
1058 | } | ||||
1059 | |||||
1060 | =item default [CHARSET] | ||||
1061 | |||||
1062 | Get/set default charset. | ||||
1063 | |||||
1064 | B<Default charset> is used by this module when charset context is | ||||
1065 | unknown. Modules using this module are recommended to use this | ||||
1066 | charset when charset context is unknown or implicit default is | ||||
1067 | expected. By default, it is C<"US-ASCII">. | ||||
1068 | |||||
1069 | If CHARSET is given and isn't false, it will be set to default charset. | ||||
1070 | Otherwise, default charset won't be changed. In both cases, | ||||
1071 | current default charset will be returned. | ||||
1072 | |||||
1073 | B<NOTE>: Default charset I<should not> be changed. | ||||
1074 | |||||
1075 | =cut | ||||
1076 | |||||
1077 | sub default(;$) { | ||||
1078 | my $charset = &canonical_charset(shift); | ||||
1079 | |||||
1080 | if ($charset) { | ||||
1081 | croak "Unknown charset '$charset'" | ||||
1082 | unless resolve_alias($charset); | ||||
1083 | $DEFAULT_CHARSET = $charset; | ||||
1084 | } | ||||
1085 | return $DEFAULT_CHARSET; | ||||
1086 | } | ||||
1087 | |||||
1088 | =item fallback [CHARSET] | ||||
1089 | |||||
1090 | Get/set fallback charset. | ||||
1091 | |||||
1092 | B<Fallback charset> is used by this module when conversion by given | ||||
1093 | charset is failed and C<"FALLBACK"> error handling scheme is specified. | ||||
1094 | Modules using this module may use this charset as last resort of charset | ||||
1095 | for conversion. By default, it is C<"UTF-8">. | ||||
1096 | |||||
1097 | If CHARSET is given and isn't false, it will be set to fallback charset. | ||||
1098 | If CHARSET is C<"NONE">, fallback charset will be undefined. | ||||
1099 | Otherwise, fallback charset won't be changed. In any cases, | ||||
1100 | current fallback charset will be returned. | ||||
1101 | |||||
1102 | B<NOTE>: It I<is> useful that C<"US-ASCII"> is specified as fallback charset, | ||||
1103 | since result of conversion will be readable without charset information. | ||||
1104 | |||||
1105 | =cut | ||||
1106 | |||||
1107 | sub fallback(;$) { | ||||
1108 | my $charset = &canonical_charset(shift); | ||||
1109 | |||||
1110 | if ($charset eq "NONE") { | ||||
1111 | $FALLBACK_CHARSET = undef; | ||||
1112 | } elsif ($charset) { | ||||
1113 | croak "Unknown charset '$charset'" | ||||
1114 | unless resolve_alias($charset); | ||||
1115 | $FALLBACK_CHARSET = $charset; | ||||
1116 | } | ||||
1117 | return $FALLBACK_CHARSET; | ||||
1118 | } | ||||
1119 | |||||
1120 | =item recommended CHARSET [, HEADERENC, BODYENC [, ENCCHARSET]] | ||||
1121 | |||||
1122 | Get/set charset profiles. | ||||
1123 | |||||
1124 | If optional arguments are given and any of them are not false, profiles | ||||
1125 | for CHARSET will be set by those arguments. Otherwise, profiles | ||||
1126 | won't be changed. In both cases, current profiles for CHARSET will be | ||||
1127 | returned as 3-item list of (HEADERENC, BODYENC, ENCCHARSET). | ||||
1128 | |||||
1129 | HEADERENC is recommended encoding scheme for message header. | ||||
1130 | It may be one of C<"B">, C<"Q">, C<"S"> (shorter one of either) or | ||||
1131 | C<undef> (might not be encoded). | ||||
1132 | |||||
1133 | BODYENC is recommended transfer-encoding for message body. It may be | ||||
1134 | one of C<"B">, C<"Q">, C<"S"> (shorter one of either) or | ||||
1135 | C<undef> (might not be transfer-encoded). | ||||
1136 | |||||
1137 | ENCCHARSET is a charset which is compatible with given CHARSET and | ||||
1138 | is recommended to be used for MIME messages on Internet. | ||||
1139 | If conversion is not needed (or this module doesn't know appropriate | ||||
1140 | charset), ENCCHARSET is C<undef>. | ||||
1141 | |||||
1142 | B<NOTE>: This function in the future releases can accept more optional | ||||
1143 | arguments (for example, properties to handle character widths, line folding | ||||
1144 | behavior, ...). So format of returned value may probably be changed. | ||||
1145 | Use L<"header_encoding">, L<"body_encoding"> or L<"output_charset"> to get | ||||
1146 | particular profile. | ||||
1147 | |||||
1148 | =cut | ||||
1149 | |||||
1150 | sub recommended ($;$;$;$) { | ||||
1151 | my $charset = &canonical_charset(shift); | ||||
1152 | my $henc = uc(shift) || undef; | ||||
1153 | my $benc = uc(shift) || undef; | ||||
1154 | my $cset = &canonical_charset(shift); | ||||
1155 | |||||
1156 | croak "CHARSET is not specified" unless $charset; | ||||
1157 | croak "Unknown header encoding" unless !$henc or $henc =~ /^[BQS]$/; | ||||
1158 | croak "Unknown body encoding" unless !$benc or $benc =~ /^[BQ]$/; | ||||
1159 | |||||
1160 | if ($henc or $benc or $cset) { | ||||
1161 | $cset = undef if $charset eq $cset; | ||||
1162 | my @spec = ($henc, $benc, USE_ENCODE? $cset: undef); | ||||
1163 | $CHARSETS{$charset} = \@spec; | ||||
1164 | return @spec; | ||||
1165 | } else { | ||||
1166 | $charset = __PACKAGE__->new($charset) unless ref $charset; | ||||
1167 | return map { $charset->{$_} } qw(HeaderEncoding BodyEncoding | ||||
1168 | OutputCharset); | ||||
1169 | } | ||||
1170 | } | ||||
1171 | |||||
1172 | =back | ||||
1173 | |||||
1174 | =head2 Constants | ||||
1175 | |||||
1176 | =over | ||||
1177 | |||||
1178 | =item USE_ENCODE | ||||
1179 | |||||
1180 | Unicode/multibyte support flag. | ||||
1181 | Non-empty string will be set when Unicode and multibyte support is enabled. | ||||
1182 | Currently, this flag will be non-empty on Perl 5.7.3 or later and | ||||
1183 | empty string on earlier versions of Perl. | ||||
1184 | |||||
1185 | =back | ||||
1186 | |||||
1187 | =head2 Error Handling | ||||
1188 | |||||
1189 | L<"body_encode"> and L<"header_encode"> accept following C<Replacement> | ||||
1190 | options: | ||||
1191 | |||||
1192 | =over | ||||
1193 | |||||
1194 | =item C<"DEFAULT"> | ||||
1195 | |||||
1196 | Put a substitution character in place of a malformed character. | ||||
1197 | For UCM-based encodings, <subchar> will be used. | ||||
1198 | |||||
1199 | =item C<"FALLBACK"> | ||||
1200 | |||||
1201 | Try C<"DEFAULT"> scheme using I<fallback charset> (see L<"fallback">). | ||||
1202 | When fallback charset is undefined and conversion causes error, | ||||
1203 | code will die on error with an error message. | ||||
1204 | |||||
1205 | =item C<"CROAK"> | ||||
1206 | |||||
1207 | Code will die on error immediately with an error message. | ||||
1208 | Therefore, you should trap the fatal error with eval{} unless you | ||||
1209 | really want to let it die on error. | ||||
1210 | Synonym is C<"STRICT">. | ||||
1211 | |||||
1212 | =item C<"PERLQQ"> | ||||
1213 | |||||
1214 | =item C<"HTMLCREF"> | ||||
1215 | |||||
1216 | =item C<"XMLCREF"> | ||||
1217 | |||||
1218 | Use C<FB_PERLQQ>, C<FB_HTMLCREF> or C<FB_XMLCREF> | ||||
1219 | scheme defined by L<Encode> module. | ||||
1220 | |||||
1221 | =item numeric values | ||||
1222 | |||||
1223 | Numeric values are also allowed. | ||||
1224 | For more details see L<Encode/Handling Malformed Data>. | ||||
1225 | |||||
1226 | =back | ||||
1227 | |||||
1228 | If error handling scheme is not specified or unknown scheme is specified, | ||||
1229 | C<"DEFAULT"> will be assumed. | ||||
1230 | |||||
1231 | =head2 Configuration File | ||||
1232 | |||||
1233 | Built-in defaults for option parameters can be overridden by configuration | ||||
1234 | file: F<MIME/Charset/Defaults.pm>. | ||||
1235 | For more details read F<MIME/Charset/Defaults.pm.sample>. | ||||
1236 | |||||
1237 | =head1 VERSION | ||||
1238 | |||||
1239 | Consult $VERSION variable. | ||||
1240 | |||||
1241 | Development versions of this module may be found at | ||||
1242 | L<http://hatuka.nezumi.nu/repos/MIME-Charset/>. | ||||
1243 | |||||
1244 | =head2 Incompatible Changes | ||||
1245 | |||||
1246 | =over 4 | ||||
1247 | |||||
1248 | =item Release 1.001 | ||||
1249 | |||||
1250 | =over 4 | ||||
1251 | |||||
1252 | =item * | ||||
1253 | |||||
1254 | new() method returns an object when CHARSET argument is not specified. | ||||
1255 | |||||
1256 | =back | ||||
1257 | |||||
1258 | =item Release 1.005 | ||||
1259 | |||||
1260 | =over 4 | ||||
1261 | |||||
1262 | =item * | ||||
1263 | |||||
1264 | Restrict characters in encoded-word according to RFC 2047 section 5 (3). | ||||
1265 | This also affects return value of encoded_header_len() method. | ||||
1266 | |||||
1267 | =back | ||||
1268 | |||||
1269 | =item Release 1.008.2 | ||||
1270 | |||||
1271 | =over 4 | ||||
1272 | |||||
1273 | =item * | ||||
1274 | |||||
1275 | body_encoding() method may also returns C<"S">. | ||||
1276 | |||||
1277 | =item * | ||||
1278 | |||||
1279 | Return value of body_encode() method for UTF-8 may include | ||||
1280 | C<"QUOTED-PRINTABLE"> encoding item that in earlier versions was fixed to | ||||
1281 | C<"BASE64">. | ||||
1282 | |||||
1283 | =back | ||||
1284 | |||||
1285 | =back | ||||
1286 | |||||
1287 | =head1 SEE ALSO | ||||
1288 | |||||
1289 | Multipurpose Internet Mail Extensions (MIME). | ||||
1290 | |||||
1291 | =head1 AUTHOR | ||||
1292 | |||||
1293 | Hatuka*nezumi - IKEDA Soji <hatuka(at)nezumi.nu> | ||||
1294 | |||||
1295 | =head1 COPYRIGHT | ||||
1296 | |||||
1297 | Copyright (C) 2006-2017 Hatuka*nezumi - IKEDA Soji. | ||||
1298 | This program is free software; you can redistribute it and/or modify it | ||||
1299 | under the same terms as Perl itself. | ||||
1300 | |||||
1301 | =cut | ||||
1302 | |||||
1303 | 1; |