Filename | /usr/local/libexec/sympa/Sympa/Tools/Text.pm |
Statements | Executed 5865 statements in 30.5ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
345 | 1 | 1 | 6.15ms | 6.15ms | CORE:readline (opcode) | Sympa::Tools::Text::
345 | 1 | 1 | 6.02ms | 53.4ms | slurp | Sympa::Tools::Text::
345 | 1 | 1 | 5.21ms | 35.1ms | canonic_text | Sympa::Tools::Text::
345 | 1 | 1 | 5.20ms | 5.20ms | CORE:open (opcode) | Sympa::Tools::Text::
345 | 1 | 1 | 1.70ms | 1.70ms | CORE:subst (opcode) | Sympa::Tools::Text::
345 | 1 | 1 | 907µs | 907µs | CORE:close (opcode) | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | BEGIN@132 | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | BEGIN@30 | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | BEGIN@31 | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | BEGIN@32 | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | BEGIN@33 | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | BEGIN@34 | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | BEGIN@35 | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | BEGIN@36 | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | BEGIN@37 | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | BEGIN@38 | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | BEGIN@39 | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | BEGIN@40 | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | BEGIN@41 | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | BEGIN@42 | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | BEGIN@44 | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | BEGIN@45 | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | CORE:match (opcode) | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | CORE:regcomp (opcode) | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | __ANON__ (xsub) | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | __ANON__[:178] | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | __ANON__[:188] | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | _gc_length | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | _url_query_string | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | addrencode | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | canonic_email | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | canonic_message_id | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | clip | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | decode_filesystem_safe | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | decode_html | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | encode_filesystem_safe | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | encode_html | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | encode_uri | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | escape_chars | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | foldcase | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | guessed_to_utf8 | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | mailtourl | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | pad | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | qdecode_filename | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | qencode_filename | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | unescape_chars | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | valid_email | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | weburl | Sympa::Tools::Text::
0 | 0 | 0 | 0s | 0s | wrap_text | Sympa::Tools::Text::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | # -*- indent-tabs-mode: nil; -*- | ||||
2 | # vim:ft=perl:et:sw=4 | ||||
3 | # $Id$ | ||||
4 | |||||
5 | # Sympa - SYsteme de Multi-Postage Automatique | ||||
6 | # | ||||
7 | # Copyright (c) 1997, 1998, 1999 Institut Pasteur & Christophe Wolfhugel | ||||
8 | # Copyright (c) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, | ||||
9 | # 2006, 2007, 2008, 2009, 2010, 2011 Comite Reseau des Universites | ||||
10 | # Copyright (c) 2011, 2012, 2013, 2014, 2015, 2016, 2017 GIP RENATER | ||||
11 | # Copyright 2018, 2020 The Sympa Community. See the AUTHORS.md | ||||
12 | # file at the top-level directory of this distribution and at | ||||
13 | # <https://github.com/sympa-community/sympa.git>. | ||||
14 | # | ||||
15 | # This program is free software; you can redistribute it and/or modify | ||||
16 | # it under the terms of the GNU General Public License as published by | ||||
17 | # the Free Software Foundation; either version 2 of the License, or | ||||
18 | # (at your option) any later version. | ||||
19 | # | ||||
20 | # This program is distributed in the hope that it will be useful, | ||||
21 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
22 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||||
23 | # GNU General Public License for more details. | ||||
24 | # | ||||
25 | # You should have received a copy of the GNU General Public License | ||||
26 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | ||||
27 | |||||
28 | package Sympa::Tools::Text; | ||||
29 | |||||
30 | use strict; | ||||
31 | use warnings; | ||||
32 | use feature qw(fc); | ||||
33 | use Encode qw(); | ||||
34 | use English qw(-no_match_vars); | ||||
35 | use Encode::MIME::Header; # 'MIME-Q' encoding. | ||||
36 | use HTML::Entities qw(); | ||||
37 | use MIME::EncWords; | ||||
38 | use Text::LineFold; | ||||
39 | use Unicode::GCString; | ||||
40 | use URI::Escape qw(); | ||||
41 | BEGIN { eval 'use Unicode::Normalize qw()'; } # spent 0s executing statements in string eval | ||||
42 | BEGIN { eval 'use Unicode::UTF8 qw()'; } # spent 0s executing statements in string eval | ||||
43 | |||||
44 | use Sympa::Language; | ||||
45 | use Sympa::Regexps; | ||||
46 | |||||
47 | # Old name: tools::addrencode(). | ||||
48 | sub addrencode { | ||||
49 | my $addr = shift; | ||||
50 | my $phrase = (shift || ''); | ||||
51 | my $charset = (shift || 'utf8'); | ||||
52 | my $comment = (shift || ''); | ||||
53 | |||||
54 | return undef unless $addr =~ /\S/; | ||||
55 | |||||
56 | if ($phrase =~ /[^\s\x21-\x7E]/) { | ||||
57 | $phrase = MIME::EncWords::encode_mimewords( | ||||
58 | Encode::decode('utf8', $phrase), | ||||
59 | 'Encoding' => 'A', | ||||
60 | 'Charset' => $charset, | ||||
61 | 'Replacement' => 'FALLBACK', | ||||
62 | 'Field' => 'Resent-Sender', # almost longest | ||||
63 | 'Minimal' => 'DISPNAME', # needs MIME::EncWords >= 1.012. | ||||
64 | ); | ||||
65 | } elsif ($phrase =~ /\S/) { | ||||
66 | $phrase =~ s/([\\\"])/\\$1/g; | ||||
67 | $phrase = '"' . $phrase . '"'; | ||||
68 | } | ||||
69 | if ($comment =~ /[^\s\x21-\x27\x2A-\x5B\x5D-\x7E]/) { | ||||
70 | $comment = MIME::EncWords::encode_mimewords( | ||||
71 | Encode::decode('utf8', $comment), | ||||
72 | 'Encoding' => 'A', | ||||
73 | 'Charset' => $charset, | ||||
74 | 'Replacement' => 'FALLBACK', | ||||
75 | 'Minimal' => 'DISPNAME', | ||||
76 | ); | ||||
77 | } elsif ($comment =~ /\S/) { | ||||
78 | $comment =~ s/([\\\"])/\\$1/g; | ||||
79 | } | ||||
80 | |||||
81 | return | ||||
82 | ($phrase =~ /\S/ ? "$phrase " : '') | ||||
83 | . ($comment =~ /\S/ ? "($comment) " : '') | ||||
84 | . "<$addr>"; | ||||
85 | } | ||||
86 | |||||
87 | # Old names: tools::clean_email(), tools::get_canonical_email(). | ||||
88 | sub canonic_email { | ||||
89 | my $email = shift; | ||||
90 | |||||
91 | return undef unless defined $email; | ||||
92 | |||||
93 | # Remove leading and trailing white spaces. | ||||
94 | $email =~ s/\A\s+//; | ||||
95 | $email =~ s/\s+\z//; | ||||
96 | |||||
97 | # Lower-case. | ||||
98 | $email =~ tr/A-Z/a-z/; | ||||
99 | |||||
100 | return (length $email) ? $email : undef; | ||||
101 | } | ||||
102 | |||||
103 | # Old name: tools::clean_msg_id(). | ||||
104 | sub canonic_message_id { | ||||
105 | my $msg_id = shift; | ||||
106 | |||||
107 | return $msg_id unless defined $msg_id; | ||||
108 | |||||
109 | chomp $msg_id; | ||||
110 | |||||
111 | if ($msg_id =~ /\<(.+)\>/) { | ||||
112 | $msg_id = $1; | ||||
113 | } | ||||
114 | |||||
115 | return $msg_id; | ||||
116 | } | ||||
117 | |||||
118 | # spent 35.1ms (5.21+29.9) within Sympa::Tools::Text::canonic_text which was called 345 times, avg 102µs/call:
# 345 times (5.21ms+29.9ms) by Sympa::Tools::Text::slurp at line 160, avg 102µs/call | ||||
119 | 345 | 134µs | my $text = shift; | ||
120 | |||||
121 | 345 | 134µs | return undef unless defined $text; | ||
122 | |||||
123 | # Normalize text. See also discussion on | ||||
124 | # https://listes.renater.fr/sympa/arc/sympa-developpers/2018-03/thrd1.html | ||||
125 | # | ||||
126 | # N.B.: Corresponding modules are optional by now, and should be | ||||
127 | # mandatory in the future. | ||||
128 | 345 | 73µs | my $utext; | ||
129 | 345 | 1.36ms | 345 | 338µs | if (Encode::is_utf8($text)) { # spent 338µs making 345 calls to Encode::is_utf8, avg 981ns/call |
130 | $utext = $text; | ||||
131 | } elsif ($Unicode::UTF8::VERSION) { | ||||
132 | no warnings 'utf8'; | ||||
133 | $utext = Unicode::UTF8::decode_utf8($text); | ||||
134 | } else { | ||||
135 | 345 | 5.14ms | 1035 | 9.29ms | $utext = Encode::decode_utf8($text); # spent 5.27ms making 345 calls to Encode::decode_utf8, avg 15µs/call
# spent 3.12ms making 345 calls to Encode::utf8::decode, avg 9µs/call
# spent 900µs making 345 calls to Encode::Encoding::renewed, avg 3µs/call |
136 | } | ||||
137 | 345 | 998µs | 345 | 22.0ms | if ($Unicode::Normalize::VERSION) { # spent 22.0ms making 345 calls to Unicode::Normalize::normalize, avg 64µs/call |
138 | $utext = Unicode::Normalize::normalize('NFC', $utext); | ||||
139 | } | ||||
140 | |||||
141 | # Remove DOS linefeeds (^M) that cause problems with Outlook 98, AOL, | ||||
142 | # and EIMS: | ||||
143 | 345 | 2.25ms | 345 | 1.70ms | $utext =~ s/\r\n|\r/\n/g; # spent 1.70ms making 345 calls to Sympa::Tools::Text::CORE:subst, avg 5µs/call |
144 | |||||
145 | 345 | 626µs | 345 | 133µs | if (Encode::is_utf8($text)) { # spent 133µs making 345 calls to Encode::is_utf8, avg 386ns/call |
146 | return $utext; | ||||
147 | } else { | ||||
148 | 345 | 1.54ms | 345 | 424µs | return Encode::encode_utf8($utext); # spent 424µs making 345 calls to Encode::encode_utf8, avg 1µs/call |
149 | } | ||||
150 | } | ||||
151 | |||||
152 | # spent 53.4ms (6.02+47.3) within Sympa::Tools::Text::slurp which was called 345 times, avg 155µs/call:
# 345 times (6.02ms+47.3ms) by Sympa::Robot::load_topics at line 210 of /usr/local/libexec/sympa/Sympa/Robot.pm, avg 155µs/call | ||||
153 | 345 | 138µs | my $path = shift; | ||
154 | |||||
155 | 345 | 89µs | my $ifh; | ||
156 | 345 | 6.85ms | 345 | 5.20ms | return undef unless open $ifh, '<', $path; # spent 5.20ms making 345 calls to Sympa::Tools::Text::CORE:open, avg 15µs/call |
157 | 1035 | 8.08ms | 345 | 6.15ms | my $text = do { local $RS; <$ifh> }; # spent 6.15ms making 345 calls to Sympa::Tools::Text::CORE:readline, avg 18µs/call |
158 | 345 | 1.51ms | 345 | 907µs | close $ifh; # spent 907µs making 345 calls to Sympa::Tools::Text::CORE:close, avg 3µs/call |
159 | |||||
160 | 345 | 1.57ms | 345 | 35.1ms | return canonic_text($text); # spent 35.1ms making 345 calls to Sympa::Tools::Text::canonic_text, avg 102µs/call |
161 | } | ||||
162 | |||||
163 | sub wrap_text { | ||||
164 | my $text = shift; | ||||
165 | my $init = shift; | ||||
166 | my $subs = shift; | ||||
167 | my $cols = shift; | ||||
168 | |||||
169 | $init //= ''; | ||||
170 | $subs //= ''; | ||||
171 | $cols //= 78; | ||||
172 | return $text unless $cols; | ||||
173 | |||||
174 | my $email_re = Sympa::Regexps::email(); | ||||
175 | my $linefold = Text::LineFold->new( | ||||
176 | Language => Sympa::Language->instance->get_lang, | ||||
177 | Prep => 'NONBREAKURI', | ||||
178 | prep => [$email_re, sub { shift; @_ }], | ||||
179 | ColumnsMax => $cols, | ||||
180 | Format => sub { | ||||
181 | shift; | ||||
182 | my $event = shift; | ||||
183 | my $str = shift; | ||||
184 | if ($event =~ /^eo/) { return "\n"; } | ||||
185 | if ($event =~ /^so[tp]/) { return $init . $str; } | ||||
186 | if ($event eq 'sol') { return $subs . $str; } | ||||
187 | undef; | ||||
188 | }, | ||||
189 | ); | ||||
190 | |||||
191 | my $t = Encode::is_utf8($text) ? $text : Encode::decode_utf8($text); | ||||
192 | |||||
193 | my $ret = ''; | ||||
194 | while (1000 < length $t) { | ||||
195 | my $s = substr $t, 0, 1000; | ||||
196 | $ret .= $linefold->break_partial($s); | ||||
197 | $t = substr $t, 1000; | ||||
198 | } | ||||
199 | $ret .= $linefold->break_partial($t) if length $t; | ||||
200 | $ret .= $linefold->break_partial(undef); | ||||
201 | |||||
202 | return Encode::is_utf8($text) ? $ret : Encode::encode_utf8($ret); | ||||
203 | } | ||||
204 | |||||
205 | sub decode_filesystem_safe { | ||||
206 | my $str = shift; | ||||
207 | return '' unless defined $str and length $str; | ||||
208 | |||||
209 | $str = Encode::encode_utf8($str) if Encode::is_utf8($str); | ||||
210 | # On case-insensitive filesystem "_XX" along with "_xx" should be decoded. | ||||
211 | $str =~ s/_([0-9A-Fa-f]{2})/chr hex "0x$1"/eg; | ||||
212 | return $str; | ||||
213 | } | ||||
214 | |||||
215 | sub decode_html { | ||||
216 | my $str = shift; | ||||
217 | |||||
218 | Encode::encode_utf8( | ||||
219 | HTML::Entities::decode_entities(Encode::decode_utf8($str))); | ||||
220 | } | ||||
221 | |||||
222 | sub encode_filesystem_safe { | ||||
223 | my $str = shift; | ||||
224 | return '' unless defined $str and length $str; | ||||
225 | |||||
226 | $str = Encode::encode_utf8($str) if Encode::is_utf8($str); | ||||
227 | $str =~ s/([^-+.0-9\@A-Za-z])/sprintf '_%02x', ord $1/eg; | ||||
228 | return $str; | ||||
229 | } | ||||
230 | |||||
231 | sub encode_html { | ||||
232 | my $str = shift; | ||||
233 | my $additional_unsafe = shift || ''; | ||||
234 | |||||
235 | HTML::Entities::encode_entities($str, '<>&"' . $additional_unsafe); | ||||
236 | } | ||||
237 | |||||
238 | sub encode_uri { | ||||
239 | my $str = shift; | ||||
240 | my %options = @_; | ||||
241 | |||||
242 | # Note: URI-1.35 (URI::Escape 3.28) or later is required. | ||||
243 | return Encode::encode_utf8( | ||||
244 | URI::Escape::uri_escape_utf8( | ||||
245 | Encode::decode_utf8($str), | ||||
246 | '^-A-Za-z0-9._~' . (exists $options{omit} ? $options{omit} : '') | ||||
247 | ) | ||||
248 | ); | ||||
249 | } | ||||
250 | |||||
251 | # Old name: tools::escape_chars(). | ||||
252 | sub escape_chars { | ||||
253 | my $s = shift; | ||||
254 | my $except = shift; ## Exceptions | ||||
255 | my $ord_except = ord $except if defined $except; | ||||
256 | |||||
257 | ## Escape chars | ||||
258 | ## !"#$%&'()+,:;<=>?[] AND accented chars | ||||
259 | ## escape % first | ||||
260 | foreach my $i ( | ||||
261 | 0x25, | ||||
262 | 0x20 .. 0x24, | ||||
263 | 0x26 .. 0x2c, | ||||
264 | 0x3a .. 0x3f, | ||||
265 | 0x5b, 0x5d, | ||||
266 | 0x80 .. 0x9f, | ||||
267 | 0xa0 .. 0xff | ||||
268 | ) { | ||||
269 | next if defined $ord_except and $i == $ord_except; | ||||
270 | my $hex_i = sprintf "%lx", $i; | ||||
271 | $s =~ s/\x$hex_i/%$hex_i/g; | ||||
272 | } | ||||
273 | ## Special traetment for '/' | ||||
274 | $s =~ s/\//%a5/g unless defined $except and $except eq '/'; | ||||
275 | |||||
276 | return $s; | ||||
277 | } | ||||
278 | |||||
279 | # Old name: tt2::escape_url(). | ||||
280 | # DEPRECATED. Use Sympa::Tools::Text::escape_uri() or | ||||
281 | # Sympa::Tools::Text::mailtourl(). | ||||
282 | #sub escape_url; | ||||
283 | |||||
284 | sub foldcase { | ||||
285 | my $str = shift; | ||||
286 | |||||
287 | return '' unless defined $str and length $str; | ||||
288 | return Encode::encode_utf8(fc(Encode::decode_utf8($str))); | ||||
289 | } | ||||
290 | |||||
291 | my %legacy_charsets = ( | ||||
292 | 'ar' => [qw(iso-8859-6)], | ||||
293 | 'bs' => [qw(iso-8859-2)], | ||||
294 | 'cs' => [qw(iso-8859-2)], | ||||
295 | 'eo' => [qw(iso-8859-3)], | ||||
296 | 'et' => [qw(iso-8859-4)], | ||||
297 | 'he' => [qw(iso-8859-8)], | ||||
298 | 'hr' => [qw(iso-8859-2)], | ||||
299 | 'hu' => [qw(iso-8859-2)], | ||||
300 | 'ja' => [qw(euc-jp cp932 MacJapanese)], | ||||
301 | 'kl' => [qw(iso-8859-4)], | ||||
302 | 'ko' => [qw(cp949)], | ||||
303 | 'lt' => [qw(iso-8859-4)], | ||||
304 | 'lv' => [qw(iso-8859-4)], | ||||
305 | 'mt' => [qw(iso-8859-3)], | ||||
306 | 'pl' => [qw(iso-8859-2)], | ||||
307 | 'ro' => [qw(iso-8859-2)], | ||||
308 | 'ru' => [qw(koi8-r cp1251)], # cp866? MacCyrillic? | ||||
309 | 'sk' => [qw(iso-8859-2)], | ||||
310 | 'sl' => [qw(iso-8859-2)], | ||||
311 | 'th' => [qw(iso-8859-11 cp874 MacThai)], | ||||
312 | 'tr' => [qw(iso-8859-9)], | ||||
313 | 'uk' => [qw(koi8-u)], # MacUkrainian? | ||||
314 | 'zh-CN' => [qw(euc-cn)], | ||||
315 | 'zh-TW' => [qw(big5-eten)], | ||||
316 | ); | ||||
317 | |||||
318 | sub guessed_to_utf8 { | ||||
319 | my $text = shift; | ||||
320 | my @langs = @_; | ||||
321 | |||||
322 | return Encode::encode_utf8($text) if Encode::is_utf8($text); | ||||
323 | return $text | ||||
324 | unless defined $text | ||||
325 | and length $text | ||||
326 | and $text =~ /[^\x00-\x7F]/; | ||||
327 | |||||
328 | my $utf8; | ||||
329 | if ($Unicode::UTF8::VERSION) { | ||||
330 | $utf8 = | ||||
331 | eval { Unicode::UTF8::decode_utf8($text, Encode::FB_CROAK()) }; | ||||
332 | } | ||||
333 | unless (defined $utf8) { | ||||
334 | foreach my $charset (map { $_ ? @$_ : () } @legacy_charsets{@langs}) { | ||||
335 | $utf8 = | ||||
336 | eval { Encode::decode($charset, $text, Encode::FB_CROAK()) }; | ||||
337 | last if defined $utf8; | ||||
338 | } | ||||
339 | } | ||||
340 | unless (defined $utf8) { | ||||
341 | $utf8 = Encode::decode('iso-8859-1', $text); | ||||
342 | } | ||||
343 | |||||
344 | # Apply NFC: e.g. for modified-NFD by Mac OS X. | ||||
345 | $utf8 = Unicode::Normalize::normalize('NFC', $utf8) | ||||
346 | if $Unicode::Normalize::VERSION; | ||||
347 | |||||
348 | return Encode::encode_utf8($utf8); | ||||
349 | } | ||||
350 | |||||
351 | sub mailtourl { | ||||
352 | my $text = shift; | ||||
353 | my %options = @_; | ||||
354 | |||||
355 | my $dtext = | ||||
356 | (not defined $text) ? '' | ||||
357 | : $options{decode_html} ? Sympa::Tools::Text::decode_html($text) | ||||
358 | : $text; | ||||
359 | $dtext =~ s/\A\s+//; | ||||
360 | $dtext =~ s/\s+\z//; | ||||
361 | $dtext =~ s/(?:\r\n|\r|\n)(?=[ \t])//g; | ||||
362 | $dtext =~ s/\r\n|\r|\n/ /g; | ||||
363 | |||||
364 | # The ``@'' in email address should not be encoded because some MUAs | ||||
365 | # aren't able to decode ``%40'' in e-mail address of mailto: URL. | ||||
366 | # Contrary, ``@'' in query component should be encoded because some | ||||
367 | # MUAs take it for a delimiter to separate URL from the rest. | ||||
368 | my ($format, $utext, $qsep); | ||||
369 | if ($dtext =~ /[()<>\[\]:;,\"\s]/) { | ||||
370 | # Use "to" header if source text includes any of RFC 5322 | ||||
371 | # "specials", minus ``@'' and ``\'', plus whitespaces. | ||||
372 | $format = 'mailto:?to=%s%s'; | ||||
373 | $utext = Sympa::Tools::Text::encode_uri($dtext); | ||||
374 | $qsep = '&'; | ||||
375 | } else { | ||||
376 | $format = 'mailto:%s%s'; | ||||
377 | $utext = Sympa::Tools::Text::encode_uri($dtext, omit => '@'); | ||||
378 | $qsep = '?'; | ||||
379 | } | ||||
380 | my $qstring = _url_query_string( | ||||
381 | $options{query}, | ||||
382 | decode_html => $options{decode_html}, | ||||
383 | leadchar => $qsep, | ||||
384 | sepchar => '&', | ||||
385 | trim_values => 1, | ||||
386 | ); | ||||
387 | |||||
388 | return sprintf $format, $utext, $qstring; | ||||
389 | } | ||||
390 | |||||
391 | sub _url_query_string { | ||||
392 | my $query = shift; | ||||
393 | my %options = @_; | ||||
394 | |||||
395 | unless (ref $query eq 'HASH' and %$query) { | ||||
396 | return ''; | ||||
397 | } else { | ||||
398 | my $decode_html = $options{decode_html}; | ||||
399 | my $trim_values = $options{trim_values}; | ||||
400 | return ($options{leadchar} || '?') . join( | ||||
401 | ($options{sepchar} || ';'), | ||||
402 | map { | ||||
403 | my ($dkey, $dval) = map { | ||||
404 | (not defined $_) ? '' | ||||
405 | : $decode_html ? Sympa::Tools::Text::decode_html($_) | ||||
406 | : $_; | ||||
407 | } ($_, $query->{$_}); | ||||
408 | if ($trim_values and lc $dkey ne 'body') { | ||||
409 | $dval =~ s/\A\s+//; | ||||
410 | $dval =~ s/\s+\z//; | ||||
411 | $dval =~ s/(?:\r\n|\r|\n)(?=[ \t])//g; | ||||
412 | $dval =~ s/\r\n|\r|\n/ /g; | ||||
413 | } | ||||
414 | |||||
415 | sprintf '%s=%s', | ||||
416 | Sympa::Tools::Text::encode_uri($dkey), | ||||
417 | Sympa::Tools::Text::encode_uri($dval); | ||||
418 | } sort keys %$query | ||||
419 | ); | ||||
420 | } | ||||
421 | } | ||||
422 | |||||
423 | sub pad { | ||||
424 | my $str = shift; | ||||
425 | my $width = shift; | ||||
426 | |||||
427 | return $str unless $width and defined $str; | ||||
428 | |||||
429 | my $ustr = Encode::is_utf8($str) ? $str : Encode::decode_utf8($str); | ||||
430 | my $cols = Unicode::GCString->new($ustr)->columns; | ||||
431 | |||||
432 | unless ($cols < abs $width) { | ||||
433 | return $str; | ||||
434 | } elsif ($width < 0) { | ||||
435 | return $str . (' ' x (-$width - $cols)); | ||||
436 | } else { | ||||
437 | return (' ' x ($width - $cols)) . $str; | ||||
438 | } | ||||
439 | } | ||||
440 | |||||
441 | # Old name: tools::qdecode_filename(). | ||||
442 | sub qdecode_filename { | ||||
443 | my $filename = shift; | ||||
444 | |||||
445 | ## We don't use MIME::Words here because it does not encode properly | ||||
446 | ## Unicode | ||||
447 | ## Check if string is already Q-encoded first | ||||
448 | #if ($filename =~ /\=\?UTF-8\?/) { | ||||
449 | $filename = Encode::encode_utf8(Encode::decode('MIME-Q', $filename)); | ||||
450 | #} | ||||
451 | |||||
452 | return $filename; | ||||
453 | } | ||||
454 | |||||
455 | # Old name: tools::qencode_filename(). | ||||
456 | sub qencode_filename { | ||||
457 | my $filename = shift; | ||||
458 | |||||
459 | ## We don't use MIME::Words here because it does not encode properly | ||||
460 | ## Unicode | ||||
461 | ## Check if string is already Q-encoded first | ||||
462 | ## Also check if the string contains 8bit chars | ||||
463 | unless ($filename =~ /\=\?UTF-8\?/ | ||||
464 | || $filename =~ /^[\x00-\x7f]*$/) { | ||||
465 | |||||
466 | ## Don't encode elements such as .desc. or .url or .moderate | ||||
467 | ## or .extension | ||||
468 | my $part = $filename; | ||||
469 | my ($leading, $trailing); | ||||
470 | $leading = $1 if ($part =~ s/^(\.desc\.)//); ## leading .desc | ||||
471 | $trailing = $1 if ($part =~ s/((\.\w+)+)$//); ## trailing .xx | ||||
472 | |||||
473 | my $encoded_part = MIME::EncWords::encode_mimewords( | ||||
474 | $part, | ||||
475 | Charset => 'utf8', | ||||
476 | Encoding => 'q', | ||||
477 | MaxLineLen => 1000, | ||||
478 | Minimal => 'NO' | ||||
479 | ); | ||||
480 | |||||
481 | $filename = $leading . $encoded_part . $trailing; | ||||
482 | } | ||||
483 | |||||
484 | return $filename; | ||||
485 | } | ||||
486 | |||||
487 | sub clip { | ||||
488 | my $string = shift; | ||||
489 | return undef unless @_; | ||||
490 | my $length = shift; | ||||
491 | |||||
492 | my ($gcstr, $blen); | ||||
493 | if (ref $string eq 'Unicode::GCString') { | ||||
494 | $gcstr = $string; | ||||
495 | $blen = length Encode::encode_utf8($string->as_string); | ||||
496 | } elsif (Encode::is_utf8($string)) { | ||||
497 | $gcstr = Unicode::GCString->new($string); | ||||
498 | $blen = length Encode::encode_utf8($string); | ||||
499 | } else { | ||||
500 | $gcstr = Unicode::GCString->new(Encode::decode_utf8($string)); | ||||
501 | $blen = length $string; | ||||
502 | } | ||||
503 | |||||
504 | $length += $blen if $length < 0; | ||||
505 | return '' if $length < 0; # out of range | ||||
506 | return $string if $blen <= $length; | ||||
507 | |||||
508 | my $result = $gcstr->substr(0, _gc_length($gcstr, $length)); | ||||
509 | |||||
510 | if (ref $string eq 'Unicode::GCString') { | ||||
511 | return $result; | ||||
512 | } elsif (Encode::is_utf8($string)) { | ||||
513 | return $result->as_string; | ||||
514 | } else { | ||||
515 | return Encode::encode_utf8($result->as_string); | ||||
516 | } | ||||
517 | } | ||||
518 | |||||
519 | sub _gc_length { | ||||
520 | my $gcstr = shift; | ||||
521 | my $length = shift; | ||||
522 | |||||
523 | return 0 unless $gcstr->length; | ||||
524 | return 0 unless $length; | ||||
525 | |||||
526 | my ($shorter, $longer) = (0, $gcstr->length); | ||||
527 | while ($shorter < $longer) { | ||||
528 | my $cur = ($shorter + $longer + 1) >> 1; | ||||
529 | my $elen = | ||||
530 | length Encode::encode_utf8($gcstr->substr(0, $cur)->as_string); | ||||
531 | if ($elen <= $length) { | ||||
532 | $shorter = $cur; | ||||
533 | } else { | ||||
534 | $longer = $cur - 1; | ||||
535 | } | ||||
536 | } | ||||
537 | |||||
538 | return $shorter; | ||||
539 | } | ||||
540 | |||||
541 | # Old name: tools::unescape_chars(). | ||||
542 | sub unescape_chars { | ||||
543 | my $s = shift; | ||||
544 | |||||
545 | $s =~ s/%a5/\//g; ## Special traetment for '/' | ||||
546 | foreach my $i (0x20 .. 0x2c, 0x3a .. 0x3f, 0x5b, 0x5d, 0x80 .. 0x9f, | ||||
547 | 0xa0 .. 0xff) { | ||||
548 | my $hex_i = sprintf "%lx", $i; | ||||
549 | my $hex_s = sprintf "%c", $i; | ||||
550 | $s =~ s/%$hex_i/$hex_s/g; | ||||
551 | } | ||||
552 | |||||
553 | return $s; | ||||
554 | } | ||||
555 | |||||
556 | # Old name: tools::valid_email(). | ||||
557 | sub valid_email { | ||||
558 | my $email = shift; | ||||
559 | |||||
560 | my $email_re = Sympa::Regexps::email(); | ||||
561 | return undef unless $email =~ /^${email_re}$/; | ||||
562 | |||||
563 | # Forbidden characters. | ||||
564 | return undef if $email =~ /[\|\$\*\?\!]/; | ||||
565 | |||||
566 | return 1; | ||||
567 | } | ||||
568 | |||||
569 | sub weburl { | ||||
570 | my $base = shift; | ||||
571 | my $paths = shift; | ||||
572 | my %options = @_; | ||||
573 | |||||
574 | my @paths = map { | ||||
575 | Sympa::Tools::Text::encode_uri( | ||||
576 | (not defined $_) ? '' | ||||
577 | : $options{decode_html} ? Sympa::Tools::Text::decode_html($_) | ||||
578 | : $_ | ||||
579 | ); | ||||
580 | } @{$paths || []}; | ||||
581 | |||||
582 | my $qstring = _url_query_string( | ||||
583 | $options{query}, | ||||
584 | decode_html => $options{decode_html}, | ||||
585 | sepchar => '&', | ||||
586 | ); | ||||
587 | |||||
588 | my $fstring; | ||||
589 | my $fragment = $options{fragment}; | ||||
590 | if (defined $fragment) { | ||||
591 | $fstring = '#' | ||||
592 | . Sympa::Tools::Text::encode_uri( | ||||
593 | $options{decode_html} | ||||
594 | ? Sympa::Tools::Text::decode_html($fragment) | ||||
595 | : $fragment | ||||
596 | ); | ||||
597 | } else { | ||||
598 | $fstring = ''; | ||||
599 | } | ||||
600 | |||||
601 | return sprintf '%s%s%s', join('/', grep { defined $_ } ($base, @paths)), | ||||
602 | $qstring, $fstring; | ||||
603 | } | ||||
604 | |||||
605 | 1; | ||||
606 | __END__ | ||||
# spent 907µs within Sympa::Tools::Text::CORE:close which was called 345 times, avg 3µs/call:
# 345 times (907µs+0s) by Sympa::Tools::Text::slurp at line 158, avg 3µs/call | |||||
# spent 5.20ms within Sympa::Tools::Text::CORE:open which was called 345 times, avg 15µs/call:
# 345 times (5.20ms+0s) by Sympa::Tools::Text::slurp at line 156, avg 15µs/call | |||||
# spent 6.15ms within Sympa::Tools::Text::CORE:readline which was called 345 times, avg 18µs/call:
# 345 times (6.15ms+0s) by Sympa::Tools::Text::slurp at line 157, avg 18µs/call | |||||
# spent 1.70ms within Sympa::Tools::Text::CORE:subst which was called 345 times, avg 5µs/call:
# 345 times (1.70ms+0s) by Sympa::Tools::Text::canonic_text at line 143, avg 5µs/call |