From 83bbd0422ae8c839c9182529f0fbf82900387379 Mon Sep 17 00:00:00 2001 From: "Michael G. Schwern" Date: Tue, 9 Aug 2016 10:27:37 -0700 Subject: [PATCH] Choose a utf-8 encoding that will not break threads. ...or fork() on Windows (implemented with threads). I've deliberately avoided tests for which particular utf-8 encoding is used in what scenario because that behavior is deliberately left undefined for future-proofing. All utf8::all guarantees is threads still work. --- lib/utf8/all.pm | 33 ++++++++++++++++++++++++++++++++- t/lexical-again.t | 5 ++--- t/open.t | 6 +++--- t/threads.t | 33 +++++++++++++++++++++++++++++++++ t/utf8.t | 4 +--- 5 files changed, 71 insertions(+), 10 deletions(-) create mode 100644 t/threads.t diff --git a/lib/utf8/all.pm b/lib/utf8/all.pm index 5574c4a..cdeaf93 100644 --- a/lib/utf8/all.pm +++ b/lib/utf8/all.pm @@ -110,15 +110,21 @@ L for fully utf-8 aware Cwd functions. use Import::Into; use parent qw(Encode charnames utf8 open warnings feature); use Symbol qw(qualify_to_ref); +use Config; # Holds the pointers to the original version of redefined functions state %_orig_functions; sub import { + my $class = shift; + # Enable features/pragmas in calling package my $target = caller; + + my $utf8_encoding = $class->_choose_utf8_encoding; + 'utf8'->import::into($target); - 'open'->import::into($target, qw{:encoding(UTF-8) :std}); + 'open'->import::into($target => $utf8_encoding, ':std'); 'charnames'->import::into($target, qw{:full :short}); 'warnings'->import::into($target, qw{FATAL utf8}); 'feature'->import::into($target, qw{unicode_strings}) if $^V >= v5.11.0; @@ -208,6 +214,31 @@ sub _utf8_glob { } } +sub _choose_utf8_encoding { + # No threads? No problem. + return ':encoding(UTF-8)' if !$Config{usethreads} && !$Config{useithreads}; + + # 5.24.0 seems to have fixed the major utf8 issues. + return ':encoding(UTF-8)' if $^V ge 5.24.0; + + # A safe default. + return ':utf8'; +} + +=head1 WHICH UTF-8 ENCODING? + +I. Perl's unicode has bugs. utf8::all will try to work around them. + +As of this writing, Perl has several ways to do utf-8. It has to do +with whether "unassigned" code points are considered errors or +not. The details are in L. Perl also has lots of Unicode bugs, particularly with threads and +strict utf-8 encoding (ie. C<:encoding(UTF-8)>). + +utf8::all will prefer the strictest encoding available, but it may +choose a less strict utf-8 encoding if it detects your Perl is +vulnerable to Unicode bugs. This should have no effect on how valid +utf-8 is handled. + =head1 INTERACTION WITH AUTODIE If you use L, which is a great idea, you need to use at least version diff --git a/t/lexical-again.t b/t/lexical-again.t index a1f4a05..6f2877a 100644 --- a/t/lexical-again.t +++ b/t/lexical-again.t @@ -2,7 +2,7 @@ # no utf8::all should disable its effects lexically # Note: Changes to @ARGV, STDIN, STDOU, and STDERR are always global! -use Test::More tests => 17; +use Test::More; use PerlIO; my $expected_unicode = "\x{30c6}\x{30b9}\x{30c8}"; # Unicode characters @@ -52,7 +52,6 @@ for my $fh (keys %handles) { my @layers = PerlIO::get_layers($handles{$fh}); ok(grep(m/utf8/, @layers), "$fh: utf8 does appear in the perlio layers") or diag explain { $fh => \@layers }; - ok(grep(m/utf-8-strict/, @layers), "$fh: utf-8-strict does appear in the perlio layers") - or diag explain { $fh => \@layers }; } +done_testing; diff --git a/t/open.t b/t/open.t index b782395..227fa8d 100644 --- a/t/open.t +++ b/t/open.t @@ -2,14 +2,14 @@ # Test opening an actual file use utf8::all; use PerlIO; -use Test::More tests => 4; +use Test::More; ok open my $in, '<', 'corpus/testfile'; my @layers = PerlIO::get_layers($in); ok(grep(m/utf8/, @layers), 'utf8 appears in the perlio layers') or diag explain { $fh => \@layers }; -ok(grep(m/utf-8-strict/, @layers), 'utf-8-strict appears in the perlio layers') - or diag explain { $fh => \@layers }; my $contents = do { local $/; <$in>}; is $contents, "\x{30c6}\x{30b9}\x{30c8}\n", 'unicode retrieved OK'; + +done_testing; diff --git a/t/threads.t b/t/threads.t new file mode 100644 index 0000000..4ff1d27 --- /dev/null +++ b/t/threads.t @@ -0,0 +1,33 @@ +#!perl + +# Test that utf8::all is choosing the right encoding to not +# tickle thread bugs. + +use strict; +use warnings; + +# This is loaded before threads. It will not be aware of tests run in +# a thread. +use Test::More 0.96; +use Config; + +BEGIN { + plan skip_all => "Requires threads" + if !$Config{usethreads}; +} + +# Deliberately before loading threads so we don't cheat and check +# if threads are loaded, that would be brittle. +use utf8::all; + +use threads; +use threads::shared; + +note "basic utf8 + threads bug"; { + my $ok :shared = 0; + my $t = threads->create(sub { $ok = 1; }); + $t->join(); + ok $ok, "threads ok with utf8::all"; +} + +done_testing; diff --git a/t/utf8.t b/t/utf8.t index b767890..fba32b2 100644 --- a/t/utf8.t +++ b/t/utf8.t @@ -19,8 +19,6 @@ use Test::More; my @layers = PerlIO::get_layers($fh); ok(grep(m/utf8/, @layers), 'utf8 appears in the perlio layers') or diag explain { $fh => \@layers }; - ok(grep(m/utf-8-strict/, @layers), 'utf-8-strict appears in the perlio layers') - or diag explain { $fh => \@layers }; } } @@ -33,7 +31,7 @@ use Test::More; END { unlink "perlio_test2" } my @layers = PerlIO::get_layers($test_fh); - SKIP: { + SKIP: { # If we have the Perl Unicode flag set that adds the UTF-8 layer, # we need to skip this test. skip 'Perl Unicode flag set that always adds UTF-8 layer to output', 1 if (${^UNICODE} & 16);