#!/usr/bin/perl use strict; use warnings; use Encode qw(); BEGIN { eval 'use Unicode::UTF8 qw()'; } # Must exist for Match to work BEGIN { eval 'use Unicode::Normalize qw()'; } # Stolen from src/lib/Sympa/Tools/Text.pm sub canonic_text { my $text = shift; return undef unless defined $text; # Normalize text. See also discussion on # https://listes.renater.fr/sympa/arc/sympa-developpers/2018-03/thrd1.html # # N.B.: Corresponding modules are optional by now, and should be # mandatory in the future. my $utext; if (Encode::is_utf8($text)) { $utext = $text; } elsif ($Unicode::UTF8::VERSION) { no warnings 'utf8'; $utext = Unicode::UTF8::decode_utf8($text); } else { $utext = Encode::decode_utf8($text); } if ($Unicode::Normalize::VERSION) { $utext = Unicode::Normalize::normalize('NFC', $utext); } # Remove DOS linefeeds (^M) that cause problems with Outlook 98, AOL, # and EIMS: $utext =~ s/\r\n|\r/\n/g; if (Encode::is_utf8($text)) { return $utext; } else { return Encode::encode_utf8($utext); } } # Derived from t/Tools_Text.t my $a = canonic_text( "\xED\xA0\x80\n\xF4\x8F\xBF\xBE\n\xF4\x90\x80\x80\n\xF8\x88\x80\x80\x80\n" ); my $b = Encode::encode_utf8( "\x{FFFD}\x{FFFD}\x{FFFD}\n\x{FFFD}\n\x{FFFD}\x{FFFD}\x{FFFD}\x{FFFD}\n\x{FFFD}\x{FFFD}\x{FFFD}\x{FFFD}\x{FFFD}\n" ); if ($a eq $b) { print "Match\n"; }