libunicode-string-perl/t/encoding.t

#!perl -w

use strict;
use Test qw(plan ok);

plan tests => 59;

use Unicode::String qw(latin1 ucs4 utf32le utf16 utf16le utf8 utf7);

#use Devel::Dump;

$SIG{__WARN__} = sub { print "$_[0]"; };

my $u = latin1("abcæøå");
#Dump($u);

#---- Test Latin1 encoding ----

ok($u->latin1, "abcæøå");
ok($u->length, 6);
ok($u->ucs4, "\0\0\0a\0\0\0b\0\0\0c\0\0\0æ\0\0\0ø\0\0\0å");
ok($u->utf32, "\0\0\0a\0\0\0b\0\0\0c\0\0\0æ\0\0\0ø\0\0\0å");
ok($u->utf32be, "\0\0\0a\0\0\0b\0\0\0c\0\0\0æ\0\0\0ø\0\0\0å");
ok($u->utf32le, "a\0\0\0b\0\0\0c\0\0\0æ\0\0\0ø\0\0\0å\0\0\0");
ok($u->utf16, "\0a\0b\0c\0æ\0ø\0å");
ok($u->utf16be, "\0a\0b\0c\0æ\0ø\0å");
ok($u->ucs2, "\0a\0b\0c\0æ\0ø\0å");
ok($u->utf16le, "a\0b\0c\0æ\0ø\0å\0");
ok($u->utf8, "abcÃ¦Ã¸Ã¥");
ok($u->utf7, "abc+AOYA+ADl-");
ok($u->hex, "U+0061 U+0062 U+0063 U+00e6 U+00f8 U+00e5");

$u = latin1("abc");
$a = $u->latin1("def");
$b = $u->latin1;
$u->latin1("ghi");

ok($a, "abc");
ok($b, "def");
ok($u->latin1, "ghi");


$u = utf16("aa\0bcc\0d");

print "Expect 2 lines of warnings...\n";
my $x = $u->latin1;

ok($x, "bd");

#---- Test UCS4 encoding ----

$x = "\0\0\0a\0\0bb\0\3cc\0\1\0\2\0\0\0\0";
$u = ucs4($x);

ok($u->length, 7);
ok($u->hex, "U+0061 U+6262 U+d898 U+df63 U+d800 U+dc02 U+0000");
ok($u->ucs4, $x);

$a = $u->ucs4("");
ok($a, $x);
ok($u->length, 0);

$u = utf32le("a\0\0\0" . "bb\0\0" . "cc\3\0" . "\2\0\1\0" . "\0\0\0\0");
ok($u->length, 7);
ok($u->hex, "U+0061 U+6262 U+d898 U+df63 U+d800 U+dc02 U+0000");
ok($u->ucs4, $x);

print "Expect 2 lines of warnings...\n";
$u->ucs4("    \0\x10\xff\xff\0\x11\0\0\0\0\0\0");
ok($u->hex, "U+dbff U+dfff U+0000");

#--- Test UTF8 encoding ---

$u = utf8("");
ok($u->length, 0);
ok($u->utf8, "");

$u = utf8("abc");
my $old = $u->utf8("def");
ok($old, "abc");
ok($u->latin1, "def");

$u = utf16("\0a\0å\1\0\7a\0aa");
$x = unpack("H*", $u->utf8);
ok($x, "61c3a5c480dda161e68480");

my $u2 = utf8($u->utf8);
ok($u->utf16, $u2->utf16);

# Test surrogates and utf8
print "Surrogates...\n";

$u = ucs4("\0\1\0\0\0\x10\xFF\xFF");
$x = unpack("H*", $u->utf8);
ok($x, "f0908080f48fbfbf");

$u->utf8(pack("H*", $x));
ok($u->ucs4, "\0\1\0\0\0\x10\xFF\xFF");

print "Expect a warning with this incomplete surrogate pair...\n";
$u = utf16("\xd8\x00");
$u2 = utf8($u->utf8);
ok($u2->hex, "U+d800");

print "...and lots of noice from this...\n";
$u = utf8("¤¤a\xf7¤¤¤b\xf8¤¤¤¤c\xfc¤¤¤¤¤d\xfd\xfe\xffef");
print $u->hex, "\n";

ok($u->utf8, "abcdef");


#--- Test UTF7 encoding ---

# Examples from RFC 1642...
#
#      Example. The Unicode sequence "A<NOT IDENTICAL TO><ALPHA>."
#      (hexadecimal 0041,2262,0391,002E) may be encoded as follows:
#
#            A+ImIDkQ.
#
#      Example. The Unicode sequence "Hi Mom <WHITE SMILING FACE>!"
#      (hexadecimal 0048, 0069, 0020, 004D, 006F, 004D, 0020, 263A, 0021)
#      may be encoded as follows:
#
#            Hi Mom +Jjo-!

$u = utf7("A+ImIDkQ.");
ok($u->hex, "U+0041 U+2262 U+0391 U+002e");

my $utf7 = $u->utf7("Hi Mom +Jjo-!");
ok($utf7, qr/^A\+ImIDkQ-?\.$/);

ok($u->hex, "U+0048 U+0069 U+0020 U+004d U+006f U+006d U+0020 U+263a U+0021");
ok($u->utf7 eq "Hi Mom +Jjo-!" || $u->utf7 eq "Hi Mom +JjoAIQ-");

#      Example. The Unicode sequence representing the Han characters for
#      the Japanese word "nihongo" (hexadecimal 65E5,672C,8A9E) may be
#      encoded as follows:

$u = utf7("+ZeVnLIqe-");
ok($u->hex, "U+65e5 U+672c U+8a9e");
ok($u->utf7, "+ZeVnLIqe-");

# Appendix A -- Examples
#
#   Here is a longer example, taken from a document originally in Big5
#   code. It has been condensed for brevity. There are two versions: the
#   first uses optional characters from set O (and thus may not pass
#   through some mail gateways), and the second uses no optional
#   characters.

my $text = <<'EOT';
   Below is the full Chinese text of the Analects (+itaKng-).

   The sources for the text are:

   "The sayings of Confucius," James R. Ware, trans.  +U/BTFw-:
   +ZYeB9FH6ckh5Pg-, 1980.  (Chinese text with English translation)

   +Vttm+E6UfZM-, +W4tRQ066bOg-, +UxdOrA-:  +Ti1XC2b4Xpc-, 1990.

   "The Chinese Classics with a Translation, Critical and
   Exegetical Notes, Prolegomena, and Copius Indexes," James
   Legge, trans., Taipei:  Southern Materials Center Publishing,
   Inc., 1991.  (Chinese text with English translation)

   Big Five and GB versions of the text are being made available
   separately.

   Neither the Big Five nor GB contain all the characters used in
   this text.  Missing characters have been indicated using their
   Unicode/ISO 10646 code points.  "U+-" followed by four
   hexadecimal digits indicates a Unicode/10646 code (e.g.,
   U+-9F08).  There is no good solution to the problem of the small
   size of the Big Five/GB character sets; this represents the
   solution I find personally most satisfactory.

   (omitted...)

   I have tried to minimize this problem by using variant
   characters where they were available and the character
   actually in the text was not.  Only variants listed as such in
   the +XrdxmVtXUXg- were used.

   (omitted...)

   John H. Jenkins
   +TpVPXGBG-
   John_Jenkins@taligent.com
   5 January 1993
EOT

$u = utf7($text);
my $utf = $u->utf7;

unless ($utf eq $text) {
   print $u->length, " $utf\n";
   open(F, ">utf7-$$.orig"); print F $text;
   open(F, ">utf7-$$.enc");  print F $utf;
   close(F);
   system("diff -u0 utf7-$$.orig utf7-$$.enc");
   unlink("utf7-$$.orig", "utf7-$$.enc");
}

ok($utf, $text);

# Test encoding of different encoding byte lengths
for my $len (1 .. 6) {
   $u = Unicode::String->new;
   $u->pack(map {1000 + $_} 1 .. $len);
   $u2 = utf7($u->utf7);
   ok($u->utf16, $u2->utf16);
}

$Unicode::String::UTF7_OPTIONAL_DIRECT_CHARS = 0;

$u = latin1("a=4!æøå");
$utf = $u->utf7;

ok($utf7 !~ /[=!]/);
ok(utf7($utf)->latin1, "a=4!æøå");

#--- Swapped bytes ---

$u = utf16("ÿþa\0b\0c\0");
ok($u->hex, "U+feff U+0061 U+0062 U+0063");
ok($u->latin1, "abc");

$u = utf16("þÿ\0a\0b\0c");
ok($u->hex, "U+feff U+0061 U+0062 U+0063");
ok($u->latin1, "abc");

$u = utf16le("ÿþa\0b\0c\0");
ok($u->hex, "U+feff U+0061 U+0062 U+0063");
ok($u->latin1, "abc");

$u = utf16le("þÿ\0a\0b\0c");
ok($u->hex, "U+feff U+0061 U+0062 U+0063");
ok($u->latin1, "abc");