997 lines
22 KiB
Perl
997 lines
22 KiB
Perl
package Unicode::String;
|
||
|
||
# Copyright 1997-1999, Gisle Aas.
|
||
|
||
use strict;
|
||
use vars qw($VERSION @ISA @EXPORT_OK $UTF7_OPTIONAL_DIRECT_CHARS);
|
||
use Carp;
|
||
|
||
require Exporter;
|
||
require DynaLoader;
|
||
@ISA = qw(Exporter DynaLoader);
|
||
|
||
@EXPORT_OK = qw(
|
||
utf16 utf16le utf16be ucs2
|
||
utf8
|
||
utf7
|
||
ucs4 utf32 utf32be utf32le
|
||
latin1
|
||
uchr uhex
|
||
|
||
byteswap2 byteswap4
|
||
);
|
||
|
||
$VERSION = '2.10';
|
||
|
||
$UTF7_OPTIONAL_DIRECT_CHARS ||= 1;
|
||
|
||
bootstrap Unicode::String $VERSION;
|
||
|
||
use overload '""' => \&as_string,
|
||
'bool' => \&as_bool,
|
||
'0+' => \&as_num,
|
||
'.=' => \&append,
|
||
'.' => \&concat,
|
||
'x' => \&repeat,
|
||
'=' => \©,
|
||
'fallback' => 1;
|
||
|
||
my %stringify = (
|
||
unicode => \&utf16,
|
||
utf16 => \&utf16,
|
||
utf16be => \&utf16,
|
||
utf16le => \&utf16le,
|
||
ucs2 => \&utf16,
|
||
utf8 => \&utf8,
|
||
utf7 => \&utf7,
|
||
ucs4 => \&ucs4,
|
||
utf32 => \&ucs4,
|
||
utf32be => \&ucs4,
|
||
utf32le => \&utf32le,
|
||
latin1 => \&latin1,
|
||
'hex' => \&hex,
|
||
);
|
||
|
||
my $stringify_as = \&utf8;
|
||
|
||
# some aliases
|
||
*ucs2 = \&utf16;
|
||
*utf16be = \&utf16;
|
||
*utf32 = \&ucs4;
|
||
*utf32be = \&ucs4;
|
||
*uhex = \&hex;
|
||
*uchr = \&chr;
|
||
|
||
sub new
|
||
{
|
||
#_dump_arg("new", @_);
|
||
my $class = shift;
|
||
my $str;
|
||
my $self = bless \$str, $class;
|
||
&$stringify_as($self, shift) if @_;
|
||
$self;
|
||
}
|
||
|
||
|
||
sub repeat
|
||
{
|
||
my($self, $count) = @_;
|
||
my $class = ref($self);
|
||
my $str = $$self x $count;
|
||
bless \$str, $class;
|
||
}
|
||
|
||
|
||
sub _dump_arg
|
||
{
|
||
my $func = shift;
|
||
print "$func(";
|
||
print join(",", map { if (defined $_) {
|
||
my $x = overload::StrVal($_);
|
||
$x =~ s/\n/\\n/g;
|
||
$x = '""' unless length $x;
|
||
$x;
|
||
} else {
|
||
"undef"
|
||
}
|
||
} @_);
|
||
print ")\n";
|
||
}
|
||
|
||
|
||
sub concat
|
||
{
|
||
#_dump_arg("concat", @_);
|
||
my($self, $other, $reversed) = @_;
|
||
my $class = ref($self);
|
||
unless (UNIVERSAL::isa($other, 'Unicode::String')) {
|
||
$other = Unicode::String->new($other);
|
||
}
|
||
my $str = $reversed ? $$other . $$self : $$self . $$other;
|
||
bless \$str, $class;
|
||
}
|
||
|
||
|
||
sub append
|
||
{
|
||
#_dump_arg("append", @_);
|
||
my($self, $other) = @_;
|
||
unless (UNIVERSAL::isa($other, 'Unicode::String')) {
|
||
$other = Unicode::String->new($other);
|
||
}
|
||
$$self .= $$other;
|
||
$self;
|
||
}
|
||
|
||
|
||
sub copy
|
||
{
|
||
my($self) = @_;
|
||
my $class = ref($self);
|
||
my $copy = $$self;
|
||
bless \$copy, $class;
|
||
}
|
||
|
||
|
||
sub as_string
|
||
{
|
||
#_dump_arg("as_string", @_);
|
||
&$stringify_as($_[0]);
|
||
}
|
||
|
||
|
||
sub as_bool
|
||
{
|
||
# This is different from perl's normal behaviour by not letting
|
||
# a U+0030 ("0") be false.
|
||
my $self = shift;
|
||
$$self ? 1 : "";
|
||
}
|
||
|
||
|
||
sub as_num
|
||
{
|
||
# Should be able to use the numeric property from Unidata
|
||
# in order to parse a large number of numbers. Currently we
|
||
# only convert it to a plain string and let perl's normal
|
||
# num-converter do the job.
|
||
my $self = shift;
|
||
my $str = $self->utf8;
|
||
$str + 0;
|
||
}
|
||
|
||
|
||
sub stringify_as
|
||
{
|
||
my $class;
|
||
if (@_ > 1) {
|
||
$class = shift;
|
||
$class = ref($class) if ref($class);
|
||
} else {
|
||
$class = "Unicode::String";
|
||
}
|
||
my $old = $stringify_as;
|
||
if (@_) {
|
||
my $as = shift;
|
||
croak("Don't know how to stringify as '$as'")
|
||
unless exists $stringify{$as};
|
||
$stringify_as = $stringify{$as};
|
||
}
|
||
$old;
|
||
}
|
||
|
||
|
||
sub utf16
|
||
{
|
||
my $self = shift;
|
||
unless (ref $self) {
|
||
my $u = new Unicode::String;
|
||
$u->utf16($self);
|
||
return $u;
|
||
}
|
||
my $old = $$self;
|
||
if (@_) {
|
||
$$self = shift;
|
||
if ((length($$self) % 2) != 0) {
|
||
warn "Uneven UTF16 data" if $^W;
|
||
$$self .= "\0";
|
||
}
|
||
if ($$self =~ /^\xFF\xFE/) {
|
||
# the string needs byte swapping
|
||
$$self = byteswap2($$self);
|
||
}
|
||
}
|
||
$old;
|
||
}
|
||
|
||
|
||
sub utf16le
|
||
{
|
||
my $self = shift;
|
||
unless (ref $self) {
|
||
my $u = new Unicode::String;
|
||
$u->utf16(byteswap2($self));
|
||
return $u;
|
||
}
|
||
my $old = byteswap2($$self);
|
||
if (@_) {
|
||
$self->utf16(byteswap2(shift));
|
||
}
|
||
$old;
|
||
}
|
||
|
||
|
||
sub utf32le
|
||
{
|
||
my $self = shift;
|
||
unless (ref $self) {
|
||
my $u = new Unicode::String;
|
||
$u->ucs4(byteswap4($self));
|
||
return $u;
|
||
}
|
||
my $old = byteswap4($self->ucs4);
|
||
if (@_) {
|
||
$self->ucs4(byteswap4(shift));
|
||
}
|
||
$old;
|
||
}
|
||
|
||
|
||
sub utf7 # rfc1642
|
||
{
|
||
my $self = shift;
|
||
unless (ref $self) {
|
||
# act as ctor
|
||
my $u = new Unicode::String;
|
||
$u->utf7($self);
|
||
return $u;
|
||
}
|
||
my $old;
|
||
if (defined wantarray) {
|
||
# encode into $old
|
||
$old = "";
|
||
pos($$self) = 0;
|
||
my $len = length($$self);
|
||
while (pos($$self) < $len) {
|
||
if (($UTF7_OPTIONAL_DIRECT_CHARS &&
|
||
$$self =~ /\G((?:\0[A-Za-z0-9\'\(\)\,\-\.\/\:\?\!\"\#\$\%\&\*\;\<\=\>\@\[\]\^\_\`\{\|\}\s])+)/gc)
|
||
|| $$self =~ /\G((?:\0[A-Za-z0-9\'\(\)\,\-\.\/\:\?\s])+)/gc)
|
||
{
|
||
#print "Plain ", utf16($1)->latin1, "\n";
|
||
$old .= utf16($1)->latin1;
|
||
}
|
||
elsif (($UTF7_OPTIONAL_DIRECT_CHARS &&
|
||
$$self =~ /\G((?:[^\0].|\0[^A-Za-z0-9\'\(\)\,\-\.\/\:\?\!\"\#\$\%\&\*\;\<\=\>\@\[\]\^\_\`\{\|\}\s])+)/gsc)
|
||
|| $$self =~ /\G((?:[^\0].|\0[^A-Za-z0-9\'\(\)\,\-\.\/\:\?\s])+)/gsc)
|
||
{
|
||
#print "Unplain ", utf16($1)->hex, "\n";
|
||
if ($1 eq "\0+") {
|
||
$old .= "+-";
|
||
} else {
|
||
require MIME::Base64;
|
||
my $base64 = MIME::Base64::encode($1, '');
|
||
$base64 =~ s/=+$//;
|
||
$old .= "+$base64-";
|
||
# XXX should we determine when the final "-" is
|
||
# unnecessary? depends on next char not being part
|
||
# of the base64 char set.
|
||
}
|
||
} else {
|
||
die "This should not happen, pos=" . pos($$self) .
|
||
": " . $self->hex . "\n";
|
||
}
|
||
}
|
||
}
|
||
|
||
if (@_) {
|
||
# decode
|
||
my $len = length($_[0]);
|
||
$$self = "";
|
||
pos($_[0]) = 0;
|
||
while (pos($_[0]) < $len) {
|
||
if ($_[0] =~ /\G([^+]+)/gc) {
|
||
$self->append(latin1($1));
|
||
} elsif ($_[0] =~ /\G\+-/gc) {
|
||
$$self .= "\0+";
|
||
} elsif ($_[0] =~ /\G\+([A-Za-z0-9+\/]+)-?/gc) {
|
||
my $base64 = $1;
|
||
my $pad = length($base64) % 4;
|
||
$base64 .= "=" x (4 - $pad) if $pad;
|
||
require MIME::Base64;
|
||
$$self .= MIME::Base64::decode($base64);
|
||
if ((length($$self) % 2) != 0) {
|
||
warn "Uneven UTF7 base64-data" if $^W;
|
||
chop($$self); # correct it
|
||
}
|
||
} elsif ($_[0] =~ /\G\+/gc) {
|
||
warn "Bad UTF7 data escape" if $^W;
|
||
$$self .= "\0+";
|
||
} else {
|
||
die "This should not happen " . pos($_[0]);
|
||
}
|
||
}
|
||
}
|
||
$old;
|
||
}
|
||
|
||
|
||
sub hex
|
||
{
|
||
my $self = shift;
|
||
unless (ref $self) {
|
||
my $u = new Unicode::String;
|
||
$u->hex($self);
|
||
return $u;
|
||
}
|
||
my $old;
|
||
if (defined($$self) && defined wantarray) {
|
||
$old = unpack("H*", $$self);
|
||
$old =~ s/(....)/U+$1 /g;
|
||
$old =~ s/\s+$//;
|
||
}
|
||
if (@_) {
|
||
my $new = shift;
|
||
$new =~ tr/0-9A-Fa-f//cd; # leave only hex chars
|
||
croak("Hex string length must be multiple of four")
|
||
unless (length($new) % 4) == 0;
|
||
$$self = pack("H*", $new);
|
||
}
|
||
$old;
|
||
}
|
||
|
||
|
||
sub length
|
||
{
|
||
my $self = shift;
|
||
int(length($$self) / 2);
|
||
}
|
||
|
||
sub byteswap
|
||
{
|
||
my $self = shift;
|
||
byteswap2($$self);
|
||
$self;
|
||
}
|
||
|
||
sub unpack
|
||
{
|
||
my $self = shift;
|
||
unpack("n*", $$self)
|
||
}
|
||
|
||
|
||
sub pack
|
||
{
|
||
my $self = shift;
|
||
$$self = pack("n*", @_);
|
||
$self;
|
||
}
|
||
|
||
|
||
sub ord
|
||
{
|
||
my $self = shift;
|
||
return () unless defined $$self;
|
||
|
||
my $array = wantarray;
|
||
my @ret;
|
||
my @chars;
|
||
if ($array) {
|
||
@chars = CORE::unpack("n*", $$self);
|
||
} else {
|
||
@chars = CORE::unpack("n2", $$self);
|
||
}
|
||
|
||
while (@chars) {
|
||
my $first = shift(@chars);
|
||
if ($first >= 0xD800 && $first <= 0xDFFF) { # surrogate
|
||
my $second = shift(@chars);
|
||
#print "F=$first S=$second\n";
|
||
if ($first >= 0xDC00 || $second < 0xDC00 || $second > 0xDFFF) {
|
||
carp(sprintf("Bad surrogate pair (U+%04x U+%04x)",
|
||
$first, $second));
|
||
unshift(@chars, $second);
|
||
next;
|
||
}
|
||
push(@ret, ($first-0xD800)*0x400 + ($second-0xDC00) + 0x10000);
|
||
} else {
|
||
push(@ret, $first);
|
||
}
|
||
last unless $array;
|
||
}
|
||
$array ? @ret : $ret[0];
|
||
}
|
||
|
||
|
||
sub name
|
||
{
|
||
my $self = shift;
|
||
require Unicode::CharName;
|
||
if (wantarray) {
|
||
return map { Unicode::CharName::uname($_) } $self->ord;
|
||
} else {
|
||
return Unicode::CharName::uname(scalar($self->ord));
|
||
}
|
||
}
|
||
|
||
|
||
sub chr
|
||
{
|
||
my($self,$val) = @_;
|
||
unless (ref $self) {
|
||
# act as ctor
|
||
my $u = new Unicode::String;
|
||
return $u->uchr($self);
|
||
}
|
||
if ($val > 0xFFFF) {
|
||
# must be represented by a surrogate pair
|
||
return undef if $val > 0x10FFFF; # Unicode limit
|
||
$val -= 0x10000;
|
||
my $h = int($val / 0x400) + 0xD800;
|
||
my $l = ($val % 0x400) + 0xDC00;
|
||
$$self = CORE::pack("n2", $h, $l);
|
||
} else {
|
||
$$self = CORE::pack("n", $val);
|
||
}
|
||
$self;
|
||
}
|
||
|
||
|
||
sub substr
|
||
{
|
||
my($self, $offset, $length, $substitute) = @_;
|
||
$offset ||= 0;
|
||
$offset *= 2;
|
||
my $substr;
|
||
if (defined $substitute) {
|
||
unless (UNIVERSAL::isa($substitute, 'Unicode::String')) {
|
||
$substitute = Unicode::String->new($substitute);
|
||
}
|
||
if (defined $length) {
|
||
$substr = substr($$self, $offset, $length*2) = $$substitute;
|
||
} else {
|
||
$substr = substr($$self, $offset) = $$substitute;
|
||
}
|
||
} else {
|
||
if (defined $length) {
|
||
$substr = substr($$self, $offset, $length*2);
|
||
} else {
|
||
$substr = substr($$self, $offset);
|
||
}
|
||
}
|
||
bless \$substr, ref($self);
|
||
}
|
||
|
||
|
||
sub index
|
||
{
|
||
my($self, $other, $pos) = @_;
|
||
$pos ||= 0;
|
||
$pos *= 2;
|
||
$other = Unicode::String->new($other) unless ref($other);
|
||
$pos++ while ($pos = index($$self, $$other, $pos)) > 0 && ($pos%2) != 0;
|
||
$pos /= 2 if $pos > 0;
|
||
$pos;
|
||
}
|
||
|
||
|
||
sub rindex
|
||
{
|
||
my($self, $other, $pos) = @_;
|
||
$pos ||= 0;
|
||
die "NYI";
|
||
}
|
||
|
||
|
||
sub chop
|
||
{
|
||
my $self = shift;
|
||
if (CORE::length $$self) {
|
||
my $chop = chop($$self);
|
||
$chop = chop($$self) . $chop;
|
||
return bless \$chop, ref($self);
|
||
}
|
||
undef;
|
||
}
|
||
|
||
|
||
# XXX: Ideas to be implemented
|
||
sub scan;
|
||
sub reverse;
|
||
|
||
sub lc;
|
||
sub lcfirst;
|
||
sub uc;
|
||
sub ucfirst;
|
||
|
||
sub split;
|
||
sub sprintf;
|
||
sub study;
|
||
sub tr;
|
||
|
||
|
||
1;
|
||
|
||
__END__
|
||
|
||
=head1 NAME
|
||
|
||
Unicode::String - String of Unicode characters (UTF-16BE)
|
||
|
||
=head1 SYNOPSIS
|
||
|
||
use Unicode::String qw(utf8 latin1 utf16be);
|
||
|
||
$u = utf8("string");
|
||
$u = latin1("string");
|
||
$u = utf16be("\0s\0t\0r\0i\0n\0g");
|
||
|
||
print $u->utf32be; # 4 byte characters
|
||
print $u->utf16le; # 2 byte characters + surrogates
|
||
print $u->utf8; # 1-4 byte characters
|
||
|
||
=head1 DESCRIPTION
|
||
|
||
A C<Unicode::String> object represents a sequence of Unicode
|
||
characters. Methods are provided to convert between various external
|
||
formats (encodings) and C<Unicode::String> objects, and methods are
|
||
provided for common string manipulations.
|
||
|
||
The functions utf32be(), utf32le(), utf16be(), utf16le(), utf8(),
|
||
utf7(), latin1(), uhex(), uchr() can be imported from the
|
||
C<Unicode::String> module and will work as constructors initializing
|
||
strings of the corresponding encoding.
|
||
|
||
The C<Unicode::String> objects overload various operators, which means
|
||
that they in most cases can be treated like plain strings.
|
||
|
||
Internally a C<Unicode::String> object is represented by a string of 2
|
||
byte numbers in network byte order (big-endian). This representation
|
||
is not visible by the API provided, but it might be useful to know in
|
||
order to predict the efficiency of the provided methods.
|
||
|
||
=head2 METHODS
|
||
|
||
=head2 Class methods
|
||
|
||
The following class methods are available:
|
||
|
||
=over 4
|
||
|
||
=item Unicode::String->stringify_as
|
||
|
||
=item Unicode::String->stringify_as( $enc )
|
||
|
||
This method is used to specify which encoding will be used when
|
||
C<Unicode::String> objects are implicitly converted to and from plain
|
||
strings.
|
||
|
||
If an argument is provided it sets the current encoding. The argument
|
||
should have one of the following: "ucs4", "utf32", "utf32be",
|
||
"utf32le", "ucs2", "utf16", "utf16be", "utf16le", "utf8", "utf7",
|
||
"latin1" or "hex". The default is "utf8".
|
||
|
||
The stringify_as() method returns a reference to the current encoding
|
||
function.
|
||
|
||
=item $us = Unicode::String->new
|
||
|
||
=item $us = Unicode::String->new( $initial_value )
|
||
|
||
This is the object constructor. Without argument, it creates an empty
|
||
C<Unicode::String> object. If an $initial_value argument is given, it
|
||
is decoded according to the specified stringify_as() encoding, UTF-8
|
||
by default.
|
||
|
||
In general it is recommended to import and use one of the encoding
|
||
specific constructor functions instead of invoking this method.
|
||
|
||
=back
|
||
|
||
=head2 Encoding methods
|
||
|
||
These methods get or set the value of the C<Unicode::String> object by
|
||
passing strings in the corresponding encoding. If a new value is
|
||
passed as argument it will set the value of the C<Unicode::String>,
|
||
and the previous value is returned. If no argument is passed then the
|
||
current value is returned.
|
||
|
||
To illustrate the encodings we show how the 2 character sample string
|
||
of "<22>m" (micro meter) is encoded for each one.
|
||
|
||
=over 4
|
||
|
||
=item $us->utf32be
|
||
|
||
=item $us->utf32be( $newval )
|
||
|
||
The string passed should be in the UTF-32 encoding with bytes in big
|
||
endian order. The sample "<22>m" is "\0\0\0\xB5\0\0\0m" in this encoding.
|
||
|
||
Alternative names for this method are utf32() and ucs4().
|
||
|
||
=item $us->utf32le
|
||
|
||
=item $us->utf32le( $newval )
|
||
|
||
The string passed should be in the UTF-32 encoding with bytes in little
|
||
endian order. The sample "<22>m" is is "\xB5\0\0\0m\0\0\0" in this encoding.
|
||
|
||
=item $us->utf16be
|
||
|
||
=item $us->utf16be( $newval )
|
||
|
||
The string passed should be in the UTF-16 encoding with bytes in big
|
||
endian order. The sample "<22>m" is "\0\xB5\0m" in this encoding.
|
||
|
||
Alternative names for this method are utf16() and ucs2().
|
||
|
||
If the string passed to utf16be() starts with the Unicode byte order
|
||
mark in little endian order, the result is as if utf16le() was called
|
||
instead.
|
||
|
||
=item $us->utf16le
|
||
|
||
=item $us->utf16le( $newval )
|
||
|
||
The string passed should be in the UTF-16 encoding with bytes in
|
||
little endian order. The sample "<22>m" is is "\xB5\0m\0" in this
|
||
encoding. This is the encoding used by the Microsoft Windows API.
|
||
|
||
If the string passed to utf16le() starts with the Unicode byte order
|
||
mark in big endian order, the result is as if utf16le() was called
|
||
instead.
|
||
|
||
=item $us->utf8
|
||
|
||
=item $us->utf8( $newval )
|
||
|
||
The string passed should be in the UTF-8 encoding. The sample "<22>m" is
|
||
"\xC2\xB5m" in this encoding.
|
||
|
||
=item $us->utf7
|
||
|
||
=item $us->utf7( $newval )
|
||
|
||
The string passed should be in the UTF-7 encoding. The sample "<22>m" is
|
||
"+ALU-m" in this encoding.
|
||
|
||
|
||
The UTF-7 encoding only use plain US-ASCII characters for the
|
||
encoding. This makes it safe for transport through 8-bit stripping
|
||
protocols. Characters outside the US-ASCII range are base64-encoded
|
||
and '+' is used as an escape character. The UTF-7 encoding is
|
||
described in RFC 1642.
|
||
|
||
If the (global) variable $Unicode::String::UTF7_OPTIONAL_DIRECT_CHARS
|
||
is TRUE, then a wider range of characters are encoded as themselves.
|
||
It is even TRUE by default. The characters affected by this are:
|
||
|
||
! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
|
||
|
||
=item $us->latin1
|
||
|
||
=item $us->latin1( $newval )
|
||
|
||
The string passed should be in the ISO-8859-1 encoding. The sample "<22>m" is
|
||
"\xB5m" in this encoding.
|
||
|
||
Characters outside the "\x00" .. "\xFF" range are simply removed from
|
||
the return value of the latin1() method. If you want more control
|
||
over the mapping from Unicode to ISO-8859-1, use the C<Unicode::Map8>
|
||
class. This is also the way to deal with other 8-bit character sets.
|
||
|
||
=item $us->hex
|
||
|
||
=item $us->hex( $newval )
|
||
|
||
The string passed should be plain ASCII where each Unicode character
|
||
is represented by the "U+XXXX" string and separated by a single space
|
||
character. The "U+" prefix is optional when setting the value. The
|
||
sample "<22>m" is "U+00b5 U+006d" in this encoding.
|
||
|
||
=back
|
||
|
||
=head2 String Operations
|
||
|
||
The following methods are available:
|
||
|
||
=over 4
|
||
|
||
=item $us->as_string
|
||
|
||
Converts a C<Unicode::String> to a plain string according to the
|
||
setting of stringify_as(). The default stringify_as() encoding is
|
||
"utf8".
|
||
|
||
=item $us->as_num
|
||
|
||
Converts a C<Unicode::String> to a number. Currently only the digits
|
||
in the range 0x30 .. 0x39 are recognized. The plan is to eventually
|
||
support all Unicode digit characters.
|
||
|
||
=item $us->as_bool
|
||
|
||
Converts a C<Unicode::String> to a boolean value. Only the empty
|
||
string is FALSE. A string consisting of only the character U+0030 is
|
||
considered TRUE, even if Perl consider "0" to be FALSE.
|
||
|
||
=item $us->repeat( $count )
|
||
|
||
Returns a new C<Unicode::String> where the content of $us is repeated
|
||
$count times. This operation is also overloaded as:
|
||
|
||
$us x $count
|
||
|
||
=item $us->concat( $other_string )
|
||
|
||
Concatenates the string $us and the string $other_string. If
|
||
$other_string is not an C<Unicode::String> object, then it is first
|
||
passed to the Unicode::String->new constructor function. This
|
||
operation is also overloaded as:
|
||
|
||
$us . $other_string
|
||
|
||
|
||
=item $us->append( $other_string )
|
||
|
||
Appends the string $other_string to the value of $us. If
|
||
$other_string is not an C<Unicode::String> object, then it is first
|
||
passed to the Unicode::String->new constructor function. This
|
||
operation is also overloaded as:
|
||
|
||
$us .= $other_string
|
||
|
||
=item $us->copy
|
||
|
||
Returns a copy of the current C<Unicode::String> object. This
|
||
operation is overloaded as the assignment operator.
|
||
|
||
=item $us->length
|
||
|
||
Returns the length of the C<Unicode::String>. Surrogate pairs are
|
||
still counted as 2.
|
||
|
||
=item $us->byteswap
|
||
|
||
This method will swap the bytes in the internal representation of the
|
||
C<Unicode::String> object.
|
||
|
||
Unicode reserve the character U+FEFF character as a byte order mark.
|
||
This works because the swapped character, U+FFFE, is reserved to not
|
||
be valid. For strings that have the byte order mark as the first
|
||
character, we can guaranty to get the byte order right with the
|
||
following code:
|
||
|
||
$ustr->byteswap if $ustr->ord == 0xFFFE;
|
||
|
||
=item $us->unpack
|
||
|
||
Returns a list of integers each representing an UCS-2 character code.
|
||
|
||
=item $us->pack( @uchr )
|
||
|
||
Sets the value of $us as a sequence of UCS-2 characters with the
|
||
characters codes given as parameter.
|
||
|
||
=item $us->ord
|
||
|
||
Returns the character code of the first character in $us. The ord()
|
||
method deals with surrogate pairs, which gives us a result-range of
|
||
0x0 .. 0x10FFFF. If the $us string is empty, undef is returned.
|
||
|
||
=item $us->chr( $code )
|
||
|
||
Sets the value of $us to be a string containing the character assigned
|
||
code $code. The argument $code must be an integer in the range 0x0
|
||
.. 0x10FFFF. If the code is greater than 0xFFFF then a surrogate pair
|
||
created.
|
||
|
||
=item $us->name
|
||
|
||
In scalar context returns the official Unicode name of the first
|
||
character in $us. In array context returns the name of all characters
|
||
in $us. Also see L<Unicode::CharName>.
|
||
|
||
=item $us->substr( $offset )
|
||
|
||
=item $us->substr( $offset, $length )
|
||
|
||
=item $us->substr( $offset, $length, $subst )
|
||
|
||
Returns a sub-string of $us. Works similar to the builtin substr()
|
||
function.
|
||
|
||
=item $us->index( $other )
|
||
|
||
=item $us->index( $other, $pos )
|
||
|
||
Locates the position of $other within $us, possibly starting the
|
||
search at position $pos.
|
||
|
||
=item $us->chop
|
||
|
||
Chops off the last character of $us and returns it (as a
|
||
C<Unicode::String> object).
|
||
|
||
=back
|
||
|
||
=head1 FUNCTIONS
|
||
|
||
The following functions are provided. None of these are exported by default.
|
||
|
||
=over 4
|
||
|
||
=item byteswap2( $str, ... )
|
||
|
||
This function will swap 2 and 2 bytes in the strings passed as
|
||
arguments. If this function is called in void context,
|
||
then it will modify its arguments in-place. Otherwise, the swapped
|
||
strings are returned.
|
||
|
||
=item byteswap4( $str, ... )
|
||
|
||
The byteswap4 function works similar to byteswap2, but will reverse
|
||
the order of 4 and 4 bytes.
|
||
|
||
=item latin1( $str )
|
||
|
||
=item utf7( $str )
|
||
|
||
=item utf8( $str )
|
||
|
||
=item utf16le( $str )
|
||
|
||
=item utf16be( $str )
|
||
|
||
=item utf32le( $str )
|
||
|
||
=item utf32be( $str )
|
||
|
||
Constructor functions for the various Unicode encodings. These return
|
||
new C<Unicode::String> objects. The provided argument should be
|
||
encoded correspondingly.
|
||
|
||
=item uhex( $str )
|
||
|
||
Constructs a new C<Unicode::String> object from a string of hex
|
||
values. See hex() method above for description of the format.
|
||
|
||
=item uchar( $num )
|
||
|
||
Constructs a new one character C<Unicode::String> object from a
|
||
Unicode character code. This works similar to perl's builtin chr()
|
||
function.
|
||
|
||
=back
|
||
|
||
=head1 SEE ALSO
|
||
|
||
L<Unicode::CharName>,
|
||
L<Unicode::Map8>
|
||
|
||
L<http://www.unicode.org/>
|
||
|
||
L<perlunicode>
|
||
|
||
=head1 COPYRIGHT
|
||
|
||
Copyright 1997-2000,2005 Gisle Aas.
|
||
|
||
This library is free software; you can redistribute it and/or
|
||
modify it under the same terms as Perl itself.
|
||
|
||
=cut
|
||
|
||
|
||
#
|
||
# Some old code that is not used any more (because the methods are
|
||
# now implemented as XS) and which I did not want to throw away yet.
|
||
#
|
||
|
||
sub ucs4_inperl
|
||
{
|
||
my $self = shift;
|
||
unless (ref $self) {
|
||
my $u = new Unicode::String;
|
||
$u->ucs4($self);
|
||
return $u;
|
||
}
|
||
my $old = pack("N*", $self->ord);
|
||
if (@_) {
|
||
$$self = "";
|
||
for (unpack("N*", shift)) {
|
||
$self->append(uchr($_));
|
||
}
|
||
}
|
||
$old;
|
||
}
|
||
|
||
|
||
sub utf8_inperl
|
||
{
|
||
my $self = shift;
|
||
unless (ref $self) {
|
||
# act as ctor
|
||
my $u = new Unicode::String;
|
||
$u->utf8($self);
|
||
return $u;
|
||
}
|
||
|
||
my $old;
|
||
if (defined($$self) && defined wantarray) {
|
||
# encode UTF-8
|
||
my $uc;
|
||
for $uc (unpack("n*", $$self)) {
|
||
if ($uc < 0x80) {
|
||
# 1 byte representation
|
||
$old .= chr($uc);
|
||
} elsif ($uc < 0x800) {
|
||
# 2 byte representation
|
||
$old .= chr(0xC0 | ($uc >> 6)) .
|
||
chr(0x80 | ($uc & 0x3F));
|
||
} else {
|
||
# 3 byte representation
|
||
$old .= chr(0xE0 | ($uc >> 12)) .
|
||
chr(0x80 | (($uc >> 6) & 0x3F)) .
|
||
chr(0x80 | ($uc & 0x3F));
|
||
}
|
||
}
|
||
}
|
||
|
||
if (@_) {
|
||
if (defined $_[0]) {
|
||
$$self = "";
|
||
my $bytes = shift;
|
||
$bytes =~ s/^[\200-\277]+//; # can't start with 10xxxxxx
|
||
while (length $bytes) {
|
||
if ($bytes =~ s/^([\000-\177]+)//) {
|
||
$$self .= pack("n*", unpack("C*", $1));
|
||
} elsif ($bytes =~ s/^([\300-\337])([\200-\277])//) {
|
||
my($b1,$b2) = (ord($1), ord($2));
|
||
$$self .= pack("n", (($b1 & 0x1F) << 6) | ($b2 & 0x3F));
|
||
} elsif ($bytes =~ s/^([\340-\357])([\200-\277])([\200-\277])//) {
|
||
my($b1,$b2,$b3) = (ord($1), ord($2), ord($3));
|
||
$$self .= pack("n", (($b1 & 0x0F) << 12) |
|
||
(($b2 & 0x3F) << 6) |
|
||
($b3 & 0x3F));
|
||
} else {
|
||
croak "Bad UTF-8 data";
|
||
}
|
||
}
|
||
} else {
|
||
$$self = undef;
|
||
}
|
||
}
|
||
|
||
$old;
|
||
}
|
||
|
||
|
||
|
||
|
||
sub latin1_inperl
|
||
{
|
||
my $self = shift;
|
||
unless (ref $self) {
|
||
# act as ctor
|
||
my $u = new Unicode::String;
|
||
$u->latin1($self);
|
||
return $u;
|
||
}
|
||
|
||
my $old;
|
||
# XXX: should really check that none of the chars > 256
|
||
$old = pack("C*", unpack("n*", $$self)) if defined $$self;
|
||
|
||
if (@_) {
|
||
# set the value
|
||
if (defined $_[0]) {
|
||
$$self = pack("n*", unpack("C*", $_[0]));
|
||
} else {
|
||
$$self = undef;
|
||
}
|
||
}
|
||
$old;
|
||
}
|