forked from openkylin/libunicode-string-perl
Compare commits
No commits in common. "openkylin/yangtze" and "pristine-tar" have entirely different histories.
openkylin/
...
pristine-t
121
Changes
121
Changes
|
@ -1,121 +0,0 @@
|
|||
2016-09-08 Gisle Aas <gisle@ActiveState.com>
|
||||
|
||||
Release 2.10
|
||||
|
||||
Applied patch from Michael Joyce that is required to make the
|
||||
test pass for perl-5.24
|
||||
|
||||
|
||||
2005-10-26 Gisle Aas <gisle@ActiveState.com>
|
||||
|
||||
Release 2.09
|
||||
|
||||
Simplified and reworked documentation.
|
||||
|
||||
Unicode::CharName::uname() did not work for chars > 0xFFFF.
|
||||
|
||||
|
||||
|
||||
2005-10-25 Gisle Aas <gisle@ActiveState.com>
|
||||
|
||||
Release 2.08
|
||||
|
||||
Unicode::String now provide utf16le() and utf32le() constructors.
|
||||
Corresponding methods also available.
|
||||
|
||||
Unicode::CharName updated with Unicode 4.1 data.
|
||||
|
||||
|
||||
|
||||
2003-03-10 Gisle Aas <gisle@ActiveState.com>
|
||||
|
||||
Release 2.07
|
||||
|
||||
Don't let the $^W test get confused by lexical warnings.
|
||||
|
||||
|
||||
|
||||
2000-05-19 Gisle Aas <gisle@ActiveState.com>
|
||||
|
||||
Release 2.06
|
||||
|
||||
Make it compile with perl5.6.0.
|
||||
|
||||
Typo fixes spotted by alexander smishlajev <als@turnhere.com>
|
||||
when translating the module to python.
|
||||
|
||||
|
||||
|
||||
1999-11-17 Gisle Aas <gisle@aas.no>
|
||||
|
||||
Release 2.05
|
||||
|
||||
Operator "." failed when a normal string was concatenated with
|
||||
and Unicode::String object. Patch by Ilya Ketris <ilya@gde.to>.
|
||||
|
||||
|
||||
|
||||
1999-09-14 Gisle Aas <gisle@aas.no>
|
||||
|
||||
Release 2.04
|
||||
|
||||
Added a byteswap method. Also the functions byteswap2() and
|
||||
byteswap4() are made available.
|
||||
|
||||
|
||||
|
||||
1999-07-28 Gisle Aas <gisle@aas.no>
|
||||
|
||||
Release 2.03
|
||||
|
||||
Tweaks to allow compilation on Win32 systems. Patch by
|
||||
Jan Dubois <jan.dubois@ibm.net>.
|
||||
|
||||
|
||||
|
||||
1999-05-01 Gisle Aas <gisle@aas.no>
|
||||
|
||||
Release 2.02
|
||||
|
||||
Avoid "Ambiguous call resolved as CORE::..." warnings from
|
||||
newer perls.
|
||||
|
||||
|
||||
|
||||
1998-01-20 Gisle Aas <aas@sn.no>
|
||||
|
||||
Release 2.01
|
||||
|
||||
Improved the documentation of Unicode::String
|
||||
|
||||
Only warn when $^W is TRUE
|
||||
|
||||
|
||||
|
||||
1998-01-19 Gisle Aas <aas@sn.no>
|
||||
|
||||
Release Unicode-String-2.00
|
||||
|
||||
Renamed package to Unicode-String (moved String modules up one
|
||||
level in the distribution)
|
||||
|
||||
Handle assignment of utf16 strings with uneven byte length
|
||||
|
||||
The $u->length now always returns an integer value as it should
|
||||
|
||||
The $u->utf7 encoder bailed out on U+XX0A characters
|
||||
|
||||
|
||||
|
||||
1997-07-10 Gisle Aas <aas@sn.no>
|
||||
|
||||
Release Unicode-19970710
|
||||
|
||||
Now most basic conversion routines and string manipulation works.
|
||||
Documentation is starting to appear.
|
||||
|
||||
|
||||
|
||||
1997-02-11 Gisle Aas <aas@sn.no>
|
||||
|
||||
First version of this module posted to perl5-porters.
|
15
MANIFEST
15
MANIFEST
|
@ -1,15 +0,0 @@
|
|||
Changes
|
||||
MANIFEST
|
||||
Makefile.PL
|
||||
README
|
||||
String.pm
|
||||
String.xs
|
||||
lib/Unicode/CharName.pm
|
||||
t/byteswap.t
|
||||
t/charname.t
|
||||
t/encoding.t
|
||||
t/ord.t
|
||||
t/op.t
|
||||
t/stringify.t
|
||||
META.yml Module meta-data (added by MakeMaker)
|
||||
META.json Module JSON meta-data (added by MakeMaker)
|
42
META.json
42
META.json
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"abstract" : "unknown",
|
||||
"author" : [
|
||||
"unknown"
|
||||
],
|
||||
"dynamic_config" : 1,
|
||||
"generated_by" : "ExtUtils::MakeMaker version 7.1001, CPAN::Meta::Converter version 2.150005",
|
||||
"license" : [
|
||||
"unknown"
|
||||
],
|
||||
"meta-spec" : {
|
||||
"url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
|
||||
"version" : "2"
|
||||
},
|
||||
"name" : "Unicode-String",
|
||||
"no_index" : {
|
||||
"directory" : [
|
||||
"t",
|
||||
"inc"
|
||||
]
|
||||
},
|
||||
"prereqs" : {
|
||||
"build" : {
|
||||
"requires" : {
|
||||
"ExtUtils::MakeMaker" : "0"
|
||||
}
|
||||
},
|
||||
"configure" : {
|
||||
"requires" : {
|
||||
"ExtUtils::MakeMaker" : "0"
|
||||
}
|
||||
},
|
||||
"runtime" : {
|
||||
"requires" : {
|
||||
"MIME::Base64" : "2"
|
||||
}
|
||||
}
|
||||
},
|
||||
"release_status" : "stable",
|
||||
"version" : "2.10",
|
||||
"x_serialization_backend" : "JSON::PP version 2.27300"
|
||||
}
|
23
META.yml
23
META.yml
|
@ -1,23 +0,0 @@
|
|||
---
|
||||
abstract: unknown
|
||||
author:
|
||||
- unknown
|
||||
build_requires:
|
||||
ExtUtils::MakeMaker: '0'
|
||||
configure_requires:
|
||||
ExtUtils::MakeMaker: '0'
|
||||
dynamic_config: 1
|
||||
generated_by: 'ExtUtils::MakeMaker version 7.1001, CPAN::Meta::Converter version 2.150005'
|
||||
license: unknown
|
||||
meta-spec:
|
||||
url: http://module-build.sourceforge.net/META-spec-v1.4.html
|
||||
version: '1.4'
|
||||
name: Unicode-String
|
||||
no_index:
|
||||
directory:
|
||||
- t
|
||||
- inc
|
||||
requires:
|
||||
MIME::Base64: '2'
|
||||
version: '2.10'
|
||||
x_serialization_backend: 'CPAN::Meta::YAML version 0.018'
|
11
Makefile.PL
11
Makefile.PL
|
@ -1,11 +0,0 @@
|
|||
require 5.004;
|
||||
|
||||
use ExtUtils::MakeMaker;
|
||||
WriteMakefile(
|
||||
'NAME' => 'Unicode::String',
|
||||
'VERSION_FROM' => 'String.pm',
|
||||
'PREREQ_PM' => {
|
||||
'MIME::Base64' => 2.00,
|
||||
},
|
||||
dist => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', },
|
||||
);
|
79
README
79
README
|
@ -1,79 +0,0 @@
|
|||
These are experimental modules to handle various Unicode issues. They
|
||||
were made before perl included native UTF8 support.
|
||||
|
||||
More information on what Unicode is and can do for you are to be found
|
||||
at http://www.unicode.org
|
||||
|
||||
The current set of modules are:
|
||||
|
||||
Unicode::String - represent strings of Unicode chars
|
||||
Unicode::CharName - look up character names
|
||||
Unicode::Map8 - mapping tables towards 8-bit char sets
|
||||
|
||||
(the Unicode::Map8 module is distributed separately)
|
||||
|
||||
|
||||
Some of ideas to investigate for the Unicode modules are:
|
||||
|
||||
o Depreciation because of perl's own utf8 support.
|
||||
|
||||
o Composition/decomposition support:
|
||||
$u->decomp; # will decomposite as much as possible: "å" --> "a°"
|
||||
$u->comp; # will composite as much as possible: "a°" --> "å"
|
||||
|
||||
Need separate routines or a special argument to distinguish
|
||||
between compatibility decomposition and canonical decomposition.
|
||||
The last one is a subset of the first one.
|
||||
|
||||
o General Unicode string to number convertion (based on unidata
|
||||
number attributes)
|
||||
|
||||
o Case convertions (lc, uc, ucfirst) last one should use title-case
|
||||
|
||||
o Fast lookup of Unicode attributes (unidata lookup using XS)
|
||||
$u->isletter, $u->isupper, $u->islower,.... why do we need them when
|
||||
perl does not need them for normal text??
|
||||
|
||||
o There might be some support for the private area (i.e. adding case
|
||||
convertion and char properties to chars within the area).
|
||||
|
||||
o Unicode tr-function, sprintf-function
|
||||
|
||||
o Unicode string comparison functions: cmp(), le, eq,...
|
||||
|
||||
o Unicode regular expressions: m// s/// split(//,..)
|
||||
|
||||
o Unicode filehandles (automatic convertion from UTF-7/UTF-8/8-bit
|
||||
char set when reading,writing to filehandles)
|
||||
|
||||
o Fast convertion to other large char sets (east-asien). I don't
|
||||
know anything about this.
|
||||
|
||||
|
||||
EXAMPLES
|
||||
|
||||
The following are examples of use of the current modules:
|
||||
|
||||
use Unicode::String qw(latin1 utf8);
|
||||
|
||||
$u = utf8("this is a string\n");
|
||||
print $u->ucs4;
|
||||
print $u->utf16;
|
||||
print $u->utf8;
|
||||
print $u->utf7;
|
||||
print $u->latin1;
|
||||
print $u->hex;
|
||||
|
||||
print latin1("naïve\n")->utf8;
|
||||
|
||||
use Unicode::CharName qw(uname);
|
||||
print uname(ord('$')), "\n";
|
||||
|
||||
|
||||
|
||||
COPYRIGHT
|
||||
|
||||
© 1997-2000,2005 Gisle Aas. All rights reserved.
|
||||
|
||||
This library is free software; you can redistribute it and/or modify
|
||||
it under the same terms as Perl itself.
|
996
String.pm
996
String.pm
|
@ -1,996 +0,0 @@
|
|||
package Unicode::String;
|
||||
|
||||
# Copyright 1997-1999, Gisle Aas.
|
||||
|
||||
use strict;
|
||||
use vars qw($VERSION @ISA @EXPORT_OK $UTF7_OPTIONAL_DIRECT_CHARS);
|
||||
use Carp;
|
||||
|
||||
require Exporter;
|
||||
require DynaLoader;
|
||||
@ISA = qw(Exporter DynaLoader);
|
||||
|
||||
@EXPORT_OK = qw(
|
||||
utf16 utf16le utf16be ucs2
|
||||
utf8
|
||||
utf7
|
||||
ucs4 utf32 utf32be utf32le
|
||||
latin1
|
||||
uchr uhex
|
||||
|
||||
byteswap2 byteswap4
|
||||
);
|
||||
|
||||
$VERSION = '2.10';
|
||||
|
||||
$UTF7_OPTIONAL_DIRECT_CHARS ||= 1;
|
||||
|
||||
bootstrap Unicode::String $VERSION;
|
||||
|
||||
use overload '""' => \&as_string,
|
||||
'bool' => \&as_bool,
|
||||
'0+' => \&as_num,
|
||||
'.=' => \&append,
|
||||
'.' => \&concat,
|
||||
'x' => \&repeat,
|
||||
'=' => \©,
|
||||
'fallback' => 1;
|
||||
|
||||
my %stringify = (
|
||||
unicode => \&utf16,
|
||||
utf16 => \&utf16,
|
||||
utf16be => \&utf16,
|
||||
utf16le => \&utf16le,
|
||||
ucs2 => \&utf16,
|
||||
utf8 => \&utf8,
|
||||
utf7 => \&utf7,
|
||||
ucs4 => \&ucs4,
|
||||
utf32 => \&ucs4,
|
||||
utf32be => \&ucs4,
|
||||
utf32le => \&utf32le,
|
||||
latin1 => \&latin1,
|
||||
'hex' => \&hex,
|
||||
);
|
||||
|
||||
my $stringify_as = \&utf8;
|
||||
|
||||
# some aliases
|
||||
*ucs2 = \&utf16;
|
||||
*utf16be = \&utf16;
|
||||
*utf32 = \&ucs4;
|
||||
*utf32be = \&ucs4;
|
||||
*uhex = \&hex;
|
||||
*uchr = \&chr;
|
||||
|
||||
sub new
|
||||
{
|
||||
#_dump_arg("new", @_);
|
||||
my $class = shift;
|
||||
my $str;
|
||||
my $self = bless \$str, $class;
|
||||
&$stringify_as($self, shift) if @_;
|
||||
$self;
|
||||
}
|
||||
|
||||
|
||||
sub repeat
|
||||
{
|
||||
my($self, $count) = @_;
|
||||
my $class = ref($self);
|
||||
my $str = $$self x $count;
|
||||
bless \$str, $class;
|
||||
}
|
||||
|
||||
|
||||
sub _dump_arg
|
||||
{
|
||||
my $func = shift;
|
||||
print "$func(";
|
||||
print join(",", map { if (defined $_) {
|
||||
my $x = overload::StrVal($_);
|
||||
$x =~ s/\n/\\n/g;
|
||||
$x = '""' unless length $x;
|
||||
$x;
|
||||
} else {
|
||||
"undef"
|
||||
}
|
||||
} @_);
|
||||
print ")\n";
|
||||
}
|
||||
|
||||
|
||||
sub concat
|
||||
{
|
||||
#_dump_arg("concat", @_);
|
||||
my($self, $other, $reversed) = @_;
|
||||
my $class = ref($self);
|
||||
unless (UNIVERSAL::isa($other, 'Unicode::String')) {
|
||||
$other = Unicode::String->new($other);
|
||||
}
|
||||
my $str = $reversed ? $$other . $$self : $$self . $$other;
|
||||
bless \$str, $class;
|
||||
}
|
||||
|
||||
|
||||
sub append
|
||||
{
|
||||
#_dump_arg("append", @_);
|
||||
my($self, $other) = @_;
|
||||
unless (UNIVERSAL::isa($other, 'Unicode::String')) {
|
||||
$other = Unicode::String->new($other);
|
||||
}
|
||||
$$self .= $$other;
|
||||
$self;
|
||||
}
|
||||
|
||||
|
||||
sub copy
|
||||
{
|
||||
my($self) = @_;
|
||||
my $class = ref($self);
|
||||
my $copy = $$self;
|
||||
bless \$copy, $class;
|
||||
}
|
||||
|
||||
|
||||
sub as_string
|
||||
{
|
||||
#_dump_arg("as_string", @_);
|
||||
&$stringify_as($_[0]);
|
||||
}
|
||||
|
||||
|
||||
sub as_bool
|
||||
{
|
||||
# This is different from perl's normal behaviour by not letting
|
||||
# a U+0030 ("0") be false.
|
||||
my $self = shift;
|
||||
$$self ? 1 : "";
|
||||
}
|
||||
|
||||
|
||||
sub as_num
|
||||
{
|
||||
# Should be able to use the numeric property from Unidata
|
||||
# in order to parse a large number of numbers. Currently we
|
||||
# only convert it to a plain string and let perl's normal
|
||||
# num-converter do the job.
|
||||
my $self = shift;
|
||||
my $str = $self->utf8;
|
||||
$str + 0;
|
||||
}
|
||||
|
||||
|
||||
sub stringify_as
|
||||
{
|
||||
my $class;
|
||||
if (@_ > 1) {
|
||||
$class = shift;
|
||||
$class = ref($class) if ref($class);
|
||||
} else {
|
||||
$class = "Unicode::String";
|
||||
}
|
||||
my $old = $stringify_as;
|
||||
if (@_) {
|
||||
my $as = shift;
|
||||
croak("Don't know how to stringify as '$as'")
|
||||
unless exists $stringify{$as};
|
||||
$stringify_as = $stringify{$as};
|
||||
}
|
||||
$old;
|
||||
}
|
||||
|
||||
|
||||
sub utf16
|
||||
{
|
||||
my $self = shift;
|
||||
unless (ref $self) {
|
||||
my $u = new Unicode::String;
|
||||
$u->utf16($self);
|
||||
return $u;
|
||||
}
|
||||
my $old = $$self;
|
||||
if (@_) {
|
||||
$$self = shift;
|
||||
if ((length($$self) % 2) != 0) {
|
||||
warn "Uneven UTF16 data" if $^W;
|
||||
$$self .= "\0";
|
||||
}
|
||||
if ($$self =~ /^\xFF\xFE/) {
|
||||
# the string needs byte swapping
|
||||
$$self = byteswap2($$self);
|
||||
}
|
||||
}
|
||||
$old;
|
||||
}
|
||||
|
||||
|
||||
sub utf16le
|
||||
{
|
||||
my $self = shift;
|
||||
unless (ref $self) {
|
||||
my $u = new Unicode::String;
|
||||
$u->utf16(byteswap2($self));
|
||||
return $u;
|
||||
}
|
||||
my $old = byteswap2($$self);
|
||||
if (@_) {
|
||||
$self->utf16(byteswap2(shift));
|
||||
}
|
||||
$old;
|
||||
}
|
||||
|
||||
|
||||
sub utf32le
|
||||
{
|
||||
my $self = shift;
|
||||
unless (ref $self) {
|
||||
my $u = new Unicode::String;
|
||||
$u->ucs4(byteswap4($self));
|
||||
return $u;
|
||||
}
|
||||
my $old = byteswap4($self->ucs4);
|
||||
if (@_) {
|
||||
$self->ucs4(byteswap4(shift));
|
||||
}
|
||||
$old;
|
||||
}
|
||||
|
||||
|
||||
sub utf7 # rfc1642
|
||||
{
|
||||
my $self = shift;
|
||||
unless (ref $self) {
|
||||
# act as ctor
|
||||
my $u = new Unicode::String;
|
||||
$u->utf7($self);
|
||||
return $u;
|
||||
}
|
||||
my $old;
|
||||
if (defined wantarray) {
|
||||
# encode into $old
|
||||
$old = "";
|
||||
pos($$self) = 0;
|
||||
my $len = length($$self);
|
||||
while (pos($$self) < $len) {
|
||||
if (($UTF7_OPTIONAL_DIRECT_CHARS &&
|
||||
$$self =~ /\G((?:\0[A-Za-z0-9\'\(\)\,\-\.\/\:\?\!\"\#\$\%\&\*\;\<\=\>\@\[\]\^\_\`\{\|\}\s])+)/gc)
|
||||
|| $$self =~ /\G((?:\0[A-Za-z0-9\'\(\)\,\-\.\/\:\?\s])+)/gc)
|
||||
{
|
||||
#print "Plain ", utf16($1)->latin1, "\n";
|
||||
$old .= utf16($1)->latin1;
|
||||
}
|
||||
elsif (($UTF7_OPTIONAL_DIRECT_CHARS &&
|
||||
$$self =~ /\G((?:[^\0].|\0[^A-Za-z0-9\'\(\)\,\-\.\/\:\?\!\"\#\$\%\&\*\;\<\=\>\@\[\]\^\_\`\{\|\}\s])+)/gsc)
|
||||
|| $$self =~ /\G((?:[^\0].|\0[^A-Za-z0-9\'\(\)\,\-\.\/\:\?\s])+)/gsc)
|
||||
{
|
||||
#print "Unplain ", utf16($1)->hex, "\n";
|
||||
if ($1 eq "\0+") {
|
||||
$old .= "+-";
|
||||
} else {
|
||||
require MIME::Base64;
|
||||
my $base64 = MIME::Base64::encode($1, '');
|
||||
$base64 =~ s/=+$//;
|
||||
$old .= "+$base64-";
|
||||
# XXX should we determine when the final "-" is
|
||||
# unnecessary? depends on next char not being part
|
||||
# of the base64 char set.
|
||||
}
|
||||
} else {
|
||||
die "This should not happen, pos=" . pos($$self) .
|
||||
": " . $self->hex . "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (@_) {
|
||||
# decode
|
||||
my $len = length($_[0]);
|
||||
$$self = "";
|
||||
pos($_[0]) = 0;
|
||||
while (pos($_[0]) < $len) {
|
||||
if ($_[0] =~ /\G([^+]+)/gc) {
|
||||
$self->append(latin1($1));
|
||||
} elsif ($_[0] =~ /\G\+-/gc) {
|
||||
$$self .= "\0+";
|
||||
} elsif ($_[0] =~ /\G\+([A-Za-z0-9+\/]+)-?/gc) {
|
||||
my $base64 = $1;
|
||||
my $pad = length($base64) % 4;
|
||||
$base64 .= "=" x (4 - $pad) if $pad;
|
||||
require MIME::Base64;
|
||||
$$self .= MIME::Base64::decode($base64);
|
||||
if ((length($$self) % 2) != 0) {
|
||||
warn "Uneven UTF7 base64-data" if $^W;
|
||||
chop($$self); # correct it
|
||||
}
|
||||
} elsif ($_[0] =~ /\G\+/gc) {
|
||||
warn "Bad UTF7 data escape" if $^W;
|
||||
$$self .= "\0+";
|
||||
} else {
|
||||
die "This should not happen " . pos($_[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
$old;
|
||||
}
|
||||
|
||||
|
||||
sub hex
|
||||
{
|
||||
my $self = shift;
|
||||
unless (ref $self) {
|
||||
my $u = new Unicode::String;
|
||||
$u->hex($self);
|
||||
return $u;
|
||||
}
|
||||
my $old;
|
||||
if (defined($$self) && defined wantarray) {
|
||||
$old = unpack("H*", $$self);
|
||||
$old =~ s/(....)/U+$1 /g;
|
||||
$old =~ s/\s+$//;
|
||||
}
|
||||
if (@_) {
|
||||
my $new = shift;
|
||||
$new =~ tr/0-9A-Fa-f//cd; # leave only hex chars
|
||||
croak("Hex string length must be multiple of four")
|
||||
unless (length($new) % 4) == 0;
|
||||
$$self = pack("H*", $new);
|
||||
}
|
||||
$old;
|
||||
}
|
||||
|
||||
|
||||
sub length
|
||||
{
|
||||
my $self = shift;
|
||||
int(length($$self) / 2);
|
||||
}
|
||||
|
||||
sub byteswap
|
||||
{
|
||||
my $self = shift;
|
||||
byteswap2($$self);
|
||||
$self;
|
||||
}
|
||||
|
||||
sub unpack
|
||||
{
|
||||
my $self = shift;
|
||||
unpack("n*", $$self)
|
||||
}
|
||||
|
||||
|
||||
sub pack
|
||||
{
|
||||
my $self = shift;
|
||||
$$self = pack("n*", @_);
|
||||
$self;
|
||||
}
|
||||
|
||||
|
||||
sub ord
|
||||
{
|
||||
my $self = shift;
|
||||
return () unless defined $$self;
|
||||
|
||||
my $array = wantarray;
|
||||
my @ret;
|
||||
my @chars;
|
||||
if ($array) {
|
||||
@chars = CORE::unpack("n*", $$self);
|
||||
} else {
|
||||
@chars = CORE::unpack("n2", $$self);
|
||||
}
|
||||
|
||||
while (@chars) {
|
||||
my $first = shift(@chars);
|
||||
if ($first >= 0xD800 && $first <= 0xDFFF) { # surrogate
|
||||
my $second = shift(@chars);
|
||||
#print "F=$first S=$second\n";
|
||||
if ($first >= 0xDC00 || $second < 0xDC00 || $second > 0xDFFF) {
|
||||
carp(sprintf("Bad surrogate pair (U+%04x U+%04x)",
|
||||
$first, $second));
|
||||
unshift(@chars, $second);
|
||||
next;
|
||||
}
|
||||
push(@ret, ($first-0xD800)*0x400 + ($second-0xDC00) + 0x10000);
|
||||
} else {
|
||||
push(@ret, $first);
|
||||
}
|
||||
last unless $array;
|
||||
}
|
||||
$array ? @ret : $ret[0];
|
||||
}
|
||||
|
||||
|
||||
sub name
|
||||
{
|
||||
my $self = shift;
|
||||
require Unicode::CharName;
|
||||
if (wantarray) {
|
||||
return map { Unicode::CharName::uname($_) } $self->ord;
|
||||
} else {
|
||||
return Unicode::CharName::uname(scalar($self->ord));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
sub chr
|
||||
{
|
||||
my($self,$val) = @_;
|
||||
unless (ref $self) {
|
||||
# act as ctor
|
||||
my $u = new Unicode::String;
|
||||
return $u->uchr($self);
|
||||
}
|
||||
if ($val > 0xFFFF) {
|
||||
# must be represented by a surrogate pair
|
||||
return undef if $val > 0x10FFFF; # Unicode limit
|
||||
$val -= 0x10000;
|
||||
my $h = int($val / 0x400) + 0xD800;
|
||||
my $l = ($val % 0x400) + 0xDC00;
|
||||
$$self = CORE::pack("n2", $h, $l);
|
||||
} else {
|
||||
$$self = CORE::pack("n", $val);
|
||||
}
|
||||
$self;
|
||||
}
|
||||
|
||||
|
||||
sub substr
|
||||
{
|
||||
my($self, $offset, $length, $substitute) = @_;
|
||||
$offset ||= 0;
|
||||
$offset *= 2;
|
||||
my $substr;
|
||||
if (defined $substitute) {
|
||||
unless (UNIVERSAL::isa($substitute, 'Unicode::String')) {
|
||||
$substitute = Unicode::String->new($substitute);
|
||||
}
|
||||
if (defined $length) {
|
||||
$substr = substr($$self, $offset, $length*2) = $$substitute;
|
||||
} else {
|
||||
$substr = substr($$self, $offset) = $$substitute;
|
||||
}
|
||||
} else {
|
||||
if (defined $length) {
|
||||
$substr = substr($$self, $offset, $length*2);
|
||||
} else {
|
||||
$substr = substr($$self, $offset);
|
||||
}
|
||||
}
|
||||
bless \$substr, ref($self);
|
||||
}
|
||||
|
||||
|
||||
sub index
|
||||
{
|
||||
my($self, $other, $pos) = @_;
|
||||
$pos ||= 0;
|
||||
$pos *= 2;
|
||||
$other = Unicode::String->new($other) unless ref($other);
|
||||
$pos++ while ($pos = index($$self, $$other, $pos)) > 0 && ($pos%2) != 0;
|
||||
$pos /= 2 if $pos > 0;
|
||||
$pos;
|
||||
}
|
||||
|
||||
|
||||
sub rindex
|
||||
{
|
||||
my($self, $other, $pos) = @_;
|
||||
$pos ||= 0;
|
||||
die "NYI";
|
||||
}
|
||||
|
||||
|
||||
sub chop
|
||||
{
|
||||
my $self = shift;
|
||||
if (CORE::length $$self) {
|
||||
my $chop = chop($$self);
|
||||
$chop = chop($$self) . $chop;
|
||||
return bless \$chop, ref($self);
|
||||
}
|
||||
undef;
|
||||
}
|
||||
|
||||
|
||||
# XXX: Ideas to be implemented
|
||||
sub scan;
|
||||
sub reverse;
|
||||
|
||||
sub lc;
|
||||
sub lcfirst;
|
||||
sub uc;
|
||||
sub ucfirst;
|
||||
|
||||
sub split;
|
||||
sub sprintf;
|
||||
sub study;
|
||||
sub tr;
|
||||
|
||||
|
||||
1;
|
||||
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Unicode::String - String of Unicode characters (UTF-16BE)
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use Unicode::String qw(utf8 latin1 utf16be);
|
||||
|
||||
$u = utf8("string");
|
||||
$u = latin1("string");
|
||||
$u = utf16be("\0s\0t\0r\0i\0n\0g");
|
||||
|
||||
print $u->utf32be; # 4 byte characters
|
||||
print $u->utf16le; # 2 byte characters + surrogates
|
||||
print $u->utf8; # 1-4 byte characters
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
A C<Unicode::String> object represents a sequence of Unicode
|
||||
characters. Methods are provided to convert between various external
|
||||
formats (encodings) and C<Unicode::String> objects, and methods are
|
||||
provided for common string manipulations.
|
||||
|
||||
The functions utf32be(), utf32le(), utf16be(), utf16le(), utf8(),
|
||||
utf7(), latin1(), uhex(), uchr() can be imported from the
|
||||
C<Unicode::String> module and will work as constructors initializing
|
||||
strings of the corresponding encoding.
|
||||
|
||||
The C<Unicode::String> objects overload various operators, which means
|
||||
that they in most cases can be treated like plain strings.
|
||||
|
||||
Internally a C<Unicode::String> object is represented by a string of 2
|
||||
byte numbers in network byte order (big-endian). This representation
|
||||
is not visible by the API provided, but it might be useful to know in
|
||||
order to predict the efficiency of the provided methods.
|
||||
|
||||
=head2 METHODS
|
||||
|
||||
=head2 Class methods
|
||||
|
||||
The following class methods are available:
|
||||
|
||||
=over 4
|
||||
|
||||
=item Unicode::String->stringify_as
|
||||
|
||||
=item Unicode::String->stringify_as( $enc )
|
||||
|
||||
This method is used to specify which encoding will be used when
|
||||
C<Unicode::String> objects are implicitly converted to and from plain
|
||||
strings.
|
||||
|
||||
If an argument is provided it sets the current encoding. The argument
|
||||
should have one of the following: "ucs4", "utf32", "utf32be",
|
||||
"utf32le", "ucs2", "utf16", "utf16be", "utf16le", "utf8", "utf7",
|
||||
"latin1" or "hex". The default is "utf8".
|
||||
|
||||
The stringify_as() method returns a reference to the current encoding
|
||||
function.
|
||||
|
||||
=item $us = Unicode::String->new
|
||||
|
||||
=item $us = Unicode::String->new( $initial_value )
|
||||
|
||||
This is the object constructor. Without argument, it creates an empty
|
||||
C<Unicode::String> object. If an $initial_value argument is given, it
|
||||
is decoded according to the specified stringify_as() encoding, UTF-8
|
||||
by default.
|
||||
|
||||
In general it is recommended to import and use one of the encoding
|
||||
specific constructor functions instead of invoking this method.
|
||||
|
||||
=back
|
||||
|
||||
=head2 Encoding methods
|
||||
|
||||
These methods get or set the value of the C<Unicode::String> object by
|
||||
passing strings in the corresponding encoding. If a new value is
|
||||
passed as argument it will set the value of the C<Unicode::String>,
|
||||
and the previous value is returned. If no argument is passed then the
|
||||
current value is returned.
|
||||
|
||||
To illustrate the encodings we show how the 2 character sample string
|
||||
of "µm" (micro meter) is encoded for each one.
|
||||
|
||||
=over 4
|
||||
|
||||
=item $us->utf32be
|
||||
|
||||
=item $us->utf32be( $newval )
|
||||
|
||||
The string passed should be in the UTF-32 encoding with bytes in big
|
||||
endian order. The sample "µm" is "\0\0\0\xB5\0\0\0m" in this encoding.
|
||||
|
||||
Alternative names for this method are utf32() and ucs4().
|
||||
|
||||
=item $us->utf32le
|
||||
|
||||
=item $us->utf32le( $newval )
|
||||
|
||||
The string passed should be in the UTF-32 encoding with bytes in little
|
||||
endian order. The sample "µm" is is "\xB5\0\0\0m\0\0\0" in this encoding.
|
||||
|
||||
=item $us->utf16be
|
||||
|
||||
=item $us->utf16be( $newval )
|
||||
|
||||
The string passed should be in the UTF-16 encoding with bytes in big
|
||||
endian order. The sample "µm" is "\0\xB5\0m" in this encoding.
|
||||
|
||||
Alternative names for this method are utf16() and ucs2().
|
||||
|
||||
If the string passed to utf16be() starts with the Unicode byte order
|
||||
mark in little endian order, the result is as if utf16le() was called
|
||||
instead.
|
||||
|
||||
=item $us->utf16le
|
||||
|
||||
=item $us->utf16le( $newval )
|
||||
|
||||
The string passed should be in the UTF-16 encoding with bytes in
|
||||
little endian order. The sample "µm" is is "\xB5\0m\0" in this
|
||||
encoding. This is the encoding used by the Microsoft Windows API.
|
||||
|
||||
If the string passed to utf16le() starts with the Unicode byte order
|
||||
mark in big endian order, the result is as if utf16le() was called
|
||||
instead.
|
||||
|
||||
=item $us->utf8
|
||||
|
||||
=item $us->utf8( $newval )
|
||||
|
||||
The string passed should be in the UTF-8 encoding. The sample "µm" is
|
||||
"\xC2\xB5m" in this encoding.
|
||||
|
||||
=item $us->utf7
|
||||
|
||||
=item $us->utf7( $newval )
|
||||
|
||||
The string passed should be in the UTF-7 encoding. The sample "µm" is
|
||||
"+ALU-m" in this encoding.
|
||||
|
||||
|
||||
The UTF-7 encoding only use plain US-ASCII characters for the
|
||||
encoding. This makes it safe for transport through 8-bit stripping
|
||||
protocols. Characters outside the US-ASCII range are base64-encoded
|
||||
and '+' is used as an escape character. The UTF-7 encoding is
|
||||
described in RFC 1642.
|
||||
|
||||
If the (global) variable $Unicode::String::UTF7_OPTIONAL_DIRECT_CHARS
|
||||
is TRUE, then a wider range of characters are encoded as themselves.
|
||||
It is even TRUE by default. The characters affected by this are:
|
||||
|
||||
! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
|
||||
|
||||
=item $us->latin1
|
||||
|
||||
=item $us->latin1( $newval )
|
||||
|
||||
The string passed should be in the ISO-8859-1 encoding. The sample "µm" is
|
||||
"\xB5m" in this encoding.
|
||||
|
||||
Characters outside the "\x00" .. "\xFF" range are simply removed from
|
||||
the return value of the latin1() method. If you want more control
|
||||
over the mapping from Unicode to ISO-8859-1, use the C<Unicode::Map8>
|
||||
class. This is also the way to deal with other 8-bit character sets.
|
||||
|
||||
=item $us->hex
|
||||
|
||||
=item $us->hex( $newval )
|
||||
|
||||
The string passed should be plain ASCII where each Unicode character
|
||||
is represented by the "U+XXXX" string and separated by a single space
|
||||
character. The "U+" prefix is optional when setting the value. The
|
||||
sample "µm" is "U+00b5 U+006d" in this encoding.
|
||||
|
||||
=back
|
||||
|
||||
=head2 String Operations
|
||||
|
||||
The following methods are available:
|
||||
|
||||
=over 4
|
||||
|
||||
=item $us->as_string
|
||||
|
||||
Converts a C<Unicode::String> to a plain string according to the
|
||||
setting of stringify_as(). The default stringify_as() encoding is
|
||||
"utf8".
|
||||
|
||||
=item $us->as_num
|
||||
|
||||
Converts a C<Unicode::String> to a number. Currently only the digits
|
||||
in the range 0x30 .. 0x39 are recognized. The plan is to eventually
|
||||
support all Unicode digit characters.
|
||||
|
||||
=item $us->as_bool
|
||||
|
||||
Converts a C<Unicode::String> to a boolean value. Only the empty
|
||||
string is FALSE. A string consisting of only the character U+0030 is
|
||||
considered TRUE, even if Perl consider "0" to be FALSE.
|
||||
|
||||
=item $us->repeat( $count )
|
||||
|
||||
Returns a new C<Unicode::String> where the content of $us is repeated
|
||||
$count times. This operation is also overloaded as:
|
||||
|
||||
$us x $count
|
||||
|
||||
=item $us->concat( $other_string )
|
||||
|
||||
Concatenates the string $us and the string $other_string. If
|
||||
$other_string is not an C<Unicode::String> object, then it is first
|
||||
passed to the Unicode::String->new constructor function. This
|
||||
operation is also overloaded as:
|
||||
|
||||
$us . $other_string
|
||||
|
||||
|
||||
=item $us->append( $other_string )
|
||||
|
||||
Appends the string $other_string to the value of $us. If
|
||||
$other_string is not an C<Unicode::String> object, then it is first
|
||||
passed to the Unicode::String->new constructor function. This
|
||||
operation is also overloaded as:
|
||||
|
||||
$us .= $other_string
|
||||
|
||||
=item $us->copy
|
||||
|
||||
Returns a copy of the current C<Unicode::String> object. This
|
||||
operation is overloaded as the assignment operator.
|
||||
|
||||
=item $us->length
|
||||
|
||||
Returns the length of the C<Unicode::String>. Surrogate pairs are
|
||||
still counted as 2.
|
||||
|
||||
=item $us->byteswap
|
||||
|
||||
This method will swap the bytes in the internal representation of the
|
||||
C<Unicode::String> object.
|
||||
|
||||
Unicode reserve the character U+FEFF character as a byte order mark.
|
||||
This works because the swapped character, U+FFFE, is reserved to not
|
||||
be valid. For strings that have the byte order mark as the first
|
||||
character, we can guaranty to get the byte order right with the
|
||||
following code:
|
||||
|
||||
$ustr->byteswap if $ustr->ord == 0xFFFE;
|
||||
|
||||
=item $us->unpack
|
||||
|
||||
Returns a list of integers each representing an UCS-2 character code.
|
||||
|
||||
=item $us->pack( @uchr )
|
||||
|
||||
Sets the value of $us as a sequence of UCS-2 characters with the
|
||||
characters codes given as parameter.
|
||||
|
||||
=item $us->ord
|
||||
|
||||
Returns the character code of the first character in $us. The ord()
|
||||
method deals with surrogate pairs, which gives us a result-range of
|
||||
0x0 .. 0x10FFFF. If the $us string is empty, undef is returned.
|
||||
|
||||
=item $us->chr( $code )
|
||||
|
||||
Sets the value of $us to be a string containing the character assigned
|
||||
code $code. The argument $code must be an integer in the range 0x0
|
||||
.. 0x10FFFF. If the code is greater than 0xFFFF then a surrogate pair
|
||||
created.
|
||||
|
||||
=item $us->name
|
||||
|
||||
In scalar context returns the official Unicode name of the first
|
||||
character in $us. In array context returns the name of all characters
|
||||
in $us. Also see L<Unicode::CharName>.
|
||||
|
||||
=item $us->substr( $offset )
|
||||
|
||||
=item $us->substr( $offset, $length )
|
||||
|
||||
=item $us->substr( $offset, $length, $subst )
|
||||
|
||||
Returns a sub-string of $us. Works similar to the builtin substr()
|
||||
function.
|
||||
|
||||
=item $us->index( $other )
|
||||
|
||||
=item $us->index( $other, $pos )
|
||||
|
||||
Locates the position of $other within $us, possibly starting the
|
||||
search at position $pos.
|
||||
|
||||
=item $us->chop
|
||||
|
||||
Chops off the last character of $us and returns it (as a
|
||||
C<Unicode::String> object).
|
||||
|
||||
=back
|
||||
|
||||
=head1 FUNCTIONS
|
||||
|
||||
The following functions are provided. None of these are exported by default.
|
||||
|
||||
=over 4
|
||||
|
||||
=item byteswap2( $str, ... )
|
||||
|
||||
This function will swap 2 and 2 bytes in the strings passed as
|
||||
arguments. If this function is called in void context,
|
||||
then it will modify its arguments in-place. Otherwise, the swapped
|
||||
strings are returned.
|
||||
|
||||
=item byteswap4( $str, ... )
|
||||
|
||||
The byteswap4 function works similar to byteswap2, but will reverse
|
||||
the order of 4 and 4 bytes.
|
||||
|
||||
=item latin1( $str )
|
||||
|
||||
=item utf7( $str )
|
||||
|
||||
=item utf8( $str )
|
||||
|
||||
=item utf16le( $str )
|
||||
|
||||
=item utf16be( $str )
|
||||
|
||||
=item utf32le( $str )
|
||||
|
||||
=item utf32be( $str )
|
||||
|
||||
Constructor functions for the various Unicode encodings. These return
|
||||
new C<Unicode::String> objects. The provided argument should be
|
||||
encoded correspondingly.
|
||||
|
||||
=item uhex( $str )
|
||||
|
||||
Constructs a new C<Unicode::String> object from a string of hex
|
||||
values. See hex() method above for description of the format.
|
||||
|
||||
=item uchar( $num )
|
||||
|
||||
Constructs a new one character C<Unicode::String> object from a
|
||||
Unicode character code. This works similar to perl's builtin chr()
|
||||
function.
|
||||
|
||||
=back
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Unicode::CharName>,
|
||||
L<Unicode::Map8>
|
||||
|
||||
L<http://www.unicode.org/>
|
||||
|
||||
L<perlunicode>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright 1997-2000,2005 Gisle Aas.
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself.
|
||||
|
||||
=cut
|
||||
|
||||
|
||||
#
|
||||
# Some old code that is not used any more (because the methods are
|
||||
# now implemented as XS) and which I did not want to throw away yet.
|
||||
#
|
||||
|
||||
sub ucs4_inperl
|
||||
{
|
||||
my $self = shift;
|
||||
unless (ref $self) {
|
||||
my $u = new Unicode::String;
|
||||
$u->ucs4($self);
|
||||
return $u;
|
||||
}
|
||||
my $old = pack("N*", $self->ord);
|
||||
if (@_) {
|
||||
$$self = "";
|
||||
for (unpack("N*", shift)) {
|
||||
$self->append(uchr($_));
|
||||
}
|
||||
}
|
||||
$old;
|
||||
}
|
||||
|
||||
|
||||
sub utf8_inperl
|
||||
{
|
||||
my $self = shift;
|
||||
unless (ref $self) {
|
||||
# act as ctor
|
||||
my $u = new Unicode::String;
|
||||
$u->utf8($self);
|
||||
return $u;
|
||||
}
|
||||
|
||||
my $old;
|
||||
if (defined($$self) && defined wantarray) {
|
||||
# encode UTF-8
|
||||
my $uc;
|
||||
for $uc (unpack("n*", $$self)) {
|
||||
if ($uc < 0x80) {
|
||||
# 1 byte representation
|
||||
$old .= chr($uc);
|
||||
} elsif ($uc < 0x800) {
|
||||
# 2 byte representation
|
||||
$old .= chr(0xC0 | ($uc >> 6)) .
|
||||
chr(0x80 | ($uc & 0x3F));
|
||||
} else {
|
||||
# 3 byte representation
|
||||
$old .= chr(0xE0 | ($uc >> 12)) .
|
||||
chr(0x80 | (($uc >> 6) & 0x3F)) .
|
||||
chr(0x80 | ($uc & 0x3F));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (@_) {
|
||||
if (defined $_[0]) {
|
||||
$$self = "";
|
||||
my $bytes = shift;
|
||||
$bytes =~ s/^[\200-\277]+//; # can't start with 10xxxxxx
|
||||
while (length $bytes) {
|
||||
if ($bytes =~ s/^([\000-\177]+)//) {
|
||||
$$self .= pack("n*", unpack("C*", $1));
|
||||
} elsif ($bytes =~ s/^([\300-\337])([\200-\277])//) {
|
||||
my($b1,$b2) = (ord($1), ord($2));
|
||||
$$self .= pack("n", (($b1 & 0x1F) << 6) | ($b2 & 0x3F));
|
||||
} elsif ($bytes =~ s/^([\340-\357])([\200-\277])([\200-\277])//) {
|
||||
my($b1,$b2,$b3) = (ord($1), ord($2), ord($3));
|
||||
$$self .= pack("n", (($b1 & 0x0F) << 12) |
|
||||
(($b2 & 0x3F) << 6) |
|
||||
($b3 & 0x3F));
|
||||
} else {
|
||||
croak "Bad UTF-8 data";
|
||||
}
|
||||
}
|
||||
} else {
|
||||
$$self = undef;
|
||||
}
|
||||
}
|
||||
|
||||
$old;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
sub latin1_inperl
|
||||
{
|
||||
my $self = shift;
|
||||
unless (ref $self) {
|
||||
# act as ctor
|
||||
my $u = new Unicode::String;
|
||||
$u->latin1($self);
|
||||
return $u;
|
||||
}
|
||||
|
||||
my $old;
|
||||
# XXX: should really check that none of the chars > 256
|
||||
$old = pack("C*", unpack("n*", $$self)) if defined $$self;
|
||||
|
||||
if (@_) {
|
||||
# set the value
|
||||
if (defined $_[0]) {
|
||||
$$self = pack("n*", unpack("C*", $_[0]));
|
||||
} else {
|
||||
$$self = undef;
|
||||
}
|
||||
}
|
||||
$old;
|
||||
}
|
414
String.xs
414
String.xs
|
@ -1,414 +0,0 @@
|
|||
/* $Id$
|
||||
*
|
||||
* Copyright 1997-1999, Gisle Aas.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the same terms as Perl itself.
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
#include "EXTERN.h"
|
||||
#include "perl.h"
|
||||
#include "XSUB.h"
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#include "patchlevel.h"
|
||||
#if PATCHLEVEL <= 4 && !defined(PL_dowarn)
|
||||
#define PL_dowarn dowarn
|
||||
#endif
|
||||
|
||||
#ifdef G_WARN_ON
|
||||
#define DOWARN (PL_dowarn & G_WARN_ON)
|
||||
#else
|
||||
#define DOWARN PL_dowarn
|
||||
#endif
|
||||
|
||||
|
||||
MODULE = Unicode::String PACKAGE = Unicode::String
|
||||
|
||||
PROTOTYPES: DISABLE
|
||||
|
||||
SV*
|
||||
latin1(self,...)
|
||||
SV* self
|
||||
|
||||
PREINIT:
|
||||
SV* newsv;
|
||||
SV* str;
|
||||
|
||||
CODE:
|
||||
RETVAL = 0;
|
||||
if (!sv_isobject(self)) {
|
||||
newsv = self;
|
||||
RETVAL = self = newSV(0);
|
||||
newSVrv(self, "Unicode::String");
|
||||
} else if (items > 1) {
|
||||
newsv = ST(1);
|
||||
} else {
|
||||
newsv = 0;
|
||||
}
|
||||
|
||||
str = SvRV(self);
|
||||
if (GIMME_V != G_VOID && !RETVAL) {
|
||||
U8 *beg, *s;
|
||||
STRLEN len;
|
||||
U16* usp = (U16*)SvPV(str,len);
|
||||
len /= 2;
|
||||
RETVAL = newSV(len+1);
|
||||
SvPOK_on(RETVAL);
|
||||
beg = s = (U8*)SvPVX(RETVAL);
|
||||
while (len--) {
|
||||
U16 us = ntohs(*usp++);
|
||||
if (us > 255) {
|
||||
if (us == 0xFEFF) {
|
||||
/* ignore BYTE ORDER MARK */
|
||||
} else {
|
||||
if (DOWARN) warn("Data outside latin1 range (pos=%d, ch=U+%x)", s - beg, us);
|
||||
}
|
||||
} else {
|
||||
*s++ = us;
|
||||
}
|
||||
}
|
||||
SvCUR_set(RETVAL, s - beg);
|
||||
*s='\0';
|
||||
}
|
||||
|
||||
if (newsv) {
|
||||
U16 *usp;
|
||||
STRLEN len;
|
||||
STRLEN my_na;
|
||||
U8 *s = (U8*)SvPV(newsv, len);
|
||||
SvGROW(str, len*2 + 2);
|
||||
SvPOK_on(str);
|
||||
SvCUR_set(str,len*2);
|
||||
usp = (U16*)SvPV(str,my_na);
|
||||
while (len--) {
|
||||
*usp++ = htons((U16)*s++);
|
||||
}
|
||||
*usp = 0;
|
||||
}
|
||||
if (!RETVAL)
|
||||
RETVAL = newSViv(0);
|
||||
|
||||
OUTPUT:
|
||||
RETVAL
|
||||
|
||||
SV*
|
||||
ucs4(self,...)
|
||||
SV* self
|
||||
|
||||
PREINIT:
|
||||
SV* newsv;
|
||||
SV* str;
|
||||
|
||||
CODE:
|
||||
RETVAL = 0;
|
||||
if (!sv_isobject(self)) {
|
||||
newsv = self;
|
||||
RETVAL = self = newSV(0);
|
||||
newSVrv(self, "Unicode::String");
|
||||
} else if (items > 1) {
|
||||
newsv = ST(1);
|
||||
} else {
|
||||
newsv = 0;
|
||||
}
|
||||
|
||||
str = SvRV(self);
|
||||
|
||||
if (GIMME_V != G_VOID && !RETVAL) {
|
||||
U32* to, *beg;
|
||||
STRLEN len; /* source length */
|
||||
U16* from = (U16*)SvPV(str, len);
|
||||
STRLEN my_na;
|
||||
len /= 2;
|
||||
RETVAL = newSV(len*4 + 1);
|
||||
SvPOK_on(RETVAL);
|
||||
beg = to = (U32*)SvPV(RETVAL, my_na);
|
||||
while (len--) {
|
||||
U16 us = ntohs(*from++);
|
||||
if (us >= 0xD800 && us <= 0xDFFF) { /* surrogate */
|
||||
U16 low = len ? ntohs(*from) : 0;
|
||||
if (us >= 0xDC00 || low < 0xDC00 || low > 0xDFFF) {
|
||||
/* bad surrogate pair */
|
||||
if (DOWARN) warn("Bad surrogate pair U+%04x U+%04x", us, low);
|
||||
} else {
|
||||
len--; from++;
|
||||
*to++ = htonl((us-0xD800)*0x400 + low-0xDC00 + 0x10000);
|
||||
}
|
||||
} else {
|
||||
*to++ = htonl(us);
|
||||
}
|
||||
}
|
||||
SvCUR_set(RETVAL, (to - beg) * 4);
|
||||
SvPVX(RETVAL)[SvCUR(RETVAL)] = '\0';
|
||||
}
|
||||
|
||||
if (newsv) {
|
||||
STRLEN len;
|
||||
U32* from = (U32*)SvPV(newsv, len);
|
||||
len /= 4;
|
||||
SvGROW(str, len*2 + 1); /* enough if we don't need surrogates */
|
||||
SvPOK_on(str);
|
||||
SvCUR_set(str, 0);
|
||||
while (len--) {
|
||||
U32 uc = ntohl(*from++); /* XXX should look for swapped FEFF */
|
||||
if (uc > 0xFFFF) {
|
||||
if (uc > 0x10FFFF) {
|
||||
/* can't be represented */
|
||||
if (DOWARN) warn("UCS4 char (0x%08x) can not be encoded as UTF16", uc);
|
||||
} else {
|
||||
/* generate two surrogates */
|
||||
U16 high, low;
|
||||
uc -= 0x10000;
|
||||
high = htons(uc/0x400 + 0xD800);
|
||||
low = htons(uc%0x400 + 0xDC00);
|
||||
sv_catpvn(str, (char*)&high, 2);
|
||||
sv_catpvn(str, (char*)&low, 2);
|
||||
}
|
||||
} else {
|
||||
U16 s = htons(uc);
|
||||
sv_catpvn(str, (char*)&s, 2);
|
||||
}
|
||||
}
|
||||
/* ensure '\0' termination of string */
|
||||
SvGROW(str, SvCUR(str)+1);
|
||||
SvPVX(str)[SvCUR(str)] = '\0';
|
||||
}
|
||||
|
||||
if (!RETVAL)
|
||||
RETVAL = newSViv(0);
|
||||
|
||||
OUTPUT:
|
||||
RETVAL
|
||||
|
||||
|
||||
SV*
|
||||
utf8(self,...)
|
||||
SV* self
|
||||
|
||||
PREINIT:
|
||||
SV* newsv;
|
||||
SV* str;
|
||||
|
||||
CODE:
|
||||
RETVAL = 0;
|
||||
if (!sv_isobject(self)) {
|
||||
newsv = self;
|
||||
RETVAL = self = newSV(0);
|
||||
newSVrv(self, "Unicode::String");
|
||||
} else if (items > 1) {
|
||||
newsv = ST(1);
|
||||
} else {
|
||||
newsv = 0;
|
||||
}
|
||||
|
||||
str = SvRV(self);
|
||||
if (GIMME_V != G_VOID && !RETVAL) {
|
||||
/* encode str */
|
||||
STRLEN len;
|
||||
U16* from = (U16*)SvPV(str, len);
|
||||
len /= 2;
|
||||
RETVAL = newSV(len*1.2 + 1); /* guess osuitable for euro-text */
|
||||
SvPOK_on(RETVAL);
|
||||
SvCUR_set(RETVAL, 0);
|
||||
while (len--) {
|
||||
register U32 us = ntohs(*from++);
|
||||
if (us >= 0xD800 && us <= 0xDFFF) { /* surrogate */
|
||||
U16 low = len ? ntohs(*from) : 0;
|
||||
if (us >= 0xDC00 || low < 0xDC00 || low > 0xDFFF) {
|
||||
/* bad surrogate pair */
|
||||
if (DOWARN) warn("Bad surrogate pair U+%04x U+%04x", us, low);
|
||||
} else {
|
||||
len--; from++;
|
||||
us = (us-0xD800)*0x400 + low-0xDC00 + 0x10000;
|
||||
}
|
||||
}
|
||||
if (us < 0x80) {
|
||||
U8 c = us;
|
||||
sv_catpvn(RETVAL, (char*)&c, 1);
|
||||
} else if (us < 0x800) {
|
||||
U8 c[2];
|
||||
c[1] = (us & 0077) | 0200;
|
||||
c[0] = (us >> 6) | 0300;
|
||||
sv_catpvn(RETVAL, (char*)c, 2);
|
||||
} else if (us < 0x10000) {
|
||||
U8 c[3];
|
||||
c[2] = (us & 0077) | 0200; us >>= 6;
|
||||
c[1] = (us & 0077) | 0200; us >>= 6;
|
||||
c[0] = us | 0340;
|
||||
sv_catpvn(RETVAL, (char*)c, 3);
|
||||
} else if (us < 0x200000) {
|
||||
U8 c[4];
|
||||
c[3] = (us & 0077) | 0200; us >>= 6;
|
||||
c[2] = (us & 0077) | 0200; us >>= 6;
|
||||
c[1] = (us & 0077) | 0200; us >>= 6;
|
||||
c[0] = us | 0360;
|
||||
sv_catpvn(RETVAL, (char*)c, 4);
|
||||
} else {
|
||||
/* this can't really happen since we start with utf16 */
|
||||
if (DOWARN) warn("Large char (%08X) ignored", us);
|
||||
}
|
||||
}
|
||||
/* ensure '\0' termination of string */
|
||||
SvGROW(str, SvCUR(str)+1);
|
||||
SvPVX(str)[SvCUR(str)] = '\0';
|
||||
}
|
||||
|
||||
if (newsv) {
|
||||
/* decode new */
|
||||
STRLEN len;
|
||||
U8* from = (U8*)SvPV(newsv, len);
|
||||
SvGROW(str, len + 1); /* must be at least this big */
|
||||
SvPOK_on(str);
|
||||
SvCUR_set(str, 0);
|
||||
while (len--) {
|
||||
U8 s[2];
|
||||
U8 u = *from++;
|
||||
if (u < 0x80) {
|
||||
s[0] = '\0';
|
||||
s[1] = u;
|
||||
sv_catpvn(str, (char*)s, 2);
|
||||
} else if ((u & 0340) == 0300) {
|
||||
/* 2 bytes to decode */
|
||||
if (!len) {
|
||||
if (DOWARN) warn("Missing second byte of utf8 encoded char");
|
||||
} else {
|
||||
U8 u2 = *from;
|
||||
if ((u2 & 0300) != 0200) {
|
||||
if (DOWARN) warn("Bad second byte of utf8 encoded char");
|
||||
} else {
|
||||
from++; len--; /* consume it */
|
||||
s[0] = (u & 0037) >> 2;
|
||||
s[1] = ((u & 0003) << 6) | (u2 & 0077);
|
||||
sv_catpvn(str, (char*)s, 2);
|
||||
}
|
||||
}
|
||||
} else if ((u & 0360) == 0340) {
|
||||
/* 3 bytes to decode */
|
||||
if (len < 2) {
|
||||
if (DOWARN) warn("Missing 2nd or 3rd byte of utf8 encoded char");
|
||||
} else {
|
||||
U8 u2 = from[0];
|
||||
U8 u3 = from[1];
|
||||
if ((u2 & 0300) != 0200 || (u3 & 0300) != 0200) {
|
||||
if (DOWARN) warn("Bad 2nd or 3rd byte of utf8 encoded char");
|
||||
} else {
|
||||
from += 2; len -= 2; /* consume them */
|
||||
s[0] = (u << 4) | (u2 & 0077) >> 2;
|
||||
s[1] = (u2 << 6) | (u3 & 0077);
|
||||
sv_catpvn(str, (char*)s, 2);
|
||||
}
|
||||
}
|
||||
} else if ((u & 0370) == 0360) {
|
||||
/* 4 bytes to decode, encoded using surrogates */
|
||||
if (len < 3) {
|
||||
if (DOWARN) warn("Missing 2nd, 3rd or 4th byte of utf8 encoded char");
|
||||
} else {
|
||||
if ((from[0] & 0300) != 0200 ||
|
||||
(from[1] & 0300) != 0200 ||
|
||||
(from[2] & 0300) != 0200)
|
||||
{
|
||||
if (DOWARN) warn("Bad 2nd, 3rd or 4th byte of utf8 encoded char");
|
||||
} else {
|
||||
U32 c = (u & 0007) << 6;
|
||||
c |= (from[0] & 0077); c <<= 6;
|
||||
c |= (from[1] & 0077); c <<= 6;
|
||||
c |= (from[2] & 0077);
|
||||
from += 3; len -= 3;
|
||||
/* c must now be encoded as two surrogates */
|
||||
if (c > 0x10FFFF) {
|
||||
if (DOWARN) warn("Can't represent 0x%08X as utf16", c);
|
||||
} else {
|
||||
/* generate two surrogates */
|
||||
U16 high, low;
|
||||
c -= 0x10000;
|
||||
high = htons(c/0x400 + 0xD800);
|
||||
low = htons(c%0x400 + 0xDC00);
|
||||
sv_catpvn(str, (char*)&high, 2);
|
||||
sv_catpvn(str, (char*)&low, 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if ((u & 0374) == 0370) {
|
||||
/* 5 bytes to decode, can't happend */
|
||||
if (DOWARN) warn("Can't represent 5 byte encoded chars");
|
||||
} else {
|
||||
if (DOWARN) warn("Bad utf8 byte (0x%02X) ignored", u);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!RETVAL)
|
||||
RETVAL = newSViv(0);
|
||||
|
||||
OUTPUT:
|
||||
RETVAL
|
||||
|
||||
void
|
||||
byteswap2(...)
|
||||
ALIAS:
|
||||
Unicode::String::byteswap2 = 2
|
||||
Unicode::String::byteswap4 = 4
|
||||
|
||||
PREINIT:
|
||||
int i;
|
||||
char c;
|
||||
STRLEN len;
|
||||
char* str;
|
||||
|
||||
PPCODE:
|
||||
for (i = 0; i < items; i++) {
|
||||
SV* sv = ST(i);
|
||||
STRLEN len;
|
||||
char* src = SvPV(sv, len);
|
||||
char* dest;
|
||||
|
||||
if (GIMME_V != G_VOID) {
|
||||
SV* dest_sv = sv_2mortal(newSV(len+1));
|
||||
SvCUR_set(dest_sv, len);
|
||||
*SvEND(dest_sv) = 0;
|
||||
SvPOK_on(dest_sv);
|
||||
PUSHs(dest_sv);
|
||||
dest = SvPVX(dest_sv);
|
||||
} else {
|
||||
if (SvREADONLY(sv)) {
|
||||
die("byteswap argument #%d is readonly", i+1);
|
||||
continue; /* probably not */
|
||||
}
|
||||
dest = src;
|
||||
}
|
||||
|
||||
if (ix == 2) {
|
||||
while (len >= 2) {
|
||||
char tmp = *src++;
|
||||
*dest++ = *src++;
|
||||
*dest++ = tmp;
|
||||
len -= 2;
|
||||
}
|
||||
}
|
||||
else { /* ix == 4 */
|
||||
while (len >= 4) {
|
||||
char tmp1 = *src++;
|
||||
char tmp2 = *src++;
|
||||
*dest++ = src[1];
|
||||
*dest++ = src[0];
|
||||
src += 2;
|
||||
*dest++ = tmp2;
|
||||
*dest++ = tmp1;
|
||||
len -= 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (len) {
|
||||
if (DOWARN)
|
||||
warn("byteswap argument #%d not long enough", i+1);
|
||||
|
||||
/* this will be a no-op unless dest/src are different */
|
||||
while (len--)
|
||||
*dest++ = *src++;
|
||||
}
|
||||
}
|
|
@ -1,8 +0,0 @@
|
|||
libunicode-string-perl for Debian
|
||||
|
||||
Please edit this to provide information specific to
|
||||
this libunicode-string-perl Debian package.
|
||||
|
||||
(Automatically generated by debmake Version 4.3.1)
|
||||
|
||||
-- denghao <denghao@kylinos.cn> Mon, 19 Sep 2022 11:13:20 +0300
|
|
@ -1,5 +0,0 @@
|
|||
libunicode-string-perl (2.10-ok1) yangtze; urgency=medium
|
||||
|
||||
* Build for openkylin.
|
||||
|
||||
-- denghao <denghao@kylinos.cn> Mon, 19 Sep 2022 11:13:20 +0300
|
|
@ -1,30 +0,0 @@
|
|||
Source: libunicode-string-perl
|
||||
Section: perl
|
||||
Priority: optional
|
||||
Maintainer: OpenKylin Developers <packaging@lists.openkylin.top>
|
||||
Build-Depends: debhelper-compat (=13),
|
||||
perl-xs-dev,
|
||||
perl:native
|
||||
Testsuite: autopkgtest-pkg-perl
|
||||
Standards-Version: 3.9.8
|
||||
Vcs-Browser: https://gitee.com/openkylin/libunicode-string-perl
|
||||
Vcs-Git: https://gitee.com/openkylin/libunicode-string-perl.git
|
||||
Homepage: https://metacpan.org/release/Unicode-String
|
||||
|
||||
Package: libunicode-string-perl
|
||||
Architecture: any
|
||||
Depends: ${misc:Depends},
|
||||
${perl:Depends},
|
||||
${shlibs:Depends}
|
||||
Description: Older Perl modules for Unicode string manipulation
|
||||
This package provides Unicode::String, which manipulates strings of
|
||||
characters in various Unicode representations, and Unicode::CharName,
|
||||
which maps Unicode characters to official character names.
|
||||
.
|
||||
These modules predate native Unicode support inside Perl. Normally, the
|
||||
integrated Perl Unicode support and modules such as Encode should be used
|
||||
instead of these modules. They are provided primarily for their
|
||||
additional utility functions and to support other Perl code with
|
||||
dependencies on these modules. Be aware that Unicode::CharName only
|
||||
includes Unicode characters up to Unicode 4.1 and the character name
|
||||
tables included with Perl are more complete.
|
|
@ -1,43 +0,0 @@
|
|||
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
|
||||
Upstream-Name: libunicode-string-perl
|
||||
Source: <url://example.com>
|
||||
#
|
||||
# Please double check copyright with the licensecheck(1) command.
|
||||
|
||||
Files: Changes
|
||||
MANIFEST
|
||||
META.json
|
||||
META.yml
|
||||
Makefile.PL
|
||||
t/byteswap.t
|
||||
t/charname.t
|
||||
t/encoding.t
|
||||
t/op.t
|
||||
t/ord.t
|
||||
t/stringify.t
|
||||
Copyright: __NO_COPYRIGHT_NOR_LICENSE__
|
||||
License: __NO_COPYRIGHT_NOR_LICENSE__
|
||||
|
||||
Files: README
|
||||
String.xs
|
||||
Copyright: 1997-2005 Gisle Aas.
|
||||
License: __UNKNOWN__
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself.
|
||||
|
||||
Files: lib/Unicode/CharName.pm
|
||||
Copyright: 1997-2005 Gisle Aas.
|
||||
License: __UNKNOWN__
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself.
|
||||
.
|
||||
Name table extracted from the Unicode 4.1 Character
|
||||
Database. Copyright (c) 1991-2005 Unicode, Inc.
|
||||
|
||||
Files: String.pm
|
||||
Copyright: 1997-1999 Gisle Aas.
|
||||
License: __NO_LICENSE__
|
||||
|
||||
#----------------------------------------------------------------------------
|
||||
# Files marked as NO_LICENSE_TEXT_FOUND may be covered by the following
|
||||
# license/copyright files.
|
|
@ -1 +0,0 @@
|
|||
README
|
|
@ -1 +0,0 @@
|
|||
# You must remove unused comment lines for the released package.
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/make -f
|
||||
|
||||
export DEB_BUILD_MAINT_OPTIONS = hardening=+all
|
||||
|
||||
%:
|
||||
dh $@
|
|
@ -1 +0,0 @@
|
|||
3.0 (native)
|
|
@ -1,2 +0,0 @@
|
|||
version=4
|
||||
https://metacpan.org/release/Unicode-String .*/Unicode-String-v?@ANY_VERSION@@ARCHIVE_EXT@$
|
16680
lib/Unicode/CharName.pm
16680
lib/Unicode/CharName.pm
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
@ -0,0 +1 @@
|
|||
de151d53d3ec9ed5d88ebd60ab58b0ec06bac020
|
49
t/byteswap.t
49
t/byteswap.t
|
@ -1,49 +0,0 @@
|
|||
print "1..5\n";
|
||||
|
||||
use Unicode::String qw(latin1 byteswap2 byteswap4);
|
||||
|
||||
$a = "12345678";
|
||||
$b = "abcdefg";
|
||||
|
||||
my $warn = 0;
|
||||
$SIG{__WARN__} = sub { print @_; $warn++; };
|
||||
|
||||
|
||||
@a = byteswap2($a, $b);
|
||||
|
||||
print "not " unless @a == 2 &&
|
||||
$a eq "12345678" &&
|
||||
$b eq "abcdefg" &&
|
||||
$a[0] eq "21436587" &&
|
||||
$a[1] eq "badcfeg" &&
|
||||
$warn == ($^W ? 1 : 0);
|
||||
print "ok 1\n";
|
||||
$warn = 0;
|
||||
|
||||
@a = byteswap4($a, $b);
|
||||
|
||||
print "not " unless @a == 2 &&
|
||||
$a eq "12345678" &&
|
||||
$b eq "abcdefg" &&
|
||||
$a[0] eq "43218765" &&
|
||||
$a[1] eq "dcbaefg" &&
|
||||
$warn == ($^W ? 1 : 0);
|
||||
print "ok 2\n";
|
||||
|
||||
# Try in-place change
|
||||
byteswap2($a);
|
||||
print "not " unless $a eq "21436587";
|
||||
print "ok 3\n";
|
||||
|
||||
print "not " unless byteswap2(byteswap2($a)) eq $a;
|
||||
print "ok 4\n";
|
||||
|
||||
# Try object method
|
||||
|
||||
$u = latin1("abc");
|
||||
#print $u->hex, "\n";
|
||||
$u->byteswap;
|
||||
#print $u->hex, "\n";
|
||||
|
||||
print "not " unless $u->ucs2 eq "a\0b\0c\0";
|
||||
print "ok 5\n";
|
49
t/charname.t
49
t/charname.t
|
@ -1,49 +0,0 @@
|
|||
print "1..13\n";
|
||||
|
||||
use Unicode::CharName qw(ublock uname);
|
||||
|
||||
#print uname(ord("å")), "\n";
|
||||
#print ublock(ord("å")), "\n";
|
||||
|
||||
# Test a few simple names
|
||||
print "not " unless uname(ord("\$")) eq "DOLLAR SIGN";
|
||||
print "ok 1\n";
|
||||
|
||||
print "not " unless uname(ord("å")) eq "LATIN SMALL LETTER A WITH RING ABOVE";
|
||||
print "ok 2\n";
|
||||
|
||||
# Test ideograph name generation
|
||||
print "not " unless uname(0x7C80) eq "CJK UNIFIED IDEOGRAPH 7C80";
|
||||
print "ok 3\n";
|
||||
|
||||
# Test Hangul Syllable name generation
|
||||
print "not " unless uname(0x1111) eq "HANGUL CHOSEONG PHIEUPH";
|
||||
print "ok 4\n";
|
||||
|
||||
print "not " unless uname(0xD4DB) eq "HANGUL SYLLABLE PWILH";
|
||||
print "ok 5\n";
|
||||
|
||||
# Some various stuff
|
||||
print "not " unless uname(0xF000) eq "<private>";
|
||||
print "ok 6\n";
|
||||
print "not " unless uname(0) eq "<control>";
|
||||
print "ok 7\n";
|
||||
print "not " unless uname(0xD800) eq "<surrogate>";
|
||||
print "ok 8\n";
|
||||
|
||||
# Test ublock function
|
||||
|
||||
print "not " unless ublock(ord("a")) eq "Basic Latin";
|
||||
print "ok 9\n";
|
||||
|
||||
print "not " unless ublock(0x2190) eq "Arrows" and ublock(0x21FF) eq "Arrows";
|
||||
print "ok 10\n";
|
||||
|
||||
print "not " unless ublock(0xFFFF) eq "Specials";
|
||||
print "ok 11\n";
|
||||
|
||||
print "not " if defined ublock(0x30000);
|
||||
print "ok 12\n";
|
||||
|
||||
print "not " unless uname(0x1d1cf) eq "MUSICAL SYMBOL CROIX";
|
||||
print "ok 13\n";
|
240
t/encoding.t
240
t/encoding.t
|
@ -1,240 +0,0 @@
|
|||
#!perl -w
|
||||
|
||||
use strict;
|
||||
use Test qw(plan ok);
|
||||
|
||||
plan tests => 59;
|
||||
|
||||
use Unicode::String qw(latin1 ucs4 utf32le utf16 utf16le utf8 utf7);
|
||||
|
||||
#use Devel::Dump;
|
||||
|
||||
$SIG{__WARN__} = sub { print "$_[0]"; };
|
||||
|
||||
my $u = latin1("abcæøå");
|
||||
#Dump($u);
|
||||
|
||||
#---- Test Latin1 encoding ----
|
||||
|
||||
ok($u->latin1, "abcæøå");
|
||||
ok($u->length, 6);
|
||||
ok($u->ucs4, "\0\0\0a\0\0\0b\0\0\0c\0\0\0æ\0\0\0ø\0\0\0å");
|
||||
ok($u->utf32, "\0\0\0a\0\0\0b\0\0\0c\0\0\0æ\0\0\0ø\0\0\0å");
|
||||
ok($u->utf32be, "\0\0\0a\0\0\0b\0\0\0c\0\0\0æ\0\0\0ø\0\0\0å");
|
||||
ok($u->utf32le, "a\0\0\0b\0\0\0c\0\0\0æ\0\0\0ø\0\0\0å\0\0\0");
|
||||
ok($u->utf16, "\0a\0b\0c\0æ\0ø\0å");
|
||||
ok($u->utf16be, "\0a\0b\0c\0æ\0ø\0å");
|
||||
ok($u->ucs2, "\0a\0b\0c\0æ\0ø\0å");
|
||||
ok($u->utf16le, "a\0b\0c\0æ\0ø\0å\0");
|
||||
ok($u->utf8, "abcæøå");
|
||||
ok($u->utf7, "abc+AOYA+ADl-");
|
||||
ok($u->hex, "U+0061 U+0062 U+0063 U+00e6 U+00f8 U+00e5");
|
||||
|
||||
$u = latin1("abc");
|
||||
$a = $u->latin1("def");
|
||||
$b = $u->latin1;
|
||||
$u->latin1("ghi");
|
||||
|
||||
ok($a, "abc");
|
||||
ok($b, "def");
|
||||
ok($u->latin1, "ghi");
|
||||
|
||||
|
||||
$u = utf16("aa\0bcc\0d");
|
||||
|
||||
print "Expect 2 lines of warnings...\n";
|
||||
my $x = $u->latin1;
|
||||
|
||||
ok($x, "bd");
|
||||
|
||||
#---- Test UCS4 encoding ----
|
||||
|
||||
$x = "\0\0\0a\0\0bb\0\3cc\0\1\0\2\0\0\0\0";
|
||||
$u = ucs4($x);
|
||||
|
||||
ok($u->length, 7);
|
||||
ok($u->hex, "U+0061 U+6262 U+d898 U+df63 U+d800 U+dc02 U+0000");
|
||||
ok($u->ucs4, $x);
|
||||
|
||||
$a = $u->ucs4("");
|
||||
ok($a, $x);
|
||||
ok($u->length, 0);
|
||||
|
||||
$u = utf32le("a\0\0\0" . "bb\0\0" . "cc\3\0" . "\2\0\1\0" . "\0\0\0\0");
|
||||
ok($u->length, 7);
|
||||
ok($u->hex, "U+0061 U+6262 U+d898 U+df63 U+d800 U+dc02 U+0000");
|
||||
ok($u->ucs4, $x);
|
||||
|
||||
print "Expect 2 lines of warnings...\n";
|
||||
$u->ucs4(" \0\x10\xff\xff\0\x11\0\0\0\0\0\0");
|
||||
ok($u->hex, "U+dbff U+dfff U+0000");
|
||||
|
||||
#--- Test UTF8 encoding ---
|
||||
|
||||
$u = utf8("");
|
||||
ok($u->length, 0);
|
||||
ok($u->utf8, "");
|
||||
|
||||
$u = utf8("abc");
|
||||
my $old = $u->utf8("def");
|
||||
ok($old, "abc");
|
||||
ok($u->latin1, "def");
|
||||
|
||||
$u = utf16("\0a\0å\1\0\7a\0aa");
|
||||
$x = unpack("H*", $u->utf8);
|
||||
ok($x, "61c3a5c480dda161e68480");
|
||||
|
||||
my $u2 = utf8($u->utf8);
|
||||
ok($u->utf16, $u2->utf16);
|
||||
|
||||
# Test surrogates and utf8
|
||||
print "Surrogates...\n";
|
||||
|
||||
$u = ucs4("\0\1\0\0\0\x10\xFF\xFF");
|
||||
$x = unpack("H*", $u->utf8);
|
||||
ok($x, "f0908080f48fbfbf");
|
||||
|
||||
$u->utf8(pack("H*", $x));
|
||||
ok($u->ucs4, "\0\1\0\0\0\x10\xFF\xFF");
|
||||
|
||||
print "Expect a warning with this incomplete surrogate pair...\n";
|
||||
$u = utf16("\xd8\x00");
|
||||
$u2 = utf8($u->utf8);
|
||||
ok($u2->hex, "U+d800");
|
||||
|
||||
print "...and lots of noice from this...\n";
|
||||
$u = utf8("¤¤a\xf7¤¤¤b\xf8¤¤¤¤c\xfc¤¤¤¤¤d\xfd\xfe\xffef");
|
||||
print $u->hex, "\n";
|
||||
|
||||
ok($u->utf8, "abcdef");
|
||||
|
||||
|
||||
#--- Test UTF7 encoding ---
|
||||
|
||||
# Examples from RFC 1642...
|
||||
#
|
||||
# Example. The Unicode sequence "A<NOT IDENTICAL TO><ALPHA>."
|
||||
# (hexadecimal 0041,2262,0391,002E) may be encoded as follows:
|
||||
#
|
||||
# A+ImIDkQ.
|
||||
#
|
||||
# Example. The Unicode sequence "Hi Mom <WHITE SMILING FACE>!"
|
||||
# (hexadecimal 0048, 0069, 0020, 004D, 006F, 004D, 0020, 263A, 0021)
|
||||
# may be encoded as follows:
|
||||
#
|
||||
# Hi Mom +Jjo-!
|
||||
|
||||
$u = utf7("A+ImIDkQ.");
|
||||
ok($u->hex, "U+0041 U+2262 U+0391 U+002e");
|
||||
|
||||
my $utf7 = $u->utf7("Hi Mom +Jjo-!");
|
||||
ok($utf7, qr/^A\+ImIDkQ-?\.$/);
|
||||
|
||||
ok($u->hex, "U+0048 U+0069 U+0020 U+004d U+006f U+006d U+0020 U+263a U+0021");
|
||||
ok($u->utf7 eq "Hi Mom +Jjo-!" || $u->utf7 eq "Hi Mom +JjoAIQ-");
|
||||
|
||||
# Example. The Unicode sequence representing the Han characters for
|
||||
# the Japanese word "nihongo" (hexadecimal 65E5,672C,8A9E) may be
|
||||
# encoded as follows:
|
||||
|
||||
$u = utf7("+ZeVnLIqe-");
|
||||
ok($u->hex, "U+65e5 U+672c U+8a9e");
|
||||
ok($u->utf7, "+ZeVnLIqe-");
|
||||
|
||||
# Appendix A -- Examples
|
||||
#
|
||||
# Here is a longer example, taken from a document originally in Big5
|
||||
# code. It has been condensed for brevity. There are two versions: the
|
||||
# first uses optional characters from set O (and thus may not pass
|
||||
# through some mail gateways), and the second uses no optional
|
||||
# characters.
|
||||
|
||||
my $text = <<'EOT';
|
||||
Below is the full Chinese text of the Analects (+itaKng-).
|
||||
|
||||
The sources for the text are:
|
||||
|
||||
"The sayings of Confucius," James R. Ware, trans. +U/BTFw-:
|
||||
+ZYeB9FH6ckh5Pg-, 1980. (Chinese text with English translation)
|
||||
|
||||
+Vttm+E6UfZM-, +W4tRQ066bOg-, +UxdOrA-: +Ti1XC2b4Xpc-, 1990.
|
||||
|
||||
"The Chinese Classics with a Translation, Critical and
|
||||
Exegetical Notes, Prolegomena, and Copius Indexes," James
|
||||
Legge, trans., Taipei: Southern Materials Center Publishing,
|
||||
Inc., 1991. (Chinese text with English translation)
|
||||
|
||||
Big Five and GB versions of the text are being made available
|
||||
separately.
|
||||
|
||||
Neither the Big Five nor GB contain all the characters used in
|
||||
this text. Missing characters have been indicated using their
|
||||
Unicode/ISO 10646 code points. "U+-" followed by four
|
||||
hexadecimal digits indicates a Unicode/10646 code (e.g.,
|
||||
U+-9F08). There is no good solution to the problem of the small
|
||||
size of the Big Five/GB character sets; this represents the
|
||||
solution I find personally most satisfactory.
|
||||
|
||||
(omitted...)
|
||||
|
||||
I have tried to minimize this problem by using variant
|
||||
characters where they were available and the character
|
||||
actually in the text was not. Only variants listed as such in
|
||||
the +XrdxmVtXUXg- were used.
|
||||
|
||||
(omitted...)
|
||||
|
||||
John H. Jenkins
|
||||
+TpVPXGBG-
|
||||
John_Jenkins@taligent.com
|
||||
5 January 1993
|
||||
EOT
|
||||
|
||||
$u = utf7($text);
|
||||
my $utf = $u->utf7;
|
||||
|
||||
unless ($utf eq $text) {
|
||||
print $u->length, " $utf\n";
|
||||
open(F, ">utf7-$$.orig"); print F $text;
|
||||
open(F, ">utf7-$$.enc"); print F $utf;
|
||||
close(F);
|
||||
system("diff -u0 utf7-$$.orig utf7-$$.enc");
|
||||
unlink("utf7-$$.orig", "utf7-$$.enc");
|
||||
}
|
||||
|
||||
ok($utf, $text);
|
||||
|
||||
# Test encoding of different encoding byte lengths
|
||||
for my $len (1 .. 6) {
|
||||
$u = Unicode::String->new;
|
||||
$u->pack(map {1000 + $_} 1 .. $len);
|
||||
$u2 = utf7($u->utf7);
|
||||
ok($u->utf16, $u2->utf16);
|
||||
}
|
||||
|
||||
$Unicode::String::UTF7_OPTIONAL_DIRECT_CHARS = 0;
|
||||
|
||||
$u = latin1("a=4!æøå");
|
||||
$utf = $u->utf7;
|
||||
|
||||
ok($utf7 !~ /[=!]/);
|
||||
ok(utf7($utf)->latin1, "a=4!æøå");
|
||||
|
||||
#--- Swapped bytes ---
|
||||
|
||||
$u = utf16("ÿþa\0b\0c\0");
|
||||
ok($u->hex, "U+feff U+0061 U+0062 U+0063");
|
||||
ok($u->latin1, "abc");
|
||||
|
||||
$u = utf16("þÿ\0a\0b\0c");
|
||||
ok($u->hex, "U+feff U+0061 U+0062 U+0063");
|
||||
ok($u->latin1, "abc");
|
||||
|
||||
$u = utf16le("ÿþa\0b\0c\0");
|
||||
ok($u->hex, "U+feff U+0061 U+0062 U+0063");
|
||||
ok($u->latin1, "abc");
|
||||
|
||||
$u = utf16le("þÿ\0a\0b\0c");
|
||||
ok($u->hex, "U+feff U+0061 U+0062 U+0063");
|
||||
ok($u->latin1, "abc");
|
||||
|
103
t/op.t
103
t/op.t
|
@ -1,103 +0,0 @@
|
|||
# Test string operations
|
||||
|
||||
print "1..22\n";
|
||||
|
||||
use Unicode::String qw(utf8 utf16);
|
||||
|
||||
$u = utf8("abc");
|
||||
$u->append(utf8("def"));
|
||||
|
||||
print "not " unless $u->utf8 eq "abcdef";
|
||||
print "ok 1\n";
|
||||
|
||||
$x = $u->copy;
|
||||
print "not " unless overload::StrVal($x) ne overload::StrVal($u);
|
||||
print "ok 2\n";
|
||||
|
||||
print $u->hex, "\n";
|
||||
print $x->hex, "\n";
|
||||
|
||||
print "not " unless $x->hex eq $u->hex;
|
||||
print "ok 3\n";
|
||||
|
||||
$x->append(utf8("g"));
|
||||
|
||||
print "not " unless $x->utf8 eq "abcdefg";
|
||||
print "ok 4\n";
|
||||
|
||||
$y = $x->repeat(3);
|
||||
print $u->hex, "\n";
|
||||
print $x->hex, "\n";
|
||||
print $y->utf8, "\n";
|
||||
|
||||
print "not " unless $y->utf8 eq ("abcdefg" x 3);
|
||||
print "ok 5\n";
|
||||
|
||||
$y = $x->concat($u);
|
||||
print "not " unless $y->utf8 eq "abcdefgabcdef" &&
|
||||
$x->utf8 eq "abcdefg" && $u->utf8 eq "abcdef";
|
||||
print "ok 6\n";
|
||||
|
||||
$x = utf8("123");
|
||||
print "not " unless $x->as_num == 123;
|
||||
print "ok 7\n";
|
||||
|
||||
$x = utf8("");
|
||||
print "not " if $x->as_bool;
|
||||
print "ok 8\n";
|
||||
|
||||
$x = utf8("0");
|
||||
print "not " if !$x->as_bool;
|
||||
print "ok 9\n";
|
||||
|
||||
$x = utf8("abc");
|
||||
print "not " if !$x->as_bool;
|
||||
print "ok 10\n";
|
||||
|
||||
#--- substr ---
|
||||
|
||||
$y = $x->substr(0,1);
|
||||
print "not " unless $y->utf8 eq "a" && $x->utf8 eq "abc";
|
||||
print "ok 11\n";
|
||||
|
||||
$y = $x->substr(1);
|
||||
print "not " unless $y->utf8 eq "bc" && $x->utf8 eq "abc";
|
||||
print "ok 12\n";
|
||||
|
||||
$x = utf8("0123456789");
|
||||
print "not " unless $x->substr(2,3)->utf8 eq "234";
|
||||
print "ok 13\n";
|
||||
|
||||
print "not " unless $x->substr->utf8 eq $x->utf8;
|
||||
print "ok 14\n";
|
||||
|
||||
print "not " unless $x->substr(3,0)->length == 0;
|
||||
print "ok 15\n";
|
||||
|
||||
print "not " unless $x->substr(8, 100)->utf8 eq "89";
|
||||
print "ok 16\n";
|
||||
|
||||
#--- index/rindex ---
|
||||
|
||||
print "not " unless $x->index(utf8("345")) == 3;
|
||||
print "ok 17\n";
|
||||
|
||||
print "not " unless $x->index(utf8("356")) == -1;
|
||||
print "ok 18\n";
|
||||
|
||||
print "not " unless $x->index(utf8("0")) == 0;
|
||||
print "ok 19\n";
|
||||
|
||||
print "not " unless $x->index(utf8("0"), 1) == -1;
|
||||
print "ok 20\n";
|
||||
|
||||
# Test some more interresting patterns
|
||||
$x = utf16("abbaabbbaa");
|
||||
print "not " unless $x->index(utf16("bb")) == 3;
|
||||
print "ok 21\n";
|
||||
|
||||
#--- overload -->
|
||||
print "not " unless (utf8("a") . utf8("b"))->utf8 eq "ab" &&
|
||||
("a" . utf8("b"))->utf8 eq "ab" &&
|
||||
(utf8("a") . "b")->utf8 eq "ab";
|
||||
print "ok 22\n";
|
70
t/ord.t
70
t/ord.t
|
@ -1,70 +0,0 @@
|
|||
# Test the ord/chr functions
|
||||
|
||||
print "1..10\n";
|
||||
|
||||
use Unicode::String qw(latin1 uchr utf16);
|
||||
|
||||
$u = uchr(ord("å"));
|
||||
|
||||
print $u->ord, "\n", ord("å"), "\n";
|
||||
|
||||
|
||||
print "not " unless $u->ord == ord("å");
|
||||
print "ok 1\n";
|
||||
|
||||
print "not " unless latin1("\0")->ord == 0 && latin1("A")->ord == 65;
|
||||
print "ok 2\n";
|
||||
|
||||
print "not " unless uchr(0xFFFF)->ord == 0xFFFF;
|
||||
print "ok 3\n";
|
||||
|
||||
# Test surrogates
|
||||
|
||||
$u = uchr(0x10000);
|
||||
print $u->hex, "\n";
|
||||
|
||||
print "not " unless $u->ord == 0x10000;
|
||||
print "ok 4\n";
|
||||
|
||||
$u = uchr(0x10FFFF);
|
||||
print $u->hex, "\n";
|
||||
print "not " unless $u->ord == 0x10FFFF;
|
||||
print "ok 5\n";
|
||||
|
||||
$u = utf16("\xd8\x01\xdc\01");
|
||||
print $u->hex, "\n";
|
||||
print "not " unless $u->ord == 0x10401;
|
||||
print "ok 6\n";
|
||||
|
||||
# Try $u->ord in array context
|
||||
|
||||
@ord = latin1("perl")->ord;
|
||||
|
||||
print "not " unless "@ord" eq "112 101 114 108";
|
||||
print "ok 7\n";
|
||||
|
||||
$u = utf16("\0a\xd8\x01\xdc\01\0b");
|
||||
print $u->hex, "\n";
|
||||
@ord = map { sprintf("U+%04x", $_) } $u->ord;
|
||||
print "@ord\n";
|
||||
|
||||
print "not " unless "@ord" eq "U+0061 U+10401 U+0062";
|
||||
print "ok 8\n";
|
||||
|
||||
# Try some illegal stuff
|
||||
$u = utf16("\0a\xdc\01\xd8\x01\0b"); # reversed surrogate
|
||||
print $u->hex, "\n";
|
||||
|
||||
print "not " unless $u->ord == ord("a");
|
||||
print "ok 9\n";
|
||||
|
||||
{
|
||||
local($SIG{__WARN__}) = sub {};
|
||||
@ord = map { sprintf("U+%04x", $_) } $u->ord;
|
||||
}
|
||||
|
||||
print "@ord\n";
|
||||
|
||||
print "not " unless "@ord" eq "U+0061 U+0062";
|
||||
print "ok 10\n";
|
||||
|
|
@ -1,32 +0,0 @@
|
|||
print "1..3\n";
|
||||
|
||||
use Unicode::String qw(latin1);
|
||||
|
||||
Unicode::String::stringify_as("hex");
|
||||
|
||||
$u = latin1("gjøk");
|
||||
|
||||
print $u->hex, "\n";
|
||||
|
||||
print "not " unless "$u" eq "U+0067 U+006a U+00f8 U+006b";
|
||||
print "ok 1\n";
|
||||
|
||||
Unicode::String::stringify_as("utf8");
|
||||
|
||||
print $u->hex, "\n";
|
||||
$str = "$u";
|
||||
|
||||
|
||||
print "$str\n";
|
||||
|
||||
print "not " unless "$u" eq "gjøk";
|
||||
print "ok 2\n";
|
||||
|
||||
eval {
|
||||
Unicode::String::stringify_as("xyzzy");
|
||||
};
|
||||
|
||||
print $@;
|
||||
print "not " unless $@;
|
||||
print "ok 3\n";
|
||||
|
Loading…
Reference in New Issue