105 lines
2.8 KiB
Perl
Executable File
105 lines
2.8 KiB
Perl
Executable File
#!/usr/bin/perl
|
|
#
|
|
# Create a character mapping for GB2312 encoding.
|
|
# usage: mkCSGB2312.pl
|
|
#
|
|
# Requires the map file GB2312.TXT (mapping actually GB2312-80) in the
|
|
# current directory, produces the map file CSGB2312.TXT
|
|
#
|
|
# Copyright (C) 2000 Martin Schwartz. All rights reserved.
|
|
# This program is free software; you can redistribute it and/or
|
|
# modify it under the same terms as Perl itself.
|
|
#
|
|
# Contact: Martin Schwartz <martin@nacho.de>
|
|
#
|
|
|
|
my $info = <<END;
|
|
#
|
|
# GB2312 to Unicode table; a mixed one byte, two byte mapping.
|
|
#
|
|
# NOTE: This file is generated automatically from GB2312.TXT by mkCSGB2312
|
|
# It is constructed from the mappings of:
|
|
#
|
|
# - ISO8859-1 characters 0x0000 .. 0x00FF
|
|
#
|
|
# - GB2312-80 characters in EUC form.
|
|
#
|
|
# Actually GB2312 should not incorporate the whole ISO8859-1 set, but only the
|
|
# Unicode characters 0x0020 to 0x007f. World's usage is different...
|
|
# As an effect of this a round trip conversion GB2312 -> UTF16 -> GB2312 will
|
|
# produce differences if the original GB2312 encoding contains one or more
|
|
# of these ISO-8859-1 one byte characters:
|
|
#
|
|
# 0xA4, 0xA7, 0xA8, 0xB0, 0xB1, 0xD7, 0xE0, 0xE1, 0xE8, 0xE9,
|
|
# 0xEA, 0xEC, 0xED, 0xF2, 0xF3, 0xF7, 0xF9, 0xFA, 0xFC
|
|
#
|
|
# Anyway these differences shouldn't cause rendering problems, since the
|
|
# translation back to GB2312 for these characters will utilize an original
|
|
# character of set GB2312-80.
|
|
#
|
|
# martin [2000-Jun-19]
|
|
#
|
|
END
|
|
|
|
use strict;
|
|
|
|
main: {
|
|
print "Creating GB2312 encoding, based on GB2312-80 encoding.\n";
|
|
_open ( );
|
|
_createInfo ( );
|
|
_createMapping ( );
|
|
_close ( );
|
|
print "Done. Saved as CSGB2312.TXT\n";
|
|
}
|
|
|
|
sub _open {
|
|
open ( GB2312, "GB2312.TXT" )
|
|
or die "Can't open input file GB2312.TXT! ($!)"
|
|
;
|
|
open ( CSGB2312, ">CSGB2312.TXT" )
|
|
or die "Can't open output file CSGB2312.TXT! ($!)"
|
|
;
|
|
}
|
|
|
|
sub _createInfo {
|
|
print CSGB2312 $info;
|
|
}
|
|
|
|
sub _createMapping {
|
|
print CSGB2312 "\n# ISO-8859-1 characters (0x0000-0x00ff):\n\n";
|
|
for ( 0x00 .. 0xff ) {
|
|
printf CSGB2312 "0x%02x\t0x%04x\n", $_, $_;
|
|
}
|
|
|
|
# print CSGB2312 "\n\n# Unambiguous ISO-8859-1 characters:\n\n";
|
|
# for (
|
|
# 0x80..0xa3, 0xa5..0xa6, 0xa9..0xaf, 0xb2..0xd6,
|
|
# 0xd8..0xdf, 0xe2..0xe7, 0xeb, 0xee..0xf1, 0xf4..0xf6,
|
|
# 0xf8, 0xfb, 0xfd, 0xfe, 0xff
|
|
# ) {
|
|
# printf CSGB2312 "0x%02x\t0x%04x\n", $_, $_;
|
|
# }
|
|
|
|
print CSGB2312 "\n\n# GB2312-80 characters:\n\n";
|
|
while ( <GB2312> ) {
|
|
next unless /^0x/i;
|
|
my ($gb, $uni) = /(0x....)\s+(0x....)/;
|
|
if ( $gb && $uni ) {
|
|
my $euc = hex ($gb) | 0x8080;
|
|
printf CSGB2312 "0x%04x\t$uni\n", $euc;
|
|
}
|
|
}
|
|
|
|
print CSGB2312 "\n# End of file\n";
|
|
}
|
|
|
|
sub _close {
|
|
close CSGB2312
|
|
or die "Can't close input file GB2312.TXT! ($!)"
|
|
;
|
|
close GB2312
|
|
or die "Can't close output file CSGB2312.TXT! ($!)"
|
|
;
|
|
}
|
|
|