mirror of https://gitee.com/openkylin/scowl.git
133 lines
4.1 KiB
Plaintext
133 lines
4.1 KiB
Plaintext
![]() |
#!/usr/bin/perl
|
|||
|
|
|||
|
use warnings;
|
|||
|
use strict;
|
|||
|
|
|||
|
use Getopt::Long;
|
|||
|
|
|||
|
Getopt::Long::Configure ("bundling");
|
|||
|
|
|||
|
my $with_variants;
|
|||
|
my $file_names_only;
|
|||
|
my $dir;
|
|||
|
my $no_implied;
|
|||
|
my $variants_str;
|
|||
|
my $accents;
|
|||
|
|
|||
|
sub usage() {
|
|||
|
print STDERR "$0 [-v#] [<options>] <spelling categories> <size>\n";
|
|||
|
exit 1;
|
|||
|
}
|
|||
|
|
|||
|
GetOptions ("dir|d=s", \$dir,
|
|||
|
"with-variants|v=i", => \$with_variants,
|
|||
|
"variants=s", \$variants_str,
|
|||
|
"file-names-only|f", \$file_names_only,
|
|||
|
"no-implied", \$no_implied,
|
|||
|
"accents=s", \$accents) or usage();
|
|||
|
$dir = "final" unless defined $dir;
|
|||
|
|
|||
|
die "Cannot specify both --file-names-only and --accents" if $file_names_only && $accents;
|
|||
|
|
|||
|
my %spelling_map = qw(en-us american en-gb british? en-gb-ise british
|
|||
|
en-gb-ize british_z en-gb-oed british_z en-ca canadian en-au australian);
|
|||
|
$spelling_map{$_} = $_
|
|||
|
foreach (qw(english special american british canadian
|
|||
|
variant_1 variant_2 variant_3
|
|||
|
british_variant_1 british_variant_2
|
|||
|
canadian_variant_1 canadian_variant_2
|
|||
|
australian_variant_1 australian_variant_2));
|
|||
|
|
|||
|
die "Cannot specify both --with-variants and --variants" if $with_variants && $variants_str;
|
|||
|
$variants_str = "non" unless defined $variants_str || defined $with_variants;
|
|||
|
$variants_str = join(',', 'non', 1..$with_variants) if defined $with_variants;
|
|||
|
my %use_variant_level;
|
|||
|
foreach (split ',', $variants_str) {
|
|||
|
die "Invalid variant num: $_" unless /^(non|[123])$/;
|
|||
|
$use_variant_level{$_} = 1;
|
|||
|
}
|
|||
|
|
|||
|
$no_implied = 1 unless $use_variant_level{non};
|
|||
|
|
|||
|
usage unless @ARGV >= 2;
|
|||
|
|
|||
|
my $SIZE = pop @ARGV;
|
|||
|
die "Invalid size: $SIZE\n" unless $SIZE =~ /^\d\d$/;
|
|||
|
|
|||
|
my @SPS = map {my $sp = $_;
|
|||
|
$sp =~ tr/_/-/;
|
|||
|
$sp = $spelling_map{lc $sp};
|
|||
|
die "Must specify en_GB-ise or en_GB-ize\n"
|
|||
|
if $sp eq 'british?' && $use_variant_level{non};
|
|||
|
$sp = 'british' if $sp eq 'british?';
|
|||
|
die "Unknown spelling category: $_\n" unless defined $sp;
|
|||
|
$sp;} @ARGV;
|
|||
|
my %SPS;
|
|||
|
$SPS{$_} = 1 foreach (@SPS);
|
|||
|
|
|||
|
$SPS{english} = 1 unless $no_implied;
|
|||
|
$SPS{special} = 1 unless $no_implied;
|
|||
|
|
|||
|
$SPS{variant_1} = 1 if $SPS{american} && $use_variant_level{1};
|
|||
|
$SPS{variant_2} = 1 if $SPS{american} && $use_variant_level{2};
|
|||
|
$SPS{british_variant_1} = 1 if ($SPS{british} || $SPS{british_z}) && $use_variant_level{1};
|
|||
|
$SPS{british_variant_2} = 1 if ($SPS{british} || $SPS{british_z}) && $use_variant_level{2};
|
|||
|
$SPS{canadian_variant_1} = 1 if $SPS{canadian} && $use_variant_level{1};
|
|||
|
$SPS{canadian_variant_2} = 1 if $SPS{canadian} && $use_variant_level{2};
|
|||
|
$SPS{australian_variant_1} = 1 if $SPS{australian} && $use_variant_level{1};
|
|||
|
$SPS{australian_variant_2} = 1 if $SPS{australian} && $use_variant_level{2};
|
|||
|
$SPS{variant_3} = 1 if $use_variant_level{3};
|
|||
|
|
|||
|
unless ($use_variant_level{non}) {
|
|||
|
delete $SPS{$_} foreach (@SPS);
|
|||
|
}
|
|||
|
|
|||
|
opendir D, $dir or die "Unable to open dir $dir\n";
|
|||
|
|
|||
|
my @words;
|
|||
|
|
|||
|
sub deaccent($) {
|
|||
|
local $_ = $_[0];
|
|||
|
# from deaccent-toperl.cc
|
|||
|
# the line is encoded in iso-8859-1 to match the input encoding:
|
|||
|
tr/<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>/AAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy/;
|
|||
|
return $_;
|
|||
|
}
|
|||
|
my $add_word;
|
|||
|
if (!defined($accents) || $accents eq 'keep') {
|
|||
|
$add_word = sub {push @words, $_[0]};
|
|||
|
} elsif ($accents eq 'strip') {
|
|||
|
$add_word = sub {push @words, deaccent($_[0])};
|
|||
|
} elsif ($accents eq 'both') {
|
|||
|
$add_word = sub {push @words, $_[0];
|
|||
|
my $new_word = deaccent($_[0]);
|
|||
|
push @words, $new_word if $new_word ne $_[0];}
|
|||
|
} else {
|
|||
|
die "--accents must be one of: keep, strip, both\n";
|
|||
|
}
|
|||
|
|
|||
|
my @files = readdir(D);
|
|||
|
|
|||
|
foreach (sort @files) {
|
|||
|
my ($sp, $type, $size) = /^(\w+)-([^.]+)\.(\d\d)/ or next;
|
|||
|
next unless $SPS{$sp};
|
|||
|
next unless $size <= $SIZE;
|
|||
|
if ($file_names_only) {
|
|||
|
print "$_\n";
|
|||
|
} else {
|
|||
|
open F, "$dir/$_" or die "Unable to open $dir/$_ for reading";
|
|||
|
while (<F>) {
|
|||
|
chop;
|
|||
|
$add_word->($_);
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
my $prev = '';
|
|||
|
foreach (sort @words) {
|
|||
|
next if $_ eq $prev;
|
|||
|
print "$_\n";
|
|||
|
$prev = $_;
|
|||
|
}
|
|||
|
|