2022-06-29 16:02:05 +08:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
# Run this script like so:
|
|
|
|
#
|
|
|
|
# ./update-gtranslit.py /path/to/glibc/localedata/locales > gtranslit-data.h
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
|
2022-06-29 16:02:05 +08:00
|
|
|
|
|
|
|
localedir = sys.argv[1]
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
|
2022-06-29 16:02:05 +08:00
|
|
|
# returns true if the name looks like a POSIX locale name
|
|
|
|
def looks_like_locale(name):
|
2023-02-15 16:51:32 +08:00
|
|
|
name, _, variant = name.partition("@")
|
2022-06-29 16:02:05 +08:00
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
if "_" not in name:
|
2022-06-29 16:02:05 +08:00
|
|
|
return False
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
lang, _, land = name.partition("_")
|
2022-06-29 16:02:05 +08:00
|
|
|
|
|
|
|
return len(lang) == 2 or len(lang) == 3 and len(land) == 2
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
|
2022-06-29 16:02:05 +08:00
|
|
|
# handles <U1234> style escapes
|
|
|
|
def unescape(string):
|
|
|
|
chunks = []
|
|
|
|
|
|
|
|
n = len(string)
|
|
|
|
i = 0
|
|
|
|
|
|
|
|
while i < n:
|
2023-02-15 16:51:32 +08:00
|
|
|
start_escape = string.find("<", i)
|
2022-06-29 16:02:05 +08:00
|
|
|
|
|
|
|
if start_escape == -1:
|
|
|
|
chunks.append(string[i:])
|
|
|
|
break
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
assert string[start_escape : (start_escape + 2)] == "<U"
|
2022-06-29 16:02:05 +08:00
|
|
|
start_escape += 2
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
end_escape = string.find(">", start_escape)
|
2022-06-29 16:02:05 +08:00
|
|
|
assert end_escape != -1
|
|
|
|
|
|
|
|
chunks.append(chr(int(string[start_escape:end_escape], 16)))
|
|
|
|
i = end_escape + 1
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
return "".join(chunks)
|
|
|
|
|
2022-06-29 16:02:05 +08:00
|
|
|
|
|
|
|
# Checks if a string is ascii
|
|
|
|
def is_ascii(string):
|
|
|
|
return all(ord(c) < 0x80 for c in string)
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
|
2022-06-29 16:02:05 +08:00
|
|
|
# A Mapping is a map from non-ascii strings to ascii strings.
|
|
|
|
#
|
|
|
|
# It corresponds to a sequence of one or more mapping lines:
|
|
|
|
#
|
|
|
|
# <U00C4> "<U0041><U0308>";"<U0041><U0045>"
|
|
|
|
#
|
|
|
|
# in a file.
|
|
|
|
class Mapping:
|
|
|
|
def __init__(self):
|
|
|
|
self.serialised = None
|
|
|
|
self.mapping = {}
|
|
|
|
|
|
|
|
# Scans a string like
|
|
|
|
#
|
2023-02-15 16:51:32 +08:00
|
|
|
# <U00C4> "<U0041><U0308>";"<U0041><U0045>" % \
|
|
|
|
# LATIN CAPITAL LETTER A WITH DIAERESIS.
|
2022-06-29 16:02:05 +08:00
|
|
|
#
|
|
|
|
# and adds the first all-ascii choice (or IGNORE) to the mapping
|
|
|
|
# dictionary, with the origin string as the key. In the case of
|
|
|
|
# IGNORE, stores the empty string.
|
|
|
|
def consider_mapping_line(self, line):
|
2023-02-15 16:51:32 +08:00
|
|
|
key, value, rest = (line + " % comment").split(maxsplit=2)
|
2022-06-29 16:02:05 +08:00
|
|
|
|
|
|
|
key = unescape(key)
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
for alternative in value.split(";"):
|
2022-06-29 16:02:05 +08:00
|
|
|
if alternative[0] == '"' and alternative[-1] == '"':
|
|
|
|
unescaped = unescape(alternative[1:-1])
|
|
|
|
if is_ascii(unescaped):
|
|
|
|
self.mapping[key] = unescaped
|
|
|
|
break
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
elif alternative[0] == "<" and alternative[-1] == ">":
|
2022-06-29 16:02:05 +08:00
|
|
|
unescaped = unescape(alternative)
|
|
|
|
if is_ascii(unescaped):
|
|
|
|
self.mapping[key] = unescaped
|
|
|
|
break
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
elif alternative == "IGNORE":
|
|
|
|
self.mapping[key] = ""
|
2022-06-29 16:02:05 +08:00
|
|
|
break
|
|
|
|
|
|
|
|
# Performs a normal dictionary merge, but ensures that there are no
|
|
|
|
# conflicting entries between the original dictionary and the requested
|
|
|
|
# changes
|
|
|
|
def merge_mapping(self, changes):
|
|
|
|
for key in changes.mapping:
|
|
|
|
if key in self.mapping:
|
|
|
|
assert self.mapping[key] == changes.mapping[key]
|
|
|
|
|
|
|
|
self.mapping.update(changes.mapping)
|
|
|
|
|
|
|
|
# Can't get much flatter...
|
|
|
|
def get_flattened(self):
|
|
|
|
return [self]
|
|
|
|
|
|
|
|
def serialise(self, serialiser):
|
|
|
|
if self.serialised is None:
|
|
|
|
self.serialised = serialiser.add_mapping(self.mapping)
|
|
|
|
|
|
|
|
return self.serialised
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
|
2022-06-29 16:02:05 +08:00
|
|
|
# A Chain is a sequence of mappings and chains.
|
|
|
|
#
|
|
|
|
# A chain contains another chain whenever "copy" or "include" is
|
|
|
|
# encountered in a source file.
|
|
|
|
#
|
|
|
|
# A chain contains a mapping whenever a sequence of mapping lines:
|
|
|
|
#
|
|
|
|
# <U00C4> "<U0041><U0308>";"<U0041><U0045>"
|
|
|
|
#
|
|
|
|
# is encountered in a file.
|
|
|
|
#
|
|
|
|
# The order of lookup is reverse: later entries override earlier ones.
|
|
|
|
class Chain:
|
|
|
|
def __init__(self, name):
|
|
|
|
self.serialised = None
|
|
|
|
self.name = name
|
|
|
|
self.chain = []
|
|
|
|
self.links = 0
|
|
|
|
|
|
|
|
self.read_from_file(os.path.join(localedir, name))
|
|
|
|
|
|
|
|
def read_from_file(self, filename):
|
|
|
|
current_mapping = None
|
|
|
|
in_lc_ctype = False
|
|
|
|
in_translit = False
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
fp = open(filename, encoding="ascii", errors="surrogateescape")
|
2022-06-29 16:02:05 +08:00
|
|
|
|
|
|
|
for line in fp:
|
|
|
|
line = line.strip()
|
|
|
|
|
|
|
|
if in_lc_ctype:
|
2023-02-15 16:51:32 +08:00
|
|
|
if line == "END LC_CTYPE":
|
2022-06-29 16:02:05 +08:00
|
|
|
break
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
if line.startswith("copy") or line.startswith("include"):
|
2022-06-29 16:02:05 +08:00
|
|
|
if current_mapping:
|
|
|
|
self.chain.append(current_mapping)
|
|
|
|
|
|
|
|
copyname = unescape(line.split('"', 3)[1])
|
|
|
|
copyfile = get_chain(copyname)
|
|
|
|
self.chain.append(copyfile)
|
|
|
|
copyfile.links += 1
|
|
|
|
|
|
|
|
current_mapping = None
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
elif line == "translit_start":
|
2022-06-29 16:02:05 +08:00
|
|
|
in_translit = True
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
elif line == "translit_end":
|
2022-06-29 16:02:05 +08:00
|
|
|
in_translit = False
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
elif in_translit and line.startswith("<U"):
|
2022-06-29 16:02:05 +08:00
|
|
|
if not current_mapping:
|
|
|
|
current_mapping = Mapping()
|
|
|
|
|
|
|
|
current_mapping.consider_mapping_line(line)
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
elif line == "" or line.startswith("%"):
|
2022-06-29 16:02:05 +08:00
|
|
|
pass
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
elif "default_missing <U003F>":
|
2022-06-29 16:02:05 +08:00
|
|
|
pass
|
|
|
|
|
|
|
|
elif in_translit:
|
2023-02-15 16:51:32 +08:00
|
|
|
print("unknown line:", line)
|
2022-06-29 16:02:05 +08:00
|
|
|
assert False
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
elif line == "LC_CTYPE":
|
2022-06-29 16:02:05 +08:00
|
|
|
in_lc_ctype = True
|
|
|
|
|
|
|
|
if current_mapping:
|
|
|
|
self.chain.append(current_mapping)
|
|
|
|
|
|
|
|
# If there is only one link to this chain, we may as well just
|
|
|
|
# return the contents of the chain so that they can be merged into
|
|
|
|
# our sole parent directly. Otherwise, return ourselves.
|
|
|
|
def get_flattened(self):
|
|
|
|
if self.links == 1:
|
|
|
|
return sum((item.get_flattened() for item in self.chain), [])
|
|
|
|
else:
|
|
|
|
return [self]
|
|
|
|
|
|
|
|
def serialise(self, serialiser):
|
|
|
|
if self.serialised is None:
|
|
|
|
# Before we serialise, see if we can optimise a bit
|
|
|
|
self.chain = sum((item.get_flattened() for item in self.chain), [])
|
|
|
|
|
|
|
|
i = 0
|
|
|
|
while i < len(self.chain) - 1:
|
2023-02-15 16:51:32 +08:00
|
|
|
if isinstance(self.chain[i], Mapping) and isinstance(
|
|
|
|
self.chain[i + 1], Mapping
|
|
|
|
):
|
2022-06-29 16:02:05 +08:00
|
|
|
# We have two mappings in a row. Try to merge them.
|
|
|
|
self.chain[i].merge_mapping(self.chain[i + 1])
|
|
|
|
del self.chain[i + 1]
|
|
|
|
else:
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
# If all that is left is one item, just serialise that directly
|
|
|
|
if len(self.chain) == 1:
|
|
|
|
self.serialised = self.chain[0].serialise(serialiser)
|
|
|
|
else:
|
|
|
|
ids = [item.serialise(serialiser) for item in self.chain]
|
|
|
|
self.serialised = serialiser.add_chain(ids)
|
|
|
|
|
|
|
|
return self.serialised
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
|
2022-06-29 16:02:05 +08:00
|
|
|
# Chain cache -- allows sharing of common chains
|
|
|
|
chains = {}
|
2023-02-15 16:51:32 +08:00
|
|
|
|
|
|
|
|
2022-06-29 16:02:05 +08:00
|
|
|
def get_chain(name):
|
2023-02-15 16:51:32 +08:00
|
|
|
if name not in chains:
|
2022-06-29 16:02:05 +08:00
|
|
|
chains[name] = Chain(name)
|
|
|
|
|
|
|
|
return chains[name]
|
|
|
|
|
|
|
|
|
|
|
|
# Remove the country name from a locale, preserving variant
|
|
|
|
# eg: 'sr_RS@latin' -> 'sr@latin'
|
|
|
|
def remove_country(string):
|
2023-02-15 16:51:32 +08:00
|
|
|
base, at, variant = string.partition("@")
|
|
|
|
lang, _, land = base.partition("_")
|
2022-06-29 16:02:05 +08:00
|
|
|
return lang + at + variant
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
|
2022-06-29 16:02:05 +08:00
|
|
|
def encode_range(start, end):
|
|
|
|
assert start <= end
|
|
|
|
length = end - start
|
|
|
|
|
|
|
|
assert start < 0x1000
|
|
|
|
assert length < 0x8
|
|
|
|
|
|
|
|
result = 0x8000 + (length << 12) + start
|
|
|
|
|
|
|
|
assert result < 0x10000
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
|
2022-06-29 16:02:05 +08:00
|
|
|
def c_pair_array(array):
|
2023-02-15 16:51:32 +08:00
|
|
|
return "{ " + ", ".join("{ %u, %u }" % pair for pair in array) + " };"
|
|
|
|
|
2022-06-29 16:02:05 +08:00
|
|
|
|
|
|
|
class Serialiser:
|
|
|
|
def __init__(self):
|
|
|
|
self.mappings = []
|
|
|
|
self.chains = []
|
|
|
|
self.locales = {}
|
|
|
|
|
|
|
|
def add_mapping(self, mapping):
|
|
|
|
if mapping in self.mappings:
|
|
|
|
mapping_id = self.mappings.index(mapping)
|
|
|
|
else:
|
|
|
|
mapping_id = len(self.mappings)
|
|
|
|
self.mappings.append(mapping)
|
|
|
|
|
|
|
|
assert mapping_id < 128
|
|
|
|
return mapping_id
|
|
|
|
|
|
|
|
def add_chain(self, chain):
|
|
|
|
if chain in self.chains:
|
|
|
|
chain_id = self.chains.index(chain)
|
|
|
|
else:
|
|
|
|
chain_id = len(self.chains)
|
|
|
|
self.chains.append(chain)
|
|
|
|
|
|
|
|
assert chain_id < 128
|
|
|
|
return 128 + chain_id
|
|
|
|
|
|
|
|
def add_locale(self, name, item_id):
|
|
|
|
self.locales[name] = item_id
|
|
|
|
|
|
|
|
def add_default(self, item_id):
|
|
|
|
self.default = item_id
|
|
|
|
|
|
|
|
def optimise_locales(self):
|
|
|
|
# Check if all regions of a language/variant agree
|
|
|
|
languages = list(set(remove_country(locale) for locale in self.locales))
|
|
|
|
|
|
|
|
for language in languages:
|
2023-02-15 16:51:32 +08:00
|
|
|
locales = [
|
|
|
|
locale for locale in self.locales if remove_country(locale) == language
|
|
|
|
]
|
2022-06-29 16:02:05 +08:00
|
|
|
|
|
|
|
item_id = self.locales[locales[0]]
|
|
|
|
if all(self.locales[locale] == item_id for locale in locales):
|
|
|
|
self.locales[language] = item_id
|
|
|
|
for locale in locales:
|
|
|
|
del self.locales[locale]
|
|
|
|
|
|
|
|
# Check if a variant is the same as the non-variant form
|
|
|
|
# eg: 'de@euro' and 'de'
|
2023-02-15 16:51:32 +08:00
|
|
|
for variant in list(locale for locale in self.locales if "@" in locale):
|
|
|
|
base, _, _ = variant.partition("@")
|
2022-06-29 16:02:05 +08:00
|
|
|
if base in self.locales and self.locales[base] == self.locales[variant]:
|
|
|
|
del self.locales[variant]
|
|
|
|
|
|
|
|
# Eliminate any entries that are just the same as the C locale
|
|
|
|
for locale in list(self.locales):
|
|
|
|
if self.locales[locale] == self.default:
|
|
|
|
del self.locales[locale]
|
|
|
|
|
|
|
|
def to_c(self):
|
2023-02-15 16:51:32 +08:00
|
|
|
src_table = ""
|
|
|
|
ascii_table = ""
|
2022-06-29 16:02:05 +08:00
|
|
|
mappings_table = []
|
|
|
|
mapping_ranges = []
|
|
|
|
chains_table = []
|
|
|
|
chain_starts = []
|
2023-02-15 16:51:32 +08:00
|
|
|
locale_names = ""
|
2022-06-29 16:02:05 +08:00
|
|
|
locale_index = []
|
|
|
|
max_lookup = 0
|
|
|
|
max_localename = 0
|
|
|
|
|
|
|
|
for mapping in self.mappings:
|
2023-02-15 16:51:32 +08:00
|
|
|
mapping_ranges.append((len(mappings_table), len(mapping)))
|
2022-06-29 16:02:05 +08:00
|
|
|
|
|
|
|
for key in sorted(mapping):
|
|
|
|
if len(key) == 1 and ord(key[0]) < 0x8000:
|
|
|
|
src_range = ord(key[0])
|
|
|
|
else:
|
|
|
|
existing = src_table.find(key)
|
|
|
|
if existing == -1:
|
|
|
|
start = len(src_table)
|
2023-02-15 16:51:32 +08:00
|
|
|
assert all(ord(c) <= 0x10FFFF for c in key)
|
2022-06-29 16:02:05 +08:00
|
|
|
src_table += key
|
|
|
|
src_range = encode_range(start, len(src_table))
|
|
|
|
max_lookup = max(max_lookup, len(key))
|
|
|
|
else:
|
|
|
|
src_range = encode_range(existing, existing + len(key))
|
|
|
|
|
|
|
|
value = mapping[key]
|
|
|
|
if len(value) == 1 and ord(value[0]) < 0x80:
|
|
|
|
ascii_range = ord(value[0])
|
|
|
|
else:
|
|
|
|
existing = ascii_table.find(value)
|
|
|
|
if existing == -1:
|
|
|
|
start = len(ascii_table)
|
|
|
|
assert all(ord(c) < 0x80 for c in value)
|
|
|
|
ascii_table += value
|
|
|
|
ascii_range = encode_range(start, len(ascii_table))
|
|
|
|
else:
|
|
|
|
ascii_range = encode_range(existing, existing + len(value))
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
mappings_table.append((src_range, ascii_range))
|
2022-06-29 16:02:05 +08:00
|
|
|
|
|
|
|
for chain in self.chains:
|
|
|
|
chain_starts.append(len(chains_table))
|
|
|
|
|
|
|
|
for item_id in reversed(chain):
|
2023-02-15 16:51:32 +08:00
|
|
|
assert item_id < 0xFF
|
2022-06-29 16:02:05 +08:00
|
|
|
chains_table.append(item_id)
|
2023-02-15 16:51:32 +08:00
|
|
|
chains_table.append(0xFF)
|
2022-06-29 16:02:05 +08:00
|
|
|
|
|
|
|
for locale in sorted(self.locales):
|
|
|
|
max_localename = max(max_localename, len(locale))
|
|
|
|
name_offset = len(locale_names)
|
2023-02-15 16:51:32 +08:00
|
|
|
assert all(ord(c) <= 0x7F for c in locale)
|
|
|
|
locale_names += locale + "\0"
|
2022-06-29 16:02:05 +08:00
|
|
|
|
|
|
|
item_id = self.locales[locale]
|
|
|
|
|
|
|
|
assert name_offset < 256
|
|
|
|
assert item_id < 256
|
|
|
|
locale_index.append((name_offset, item_id))
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
print("/* Generated by update-gtranslit.py */")
|
|
|
|
print("#define MAX_KEY_SIZE", max_lookup)
|
|
|
|
print("#define MAX_LOCALE_NAME", max_localename)
|
|
|
|
print(
|
|
|
|
"static const gunichar src_table[] = {",
|
|
|
|
", ".join(str(ord(c)) for c in src_table),
|
|
|
|
"};",
|
|
|
|
)
|
2022-06-29 16:02:05 +08:00
|
|
|
# cannot do this in plain ascii because of trigraphs... :(
|
2023-02-15 16:51:32 +08:00
|
|
|
print(
|
|
|
|
"static const gchar ascii_table[] = {",
|
|
|
|
", ".join(str(ord(c)) for c in ascii_table),
|
|
|
|
"};",
|
|
|
|
)
|
|
|
|
print(
|
|
|
|
"static const struct mapping_entry mappings_table[] =",
|
|
|
|
c_pair_array(mappings_table),
|
|
|
|
)
|
|
|
|
print(
|
|
|
|
"static const struct mapping_range mapping_ranges[] =",
|
|
|
|
c_pair_array(mapping_ranges),
|
|
|
|
)
|
|
|
|
print(
|
|
|
|
"static const guint8 chains_table[] = {",
|
|
|
|
", ".join(str(i) for i in chains_table),
|
|
|
|
"};",
|
|
|
|
)
|
|
|
|
print(
|
|
|
|
"static const guint8 chain_starts[] = {",
|
|
|
|
", ".join(str(i) for i in chain_starts),
|
|
|
|
"};",
|
|
|
|
)
|
|
|
|
print(
|
|
|
|
'static const gchar locale_names[] = "'
|
|
|
|
+ locale_names.replace("\0", "\\0")
|
|
|
|
+ '";'
|
|
|
|
)
|
|
|
|
print(
|
|
|
|
"static const struct locale_entry locale_index[] = ",
|
|
|
|
c_pair_array(locale_index),
|
|
|
|
)
|
|
|
|
print("static const guint8 default_item_id = %u;" % (self.default,))
|
2022-06-29 16:02:05 +08:00
|
|
|
|
|
|
|
def dump(self):
|
|
|
|
print(self.mappings)
|
|
|
|
print(self.chains)
|
|
|
|
print(self.locales)
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
|
2022-06-29 16:02:05 +08:00
|
|
|
locales = []
|
|
|
|
for name in os.listdir(localedir):
|
|
|
|
if looks_like_locale(name):
|
|
|
|
chain = get_chain(name)
|
2023-02-15 16:51:32 +08:00
|
|
|
locales.append(chain)
|
2022-06-29 16:02:05 +08:00
|
|
|
chain.links += 1
|
|
|
|
|
|
|
|
serialiser = Serialiser()
|
|
|
|
|
|
|
|
for locale in locales:
|
|
|
|
serialiser.add_locale(locale.name, locale.serialise(serialiser))
|
|
|
|
|
2023-02-15 16:51:32 +08:00
|
|
|
i18n = get_chain("i18n").serialise(serialiser)
|
|
|
|
combining = get_chain("translit_combining").serialise(serialiser)
|
2022-06-29 16:02:05 +08:00
|
|
|
serialiser.add_default(serialiser.add_chain([i18n, combining]))
|
|
|
|
|
|
|
|
serialiser.optimise_locales()
|
|
|
|
|
|
|
|
serialiser.to_c()
|