114 lines
4.2 KiB
Python
Executable File
114 lines
4.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Generate NSS-compatible BMP Unicode case tables from UnicodeData.txt.
|
|
|
|
The generated C file exports the NSS symbol names expected by the imported
|
|
Unicode helpers:
|
|
|
|
unicode_t NSSUniToLower[65536]
|
|
unicode_t NSSUniToUpper[65536]
|
|
|
|
Only single-code-point BMP mappings fit into these NSS tables. Full Unicode
|
|
case mappings and locale-sensitive SpecialCasing.txt entries are intentionally
|
|
not encoded here.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
from pathlib import Path
|
|
|
|
BMP_SIZE = 0x10000
|
|
|
|
|
|
def parse_unicode_data(path: Path) -> tuple[list[int], list[int], dict[str, int]]:
|
|
lower = list(range(BMP_SIZE))
|
|
upper = list(range(BMP_SIZE))
|
|
stats = {
|
|
"records": 0,
|
|
"lower_mappings": 0,
|
|
"upper_mappings": 0,
|
|
"non_bmp_skipped": 0,
|
|
}
|
|
|
|
with path.open("r", encoding="utf-8") as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
fields = line.split(";")
|
|
if len(fields) < 15:
|
|
raise ValueError(f"Malformed UnicodeData line: {line!r}")
|
|
cp = int(fields[0], 16)
|
|
stats["records"] += 1
|
|
if cp >= BMP_SIZE:
|
|
stats["non_bmp_skipped"] += 1
|
|
continue
|
|
|
|
upper_field = fields[12]
|
|
lower_field = fields[13]
|
|
|
|
if upper_field:
|
|
target = int(upper_field, 16)
|
|
if target < BMP_SIZE:
|
|
upper[cp] = target
|
|
stats["upper_mappings"] += 1
|
|
if lower_field:
|
|
target = int(lower_field, 16)
|
|
if target < BMP_SIZE:
|
|
lower[cp] = target
|
|
stats["lower_mappings"] += 1
|
|
|
|
return lower, upper, stats
|
|
|
|
|
|
def emit_array(out, c_type: str, name: str, values: list[int]) -> None:
|
|
out.write(f"{c_type} {name}[65536] = {{\n")
|
|
for base in range(0, BMP_SIZE, 8):
|
|
chunk = values[base:base + 8]
|
|
text = ", ".join(f"0x{value:04X}" for value in chunk)
|
|
out.write(f"\t{text}")
|
|
if base + 8 < BMP_SIZE:
|
|
out.write(",")
|
|
out.write(f"\t/* 0x{base:04X} */\n")
|
|
out.write("};\n")
|
|
|
|
|
|
def generate(ucd_dir: Path, output: Path, unicode_version: str) -> None:
|
|
unicode_data = ucd_dir / "UnicodeData.txt"
|
|
if not unicode_data.exists():
|
|
raise FileNotFoundError(unicode_data)
|
|
|
|
lower, upper, stats = parse_unicode_data(unicode_data)
|
|
output.parent.mkdir(parents=True, exist_ok=True)
|
|
with output.open("w", encoding="utf-8", newline="\n") as out:
|
|
out.write("/*\n")
|
|
out.write(" * Generated by scripts/gen_unicode_tables.py.\n")
|
|
out.write(f" * Source: Unicode Character Database {unicode_version}.\n")
|
|
out.write(" * Input: UCD/UnicodeData.txt simple uppercase/lowercase mappings.\n")
|
|
out.write(" *\n")
|
|
out.write(" * This file intentionally does not copy Novell NSS unitables/*.tab.\n")
|
|
out.write(" * It exports NSS-compatible symbol names from Unicode UCD data.\n")
|
|
out.write(" *\n")
|
|
out.write(" * Only single-code-point BMP mappings fit these NSS tables. Full,\n")
|
|
out.write(" * multi-code-point, and locale-sensitive mappings from SpecialCasing.txt\n")
|
|
out.write(" * do not fit unicode_t[65536] tables and are not emitted here.\n")
|
|
out.write(" */\n")
|
|
out.write("#include <xUnicode.h>\n\n")
|
|
out.write(f"/* UnicodeData records: {stats['records']}; non-BMP skipped: {stats['non_bmp_skipped']}. */\n")
|
|
out.write(f"/* Simple BMP lower mappings: {stats['lower_mappings']}; upper mappings: {stats['upper_mappings']}. */\n\n")
|
|
emit_array(out, "unicode_t", "NSSUniToLower", lower)
|
|
out.write("\n")
|
|
emit_array(out, "unicode_t", "NSSUniToUpper", upper)
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument("--ucd-dir", default="UCD", type=Path)
|
|
parser.add_argument("--output", default="TAB/unicodeTables.c", type=Path)
|
|
parser.add_argument("--unicode-version", default="17.0.0")
|
|
args = parser.parse_args()
|
|
generate(args.ucd_dir, args.output, args.unicode_version)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|