# SPDX-License-Identifier: GPL-2.0-only #!/usr/bin/env python3 """Generate NSS-compatible BMP Unicode case tables from UnicodeData.txt. The generated C file exports the NSS symbol names expected by the imported Unicode helpers: unicode_t NSSUniToLower[65536] unicode_t NSSUniToUpper[65536] Only single-code-point BMP mappings fit into these NSS tables. Full Unicode case mappings and locale-sensitive SpecialCasing.txt entries are intentionally not encoded here. """ from __future__ import annotations import argparse from pathlib import Path BMP_SIZE = 0x10000 def parse_unicode_data(path: Path) -> tuple[list[int], list[int], dict[str, int]]: lower = list(range(BMP_SIZE)) upper = list(range(BMP_SIZE)) stats = { "records": 0, "lower_mappings": 0, "upper_mappings": 0, "non_bmp_skipped": 0, } with path.open("r", encoding="utf-8") as f: for line in f: line = line.strip() if not line or line.startswith("#"): continue fields = line.split(";") if len(fields) < 15: raise ValueError(f"Malformed UnicodeData line: {line!r}") cp = int(fields[0], 16) stats["records"] += 1 if cp >= BMP_SIZE: stats["non_bmp_skipped"] += 1 continue upper_field = fields[12] lower_field = fields[13] if upper_field: target = int(upper_field, 16) if target < BMP_SIZE: upper[cp] = target stats["upper_mappings"] += 1 if lower_field: target = int(lower_field, 16) if target < BMP_SIZE: lower[cp] = target stats["lower_mappings"] += 1 return lower, upper, stats def emit_array(out, c_type: str, name: str, values: list[int]) -> None: out.write(f"{c_type} {name}[65536] = {{\n") for base in range(0, BMP_SIZE, 8): chunk = values[base:base + 8] text = ", ".join(f"0x{value:04X}" for value in chunk) out.write(f"\t{text}") if base + 8 < BMP_SIZE: out.write(",") out.write(f"\t/* 0x{base:04X} */\n") out.write("};\n") def generate(ucd_dir: Path, output: Path, unicode_version: str) -> None: unicode_data = ucd_dir / "UnicodeData.txt" if not unicode_data.exists(): raise FileNotFoundError(unicode_data) lower, upper, stats = parse_unicode_data(unicode_data) output.parent.mkdir(parents=True, exist_ok=True) with output.open("w", encoding="utf-8", newline="\n") as out: out.write("/*\n") out.write(" * Generated by scripts/gen_unicode_tables.py.\n") out.write(f" * Source: Unicode Character Database {unicode_version}.\n") out.write(" * Input: UCD/UnicodeData.txt simple uppercase/lowercase mappings.\n") out.write(" *\n") out.write(" * This file intentionally does not copy Novell NSS unitables/*.tab.\n") out.write(" * It exports NSS-compatible symbol names from Unicode UCD data.\n") out.write(" *\n") out.write(" * Only single-code-point BMP mappings fit these NSS tables. Full,\n") out.write(" * multi-code-point, and locale-sensitive mappings from SpecialCasing.txt\n") out.write(" * do not fit unicode_t[65536] tables and are not emitted here.\n") out.write(" */\n") out.write("#include \n\n") out.write(f"/* UnicodeData records: {stats['records']}; non-BMP skipped: {stats['non_bmp_skipped']}. */\n") out.write(f"/* Simple BMP lower mappings: {stats['lower_mappings']}; upper mappings: {stats['upper_mappings']}. */\n\n") emit_array(out, "unicode_t", "NSSUniToLower", lower) out.write("\n") emit_array(out, "unicode_t", "NSSUniToUpper", upper) def main() -> None: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--ucd-dir", default="UCD", type=Path) parser.add_argument("--output", default="TAB/unicodeTables.c", type=Path) parser.add_argument("--unicode-version", default="17.0.0") args = parser.parse_args() generate(args.ucd_dir, args.output, args.unicode_version) if __name__ == "__main__": main()