unicodeTables: generate NSS-compatible BMP case tables

2026-06-12 15:13:27 +00:00
parent 7f127fda65
commit 9a596b06c0
4 changed files with 16555 additions and 0 deletions
--- a/LICENSES/README.md
+++ b/LICENSES/README.md
@@ -0,0 +1,8 @@
+# Licenses
+
+The `UCD/` directory contains Unicode Character Database 17.0.0 data files.
+See `UCD/ReadMe.txt` and the Unicode terms of use referenced there.
+
+Generated files under `TAB/` are derived from those UCD data files and keep the
+same upstream data provenance. MARS-NWE consumes them as generated C data, not
+as copied Novell NSS unitables.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,28 @@
+# mars-unicode-tables
+
+Unicode Character Database input and generated C tables for MARS-NWE.
+
+This repository vendors the complete Unicode Character Database 17.0.0 under
+`UCD/` and generates NSS-compatible BMP case mapping tables under `TAB/`.
+The generated symbols are intended for MARS-NWE `libnwcore`:
+
+- `NSSUniToLower[65536]`
+- `NSSUniToUpper[65536]`
+
+The generated tables are based on Unicode UCD data, not on Novell NSS
+`shared/sdk/unitables/*.tab` files. The Novell files may be used only as shape
+or compatibility references in the MARS-NWE work tree.
+
+## Regeneration
+
+```sh
+scripts/gen_unicode_tables.py \
+  --ucd-dir UCD \
+  --output TAB/unicodeTables.c \
+  --unicode-version 17.0.0
+```
+
+The NSS table ABI is `unicode_t[65536]`, so the generator emits only simple
+single-code-point BMP mappings from `UnicodeData.txt`. Full, multi-code-point,
+and locale-sensitive mappings from `SpecialCasing.txt` cannot be represented in
+these arrays and are intentionally not emitted.
--- a/TAB/unicodeTables.c
+++ b/TAB/unicodeTables.c
--- a/scripts/gen_unicode_tables.py
+++ b/scripts/gen_unicode_tables.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""Generate NSS-compatible BMP Unicode case tables from UnicodeData.txt.
+
+The generated C file exports the NSS symbol names expected by the imported
+Unicode helpers:
+
+    unicode_t NSSUniToLower[65536]
+    unicode_t NSSUniToUpper[65536]
+
+Only single-code-point BMP mappings fit into these NSS tables. Full Unicode
+case mappings and locale-sensitive SpecialCasing.txt entries are intentionally
+not encoded here.
+"""
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+BMP_SIZE = 0x10000
+
+
+def parse_unicode_data(path: Path) -> tuple[list[int], list[int], dict[str, int]]:
+    lower = list(range(BMP_SIZE))
+    upper = list(range(BMP_SIZE))
+    stats = {
+        "records": 0,
+        "lower_mappings": 0,
+        "upper_mappings": 0,
+        "non_bmp_skipped": 0,
+    }
+
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            fields = line.split(";")
+            if len(fields) < 15:
+                raise ValueError(f"Malformed UnicodeData line: {line!r}")
+            cp = int(fields[0], 16)
+            stats["records"] += 1
+            if cp >= BMP_SIZE:
+                stats["non_bmp_skipped"] += 1
+                continue
+
+            upper_field = fields[12]
+            lower_field = fields[13]
+
+            if upper_field:
+                target = int(upper_field, 16)
+                if target < BMP_SIZE:
+                    upper[cp] = target
+                    stats["upper_mappings"] += 1
+            if lower_field:
+                target = int(lower_field, 16)
+                if target < BMP_SIZE:
+                    lower[cp] = target
+                    stats["lower_mappings"] += 1
+
+    return lower, upper, stats
+
+
+def emit_array(out, c_type: str, name: str, values: list[int]) -> None:
+    out.write(f"{c_type} {name}[65536] = {{\n")
+    for base in range(0, BMP_SIZE, 8):
+        chunk = values[base:base + 8]
+        text = ", ".join(f"0x{value:04X}" for value in chunk)
+        out.write(f"\t{text}")
+        if base + 8 < BMP_SIZE:
+            out.write(",")
+        out.write(f"\t/* 0x{base:04X} */\n")
+    out.write("};\n")
+
+
+def generate(ucd_dir: Path, output: Path, unicode_version: str) -> None:
+    unicode_data = ucd_dir / "UnicodeData.txt"
+    if not unicode_data.exists():
+        raise FileNotFoundError(unicode_data)
+
+    lower, upper, stats = parse_unicode_data(unicode_data)
+    output.parent.mkdir(parents=True, exist_ok=True)
+    with output.open("w", encoding="utf-8", newline="\n") as out:
+        out.write("/*\n")
+        out.write(" * Generated by scripts/gen_unicode_tables.py.\n")
+        out.write(f" * Source: Unicode Character Database {unicode_version}.\n")
+        out.write(" * Input: UCD/UnicodeData.txt simple uppercase/lowercase mappings.\n")
+        out.write(" *\n")
+        out.write(" * This file intentionally does not copy Novell NSS unitables/*.tab.\n")
+        out.write(" * It exports NSS-compatible symbol names from Unicode UCD data.\n")
+        out.write(" *\n")
+        out.write(" * Only single-code-point BMP mappings fit these NSS tables. Full,\n")
+        out.write(" * multi-code-point, and locale-sensitive mappings from SpecialCasing.txt\n")
+        out.write(" * do not fit unicode_t[65536] tables and are not emitted here.\n")
+        out.write(" */\n")
+        out.write("#include <xUnicode.h>\n\n")
+        out.write(f"/* UnicodeData records: {stats['records']}; non-BMP skipped: {stats['non_bmp_skipped']}. */\n")
+        out.write(f"/* Simple BMP lower mappings: {stats['lower_mappings']}; upper mappings: {stats['upper_mappings']}. */\n\n")
+        emit_array(out, "unicode_t", "NSSUniToLower", lower)
+        out.write("\n")
+        emit_array(out, "unicode_t", "NSSUniToUpper", upper)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--ucd-dir", default="UCD", type=Path)
+    parser.add_argument("--output", default="TAB/unicodeTables.c", type=Path)
+    parser.add_argument("--unicode-version", default="17.0.0")
+    args = parser.parse_args()
+    generate(args.ucd_dir, args.output, args.unicode_version)
+
+
+if __name__ == "__main__":
+    main()