#!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0-only """Generate compact MARS-NWE codepage mapping tables from Unicode MAPPINGS. The input files are Unicode.org mapping files kept under MAPPINGS/. The generated output intentionally does not copy Novell NSS unitables directory files. It emits a compact source-path-preserving descriptor format that MARS-NWE can use to build NSS-compatible converter tables. Only direct one-code-unit mappings are emitted: encoded byte/code value -> single BMP Unicode code point Composite source sequences such as 0xA1+0xE9, directional pseudo-mappings such as +0x0020, Unicode multi-codepoint targets such as 0x00FC+0xF87F, non-BMP Unicode targets, and source codes wider than 16 bits are skipped and counted. """ from __future__ import annotations import argparse import re from dataclasses import dataclass from pathlib import Path HEX = re.compile(r"^0x[0-9A-Fa-f]+$") @dataclass(frozen=True) class MappingFile: path: Path symbol: str name: str source: str max_code_bytes: int pairs: tuple[tuple[int, int], ...] skipped_composite: int skipped_wide_source: int skipped_non_bmp: int duplicate_sources: int def sanitize_component(text: str) -> str: out = [] for ch in text: if ch.isalnum(): out.append(ch.upper()) else: out.append("_") result = "".join(out).strip("_") while "__" in result: result = result.replace("__", "_") return result or "ROOT" def symbol_for(path: Path, mappings_root: Path) -> str: rel = path.relative_to(mappings_root) parts = [sanitize_component(p) for p in rel.parts] if parts[-1].endswith("_TXT"): parts[-1] = parts[-1][:-4] return "MARS_UNICODE_CODEPAGE_" + "_".join(parts) def name_for(path: Path, mappings_root: Path) -> str: rel = path.relative_to(mappings_root) stem_parts = list(rel.parts) stem_parts[-1] = Path(stem_parts[-1]).stem return "/".join(stem_parts) def parse_mapping_file(path: Path, mappings_root: Path) -> MappingFile | None: pairs: dict[int, int] = {} skipped_composite = 0 skipped_wide_source = 0 skipped_non_bmp = 0 duplicate_sources = 0 text = path.read_text(encoding="utf-8", errors="replace") for line in text.splitlines(): data = line.split("#", 1)[0].strip() if not data: continue fields = data.split() if len(fields) < 2: continue src_text, uni_text = fields[0], fields[1] if "+" in src_text or "+" in uni_text or not HEX.match(src_text) or not HEX.match(uni_text): skipped_composite += 1 continue src = int(src_text, 16) uni = int(uni_text, 16) if src > 0xFFFF: skipped_wide_source += 1 continue if uni > 0xFFFF: skipped_non_bmp += 1 continue if src in pairs: duplicate_sources += 1 continue pairs[src] = uni if not pairs: return None max_src = max(pairs) max_code_bytes = 1 if max_src <= 0xFF else 2 ordered = tuple(sorted(pairs.items())) return MappingFile( path=path, symbol=symbol_for(path, mappings_root), name=name_for(path, mappings_root), source=str(path.relative_to(mappings_root.parent)), max_code_bytes=max_code_bytes, pairs=ordered, skipped_composite=skipped_composite, skipped_wide_source=skipped_wide_source, skipped_non_bmp=skipped_non_bmp, duplicate_sources=duplicate_sources, ) def discover(mappings_root: Path) -> list[MappingFile]: def is_generated_input(path: Path) -> bool: parts = {part.lower() for part in path.parts} if path.name.lower().startswith("readme"): return False # WindowsBestFit files are Unicode-to-codepage fallback data, not # direct encoded-byte -> Unicode mapping tables. Keep them as source # material in the repo, but do not emit byte-to-Unicode descriptors. if "windowsbestfit" in parts: return False # DatedVersions are historical snapshots; keep the current top-level # mapping as the generated table input. if "datedversions" in parts: return False return True files = sorted( [p for p in mappings_root.rglob("*.TXT") if is_generated_input(p)] + [p for p in mappings_root.rglob("*.txt") if is_generated_input(p)] ) mappings: list[MappingFile] = [] for path in files: parsed = parse_mapping_file(path, mappings_root) if parsed: mappings.append(parsed) return mappings def emit_header(path: Path) -> None: path.write_text( "/*\n" " * Generated interface for MARS-NWE Unicode codepage mapping tables.\n" " *\n" " * Source data: Unicode.org Public/MAPPINGS.\n" " * Do not replace these generated descriptors with Novell NSS unitables directory files.\n" " */\n" "#ifndef MARS_UNICODE_CODEPAGE_TABLES_H\n" "#define MARS_UNICODE_CODEPAGE_TABLES_H\n" "\n" "#include \n" "#include \n" "\n" "#ifdef __cplusplus\n" "extern \"C\" {\n" "#endif\n" "\n" "typedef struct MARSUnicodeCodePagePair_s {\n" "\tuint16_t code;\n" "\tuint16_t unicode;\n" "} MARSUnicodeCodePagePair_t;\n" "\n" "typedef struct MARSUnicodeCodePage_s {\n" "\tconst char *name;\n" "\tconst char *source;\n" "\tuint8_t max_code_bytes;\n" "\tuint32_t pair_count;\n" "\tconst MARSUnicodeCodePagePair_t *pairs;\n" "} MARSUnicodeCodePage_t;\n" "\n" "extern const MARSUnicodeCodePage_t MARSUnicodeCodePages[];\n" "extern const size_t MARSUnicodeCodePageCount;\n" "\n" "const MARSUnicodeCodePage_t *MARSUnicodeFindCodePage(const char *name);\n" "\n" "#ifdef __cplusplus\n" "}\n" "#endif\n" "\n" "#endif /* MARS_UNICODE_CODEPAGE_TABLES_H */\n", encoding="utf-8", ) def emit_pairs(out, mapping: MappingFile) -> None: out.write(f"static const MARSUnicodeCodePagePair_t {mapping.symbol}_pairs[] = {{\n") for idx, (src, uni) in enumerate(mapping.pairs): out.write(f"\t{{ 0x{src:04X}, 0x{uni:04X} }}") if idx + 1 < len(mapping.pairs): out.write(",") out.write("\n") out.write("};\n\n") def emit_source(path: Path, header_name: str, mappings: list[MappingFile]) -> None: with path.open("w", encoding="utf-8") as out: out.write( "/*\n" " * Generated by scripts/gen_codepage_tables.py.\n" " * Source: Unicode.org Public/MAPPINGS.\n" " *\n" " * This file intentionally does not copy Novell NSS unitables directory files.\n" " * It emits compact descriptors from Unicode mapping files; MARS-NWE\n" " * can use them to build NSS-compatible converter tables.\n" " *\n" " * Only direct source-code -> single BMP Unicode mappings are emitted.\n" " */\n" f"#include \"{header_name}\"\n" "\n" "#include \n" "\n" ) total_pairs = sum(len(m.pairs) for m in mappings) total_skipped = sum(m.skipped_composite + m.skipped_wide_source + m.skipped_non_bmp for m in mappings) out.write(f"/* Mapping files emitted: {len(mappings)}. */\n") out.write(f"/* Mapping pairs emitted: {total_pairs}. */\n") out.write(f"/* Composite/wide/non-BMP records skipped: {total_skipped}. */\n\n") for mapping in mappings: out.write( f"/* {mapping.name}: pairs={len(mapping.pairs)}, " f"bytes={mapping.max_code_bytes}, " f"skipped_composite={mapping.skipped_composite}, " f"skipped_wide_source={mapping.skipped_wide_source}, " f"skipped_non_bmp={mapping.skipped_non_bmp}, " f"duplicate_sources={mapping.duplicate_sources}. */\n" ) emit_pairs(out, mapping) out.write("const MARSUnicodeCodePage_t MARSUnicodeCodePages[] = {\n") for mapping in mappings: out.write( f"\t{{ \"{mapping.name}\", \"{mapping.source}\", " f"{mapping.max_code_bytes}, {len(mapping.pairs)}, {mapping.symbol}_pairs }},\n" ) out.write("};\n\n") out.write( "const size_t MARSUnicodeCodePageCount =\n" "\tsizeof(MARSUnicodeCodePages) / sizeof(MARSUnicodeCodePages[0]);\n\n" "const MARSUnicodeCodePage_t *MARSUnicodeFindCodePage(const char *name)\n" "{\n" "\tsize_t i;\n" "\n" "\tif (!name)\n" "\t{\n" "\t\treturn NULL;\n" "\t}\n" "\n" "\tfor (i = 0; i < MARSUnicodeCodePageCount; i++)\n" "\t{\n" "\t\tif (strcmp(MARSUnicodeCodePages[i].name, name) == 0)\n" "\t\t{\n" "\t\t\treturn &MARSUnicodeCodePages[i];\n" "\t\t}\n" "\t}\n" "\n" "\treturn NULL;\n" "}\n" ) def write_summary(path: Path, mappings: list[MappingFile]) -> None: with path.open("w", encoding="utf-8") as out: out.write("# Generated codepage table summary\n\n") out.write("Source data: `MAPPINGS/` Unicode.org mapping files.\n\n") out.write("| Name | Source | Bytes | Pairs | Skipped composite | Skipped wide source | Skipped non-BMP | Duplicate sources |\n") out.write("| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |\n") for m in mappings: out.write( f"| `{m.name}` | `{m.source}` | {m.max_code_bytes} | {len(m.pairs)} | " f"{m.skipped_composite} | {m.skipped_wide_source} | {m.skipped_non_bmp} | {m.duplicate_sources} |\n" ) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--mappings-root", type=Path, default=Path("MAPPINGS")) parser.add_argument("--output-dir", type=Path, default=Path("TAB")) args = parser.parse_args() mappings_root = args.mappings_root output_dir = args.output_dir output_dir.mkdir(parents=True, exist_ok=True) mappings = discover(mappings_root) if not mappings: raise SystemExit("no mapping files found") emit_header(output_dir / "codepageTables.h") emit_source(output_dir / "codepageTables.c", "codepageTables.h", mappings) write_summary(output_dir / "codepageTables.md", mappings) if __name__ == "__main__": main()