mars-unicode-tables/scripts/gen_codepage_tables.py

#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-2.0-only
"""Generate compact MARS-NWE codepage mapping tables from Unicode MAPPINGS.

The input files are Unicode.org mapping files kept under MAPPINGS/.
The generated output intentionally does not copy Novell NSS unitables directory files.
It emits a compact source-path-preserving descriptor format that MARS-NWE can
use to build NSS-compatible converter tables.

Only direct one-code-unit mappings are emitted:

    encoded byte/code value -> single BMP Unicode code point

Composite source sequences such as 0xA1+0xE9, directional pseudo-mappings such
as <LR>+0x0020, Unicode multi-codepoint targets such as 0x00FC+0xF87F, non-BMP
Unicode targets, and source codes wider than 16 bits are skipped and counted.
"""
from __future__ import annotations

import argparse
import re
from dataclasses import dataclass
from pathlib import Path

HEX = re.compile(r"^0x[0-9A-Fa-f]+$")


@dataclass(frozen=True)
class MappingFile:
    path: Path
    symbol: str
    name: str
    source: str
    max_code_bytes: int
    pairs: tuple[tuple[int, int], ...]
    skipped_composite: int
    skipped_wide_source: int
    skipped_non_bmp: int
    duplicate_sources: int


def sanitize_component(text: str) -> str:
    out = []
    for ch in text:
        if ch.isalnum():
            out.append(ch.upper())
        else:
            out.append("_")
    result = "".join(out).strip("_")
    while "__" in result:
        result = result.replace("__", "_")
    return result or "ROOT"


def symbol_for(path: Path, mappings_root: Path) -> str:
    rel = path.relative_to(mappings_root)
    parts = [sanitize_component(p) for p in rel.parts]
    if parts[-1].endswith("_TXT"):
        parts[-1] = parts[-1][:-4]
    return "MARS_UNICODE_CODEPAGE_" + "_".join(parts)


def name_for(path: Path, mappings_root: Path) -> str:
    rel = path.relative_to(mappings_root)
    stem_parts = list(rel.parts)
    stem_parts[-1] = Path(stem_parts[-1]).stem
    return "/".join(stem_parts)


def parse_mapping_file(path: Path, mappings_root: Path) -> MappingFile | None:
    pairs: dict[int, int] = {}
    skipped_composite = 0
    skipped_wide_source = 0
    skipped_non_bmp = 0
    duplicate_sources = 0

    text = path.read_text(encoding="utf-8", errors="replace")
    for line in text.splitlines():
        data = line.split("#", 1)[0].strip()
        if not data:
            continue
        fields = data.split()
        if len(fields) < 2:
            continue
        src_text, uni_text = fields[0], fields[1]

        if "+" in src_text or "+" in uni_text or not HEX.match(src_text) or not HEX.match(uni_text):
            skipped_composite += 1
            continue

        src = int(src_text, 16)
        uni = int(uni_text, 16)
        if src > 0xFFFF:
            skipped_wide_source += 1
            continue
        if uni > 0xFFFF:
            skipped_non_bmp += 1
            continue
        if src in pairs:
            duplicate_sources += 1
            continue
        pairs[src] = uni

    if not pairs:
        return None

    max_src = max(pairs)
    max_code_bytes = 1 if max_src <= 0xFF else 2
    ordered = tuple(sorted(pairs.items()))
    return MappingFile(
        path=path,
        symbol=symbol_for(path, mappings_root),
        name=name_for(path, mappings_root),
        source=str(path.relative_to(mappings_root.parent)),
        max_code_bytes=max_code_bytes,
        pairs=ordered,
        skipped_composite=skipped_composite,
        skipped_wide_source=skipped_wide_source,
        skipped_non_bmp=skipped_non_bmp,
        duplicate_sources=duplicate_sources,
    )


def discover(mappings_root: Path) -> list[MappingFile]:
    def is_generated_input(path: Path) -> bool:
        parts = {part.lower() for part in path.parts}
        if path.name.lower().startswith("readme"):
            return False
        # WindowsBestFit files are Unicode-to-codepage fallback data, not
        # direct encoded-byte -> Unicode mapping tables.  Keep them as source
        # material in the repo, but do not emit byte-to-Unicode descriptors.
        if "windowsbestfit" in parts:
            return False
        # DatedVersions are historical snapshots; keep the current top-level
        # mapping as the generated table input.
        if "datedversions" in parts:
            return False
        return True

    files = sorted(
        [p for p in mappings_root.rglob("*.TXT") if is_generated_input(p)]
        + [p for p in mappings_root.rglob("*.txt") if is_generated_input(p)]
    )
    mappings: list[MappingFile] = []
    for path in files:
        parsed = parse_mapping_file(path, mappings_root)
        if parsed:
            mappings.append(parsed)
    return mappings


def emit_header(path: Path) -> None:
    path.write_text(
        "/*\n"
        " * Generated interface for MARS-NWE Unicode codepage mapping tables.\n"
        " *\n"
        " * Source data: Unicode.org Public/MAPPINGS.\n"
        " * Do not replace these generated descriptors with Novell NSS unitables directory files.\n"
        " */\n"
        "#ifndef MARS_UNICODE_CODEPAGE_TABLES_H\n"
        "#define MARS_UNICODE_CODEPAGE_TABLES_H\n"
        "\n"
        "#include <stddef.h>\n"
        "#include <stdint.h>\n"
        "\n"
        "#ifdef __cplusplus\n"
        "extern \"C\" {\n"
        "#endif\n"
        "\n"
        "typedef struct MARSUnicodeCodePagePair_s {\n"
        "\tuint16_t code;\n"
        "\tuint16_t unicode;\n"
        "} MARSUnicodeCodePagePair_t;\n"
        "\n"
        "typedef struct MARSUnicodeCodePage_s {\n"
        "\tconst char *name;\n"
        "\tconst char *source;\n"
        "\tuint8_t max_code_bytes;\n"
        "\tuint32_t pair_count;\n"
        "\tconst MARSUnicodeCodePagePair_t *pairs;\n"
        "} MARSUnicodeCodePage_t;\n"
        "\n"
        "extern const MARSUnicodeCodePage_t MARSUnicodeCodePages[];\n"
        "extern const size_t MARSUnicodeCodePageCount;\n"
        "\n"
        "const MARSUnicodeCodePage_t *MARSUnicodeFindCodePage(const char *name);\n"
        "\n"
        "#ifdef __cplusplus\n"
        "}\n"
        "#endif\n"
        "\n"
        "#endif /* MARS_UNICODE_CODEPAGE_TABLES_H */\n",
        encoding="utf-8",
    )


def emit_pairs(out, mapping: MappingFile) -> None:
    out.write(f"static const MARSUnicodeCodePagePair_t {mapping.symbol}_pairs[] = {{\n")
    for idx, (src, uni) in enumerate(mapping.pairs):
        out.write(f"\t{{ 0x{src:04X}, 0x{uni:04X} }}")
        if idx + 1 < len(mapping.pairs):
            out.write(",")
        out.write("\n")
    out.write("};\n\n")


def emit_source(path: Path, header_name: str, mappings: list[MappingFile]) -> None:
    with path.open("w", encoding="utf-8") as out:
        out.write(
            "/*\n"
            " * Generated by scripts/gen_codepage_tables.py.\n"
            " * Source: Unicode.org Public/MAPPINGS.\n"
            " *\n"
            " * This file intentionally does not copy Novell NSS unitables directory files.\n"
            " * It emits compact descriptors from Unicode mapping files; MARS-NWE\n"
            " * can use them to build NSS-compatible converter tables.\n"
            " *\n"
            " * Only direct source-code -> single BMP Unicode mappings are emitted.\n"
            " */\n"
            f"#include \"{header_name}\"\n"
            "\n"
            "#include <string.h>\n"
            "\n"
        )
        total_pairs = sum(len(m.pairs) for m in mappings)
        total_skipped = sum(m.skipped_composite + m.skipped_wide_source + m.skipped_non_bmp for m in mappings)
        out.write(f"/* Mapping files emitted: {len(mappings)}. */\n")
        out.write(f"/* Mapping pairs emitted: {total_pairs}. */\n")
        out.write(f"/* Composite/wide/non-BMP records skipped: {total_skipped}. */\n\n")

        for mapping in mappings:
            out.write(
                f"/* {mapping.name}: pairs={len(mapping.pairs)}, "
                f"bytes={mapping.max_code_bytes}, "
                f"skipped_composite={mapping.skipped_composite}, "
                f"skipped_wide_source={mapping.skipped_wide_source}, "
                f"skipped_non_bmp={mapping.skipped_non_bmp}, "
                f"duplicate_sources={mapping.duplicate_sources}. */\n"
            )
            emit_pairs(out, mapping)

        out.write("const MARSUnicodeCodePage_t MARSUnicodeCodePages[] = {\n")
        for mapping in mappings:
            out.write(
                f"\t{{ \"{mapping.name}\", \"{mapping.source}\", "
                f"{mapping.max_code_bytes}, {len(mapping.pairs)}, {mapping.symbol}_pairs }},\n"
            )
        out.write("};\n\n")
        out.write(
            "const size_t MARSUnicodeCodePageCount =\n"
            "\tsizeof(MARSUnicodeCodePages) / sizeof(MARSUnicodeCodePages[0]);\n\n"
            "const MARSUnicodeCodePage_t *MARSUnicodeFindCodePage(const char *name)\n"
            "{\n"
            "\tsize_t i;\n"
            "\n"
            "\tif (!name)\n"
            "\t{\n"
            "\t\treturn NULL;\n"
            "\t}\n"
            "\n"
            "\tfor (i = 0; i < MARSUnicodeCodePageCount; i++)\n"
            "\t{\n"
            "\t\tif (strcmp(MARSUnicodeCodePages[i].name, name) == 0)\n"
            "\t\t{\n"
            "\t\t\treturn &MARSUnicodeCodePages[i];\n"
            "\t\t}\n"
            "\t}\n"
            "\n"
            "\treturn NULL;\n"
            "}\n"
        )


def write_summary(path: Path, mappings: list[MappingFile]) -> None:
    with path.open("w", encoding="utf-8") as out:
        out.write("# Generated codepage table summary\n\n")
        out.write("Source data: `MAPPINGS/` Unicode.org mapping files.\n\n")
        out.write("| Name | Source | Bytes | Pairs | Skipped composite | Skipped wide source | Skipped non-BMP | Duplicate sources |\n")
        out.write("| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |\n")
        for m in mappings:
            out.write(
                f"| `{m.name}` | `{m.source}` | {m.max_code_bytes} | {len(m.pairs)} | "
                f"{m.skipped_composite} | {m.skipped_wide_source} | {m.skipped_non_bmp} | {m.duplicate_sources} |\n"
            )


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--mappings-root", type=Path, default=Path("MAPPINGS"))
    parser.add_argument("--output-dir", type=Path, default=Path("TAB"))
    args = parser.parse_args()

    mappings_root = args.mappings_root
    output_dir = args.output_dir
    output_dir.mkdir(parents=True, exist_ok=True)

    mappings = discover(mappings_root)
    if not mappings:
        raise SystemExit("no mapping files found")

    emit_header(output_dir / "codepageTables.h")
    emit_source(output_dir / "codepageTables.c", "codepageTables.h", mappings)
    write_summary(output_dir / "codepageTables.md", mappings)


if __name__ == "__main__":
    main()