Files
mars-unicode-tables/scripts/gen_codepage_tables.py
2026-06-12 20:38:01 +02:00

308 lines
11 KiB
Python
Executable File

#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-2.0-only
"""Generate compact MARS-NWE codepage mapping tables from Unicode MAPPINGS.
The input files are Unicode.org mapping files kept under MAPPINGS/.
The generated output intentionally does not copy Novell NSS unitables directory files.
It emits a compact source-path-preserving descriptor format that MARS-NWE can
use to build NSS-compatible converter tables.
Only direct one-code-unit mappings are emitted:
encoded byte/code value -> single BMP Unicode code point
Composite source sequences such as 0xA1+0xE9, directional pseudo-mappings such
as <LR>+0x0020, Unicode multi-codepoint targets such as 0x00FC+0xF87F, non-BMP
Unicode targets, and source codes wider than 16 bits are skipped and counted.
"""
from __future__ import annotations
import argparse
import re
from dataclasses import dataclass
from pathlib import Path
HEX = re.compile(r"^0x[0-9A-Fa-f]+$")
@dataclass(frozen=True)
class MappingFile:
path: Path
symbol: str
name: str
source: str
max_code_bytes: int
pairs: tuple[tuple[int, int], ...]
skipped_composite: int
skipped_wide_source: int
skipped_non_bmp: int
duplicate_sources: int
def sanitize_component(text: str) -> str:
out = []
for ch in text:
if ch.isalnum():
out.append(ch.upper())
else:
out.append("_")
result = "".join(out).strip("_")
while "__" in result:
result = result.replace("__", "_")
return result or "ROOT"
def symbol_for(path: Path, mappings_root: Path) -> str:
rel = path.relative_to(mappings_root)
parts = [sanitize_component(p) for p in rel.parts]
if parts[-1].endswith("_TXT"):
parts[-1] = parts[-1][:-4]
return "MARS_UNICODE_CODEPAGE_" + "_".join(parts)
def name_for(path: Path, mappings_root: Path) -> str:
rel = path.relative_to(mappings_root)
stem_parts = list(rel.parts)
stem_parts[-1] = Path(stem_parts[-1]).stem
return "/".join(stem_parts)
def parse_mapping_file(path: Path, mappings_root: Path) -> MappingFile | None:
pairs: dict[int, int] = {}
skipped_composite = 0
skipped_wide_source = 0
skipped_non_bmp = 0
duplicate_sources = 0
text = path.read_text(encoding="utf-8", errors="replace")
for line in text.splitlines():
data = line.split("#", 1)[0].strip()
if not data:
continue
fields = data.split()
if len(fields) < 2:
continue
src_text, uni_text = fields[0], fields[1]
if "+" in src_text or "+" in uni_text or not HEX.match(src_text) or not HEX.match(uni_text):
skipped_composite += 1
continue
src = int(src_text, 16)
uni = int(uni_text, 16)
if src > 0xFFFF:
skipped_wide_source += 1
continue
if uni > 0xFFFF:
skipped_non_bmp += 1
continue
if src in pairs:
duplicate_sources += 1
continue
pairs[src] = uni
if not pairs:
return None
max_src = max(pairs)
max_code_bytes = 1 if max_src <= 0xFF else 2
ordered = tuple(sorted(pairs.items()))
return MappingFile(
path=path,
symbol=symbol_for(path, mappings_root),
name=name_for(path, mappings_root),
source=str(path.relative_to(mappings_root.parent)),
max_code_bytes=max_code_bytes,
pairs=ordered,
skipped_composite=skipped_composite,
skipped_wide_source=skipped_wide_source,
skipped_non_bmp=skipped_non_bmp,
duplicate_sources=duplicate_sources,
)
def discover(mappings_root: Path) -> list[MappingFile]:
def is_generated_input(path: Path) -> bool:
parts = {part.lower() for part in path.parts}
if path.name.lower().startswith("readme"):
return False
# WindowsBestFit files are Unicode-to-codepage fallback data, not
# direct encoded-byte -> Unicode mapping tables. Keep them as source
# material in the repo, but do not emit byte-to-Unicode descriptors.
if "windowsbestfit" in parts:
return False
# DatedVersions are historical snapshots; keep the current top-level
# mapping as the generated table input.
if "datedversions" in parts:
return False
return True
files = sorted(
[p for p in mappings_root.rglob("*.TXT") if is_generated_input(p)]
+ [p for p in mappings_root.rglob("*.txt") if is_generated_input(p)]
)
mappings: list[MappingFile] = []
for path in files:
parsed = parse_mapping_file(path, mappings_root)
if parsed:
mappings.append(parsed)
return mappings
def emit_header(path: Path) -> None:
path.write_text(
"/*\n"
" * Generated interface for MARS-NWE Unicode codepage mapping tables.\n"
" *\n"
" * Source data: Unicode.org Public/MAPPINGS.\n"
" * Do not replace these generated descriptors with Novell NSS unitables directory files.\n"
" */\n"
"#ifndef MARS_UNICODE_CODEPAGE_TABLES_H\n"
"#define MARS_UNICODE_CODEPAGE_TABLES_H\n"
"\n"
"#include <stddef.h>\n"
"#include <stdint.h>\n"
"\n"
"#ifdef __cplusplus\n"
"extern \"C\" {\n"
"#endif\n"
"\n"
"typedef struct MARSUnicodeCodePagePair_s {\n"
"\tuint16_t code;\n"
"\tuint16_t unicode;\n"
"} MARSUnicodeCodePagePair_t;\n"
"\n"
"typedef struct MARSUnicodeCodePage_s {\n"
"\tconst char *name;\n"
"\tconst char *source;\n"
"\tuint8_t max_code_bytes;\n"
"\tuint32_t pair_count;\n"
"\tconst MARSUnicodeCodePagePair_t *pairs;\n"
"} MARSUnicodeCodePage_t;\n"
"\n"
"extern const MARSUnicodeCodePage_t MARSUnicodeCodePages[];\n"
"extern const size_t MARSUnicodeCodePageCount;\n"
"\n"
"const MARSUnicodeCodePage_t *MARSUnicodeFindCodePage(const char *name);\n"
"\n"
"#ifdef __cplusplus\n"
"}\n"
"#endif\n"
"\n"
"#endif /* MARS_UNICODE_CODEPAGE_TABLES_H */\n",
encoding="utf-8",
)
def emit_pairs(out, mapping: MappingFile) -> None:
out.write(f"static const MARSUnicodeCodePagePair_t {mapping.symbol}_pairs[] = {{\n")
for idx, (src, uni) in enumerate(mapping.pairs):
out.write(f"\t{{ 0x{src:04X}, 0x{uni:04X} }}")
if idx + 1 < len(mapping.pairs):
out.write(",")
out.write("\n")
out.write("};\n\n")
def emit_source(path: Path, header_name: str, mappings: list[MappingFile]) -> None:
with path.open("w", encoding="utf-8") as out:
out.write(
"/*\n"
" * Generated by scripts/gen_codepage_tables.py.\n"
" * Source: Unicode.org Public/MAPPINGS.\n"
" *\n"
" * This file intentionally does not copy Novell NSS unitables directory files.\n"
" * It emits compact descriptors from Unicode mapping files; MARS-NWE\n"
" * can use them to build NSS-compatible converter tables.\n"
" *\n"
" * Only direct source-code -> single BMP Unicode mappings are emitted.\n"
" */\n"
f"#include \"{header_name}\"\n"
"\n"
"#include <string.h>\n"
"\n"
)
total_pairs = sum(len(m.pairs) for m in mappings)
total_skipped = sum(m.skipped_composite + m.skipped_wide_source + m.skipped_non_bmp for m in mappings)
out.write(f"/* Mapping files emitted: {len(mappings)}. */\n")
out.write(f"/* Mapping pairs emitted: {total_pairs}. */\n")
out.write(f"/* Composite/wide/non-BMP records skipped: {total_skipped}. */\n\n")
for mapping in mappings:
out.write(
f"/* {mapping.name}: pairs={len(mapping.pairs)}, "
f"bytes={mapping.max_code_bytes}, "
f"skipped_composite={mapping.skipped_composite}, "
f"skipped_wide_source={mapping.skipped_wide_source}, "
f"skipped_non_bmp={mapping.skipped_non_bmp}, "
f"duplicate_sources={mapping.duplicate_sources}. */\n"
)
emit_pairs(out, mapping)
out.write("const MARSUnicodeCodePage_t MARSUnicodeCodePages[] = {\n")
for mapping in mappings:
out.write(
f"\t{{ \"{mapping.name}\", \"{mapping.source}\", "
f"{mapping.max_code_bytes}, {len(mapping.pairs)}, {mapping.symbol}_pairs }},\n"
)
out.write("};\n\n")
out.write(
"const size_t MARSUnicodeCodePageCount =\n"
"\tsizeof(MARSUnicodeCodePages) / sizeof(MARSUnicodeCodePages[0]);\n\n"
"const MARSUnicodeCodePage_t *MARSUnicodeFindCodePage(const char *name)\n"
"{\n"
"\tsize_t i;\n"
"\n"
"\tif (!name)\n"
"\t{\n"
"\t\treturn NULL;\n"
"\t}\n"
"\n"
"\tfor (i = 0; i < MARSUnicodeCodePageCount; i++)\n"
"\t{\n"
"\t\tif (strcmp(MARSUnicodeCodePages[i].name, name) == 0)\n"
"\t\t{\n"
"\t\t\treturn &MARSUnicodeCodePages[i];\n"
"\t\t}\n"
"\t}\n"
"\n"
"\treturn NULL;\n"
"}\n"
)
def write_summary(path: Path, mappings: list[MappingFile]) -> None:
with path.open("w", encoding="utf-8") as out:
out.write("# Generated codepage table summary\n\n")
out.write("Source data: `MAPPINGS/` Unicode.org mapping files.\n\n")
out.write("| Name | Source | Bytes | Pairs | Skipped composite | Skipped wide source | Skipped non-BMP | Duplicate sources |\n")
out.write("| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |\n")
for m in mappings:
out.write(
f"| `{m.name}` | `{m.source}` | {m.max_code_bytes} | {len(m.pairs)} | "
f"{m.skipped_composite} | {m.skipped_wide_source} | {m.skipped_non_bmp} | {m.duplicate_sources} |\n"
)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--mappings-root", type=Path, default=Path("MAPPINGS"))
parser.add_argument("--output-dir", type=Path, default=Path("TAB"))
args = parser.parse_args()
mappings_root = args.mappings_root
output_dir = args.output_dir
output_dir.mkdir(parents=True, exist_ok=True)
mappings = discover(mappings_root)
if not mappings:
raise SystemExit("no mapping files found")
emit_header(output_dir / "codepageTables.h")
emit_source(output_dir / "codepageTables.c", "codepageTables.h", mappings)
write_summary(output_dir / "codepageTables.md", mappings)
if __name__ == "__main__":
main()