308 lines
11 KiB
Python
Executable File
308 lines
11 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# SPDX-License-Identifier: GPL-2.0-only
|
|
"""Generate compact MARS-NWE codepage mapping tables from Unicode MAPPINGS.
|
|
|
|
The input files are Unicode.org mapping files kept under MAPPINGS/.
|
|
The generated output intentionally does not copy Novell NSS unitables directory files.
|
|
It emits a compact source-path-preserving descriptor format that MARS-NWE can
|
|
use to build NSS-compatible converter tables.
|
|
|
|
Only direct one-code-unit mappings are emitted:
|
|
|
|
encoded byte/code value -> single BMP Unicode code point
|
|
|
|
Composite source sequences such as 0xA1+0xE9, directional pseudo-mappings such
|
|
as <LR>+0x0020, Unicode multi-codepoint targets such as 0x00FC+0xF87F, non-BMP
|
|
Unicode targets, and source codes wider than 16 bits are skipped and counted.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import re
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
HEX = re.compile(r"^0x[0-9A-Fa-f]+$")
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class MappingFile:
|
|
path: Path
|
|
symbol: str
|
|
name: str
|
|
source: str
|
|
max_code_bytes: int
|
|
pairs: tuple[tuple[int, int], ...]
|
|
skipped_composite: int
|
|
skipped_wide_source: int
|
|
skipped_non_bmp: int
|
|
duplicate_sources: int
|
|
|
|
|
|
def sanitize_component(text: str) -> str:
|
|
out = []
|
|
for ch in text:
|
|
if ch.isalnum():
|
|
out.append(ch.upper())
|
|
else:
|
|
out.append("_")
|
|
result = "".join(out).strip("_")
|
|
while "__" in result:
|
|
result = result.replace("__", "_")
|
|
return result or "ROOT"
|
|
|
|
|
|
def symbol_for(path: Path, mappings_root: Path) -> str:
|
|
rel = path.relative_to(mappings_root)
|
|
parts = [sanitize_component(p) for p in rel.parts]
|
|
if parts[-1].endswith("_TXT"):
|
|
parts[-1] = parts[-1][:-4]
|
|
return "MARS_UNICODE_CODEPAGE_" + "_".join(parts)
|
|
|
|
|
|
def name_for(path: Path, mappings_root: Path) -> str:
|
|
rel = path.relative_to(mappings_root)
|
|
stem_parts = list(rel.parts)
|
|
stem_parts[-1] = Path(stem_parts[-1]).stem
|
|
return "/".join(stem_parts)
|
|
|
|
|
|
def parse_mapping_file(path: Path, mappings_root: Path) -> MappingFile | None:
|
|
pairs: dict[int, int] = {}
|
|
skipped_composite = 0
|
|
skipped_wide_source = 0
|
|
skipped_non_bmp = 0
|
|
duplicate_sources = 0
|
|
|
|
text = path.read_text(encoding="utf-8", errors="replace")
|
|
for line in text.splitlines():
|
|
data = line.split("#", 1)[0].strip()
|
|
if not data:
|
|
continue
|
|
fields = data.split()
|
|
if len(fields) < 2:
|
|
continue
|
|
src_text, uni_text = fields[0], fields[1]
|
|
|
|
if "+" in src_text or "+" in uni_text or not HEX.match(src_text) or not HEX.match(uni_text):
|
|
skipped_composite += 1
|
|
continue
|
|
|
|
src = int(src_text, 16)
|
|
uni = int(uni_text, 16)
|
|
if src > 0xFFFF:
|
|
skipped_wide_source += 1
|
|
continue
|
|
if uni > 0xFFFF:
|
|
skipped_non_bmp += 1
|
|
continue
|
|
if src in pairs:
|
|
duplicate_sources += 1
|
|
continue
|
|
pairs[src] = uni
|
|
|
|
if not pairs:
|
|
return None
|
|
|
|
max_src = max(pairs)
|
|
max_code_bytes = 1 if max_src <= 0xFF else 2
|
|
ordered = tuple(sorted(pairs.items()))
|
|
return MappingFile(
|
|
path=path,
|
|
symbol=symbol_for(path, mappings_root),
|
|
name=name_for(path, mappings_root),
|
|
source=str(path.relative_to(mappings_root.parent)),
|
|
max_code_bytes=max_code_bytes,
|
|
pairs=ordered,
|
|
skipped_composite=skipped_composite,
|
|
skipped_wide_source=skipped_wide_source,
|
|
skipped_non_bmp=skipped_non_bmp,
|
|
duplicate_sources=duplicate_sources,
|
|
)
|
|
|
|
|
|
def discover(mappings_root: Path) -> list[MappingFile]:
|
|
def is_generated_input(path: Path) -> bool:
|
|
parts = {part.lower() for part in path.parts}
|
|
if path.name.lower().startswith("readme"):
|
|
return False
|
|
# WindowsBestFit files are Unicode-to-codepage fallback data, not
|
|
# direct encoded-byte -> Unicode mapping tables. Keep them as source
|
|
# material in the repo, but do not emit byte-to-Unicode descriptors.
|
|
if "windowsbestfit" in parts:
|
|
return False
|
|
# DatedVersions are historical snapshots; keep the current top-level
|
|
# mapping as the generated table input.
|
|
if "datedversions" in parts:
|
|
return False
|
|
return True
|
|
|
|
files = sorted(
|
|
[p for p in mappings_root.rglob("*.TXT") if is_generated_input(p)]
|
|
+ [p for p in mappings_root.rglob("*.txt") if is_generated_input(p)]
|
|
)
|
|
mappings: list[MappingFile] = []
|
|
for path in files:
|
|
parsed = parse_mapping_file(path, mappings_root)
|
|
if parsed:
|
|
mappings.append(parsed)
|
|
return mappings
|
|
|
|
|
|
def emit_header(path: Path) -> None:
|
|
path.write_text(
|
|
"/*\n"
|
|
" * Generated interface for MARS-NWE Unicode codepage mapping tables.\n"
|
|
" *\n"
|
|
" * Source data: Unicode.org Public/MAPPINGS.\n"
|
|
" * Do not replace these generated descriptors with Novell NSS unitables directory files.\n"
|
|
" */\n"
|
|
"#ifndef MARS_UNICODE_CODEPAGE_TABLES_H\n"
|
|
"#define MARS_UNICODE_CODEPAGE_TABLES_H\n"
|
|
"\n"
|
|
"#include <stddef.h>\n"
|
|
"#include <stdint.h>\n"
|
|
"\n"
|
|
"#ifdef __cplusplus\n"
|
|
"extern \"C\" {\n"
|
|
"#endif\n"
|
|
"\n"
|
|
"typedef struct MARSUnicodeCodePagePair_s {\n"
|
|
"\tuint16_t code;\n"
|
|
"\tuint16_t unicode;\n"
|
|
"} MARSUnicodeCodePagePair_t;\n"
|
|
"\n"
|
|
"typedef struct MARSUnicodeCodePage_s {\n"
|
|
"\tconst char *name;\n"
|
|
"\tconst char *source;\n"
|
|
"\tuint8_t max_code_bytes;\n"
|
|
"\tuint32_t pair_count;\n"
|
|
"\tconst MARSUnicodeCodePagePair_t *pairs;\n"
|
|
"} MARSUnicodeCodePage_t;\n"
|
|
"\n"
|
|
"extern const MARSUnicodeCodePage_t MARSUnicodeCodePages[];\n"
|
|
"extern const size_t MARSUnicodeCodePageCount;\n"
|
|
"\n"
|
|
"const MARSUnicodeCodePage_t *MARSUnicodeFindCodePage(const char *name);\n"
|
|
"\n"
|
|
"#ifdef __cplusplus\n"
|
|
"}\n"
|
|
"#endif\n"
|
|
"\n"
|
|
"#endif /* MARS_UNICODE_CODEPAGE_TABLES_H */\n",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
|
|
def emit_pairs(out, mapping: MappingFile) -> None:
|
|
out.write(f"static const MARSUnicodeCodePagePair_t {mapping.symbol}_pairs[] = {{\n")
|
|
for idx, (src, uni) in enumerate(mapping.pairs):
|
|
out.write(f"\t{{ 0x{src:04X}, 0x{uni:04X} }}")
|
|
if idx + 1 < len(mapping.pairs):
|
|
out.write(",")
|
|
out.write("\n")
|
|
out.write("};\n\n")
|
|
|
|
|
|
def emit_source(path: Path, header_name: str, mappings: list[MappingFile]) -> None:
|
|
with path.open("w", encoding="utf-8") as out:
|
|
out.write(
|
|
"/*\n"
|
|
" * Generated by scripts/gen_codepage_tables.py.\n"
|
|
" * Source: Unicode.org Public/MAPPINGS.\n"
|
|
" *\n"
|
|
" * This file intentionally does not copy Novell NSS unitables directory files.\n"
|
|
" * It emits compact descriptors from Unicode mapping files; MARS-NWE\n"
|
|
" * can use them to build NSS-compatible converter tables.\n"
|
|
" *\n"
|
|
" * Only direct source-code -> single BMP Unicode mappings are emitted.\n"
|
|
" */\n"
|
|
f"#include \"{header_name}\"\n"
|
|
"\n"
|
|
"#include <string.h>\n"
|
|
"\n"
|
|
)
|
|
total_pairs = sum(len(m.pairs) for m in mappings)
|
|
total_skipped = sum(m.skipped_composite + m.skipped_wide_source + m.skipped_non_bmp for m in mappings)
|
|
out.write(f"/* Mapping files emitted: {len(mappings)}. */\n")
|
|
out.write(f"/* Mapping pairs emitted: {total_pairs}. */\n")
|
|
out.write(f"/* Composite/wide/non-BMP records skipped: {total_skipped}. */\n\n")
|
|
|
|
for mapping in mappings:
|
|
out.write(
|
|
f"/* {mapping.name}: pairs={len(mapping.pairs)}, "
|
|
f"bytes={mapping.max_code_bytes}, "
|
|
f"skipped_composite={mapping.skipped_composite}, "
|
|
f"skipped_wide_source={mapping.skipped_wide_source}, "
|
|
f"skipped_non_bmp={mapping.skipped_non_bmp}, "
|
|
f"duplicate_sources={mapping.duplicate_sources}. */\n"
|
|
)
|
|
emit_pairs(out, mapping)
|
|
|
|
out.write("const MARSUnicodeCodePage_t MARSUnicodeCodePages[] = {\n")
|
|
for mapping in mappings:
|
|
out.write(
|
|
f"\t{{ \"{mapping.name}\", \"{mapping.source}\", "
|
|
f"{mapping.max_code_bytes}, {len(mapping.pairs)}, {mapping.symbol}_pairs }},\n"
|
|
)
|
|
out.write("};\n\n")
|
|
out.write(
|
|
"const size_t MARSUnicodeCodePageCount =\n"
|
|
"\tsizeof(MARSUnicodeCodePages) / sizeof(MARSUnicodeCodePages[0]);\n\n"
|
|
"const MARSUnicodeCodePage_t *MARSUnicodeFindCodePage(const char *name)\n"
|
|
"{\n"
|
|
"\tsize_t i;\n"
|
|
"\n"
|
|
"\tif (!name)\n"
|
|
"\t{\n"
|
|
"\t\treturn NULL;\n"
|
|
"\t}\n"
|
|
"\n"
|
|
"\tfor (i = 0; i < MARSUnicodeCodePageCount; i++)\n"
|
|
"\t{\n"
|
|
"\t\tif (strcmp(MARSUnicodeCodePages[i].name, name) == 0)\n"
|
|
"\t\t{\n"
|
|
"\t\t\treturn &MARSUnicodeCodePages[i];\n"
|
|
"\t\t}\n"
|
|
"\t}\n"
|
|
"\n"
|
|
"\treturn NULL;\n"
|
|
"}\n"
|
|
)
|
|
|
|
|
|
def write_summary(path: Path, mappings: list[MappingFile]) -> None:
|
|
with path.open("w", encoding="utf-8") as out:
|
|
out.write("# Generated codepage table summary\n\n")
|
|
out.write("Source data: `MAPPINGS/` Unicode.org mapping files.\n\n")
|
|
out.write("| Name | Source | Bytes | Pairs | Skipped composite | Skipped wide source | Skipped non-BMP | Duplicate sources |\n")
|
|
out.write("| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |\n")
|
|
for m in mappings:
|
|
out.write(
|
|
f"| `{m.name}` | `{m.source}` | {m.max_code_bytes} | {len(m.pairs)} | "
|
|
f"{m.skipped_composite} | {m.skipped_wide_source} | {m.skipped_non_bmp} | {m.duplicate_sources} |\n"
|
|
)
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--mappings-root", type=Path, default=Path("MAPPINGS"))
|
|
parser.add_argument("--output-dir", type=Path, default=Path("TAB"))
|
|
args = parser.parse_args()
|
|
|
|
mappings_root = args.mappings_root
|
|
output_dir = args.output_dir
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
mappings = discover(mappings_root)
|
|
if not mappings:
|
|
raise SystemExit("no mapping files found")
|
|
|
|
emit_header(output_dir / "codepageTables.h")
|
|
emit_source(output_dir / "codepageTables.c", "codepageTables.h", mappings)
|
|
write_summary(output_dir / "codepageTables.md", mappings)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|