team/scripts/md_export.py

"""
将 Markdown 文件中的 Mermaid 图表渲染为图片，并通过 Pandoc 导出为 docx/pdf。

用法:
    python scripts/md_export.py docs/论文.md --format docx
    python scripts/md_export.py docs/论文.md --format pdf
"""

import argparse
import os
import re
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path


def _ensure_chrome_env() -> None:
    """确保 mmdc 能找到 Chrome/Chromium 可执行文件。"""
    if os.environ.get("PUPPETEER_EXECUTABLE_PATH"):
        return

    # 优先检查系统 Chrome/Chromium
    for candidate in (
        "google-chrome",
        "google-chrome-stable",
        "chromium-browser",
        "chromium",
    ):
        path = shutil.which(candidate)
        if path:
            os.environ["PUPPETEER_EXECUTABLE_PATH"] = path
            return

    # 检查 Puppeteer 缓存目录
    cache_dir = Path.home() / ".cache" / "puppeteer"
    if cache_dir.exists():
        for shell_dir in sorted(cache_dir.glob("chrome-headless-shell/linux-*")):
            binary = (
                shell_dir / "chrome-headless-shell-linux64" / "chrome-headless-shell"
            )
            if binary.exists():
                os.environ["PUPPETEER_EXECUTABLE_PATH"] = str(binary)
                return

        for chrome_dir in sorted(cache_dir.glob("chrome/linux-*")):
            binary = chrome_dir / "chrome-linux64" / "chrome"
            if binary.exists():
                os.environ["PUPPETEER_EXECUTABLE_PATH"] = str(binary)
                return


def find_project_root() -> Path:
    """从脚本位置向上查找项目根目录（包含 AGENTS.md 或 CLAUDE.md 的目录）。"""
    current = Path(__file__).resolve().parent
    for parent in [current, *current.parents]:
        if (parent / "AGENTS.md").exists() or (parent / "CLAUDE.md").exists():
            return parent
    return current


def extract_mermaid_blocks(content: str) -> list[str]:
    """从 Markdown 内容中提取所有 mermaid 代码块。"""
    return re.findall(r"```mermaid\n(.*?)\n```", content, re.DOTALL)


def render_mermaid_to_png(
    mermaid_code: str,
    output_path: Path,
    mmd_path: Path,
) -> Path:
    """用 mmdc 将 mermaid 代码渲染为 PNG 图片，同时保留 .mmd 源文件。"""
    mmd_path.write_text(mermaid_code, encoding="utf-8")
    subprocess.run(
        ["mmdc", "-i", str(mmd_path), "-o", str(output_path), "-b", "white"],
        check=True,
        capture_output=True,
    )
    return output_path


def replace_mermaid_with_images(
    content: str,
    image_dir: Path,
    file_stem: str,
) -> tuple[str, list[Path]]:
    """将 mermaid 代码块替换为图片引用，返回替换后的内容和图片路径列表。"""
    counter = 0
    image_paths: list[Path] = []

    def _replace(match: re.Match[str]) -> str:
        nonlocal counter
        counter += 1
        png_name = f"{file_stem}-mermaid-{counter}.png"
        mmd_name = f"{file_stem}-mermaid-{counter}.mmd"
        png_path = image_dir / png_name
        mmd_path = image_dir / mmd_name

        render_mermaid_to_png(match.group(1), png_path, mmd_path)
        image_paths.append(png_path)
        return f"![图{counter}](images/{png_name})"

    result = re.sub(r"```mermaid\n(.*?)\n```", _replace, content, flags=re.DOTALL)
    return result, image_paths


def get_resource_path(input_file: Path) -> Path:
    """获取 Pandoc 资源路径（图片所在目录的父目录）。"""
    return input_file.parent


def build_pandoc_command(
    input_md: Path,
    output_file: Path,
    fmt: str,
    resource_path: Path,
) -> list[str]:
    """构建 pandoc 命令行参数。"""
    cmd = [
        "pandoc",
        str(input_md),
        "-o",
        str(output_file),
        "--resource-path",
        str(resource_path),
    ]

    if fmt == "pdf":
        cmd.extend(
            [
                "--pdf-engine=xelatex",
                "-V",
                "CJKmainfont=Noto Sans CJK SC",
                "-V",
                "geometry:margin=2.5cm",
            ]
        )

    return cmd


def export(input_file: Path, fmt: str) -> Path:
    """执行完整的导出流程。"""
    if not input_file.exists():
        print(f"错误: 文件不存在 {input_file}", file=sys.stderr)
        sys.exit(1)

    if not shutil.which("mmdc"):
        print("错误: 未找到 mmdc，请先安装 @mermaid-js/mermaid-cli", file=sys.stderr)
        sys.exit(1)

    if not shutil.which("pandoc"):
        print("错误: 未找到 pandoc，请先安装 pandoc", file=sys.stderr)
        sys.exit(1)

    content = input_file.read_text(encoding="utf-8")
    mermaid_blocks = extract_mermaid_blocks(content)

    if not mermaid_blocks:
        print("未发现 mermaid 图表，直接导出...")
        output_file = input_file.with_suffix(f".{fmt}")
        resource_path = get_resource_path(input_file)
        subprocess.run(
            build_pandoc_command(input_file, output_file, fmt, resource_path),
            check=True,
        )
        print(f"导出完成: {output_file}")
        return output_file

    # 准备图片目录
    image_dir = input_file.parent / "images"
    image_dir.mkdir(exist_ok=True)
    file_stem = input_file.stem

    # 替换 mermaid 为图片引用，写入临时文件
    replaced_content, image_paths = replace_mermaid_with_images(
        content,
        image_dir,
        file_stem,
    )

    with tempfile.NamedTemporaryFile(
        mode="w",
        suffix=".md",
        encoding="utf-8",
        dir=input_file.parent,
        delete=False,
    ) as tmp:
        tmp.write(replaced_content)
        tmp_path = Path(tmp.name)

    try:
        output_file = input_file.with_suffix(f".{fmt}")
        resource_path = get_resource_path(input_file)
        cmd = build_pandoc_command(tmp_path, output_file, fmt, resource_path)
        subprocess.run(cmd, check=True)
        print(f"导出完成: {output_file}")
        print(f"渲染了 {len(image_paths)} 张 mermaid 图片:")
        for p in image_paths:
            print(f"  - {p}")
        return output_file
    finally:
        tmp_path.unlink(missing_ok=True)


def main() -> None:
    parser = argparse.ArgumentParser(
        description="将 Markdown 中的 Mermaid 图表渲染为图片后导出为 docx/pdf",
    )
    parser.add_argument("input", type=Path, help="输入的 Markdown 文件路径")
    parser.add_argument(
        "--format",
        choices=["docx", "pdf"],
        default="docx",
        help="输出格式 (默认: docx)",
    )
    args = parser.parse_args()
    export(args.input.resolve(), args.format)


if __name__ == "__main__":
    _ensure_chrome_env()
    main()