Files
team/scripts/md_export.py
arno 34346be862
All checks were successful
CI / lint (push) Successful in 6s
配置: 初始化 ISOS Agent Teams 软件研发模板
2026-04-19 21:47:08 +08:00

225 lines
6.6 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
将 Markdown 文件中的 Mermaid 图表渲染为图片,并通过 Pandoc 导出为 docx/pdf。
用法:
python scripts/md_export.py docs/论文.md --format docx
python scripts/md_export.py docs/论文.md --format pdf
"""
import argparse
import os
import re
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path
def _ensure_chrome_env() -> None:
"""确保 mmdc 能找到 Chrome/Chromium 可执行文件。"""
if os.environ.get("PUPPETEER_EXECUTABLE_PATH"):
return
# 优先检查系统 Chrome/Chromium
for candidate in (
"google-chrome",
"google-chrome-stable",
"chromium-browser",
"chromium",
):
path = shutil.which(candidate)
if path:
os.environ["PUPPETEER_EXECUTABLE_PATH"] = path
return
# 检查 Puppeteer 缓存目录
cache_dir = Path.home() / ".cache" / "puppeteer"
if cache_dir.exists():
for shell_dir in sorted(cache_dir.glob("chrome-headless-shell/linux-*")):
binary = (
shell_dir / "chrome-headless-shell-linux64" / "chrome-headless-shell"
)
if binary.exists():
os.environ["PUPPETEER_EXECUTABLE_PATH"] = str(binary)
return
for chrome_dir in sorted(cache_dir.glob("chrome/linux-*")):
binary = chrome_dir / "chrome-linux64" / "chrome"
if binary.exists():
os.environ["PUPPETEER_EXECUTABLE_PATH"] = str(binary)
return
def find_project_root() -> Path:
"""从脚本位置向上查找项目根目录(包含 AGENTS.md 或 CLAUDE.md 的目录)。"""
current = Path(__file__).resolve().parent
for parent in [current, *current.parents]:
if (parent / "AGENTS.md").exists() or (parent / "CLAUDE.md").exists():
return parent
return current
def extract_mermaid_blocks(content: str) -> list[str]:
"""从 Markdown 内容中提取所有 mermaid 代码块。"""
return re.findall(r"```mermaid\n(.*?)\n```", content, re.DOTALL)
def render_mermaid_to_png(
mermaid_code: str,
output_path: Path,
mmd_path: Path,
) -> Path:
"""用 mmdc 将 mermaid 代码渲染为 PNG 图片,同时保留 .mmd 源文件。"""
mmd_path.write_text(mermaid_code, encoding="utf-8")
subprocess.run(
["mmdc", "-i", str(mmd_path), "-o", str(output_path), "-b", "white"],
check=True,
capture_output=True,
)
return output_path
def replace_mermaid_with_images(
content: str,
image_dir: Path,
file_stem: str,
) -> tuple[str, list[Path]]:
"""将 mermaid 代码块替换为图片引用,返回替换后的内容和图片路径列表。"""
counter = 0
image_paths: list[Path] = []
def _replace(match: re.Match[str]) -> str:
nonlocal counter
counter += 1
png_name = f"{file_stem}-mermaid-{counter}.png"
mmd_name = f"{file_stem}-mermaid-{counter}.mmd"
png_path = image_dir / png_name
mmd_path = image_dir / mmd_name
render_mermaid_to_png(match.group(1), png_path, mmd_path)
image_paths.append(png_path)
return f"![图{counter}](images/{png_name})"
result = re.sub(r"```mermaid\n(.*?)\n```", _replace, content, flags=re.DOTALL)
return result, image_paths
def get_resource_path(input_file: Path) -> Path:
"""获取 Pandoc 资源路径(图片所在目录的父目录)。"""
return input_file.parent
def build_pandoc_command(
input_md: Path,
output_file: Path,
fmt: str,
resource_path: Path,
) -> list[str]:
"""构建 pandoc 命令行参数。"""
cmd = [
"pandoc",
str(input_md),
"-o",
str(output_file),
"--resource-path",
str(resource_path),
]
if fmt == "pdf":
cmd.extend(
[
"--pdf-engine=xelatex",
"-V",
"CJKmainfont=Noto Sans CJK SC",
"-V",
"geometry:margin=2.5cm",
]
)
return cmd
def export(input_file: Path, fmt: str) -> Path:
"""执行完整的导出流程。"""
if not input_file.exists():
print(f"错误: 文件不存在 {input_file}", file=sys.stderr)
sys.exit(1)
if not shutil.which("mmdc"):
print("错误: 未找到 mmdc请先安装 @mermaid-js/mermaid-cli", file=sys.stderr)
sys.exit(1)
if not shutil.which("pandoc"):
print("错误: 未找到 pandoc请先安装 pandoc", file=sys.stderr)
sys.exit(1)
content = input_file.read_text(encoding="utf-8")
mermaid_blocks = extract_mermaid_blocks(content)
if not mermaid_blocks:
print("未发现 mermaid 图表,直接导出...")
output_file = input_file.with_suffix(f".{fmt}")
resource_path = get_resource_path(input_file)
subprocess.run(
build_pandoc_command(input_file, output_file, fmt, resource_path),
check=True,
)
print(f"导出完成: {output_file}")
return output_file
# 准备图片目录
image_dir = input_file.parent / "images"
image_dir.mkdir(exist_ok=True)
file_stem = input_file.stem
# 替换 mermaid 为图片引用,写入临时文件
replaced_content, image_paths = replace_mermaid_with_images(
content,
image_dir,
file_stem,
)
with tempfile.NamedTemporaryFile(
mode="w",
suffix=".md",
encoding="utf-8",
dir=input_file.parent,
delete=False,
) as tmp:
tmp.write(replaced_content)
tmp_path = Path(tmp.name)
try:
output_file = input_file.with_suffix(f".{fmt}")
resource_path = get_resource_path(input_file)
cmd = build_pandoc_command(tmp_path, output_file, fmt, resource_path)
subprocess.run(cmd, check=True)
print(f"导出完成: {output_file}")
print(f"渲染了 {len(image_paths)} 张 mermaid 图片:")
for p in image_paths:
print(f" - {p}")
return output_file
finally:
tmp_path.unlink(missing_ok=True)
def main() -> None:
parser = argparse.ArgumentParser(
description="将 Markdown 中的 Mermaid 图表渲染为图片后导出为 docx/pdf",
)
parser.add_argument("input", type=Path, help="输入的 Markdown 文件路径")
parser.add_argument(
"--format",
choices=["docx", "pdf"],
default="docx",
help="输出格式 (默认: docx)",
)
args = parser.parse_args()
export(args.input.resolve(), args.format)
if __name__ == "__main__":
_ensure_chrome_env()
main()