225 lines
6.6 KiB
Python
Executable File
225 lines
6.6 KiB
Python
Executable File
"""
|
||
将 Markdown 文件中的 Mermaid 图表渲染为图片,并通过 Pandoc 导出为 docx/pdf。
|
||
|
||
用法:
|
||
python scripts/md_export.py docs/论文.md --format docx
|
||
python scripts/md_export.py docs/论文.md --format pdf
|
||
"""
|
||
|
||
import argparse
|
||
import os
|
||
import re
|
||
import shutil
|
||
import subprocess
|
||
import sys
|
||
import tempfile
|
||
from pathlib import Path
|
||
|
||
|
||
def _ensure_chrome_env() -> None:
|
||
"""确保 mmdc 能找到 Chrome/Chromium 可执行文件。"""
|
||
if os.environ.get("PUPPETEER_EXECUTABLE_PATH"):
|
||
return
|
||
|
||
# 优先检查系统 Chrome/Chromium
|
||
for candidate in (
|
||
"google-chrome",
|
||
"google-chrome-stable",
|
||
"chromium-browser",
|
||
"chromium",
|
||
):
|
||
path = shutil.which(candidate)
|
||
if path:
|
||
os.environ["PUPPETEER_EXECUTABLE_PATH"] = path
|
||
return
|
||
|
||
# 检查 Puppeteer 缓存目录
|
||
cache_dir = Path.home() / ".cache" / "puppeteer"
|
||
if cache_dir.exists():
|
||
for shell_dir in sorted(cache_dir.glob("chrome-headless-shell/linux-*")):
|
||
binary = (
|
||
shell_dir / "chrome-headless-shell-linux64" / "chrome-headless-shell"
|
||
)
|
||
if binary.exists():
|
||
os.environ["PUPPETEER_EXECUTABLE_PATH"] = str(binary)
|
||
return
|
||
|
||
for chrome_dir in sorted(cache_dir.glob("chrome/linux-*")):
|
||
binary = chrome_dir / "chrome-linux64" / "chrome"
|
||
if binary.exists():
|
||
os.environ["PUPPETEER_EXECUTABLE_PATH"] = str(binary)
|
||
return
|
||
|
||
|
||
def find_project_root() -> Path:
|
||
"""从脚本位置向上查找项目根目录(包含 AGENTS.md 或 CLAUDE.md 的目录)。"""
|
||
current = Path(__file__).resolve().parent
|
||
for parent in [current, *current.parents]:
|
||
if (parent / "AGENTS.md").exists() or (parent / "CLAUDE.md").exists():
|
||
return parent
|
||
return current
|
||
|
||
|
||
def extract_mermaid_blocks(content: str) -> list[str]:
|
||
"""从 Markdown 内容中提取所有 mermaid 代码块。"""
|
||
return re.findall(r"```mermaid\n(.*?)\n```", content, re.DOTALL)
|
||
|
||
|
||
def render_mermaid_to_png(
|
||
mermaid_code: str,
|
||
output_path: Path,
|
||
mmd_path: Path,
|
||
) -> Path:
|
||
"""用 mmdc 将 mermaid 代码渲染为 PNG 图片,同时保留 .mmd 源文件。"""
|
||
mmd_path.write_text(mermaid_code, encoding="utf-8")
|
||
subprocess.run(
|
||
["mmdc", "-i", str(mmd_path), "-o", str(output_path), "-b", "white"],
|
||
check=True,
|
||
capture_output=True,
|
||
)
|
||
return output_path
|
||
|
||
|
||
def replace_mermaid_with_images(
|
||
content: str,
|
||
image_dir: Path,
|
||
file_stem: str,
|
||
) -> tuple[str, list[Path]]:
|
||
"""将 mermaid 代码块替换为图片引用,返回替换后的内容和图片路径列表。"""
|
||
counter = 0
|
||
image_paths: list[Path] = []
|
||
|
||
def _replace(match: re.Match[str]) -> str:
|
||
nonlocal counter
|
||
counter += 1
|
||
png_name = f"{file_stem}-mermaid-{counter}.png"
|
||
mmd_name = f"{file_stem}-mermaid-{counter}.mmd"
|
||
png_path = image_dir / png_name
|
||
mmd_path = image_dir / mmd_name
|
||
|
||
render_mermaid_to_png(match.group(1), png_path, mmd_path)
|
||
image_paths.append(png_path)
|
||
return f""
|
||
|
||
result = re.sub(r"```mermaid\n(.*?)\n```", _replace, content, flags=re.DOTALL)
|
||
return result, image_paths
|
||
|
||
|
||
def get_resource_path(input_file: Path) -> Path:
|
||
"""获取 Pandoc 资源路径(图片所在目录的父目录)。"""
|
||
return input_file.parent
|
||
|
||
|
||
def build_pandoc_command(
|
||
input_md: Path,
|
||
output_file: Path,
|
||
fmt: str,
|
||
resource_path: Path,
|
||
) -> list[str]:
|
||
"""构建 pandoc 命令行参数。"""
|
||
cmd = [
|
||
"pandoc",
|
||
str(input_md),
|
||
"-o",
|
||
str(output_file),
|
||
"--resource-path",
|
||
str(resource_path),
|
||
]
|
||
|
||
if fmt == "pdf":
|
||
cmd.extend(
|
||
[
|
||
"--pdf-engine=xelatex",
|
||
"-V",
|
||
"CJKmainfont=Noto Sans CJK SC",
|
||
"-V",
|
||
"geometry:margin=2.5cm",
|
||
]
|
||
)
|
||
|
||
return cmd
|
||
|
||
|
||
def export(input_file: Path, fmt: str) -> Path:
|
||
"""执行完整的导出流程。"""
|
||
if not input_file.exists():
|
||
print(f"错误: 文件不存在 {input_file}", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
if not shutil.which("mmdc"):
|
||
print("错误: 未找到 mmdc,请先安装 @mermaid-js/mermaid-cli", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
if not shutil.which("pandoc"):
|
||
print("错误: 未找到 pandoc,请先安装 pandoc", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
content = input_file.read_text(encoding="utf-8")
|
||
mermaid_blocks = extract_mermaid_blocks(content)
|
||
|
||
if not mermaid_blocks:
|
||
print("未发现 mermaid 图表,直接导出...")
|
||
output_file = input_file.with_suffix(f".{fmt}")
|
||
resource_path = get_resource_path(input_file)
|
||
subprocess.run(
|
||
build_pandoc_command(input_file, output_file, fmt, resource_path),
|
||
check=True,
|
||
)
|
||
print(f"导出完成: {output_file}")
|
||
return output_file
|
||
|
||
# 准备图片目录
|
||
image_dir = input_file.parent / "images"
|
||
image_dir.mkdir(exist_ok=True)
|
||
file_stem = input_file.stem
|
||
|
||
# 替换 mermaid 为图片引用,写入临时文件
|
||
replaced_content, image_paths = replace_mermaid_with_images(
|
||
content,
|
||
image_dir,
|
||
file_stem,
|
||
)
|
||
|
||
with tempfile.NamedTemporaryFile(
|
||
mode="w",
|
||
suffix=".md",
|
||
encoding="utf-8",
|
||
dir=input_file.parent,
|
||
delete=False,
|
||
) as tmp:
|
||
tmp.write(replaced_content)
|
||
tmp_path = Path(tmp.name)
|
||
|
||
try:
|
||
output_file = input_file.with_suffix(f".{fmt}")
|
||
resource_path = get_resource_path(input_file)
|
||
cmd = build_pandoc_command(tmp_path, output_file, fmt, resource_path)
|
||
subprocess.run(cmd, check=True)
|
||
print(f"导出完成: {output_file}")
|
||
print(f"渲染了 {len(image_paths)} 张 mermaid 图片:")
|
||
for p in image_paths:
|
||
print(f" - {p}")
|
||
return output_file
|
||
finally:
|
||
tmp_path.unlink(missing_ok=True)
|
||
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser(
|
||
description="将 Markdown 中的 Mermaid 图表渲染为图片后导出为 docx/pdf",
|
||
)
|
||
parser.add_argument("input", type=Path, help="输入的 Markdown 文件路径")
|
||
parser.add_argument(
|
||
"--format",
|
||
choices=["docx", "pdf"],
|
||
default="docx",
|
||
help="输出格式 (默认: docx)",
|
||
)
|
||
args = parser.parse_args()
|
||
export(args.input.resolve(), args.format)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
_ensure_chrome_env()
|
||
main()
|