[stage1] fix: strip YAML frontmatter from TOML integration prompts (#2096)

* fix: correct toml integration frontmatter handling

* refactor: reuse frontmatter split in toml integration

* fix: preserve toml integration string semantics

* docs: align toml integration renderer docstring
This commit is contained in:
Hamilton Snow
2026-04-06 21:36:05 +08:00
committed by GitHub
parent 9c0be46006
commit 8b099585c7
2 changed files with 154 additions and 46 deletions

View File

@@ -532,23 +532,83 @@ class TomlIntegration(IntegrationBase):
def _extract_description(content: str) -> str:
"""Extract the ``description`` value from YAML frontmatter.
Scans lines between the first pair of ``---`` delimiters for a
top-level ``description:`` key. Returns the value (with
surrounding quotes stripped) or an empty string if not found.
Parses the YAML frontmatter so block scalar descriptions (``|``
and ``>``) keep their YAML semantics instead of being treated as
raw text.
"""
in_frontmatter = False
for line in content.splitlines():
stripped = line.rstrip("\n\r")
if stripped == "---":
if not in_frontmatter:
in_frontmatter = True
continue
break # second ---
if in_frontmatter and stripped.startswith("description:"):
_, _, value = stripped.partition(":")
return value.strip().strip('"').strip("'")
import yaml
frontmatter_text, _ = TomlIntegration._split_frontmatter(content)
if not frontmatter_text:
return ""
try:
frontmatter = yaml.safe_load(frontmatter_text) or {}
except yaml.YAMLError:
return ""
if not isinstance(frontmatter, dict):
return ""
description = frontmatter.get("description", "")
if isinstance(description, str):
return description
return ""
@staticmethod
def _split_frontmatter(content: str) -> tuple[str, str]:
"""Split YAML frontmatter from the remaining content.
Returns ``("", content)`` when no complete frontmatter block is
present. The body is preserved exactly as written so prompt text
keeps its intended formatting.
"""
if not content.startswith("---"):
return "", content
lines = content.splitlines(keepends=True)
if not lines or lines[0].rstrip("\r\n") != "---":
return "", content
frontmatter_end = -1
for i, line in enumerate(lines[1:], start=1):
if line.rstrip("\r\n") == "---":
frontmatter_end = i
break
if frontmatter_end == -1:
return "", content
frontmatter = "".join(lines[1:frontmatter_end])
body = "".join(lines[frontmatter_end + 1 :])
return frontmatter, body
@staticmethod
def _render_toml_string(value: str) -> str:
"""Render *value* as a TOML string literal.
Uses a basic string for single-line values, multiline basic
strings for values containing newlines, and falls back to a
literal string or escaped basic string when delimiters appear in
the content.
"""
if "\n" not in value and "\r" not in value:
escaped = value.replace("\\", "\\\\").replace('"', '\\"')
return f'"{escaped}"'
escaped = value.replace("\\", "\\\\")
if '"""' not in escaped:
return '"""\n' + escaped + '"""'
if "'''" not in value:
return "'''\n" + value + "'''"
return '"' + (
value.replace("\\", "\\\\")
.replace('"', '\\"')
.replace("\n", "\\n")
.replace("\r", "\\r")
.replace("\t", "\\t")
) + '"'
@staticmethod
def _render_toml(description: str, body: str) -> str:
"""Render a TOML command file from description and body.
@@ -558,39 +618,19 @@ class TomlIntegration(IntegrationBase):
to multiline literal strings (``'''``) if the body contains
``\"\"\"``, then to an escaped basic string as a last resort.
The body is rstrip'd so the closing delimiter appears on the line
immediately after the last content line — matching the release
script's ``echo "$body"; echo '\"\"\"'`` pattern.
The body is ``rstrip("\\n")``'d before rendering, so the TOML
value preserves content without forcing a trailing newline. As a
result, multiline delimiters appear on their own line only when
the rendered value itself ends with a newline.
"""
toml_lines: list[str] = []
if description:
desc = description.replace('"', '\\"')
toml_lines.append(f'description = "{desc}"')
toml_lines.append(f"description = {TomlIntegration._render_toml_string(description)}")
toml_lines.append("")
body = body.rstrip("\n")
# Escape backslashes for basic multiline strings.
escaped = body.replace("\\", "\\\\")
if '"""' not in escaped:
toml_lines.append('prompt = """')
toml_lines.append(escaped)
toml_lines.append('"""')
elif "'''" not in body:
toml_lines.append("prompt = '''")
toml_lines.append(body)
toml_lines.append("'''")
else:
escaped_body = (
body.replace("\\", "\\\\")
.replace('"', '\\"')
.replace("\n", "\\n")
.replace("\r", "\\r")
.replace("\t", "\\t")
)
toml_lines.append(f'prompt = "{escaped_body}"')
toml_lines.append(f"prompt = {TomlIntegration._render_toml_string(body)}")
return "\n".join(toml_lines) + "\n"
@@ -630,7 +670,8 @@ class TomlIntegration(IntegrationBase):
raw = src_file.read_text(encoding="utf-8")
description = self._extract_description(raw)
processed = self.process_template(raw, self.key, script_type, arg_placeholder)
toml_content = self._render_toml(description, processed)
_, body = self._split_frontmatter(processed)
toml_content = self._render_toml(description, body)
dst_name = self.command_filename(src_file.stem)
dst_file = self.write_file_and_record(
toml_content, dest / dst_name, project_root, manifest

View File

@@ -9,6 +9,9 @@ adapted for TOML output format.
"""
import os
import tomllib
import pytest
from specify_cli.integrations import INTEGRATION_REGISTRY, get_integration
from specify_cli.integrations.base import TomlIntegration
@@ -132,13 +135,77 @@ class TomlIntegrationTests:
has_args = any("{{args}}" in f.read_text(encoding="utf-8") for f in cmd_files)
assert has_args, "No TOML command file contains {{args}} placeholder"
@pytest.mark.parametrize(
("frontmatter", "expected"),
[
(
"---\ndescription: |\n First line\n Second line\n---\nBody\n",
"First line\nSecond line\n",
),
(
"---\ndescription: >\n First line\n Second line\n---\nBody\n",
"First line Second line\n",
),
(
"---\ndescription: |-\n First line\n Second line\n---\nBody\n",
"First line\nSecond line",
),
(
"---\ndescription: >-\n First line\n Second line\n---\nBody\n",
"First line Second line",
),
],
)
def test_toml_extract_description_supports_block_scalars(self, frontmatter, expected):
assert TomlIntegration._extract_description(frontmatter) == expected
def test_split_frontmatter_ignores_indented_delimiters(self):
content = (
"---\n"
"description: |\n"
" line one\n"
" ---\n"
" line two\n"
"---\n"
"Body\n"
)
frontmatter, body = TomlIntegration._split_frontmatter(content)
assert "line two" in frontmatter
assert body == "Body\n"
def test_toml_prompt_excludes_frontmatter(self, tmp_path, monkeypatch):
i = get_integration(self.KEY)
template = tmp_path / "sample.md"
template.write_text(
"---\n"
"description: Summary line one\n"
"scripts:\n"
" sh: scripts/bash/example.sh\n"
"---\n"
"Body line one\n"
"Body line two\n",
encoding="utf-8",
)
monkeypatch.setattr(i, "list_command_templates", lambda: [template])
m = IntegrationManifest(self.KEY, tmp_path)
created = i.setup(tmp_path, m)
cmd_files = [f for f in created if "scripts" not in f.parts]
assert len(cmd_files) == 1
generated = cmd_files[0].read_text(encoding="utf-8")
parsed = tomllib.loads(generated)
assert parsed["description"] == "Summary line one"
assert parsed["prompt"] == "Body line one\nBody line two"
assert "description:" not in parsed["prompt"]
assert "scripts:" not in parsed["prompt"]
assert "---" not in parsed["prompt"]
def test_toml_is_valid(self, tmp_path):
"""Every generated TOML file must parse without errors."""
try:
import tomllib
except ModuleNotFoundError:
import tomli as tomllib # type: ignore[no-redef]
i = get_integration(self.KEY)
m = IntegrationManifest(self.KEY, tmp_path)
created = i.setup(tmp_path, m)