From 442a581358ec9a1f669e7e6b26806831cc9bbfd0 Mon Sep 17 00:00:00 2001 From: Quratulain-bilal Date: Tue, 2 Jun 2026 17:19:11 +0500 Subject: [PATCH] fix(cli): pin UTF-8 encoding on init-options and .extensionignore I/O (#2686) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(cli): pin UTF-8 encoding on init-options and .extensionignore I/O ``Path.read_text`` / ``Path.write_text`` default to the system locale codec, which is cp1252 / gb2312 / cp932 on Windows. Two user-facing file paths in spec-kit were calling them without an explicit ``encoding=`` argument: - ``src/specify_cli/__init__.py:400,412`` — ``save_init_options`` / ``load_init_options`` for ``.specify/init-options.json``. A peer machine with a different default locale (or a UTF-8 Unix CI runner reading a file written on a cp1252 Windows host) cannot decode the file, raising ``UnicodeDecodeError``. ``UnicodeDecodeError`` is a subclass of ``ValueError`` — not ``OSError`` / ``json.JSONDecodeError`` — so the existing fall-back ``except`` tuple in ``load_init_options`` also misses it and the error propagates raw to the CLI. - ``src/specify_cli/extensions.py:764`` — ``.extensionignore`` pattern reader. The very next line already normalises backslashes "so Windows-authored files work", proving the codebase expects Windows authors to write this file. Multibyte UTF-8 patterns (Chinese filenames, accented directory names) silently mojibake when the host locale is not UTF-8, so the patterns fail to match and unintended files are shipped with the extension. The sibling integration-catalog reader at ``src/specify_cli/integrations/catalog.py:150,156,193,202,374`` already pins ``encoding="utf-8"`` everywhere. PR #2280 fixed the symmetric PowerShell-template BOM bug. This change brings the two remaining drifted paths in line with that precedent. Regression tests: - ``tests/test_presets.py::TestInitOptions`` — parametrized non-ASCII round-trip (CJK, Latin-1, Greek, emoji) plus a corrupted-file case that asserts the existing "fall back to {}" contract still holds when a peer file contains bytes invalid as UTF-8. - ``tests/test_extensions.py::TestExtensionIgnore`` — Japanese (``ドキュメント/``) and Latin-1 (``café/``) ignore patterns correctly exclude their directories during install. * fix(cli): wrap .extensionignore decode error and tighten UTF-8 contract Addresses Copilot review feedback on this PR. Three issues, three fixes: 1. ``save_init_options`` now writes JSON with ``ensure_ascii=False``. Without that flag, ``json.dumps`` emits ASCII-only ``\uXXXX`` escapes, which means the ``encoding="utf-8"`` pin on the surrounding ``Path.write_text`` makes no observable difference for any value we currently write. Flipping ``ensure_ascii`` makes the non-ASCII bytes hit the file directly, so the encoding pin becomes the thing that decides between cp1252 garbage and clean UTF-8 on Windows. The comment above the call now describes the real reason instead of the previously-misleading rationale Copilot flagged. 2. ``test_save_load_round_trip_preserves_non_ascii`` was a no-op under the old ``ensure_ascii=True`` writer (Copilot's second comment). Added ``test_save_writes_real_utf8_bytes`` that asserts the on-disk bytes contain the UTF-8 encoding of ``café`` (``0xC3 0xA9``), not its JSON escape form ``é``. Removing either ``ensure_ascii=False`` or ``encoding="utf-8"`` from the writer now breaks this test — the contract is pinned. 3. ``.extensionignore`` reader wraps ``UnicodeDecodeError`` as ``ValidationError`` with a pointer to the offending byte (Copilot's third comment). Mirrors ``ExtensionManifest._load_yaml``'s existing handler for ``extension.yml``. Adds ``test_extensionignore_invalid_utf8_raises_validation_error`` asserting installation aborts with the wrapped error instead of a raw Python traceback. --- src/specify_cli/__init__.py | 36 ++++++++++++++-- src/specify_cli/extensions.py | 23 +++++++++- tests/test_extensions.py | 75 ++++++++++++++++++++++++++++++++- tests/test_presets.py | 79 +++++++++++++++++++++++++++++++++++ 4 files changed, 207 insertions(+), 6 deletions(-) diff --git a/src/specify_cli/__init__.py b/src/specify_cli/__init__.py index 4bd23622d..4ab4595d3 100644 --- a/src/specify_cli/__init__.py +++ b/src/specify_cli/__init__.py @@ -287,7 +287,28 @@ def save_init_options(project_path: Path, options: dict[str, Any]) -> None: """ dest = project_path / INIT_OPTIONS_FILE dest.parent.mkdir(parents=True, exist_ok=True) - dest.write_text(json.dumps(options, indent=2, sort_keys=True)) + # Write JSON as real UTF-8 instead of ``\uXXXX`` escape sequences + # (``ensure_ascii=False``) and pin the file encoding to match. + # + # The default ``json.dumps`` output is ASCII-only — any non-ASCII + # character is encoded as a ``\uXXXX`` escape — so without the + # ``ensure_ascii=False`` flip below the encoding pin alone would be + # a no-op for any payload we plausibly write today. We pair the two + # so the on-disk bytes match a human's expectation of "this file is + # UTF-8" (greppable, readable in editors that don't decode JSON + # escapes, friendly to peers running ``cat`` or ``Get-Content``) and + # so the encoding pin is a real contract instead of a future hedge. + # + # ``Path.write_text`` without ``encoding=`` falls back to the system + # locale codec (cp1252 / gb2312 / cp932 on Windows), which would + # mis-encode non-ASCII bytes locally and produce a file a peer with + # a different locale couldn't decode. The sibling integration- + # catalog writer in ``integrations/catalog.py`` pins + # ``encoding="utf-8"`` for the same reason. + dest.write_text( + json.dumps(options, indent=2, sort_keys=True, ensure_ascii=False), + encoding="utf-8", + ) def load_init_options(project_path: Path) -> dict[str, Any]: @@ -299,8 +320,17 @@ def load_init_options(project_path: Path) -> dict[str, Any]: if not path.exists(): return {} try: - return json.loads(path.read_text()) - except (json.JSONDecodeError, OSError): + # Match the explicit UTF-8 used by ``save_init_options``; without + # it ``read_text`` falls back to the system codec on Windows and + # raises ``UnicodeDecodeError`` on any file containing the + # multi-byte UTF-8 sequences ``save_init_options`` now writes + # directly. ``UnicodeDecodeError`` is a subclass of + # ``ValueError``, not ``OSError`` / ``json.JSONDecodeError``, so + # it must be listed explicitly here to preserve the existing + # "fall back to empty dict" contract for corrupted / foreign- + # codec files. + return json.loads(path.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError, UnicodeDecodeError): return {} diff --git a/src/specify_cli/extensions.py b/src/specify_cli/extensions.py index d36f3d5a5..09191986d 100644 --- a/src/specify_cli/extensions.py +++ b/src/specify_cli/extensions.py @@ -761,7 +761,28 @@ class ExtensionManager: if not ignore_file.exists(): return None - lines: List[str] = ignore_file.read_text().splitlines() + # Pin UTF-8 explicitly: ``Path.read_text`` defaults to the system + # locale codec on Windows (cp1252 / gb2312 / cp932), which silently + # corrupts multibyte patterns when the file is shared across + # machines with different locales. The next line already + # normalises backslashes "so Windows-authored files work" — the + # codebase already expects Windows authors to write this file. + # + # A file that is not valid UTF-8 is a user-authoring mistake, so + # surface it as ``ValidationError`` with a pointer to the offending + # byte — the same pattern ``ExtensionManifest._load_yaml`` uses + # for ``extension.yml`` (see ``UnicodeDecodeError`` handler in + # this module). Without the wrap, the raw ``UnicodeDecodeError`` + # would abort installation with a Python traceback instead of a + # clear message naming the file. + try: + raw = ignore_file.read_text(encoding="utf-8") + except UnicodeDecodeError as e: + raise ValidationError( + f".extensionignore is not valid UTF-8: {ignore_file} " + f"({e.reason} at byte {e.start})" + ) + lines: List[str] = raw.splitlines() # Normalise backslashes in patterns so Windows-authored files work normalised: List[str] = [] diff --git a/tests/test_extensions.py b/tests/test_extensions.py index cb6c3081a..d59c7a7b4 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -3358,9 +3358,13 @@ class TestExtensionIgnore: else: p.write_text(content) - # Write .extensionignore + # Write .extensionignore. Pinned to UTF-8 so non-ASCII patterns + # in tests (see ``test_extensionignore_utf8_patterns``) survive + # the round-trip on Windows runners with non-UTF-8 default locales. if ignore_content is not None: - (ext_dir / ".extensionignore").write_text(ignore_content) + (ext_dir / ".extensionignore").write_text( + ignore_content, encoding="utf-8" + ) return ext_dir @@ -3590,6 +3594,73 @@ class TestExtensionIgnore: assert (dest / "docs" / "guide.md").exists() assert not (dest / "docs" / "internal" / "draft.md").exists() + def test_extensionignore_utf8_patterns(self, temp_dir, valid_manifest_data): + """Non-ASCII patterns in .extensionignore work on every locale. + + ``Path.read_text`` defaults to the system locale codec on Windows + (cp1252 / gb2312 / cp932). Without an explicit ``encoding="utf-8"``, + a pattern like ``ドキュメント/`` written by a UTF-8 host becomes + mojibake on a cp1252 host and silently fails to match — leaking + files the author intended to exclude. The existing + ``test_extensionignore_windows_backslash_patterns`` already shows + the codebase treats this as a Windows-author-friendly file; UTF-8 + is part of that same contract. + """ + ext_dir = self._make_extension( + temp_dir, + valid_manifest_data, + extra_files={ + "ドキュメント/private.md": "secret", + "ドキュメント/public.md": "public", + "docs/guide.md": "# Guide", + "café/résumé.txt": "draft", + }, + ignore_content="ドキュメント/\ncafé/\n", + ) + + proj_dir = temp_dir / "project" + proj_dir.mkdir() + (proj_dir / ".specify").mkdir() + + manager = ExtensionManager(proj_dir) + manager.install_from_directory(ext_dir, "0.1.0", register_commands=False) + + dest = proj_dir / ".specify" / "extensions" / "test-ext" + # Multibyte patterns excluded. + assert not (dest / "ドキュメント").exists() + assert not (dest / "café").exists() + # ASCII path with no matching pattern is unaffected. + assert (dest / "docs" / "guide.md").exists() + + def test_extensionignore_invalid_utf8_raises_validation_error( + self, temp_dir, valid_manifest_data + ): + """A non-UTF-8 ``.extensionignore`` surfaces as ``ValidationError``. + + Pinning ``encoding="utf-8"`` on the reader means an + ``.extensionignore`` written in some other codec (cp1252, etc.) + now triggers ``UnicodeDecodeError`` instead of silently + mojibake-ing patterns. Wrap that exception as ``ValidationError`` + with a pointer to the offending byte — the same pattern + ``ExtensionManifest._load_yaml`` uses for ``extension.yml`` — + so installation aborts with a user-friendly message instead of a + raw Python traceback. + """ + ext_dir = self._make_extension(temp_dir, valid_manifest_data) + # Write an .extensionignore whose bytes are not valid UTF-8. + # 0xE9 is 'é' in cp1252 but an invalid lead byte in UTF-8. + (ext_dir / ".extensionignore").write_bytes(b"caf\xe9/\n") + + proj_dir = temp_dir / "project" + proj_dir.mkdir() + (proj_dir / ".specify").mkdir() + + manager = ExtensionManager(proj_dir) + with pytest.raises( + ValidationError, match=r"\.extensionignore is not valid UTF-8" + ): + manager.install_from_directory(ext_dir, "0.1.0", register_commands=False) + def test_extensionignore_star_does_not_cross_directories(self, temp_dir, valid_manifest_data): """'*' should NOT match across directory boundaries (gitignore semantics).""" ext_dir = self._make_extension( diff --git a/tests/test_presets.py b/tests/test_presets.py index 29c1a9e5e..de11adbbb 100644 --- a/tests/test_presets.py +++ b/tests/test_presets.py @@ -2269,6 +2269,85 @@ class TestInitOptions: assert load_init_options(project_dir) == {} + @pytest.mark.parametrize( + "value", + ["名前-プロジェクト", "café-résumé", "Ωmega-Δelta", "🚀-launch"], + ) + def test_save_load_round_trip_preserves_non_ascii(self, project_dir, value): + """Non-ASCII values round-trip via explicit UTF-8 encoding. + + ``Path.write_text`` / ``Path.read_text`` default to the system + locale codec on Windows (cp1252 / gb2312 / cp932). Without + ``encoding="utf-8"`` pinned on both ends, a project name like + ``café`` written on a UTF-8 host becomes garbled or unreadable on + a cp1252 host (and vice versa). Pin UTF-8 explicitly so init + options round-trip across machines and CI. + + Note: this test only meaningfully exercises the encoding pin + because ``save_init_options`` now writes JSON with + ``ensure_ascii=False`` — otherwise ``json.dumps`` would output + ASCII-only ``\\uXXXX`` escapes and the encoding pin would be a + no-op for any value here. ``test_save_writes_real_utf8_bytes`` + below asserts that contract directly. + """ + from specify_cli import save_init_options, load_init_options + + save_init_options(project_dir, {"ai": "claude", "project_name": value}) + + loaded = load_init_options(project_dir) + assert loaded["project_name"] == value + + def test_save_writes_real_utf8_bytes(self, project_dir): + """The on-disk file contains real UTF-8 bytes, not ``\\uXXXX`` escapes. + + Pinning ``encoding="utf-8"`` on ``write_text`` only makes a + difference when the serialiser actually emits non-ASCII + characters. With ``ensure_ascii=False`` on ``json.dumps`` the + non-ASCII bytes hit the file, so the encoding pin is the thing + that decides between cp1252 garbage and clean UTF-8 on Windows. + + This test pins that behaviour: the on-disk bytes are valid UTF-8 + and contain the multi-byte encoding of ``café``, not its + ``\\u00e9`` escape form. Reviewers can verify that removing + ``ensure_ascii=False`` or ``encoding="utf-8"`` from the writer + breaks this test, which is what Copilot's review pointed out the + original round-trip test failed to do. + """ + from specify_cli import save_init_options + + save_init_options(project_dir, {"project_name": "café"}) + + opts_file = project_dir / ".specify" / "init-options.json" + raw = opts_file.read_bytes() + # 'café' in UTF-8 ends with bytes 0xC3 0xA9 ('é'). The cp1252 + # encoding of 'é' is the single byte 0xE9. The JSON-escape form + # would be the 6-byte literal '\\u00e9'. We assert the UTF-8 form + # is present so the test pins the actual contract. + assert b"caf\xc3\xa9" in raw, ( + "Expected UTF-8 bytes for 'café' in the on-disk file, " + f"got: {raw!r}" + ) + # And the whole file decodes cleanly as UTF-8. + raw.decode("utf-8") + + def test_load_returns_empty_on_locale_corrupted_file(self, project_dir): + """A file written in a non-UTF-8 codec falls back to {}, not crash. + + Simulates a file produced by an old client (or by a peer machine + with a different default locale) that contains bytes invalid as + UTF-8. ``load_init_options`` should fall back to ``{}`` per the + existing contract — never propagate a raw ``UnicodeDecodeError`` + to the CLI surface. + """ + from specify_cli import load_init_options + + opts_file = project_dir / ".specify" / "init-options.json" + opts_file.parent.mkdir(parents=True, exist_ok=True) + # 0xE9 is 'é' in cp1252 but an invalid lead byte in UTF-8. + opts_file.write_bytes(b'{"project_name": "caf\xe9"}') + + assert load_init_options(project_dir) == {} + class TestPresetSkills: """Tests for preset skill registration and unregistration.