From 442a581358ec9a1f669e7e6b26806831cc9bbfd0 Mon Sep 17 00:00:00 2001
From: Quratulain-bilal <umayaimanshah@gmail.com>
Date: Tue, 2 Jun 2026 17:19:11 +0500
Subject: [PATCH] fix(cli): pin UTF-8 encoding on init-options and
 .extensionignore I/O (#2686)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix(cli): pin UTF-8 encoding on init-options and .extensionignore I/O

``Path.read_text`` / ``Path.write_text`` default to the system locale
codec, which is cp1252 / gb2312 / cp932 on Windows. Two user-facing
file paths in spec-kit were calling them without an explicit
``encoding=`` argument:

  - ``src/specify_cli/__init__.py:400,412`` —
    ``save_init_options`` / ``load_init_options`` for
    ``.specify/init-options.json``. A peer machine with a different
    default locale (or a UTF-8 Unix CI runner reading a file written on
    a cp1252 Windows host) cannot decode the file, raising
    ``UnicodeDecodeError``. ``UnicodeDecodeError`` is a subclass of
    ``ValueError`` — not ``OSError`` / ``json.JSONDecodeError`` — so
    the existing fall-back ``except`` tuple in ``load_init_options``
    also misses it and the error propagates raw to the CLI.

  - ``src/specify_cli/extensions.py:764`` — ``.extensionignore``
    pattern reader. The very next line already normalises
    backslashes "so Windows-authored files work", proving the codebase
    expects Windows authors to write this file. Multibyte UTF-8
    patterns (Chinese filenames, accented directory names) silently
    mojibake when the host locale is not UTF-8, so the patterns fail
    to match and unintended files are shipped with the extension.

The sibling integration-catalog reader at
``src/specify_cli/integrations/catalog.py:150,156,193,202,374``
already pins ``encoding="utf-8"`` everywhere. PR #2280 fixed the
symmetric PowerShell-template BOM bug. This change brings the two
remaining drifted paths in line with that precedent.

Regression tests:

  - ``tests/test_presets.py::TestInitOptions`` — parametrized non-ASCII
    round-trip (CJK, Latin-1, Greek, emoji) plus a corrupted-file case
    that asserts the existing "fall back to {}" contract still holds
    when a peer file contains bytes invalid as UTF-8.
  - ``tests/test_extensions.py::TestExtensionIgnore`` — Japanese
    (``ドキュメント/``) and Latin-1 (``café/``) ignore patterns
    correctly exclude their directories during install.

* fix(cli): wrap .extensionignore decode error and tighten UTF-8 contract

Addresses Copilot review feedback on this PR.

Three issues, three fixes:

1. ``save_init_options`` now writes JSON with ``ensure_ascii=False``.
   Without that flag, ``json.dumps`` emits ASCII-only ``\uXXXX``
   escapes, which means the ``encoding="utf-8"`` pin on the
   surrounding ``Path.write_text`` makes no observable difference for
   any value we currently write. Flipping ``ensure_ascii`` makes the
   non-ASCII bytes hit the file directly, so the encoding pin becomes
   the thing that decides between cp1252 garbage and clean UTF-8 on
   Windows. The comment above the call now describes the real reason
   instead of the previously-misleading rationale Copilot flagged.

2. ``test_save_load_round_trip_preserves_non_ascii`` was a no-op under
   the old ``ensure_ascii=True`` writer (Copilot's second comment).
   Added ``test_save_writes_real_utf8_bytes`` that asserts the on-disk
   bytes contain the UTF-8 encoding of ``café`` (``0xC3 0xA9``), not
   its JSON escape form ``é``. Removing either
   ``ensure_ascii=False`` or ``encoding="utf-8"`` from the writer now
   breaks this test — the contract is pinned.

3. ``.extensionignore`` reader wraps ``UnicodeDecodeError`` as
   ``ValidationError`` with a pointer to the offending byte
   (Copilot's third comment). Mirrors
   ``ExtensionManifest._load_yaml``'s existing handler for
   ``extension.yml``. Adds
   ``test_extensionignore_invalid_utf8_raises_validation_error``
   asserting installation aborts with the wrapped error instead of a
   raw Python traceback.
---
 src/specify_cli/__init__.py   | 36 ++++++++++++++--
 src/specify_cli/extensions.py | 23 +++++++++-
 tests/test_extensions.py      | 75 ++++++++++++++++++++++++++++++++-
 tests/test_presets.py         | 79 +++++++++++++++++++++++++++++++++++
 4 files changed, 207 insertions(+), 6 deletions(-)

diff --git a/src/specify_cli/__init__.py b/src/specify_cli/__init__.py
index 4bd23622d..4ab4595d3 100644
--- a/src/specify_cli/__init__.py
+++ b/src/specify_cli/__init__.py
@@ -287,7 +287,28 @@ def save_init_options(project_path: Path, options: dict[str, Any]) -> None:
     """
     dest = project_path / INIT_OPTIONS_FILE
     dest.parent.mkdir(parents=True, exist_ok=True)
-    dest.write_text(json.dumps(options, indent=2, sort_keys=True))
+    # Write JSON as real UTF-8 instead of ``\uXXXX`` escape sequences
+    # (``ensure_ascii=False``) and pin the file encoding to match.
+    #
+    # The default ``json.dumps`` output is ASCII-only — any non-ASCII
+    # character is encoded as a ``\uXXXX`` escape — so without the
+    # ``ensure_ascii=False`` flip below the encoding pin alone would be
+    # a no-op for any payload we plausibly write today. We pair the two
+    # so the on-disk bytes match a human's expectation of "this file is
+    # UTF-8" (greppable, readable in editors that don't decode JSON
+    # escapes, friendly to peers running ``cat`` or ``Get-Content``) and
+    # so the encoding pin is a real contract instead of a future hedge.
+    #
+    # ``Path.write_text`` without ``encoding=`` falls back to the system
+    # locale codec (cp1252 / gb2312 / cp932 on Windows), which would
+    # mis-encode non-ASCII bytes locally and produce a file a peer with
+    # a different locale couldn't decode. The sibling integration-
+    # catalog writer in ``integrations/catalog.py`` pins
+    # ``encoding="utf-8"`` for the same reason.
+    dest.write_text(
+        json.dumps(options, indent=2, sort_keys=True, ensure_ascii=False),
+        encoding="utf-8",
+    )
 
 
 def load_init_options(project_path: Path) -> dict[str, Any]:
@@ -299,8 +320,17 @@ def load_init_options(project_path: Path) -> dict[str, Any]:
     if not path.exists():
         return {}
     try:
-        return json.loads(path.read_text())
-    except (json.JSONDecodeError, OSError):
+        # Match the explicit UTF-8 used by ``save_init_options``; without
+        # it ``read_text`` falls back to the system codec on Windows and
+        # raises ``UnicodeDecodeError`` on any file containing the
+        # multi-byte UTF-8 sequences ``save_init_options`` now writes
+        # directly. ``UnicodeDecodeError`` is a subclass of
+        # ``ValueError``, not ``OSError`` / ``json.JSONDecodeError``, so
+        # it must be listed explicitly here to preserve the existing
+        # "fall back to empty dict" contract for corrupted / foreign-
+        # codec files.
+        return json.loads(path.read_text(encoding="utf-8"))
+    except (json.JSONDecodeError, OSError, UnicodeDecodeError):
         return {}
 
 
diff --git a/src/specify_cli/extensions.py b/src/specify_cli/extensions.py
index d36f3d5a5..09191986d 100644
--- a/src/specify_cli/extensions.py
+++ b/src/specify_cli/extensions.py
@@ -761,7 +761,28 @@ class ExtensionManager:
         if not ignore_file.exists():
             return None
 
-        lines: List[str] = ignore_file.read_text().splitlines()
+        # Pin UTF-8 explicitly: ``Path.read_text`` defaults to the system
+        # locale codec on Windows (cp1252 / gb2312 / cp932), which silently
+        # corrupts multibyte patterns when the file is shared across
+        # machines with different locales. The next line already
+        # normalises backslashes "so Windows-authored files work" — the
+        # codebase already expects Windows authors to write this file.
+        #
+        # A file that is not valid UTF-8 is a user-authoring mistake, so
+        # surface it as ``ValidationError`` with a pointer to the offending
+        # byte — the same pattern ``ExtensionManifest._load_yaml`` uses
+        # for ``extension.yml`` (see ``UnicodeDecodeError`` handler in
+        # this module). Without the wrap, the raw ``UnicodeDecodeError``
+        # would abort installation with a Python traceback instead of a
+        # clear message naming the file.
+        try:
+            raw = ignore_file.read_text(encoding="utf-8")
+        except UnicodeDecodeError as e:
+            raise ValidationError(
+                f".extensionignore is not valid UTF-8: {ignore_file} "
+                f"({e.reason} at byte {e.start})"
+            )
+        lines: List[str] = raw.splitlines()
 
         # Normalise backslashes in patterns so Windows-authored files work
         normalised: List[str] = []
diff --git a/tests/test_extensions.py b/tests/test_extensions.py
index cb6c3081a..d59c7a7b4 100644
--- a/tests/test_extensions.py
+++ b/tests/test_extensions.py
@@ -3358,9 +3358,13 @@ class TestExtensionIgnore:
                 else:
                     p.write_text(content)
 
-        # Write .extensionignore
+        # Write .extensionignore. Pinned to UTF-8 so non-ASCII patterns
+        # in tests (see ``test_extensionignore_utf8_patterns``) survive
+        # the round-trip on Windows runners with non-UTF-8 default locales.
         if ignore_content is not None:
-            (ext_dir / ".extensionignore").write_text(ignore_content)
+            (ext_dir / ".extensionignore").write_text(
+                ignore_content, encoding="utf-8"
+            )
 
         return ext_dir
 
@@ -3590,6 +3594,73 @@ class TestExtensionIgnore:
         assert (dest / "docs" / "guide.md").exists()
         assert not (dest / "docs" / "internal" / "draft.md").exists()
 
+    def test_extensionignore_utf8_patterns(self, temp_dir, valid_manifest_data):
+        """Non-ASCII patterns in .extensionignore work on every locale.
+
+        ``Path.read_text`` defaults to the system locale codec on Windows
+        (cp1252 / gb2312 / cp932). Without an explicit ``encoding="utf-8"``,
+        a pattern like ``ドキュメント/`` written by a UTF-8 host becomes
+        mojibake on a cp1252 host and silently fails to match — leaking
+        files the author intended to exclude. The existing
+        ``test_extensionignore_windows_backslash_patterns`` already shows
+        the codebase treats this as a Windows-author-friendly file; UTF-8
+        is part of that same contract.
+        """
+        ext_dir = self._make_extension(
+            temp_dir,
+            valid_manifest_data,
+            extra_files={
+                "ドキュメント/private.md": "secret",
+                "ドキュメント/public.md": "public",
+                "docs/guide.md": "# Guide",
+                "café/résumé.txt": "draft",
+            },
+            ignore_content="ドキュメント/\ncafé/\n",
+        )
+
+        proj_dir = temp_dir / "project"
+        proj_dir.mkdir()
+        (proj_dir / ".specify").mkdir()
+
+        manager = ExtensionManager(proj_dir)
+        manager.install_from_directory(ext_dir, "0.1.0", register_commands=False)
+
+        dest = proj_dir / ".specify" / "extensions" / "test-ext"
+        # Multibyte patterns excluded.
+        assert not (dest / "ドキュメント").exists()
+        assert not (dest / "café").exists()
+        # ASCII path with no matching pattern is unaffected.
+        assert (dest / "docs" / "guide.md").exists()
+
+    def test_extensionignore_invalid_utf8_raises_validation_error(
+        self, temp_dir, valid_manifest_data
+    ):
+        """A non-UTF-8 ``.extensionignore`` surfaces as ``ValidationError``.
+
+        Pinning ``encoding="utf-8"`` on the reader means an
+        ``.extensionignore`` written in some other codec (cp1252, etc.)
+        now triggers ``UnicodeDecodeError`` instead of silently
+        mojibake-ing patterns. Wrap that exception as ``ValidationError``
+        with a pointer to the offending byte — the same pattern
+        ``ExtensionManifest._load_yaml`` uses for ``extension.yml`` —
+        so installation aborts with a user-friendly message instead of a
+        raw Python traceback.
+        """
+        ext_dir = self._make_extension(temp_dir, valid_manifest_data)
+        # Write an .extensionignore whose bytes are not valid UTF-8.
+        # 0xE9 is 'é' in cp1252 but an invalid lead byte in UTF-8.
+        (ext_dir / ".extensionignore").write_bytes(b"caf\xe9/\n")
+
+        proj_dir = temp_dir / "project"
+        proj_dir.mkdir()
+        (proj_dir / ".specify").mkdir()
+
+        manager = ExtensionManager(proj_dir)
+        with pytest.raises(
+            ValidationError, match=r"\.extensionignore is not valid UTF-8"
+        ):
+            manager.install_from_directory(ext_dir, "0.1.0", register_commands=False)
+
     def test_extensionignore_star_does_not_cross_directories(self, temp_dir, valid_manifest_data):
         """'*' should NOT match across directory boundaries (gitignore semantics)."""
         ext_dir = self._make_extension(
diff --git a/tests/test_presets.py b/tests/test_presets.py
index 29c1a9e5e..de11adbbb 100644
--- a/tests/test_presets.py
+++ b/tests/test_presets.py
@@ -2269,6 +2269,85 @@ class TestInitOptions:
 
         assert load_init_options(project_dir) == {}
 
+    @pytest.mark.parametrize(
+        "value",
+        ["名前-プロジェクト", "café-résumé", "Ωmega-Δelta", "🚀-launch"],
+    )
+    def test_save_load_round_trip_preserves_non_ascii(self, project_dir, value):
+        """Non-ASCII values round-trip via explicit UTF-8 encoding.
+
+        ``Path.write_text`` / ``Path.read_text`` default to the system
+        locale codec on Windows (cp1252 / gb2312 / cp932). Without
+        ``encoding="utf-8"`` pinned on both ends, a project name like
+        ``café`` written on a UTF-8 host becomes garbled or unreadable on
+        a cp1252 host (and vice versa). Pin UTF-8 explicitly so init
+        options round-trip across machines and CI.
+
+        Note: this test only meaningfully exercises the encoding pin
+        because ``save_init_options`` now writes JSON with
+        ``ensure_ascii=False`` — otherwise ``json.dumps`` would output
+        ASCII-only ``\\uXXXX`` escapes and the encoding pin would be a
+        no-op for any value here. ``test_save_writes_real_utf8_bytes``
+        below asserts that contract directly.
+        """
+        from specify_cli import save_init_options, load_init_options
+
+        save_init_options(project_dir, {"ai": "claude", "project_name": value})
+
+        loaded = load_init_options(project_dir)
+        assert loaded["project_name"] == value
+
+    def test_save_writes_real_utf8_bytes(self, project_dir):
+        """The on-disk file contains real UTF-8 bytes, not ``\\uXXXX`` escapes.
+
+        Pinning ``encoding="utf-8"`` on ``write_text`` only makes a
+        difference when the serialiser actually emits non-ASCII
+        characters. With ``ensure_ascii=False`` on ``json.dumps`` the
+        non-ASCII bytes hit the file, so the encoding pin is the thing
+        that decides between cp1252 garbage and clean UTF-8 on Windows.
+
+        This test pins that behaviour: the on-disk bytes are valid UTF-8
+        and contain the multi-byte encoding of ``café``, not its
+        ``\\u00e9`` escape form. Reviewers can verify that removing
+        ``ensure_ascii=False`` or ``encoding="utf-8"`` from the writer
+        breaks this test, which is what Copilot's review pointed out the
+        original round-trip test failed to do.
+        """
+        from specify_cli import save_init_options
+
+        save_init_options(project_dir, {"project_name": "café"})
+
+        opts_file = project_dir / ".specify" / "init-options.json"
+        raw = opts_file.read_bytes()
+        # 'café' in UTF-8 ends with bytes 0xC3 0xA9 ('é'). The cp1252
+        # encoding of 'é' is the single byte 0xE9. The JSON-escape form
+        # would be the 6-byte literal '\\u00e9'. We assert the UTF-8 form
+        # is present so the test pins the actual contract.
+        assert b"caf\xc3\xa9" in raw, (
+            "Expected UTF-8 bytes for 'café' in the on-disk file, "
+            f"got: {raw!r}"
+        )
+        # And the whole file decodes cleanly as UTF-8.
+        raw.decode("utf-8")
+
+    def test_load_returns_empty_on_locale_corrupted_file(self, project_dir):
+        """A file written in a non-UTF-8 codec falls back to {}, not crash.
+
+        Simulates a file produced by an old client (or by a peer machine
+        with a different default locale) that contains bytes invalid as
+        UTF-8. ``load_init_options`` should fall back to ``{}`` per the
+        existing contract — never propagate a raw ``UnicodeDecodeError``
+        to the CLI surface.
+        """
+        from specify_cli import load_init_options
+
+        opts_file = project_dir / ".specify" / "init-options.json"
+        opts_file.parent.mkdir(parents=True, exist_ok=True)
+        # 0xE9 is 'é' in cp1252 but an invalid lead byte in UTF-8.
+        opts_file.write_bytes(b'{"project_name": "caf\xe9"}')
+
+        assert load_init_options(project_dir) == {}
+
 
 class TestPresetSkills:
     """Tests for preset skill registration and unregistration.