fix(extensions): use explicit UTF-8 encoding when reading manifest YAML (#2370)

* fix(extensions): use explicit UTF-8 encoding when reading manifest YAML

On Windows, Python's open() defaults to the system locale encoding
(e.g., GBK on Chinese Windows), which causes UnicodeDecodeError when
extension.yml or preset.yml contains non-ASCII content such as Chinese
characters in description fields.

Add encoding='utf-8' to ExtensionManifest._load_yaml and
PresetManifest._load_yaml so manifests are read consistently across
platforms.

Fixes #2325

* test(extensions,presets): add UTF-8 manifest regression tests for #2325

Positive: extension.yml/preset.yml with non-ASCII (Chinese + emoji)
descriptions load correctly when written as UTF-8 bytes — fails on
Windows without explicit encoding='utf-8'.

Negative: files containing invalid UTF-8 bytes raise a clean error
(ValidationError or UnicodeDecodeError), not a silent crash.

* fix(extensions,presets): wrap I/O and decode errors as ValidationError

Address remaining Copilot concerns on #2370:

- Catch UnicodeDecodeError and OSError in both manifest loaders and
  re-raise as ValidationError / PresetValidationError so callers see a
  consistent error type, not a bare decode/IO traceback.
- Validate that PresetManifest YAML root is a mapping (extensions.py
  already had this; presets.py was missing it). Treat None as {} for
  empty-file compatibility.
- Tighten the negative regression tests to assert the specific message,
  and add a non-mapping-root test for PresetManifest matching the
  existing one for ExtensionManifest.
This commit is contained in:
Quratulain-bilal
2026-04-28 18:47:22 +05:00
committed by GitHub
parent 171b65ac33
commit 3a7f64c8a5
4 changed files with 83 additions and 3 deletions

View File

@@ -139,12 +139,18 @@ class ExtensionManifest:
def _load_yaml(self, path: Path) -> dict:
"""Load YAML file safely."""
try:
with open(path, 'r') as f:
with open(path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
except yaml.YAMLError as e:
raise ValidationError(f"Invalid YAML in {path}: {e}")
except FileNotFoundError:
raise ValidationError(f"Manifest not found: {path}")
except UnicodeDecodeError as e:
raise ValidationError(
f"Manifest is not valid UTF-8: {path} ({e.reason} at byte {e.start})"
)
except OSError as e:
raise ValidationError(f"Could not read manifest {path}: {e}")
if not isinstance(data, dict):
raise ValidationError(
f"Manifest must be a YAML mapping, got {type(data).__name__}: {path}"

View File

@@ -136,12 +136,25 @@ class PresetManifest:
def _load_yaml(self, path: Path) -> dict:
"""Load YAML file safely."""
try:
with open(path, 'r') as f:
return yaml.safe_load(f) or {}
with open(path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
except yaml.YAMLError as e:
raise PresetValidationError(f"Invalid YAML in {path}: {e}")
except FileNotFoundError:
raise PresetValidationError(f"Manifest not found: {path}")
except UnicodeDecodeError as e:
raise PresetValidationError(
f"Manifest is not valid UTF-8: {path} ({e.reason} at byte {e.start})"
)
except OSError as e:
raise PresetValidationError(f"Could not read manifest {path}: {e}")
if data is None:
return {}
if not isinstance(data, dict):
raise PresetValidationError(
f"Manifest must be a YAML mapping, got {type(data).__name__}: {path}"
)
return data
def _validate(self):
"""Validate manifest structure and required fields."""

View File

@@ -225,6 +225,35 @@ class TestExtensionManifest:
with pytest.raises(ValidationError, match="YAML mapping"):
ExtensionManifest(manifest_path)
def test_utf8_non_ascii_description_loads(self, temp_dir, valid_manifest_data):
"""Regression for #2325: non-ASCII (UTF-8) description loads on any platform.
On Windows, Python's default text-mode encoding is the locale codepage
(e.g. cp1252/GBK), which raises UnicodeDecodeError on UTF-8 bytes
outside the ASCII range. The loader must open with encoding='utf-8'.
"""
import yaml
valid_manifest_data["extension"]["description"] = "中文测试 — émojis 🚀"
manifest_path = temp_dir / "extension.yml"
# Write UTF-8 bytes explicitly so the test exercises the read path,
# not the (locale-dependent) write path.
manifest_path.write_bytes(
yaml.safe_dump(valid_manifest_data, allow_unicode=True).encode("utf-8")
)
manifest = ExtensionManifest(manifest_path)
assert manifest.description == "中文测试 — émojis 🚀"
def test_invalid_utf8_bytes_raises_validation_error(self, temp_dir):
"""Negative case: file containing invalid UTF-8 bytes raises ValidationError, not raw UnicodeDecodeError."""
manifest_path = temp_dir / "extension.yml"
# 0xFF/0xFE are not valid UTF-8 lead bytes.
manifest_path.write_bytes(b"\xff\xfe not valid utf-8 \xff\n")
with pytest.raises(ValidationError, match="not valid UTF-8"):
ExtensionManifest(manifest_path)
def test_invalid_extension_id(self, temp_dir, valid_manifest_data):
"""Test manifest with invalid extension ID format."""
import yaml

View File

@@ -160,6 +160,38 @@ class TestPresetManifest:
with pytest.raises(PresetValidationError, match="Invalid YAML"):
PresetManifest(bad_file)
def test_utf8_non_ascii_description_loads(self, temp_dir, valid_pack_data):
"""Regression for #2325: non-ASCII (UTF-8) description loads on any platform.
On Windows, Python's default text-mode encoding is the locale codepage
(e.g. cp1252/GBK), which raises UnicodeDecodeError on UTF-8 bytes
outside the ASCII range. The loader must open with encoding='utf-8'.
"""
valid_pack_data["preset"]["description"] = "中文测试 — émojis 🚀"
manifest_path = temp_dir / "preset.yml"
manifest_path.write_bytes(
yaml.safe_dump(valid_pack_data, allow_unicode=True).encode("utf-8")
)
manifest = PresetManifest(manifest_path)
assert manifest.description == "中文测试 — émojis 🚀"
def test_invalid_utf8_bytes_raises_validation_error(self, temp_dir):
"""Negative case: file containing invalid UTF-8 bytes raises PresetValidationError, not raw UnicodeDecodeError."""
manifest_path = temp_dir / "preset.yml"
manifest_path.write_bytes(b"\xff\xfe not valid utf-8 \xff\n")
with pytest.raises(PresetValidationError, match="not valid UTF-8"):
PresetManifest(manifest_path)
def test_non_mapping_yaml_raises_validation_error(self, temp_dir):
"""Manifest whose YAML root is a scalar or list raises PresetValidationError, not TypeError."""
manifest_path = temp_dir / "preset.yml"
for bad_content in ("42\n", "[1, 2]\n"):
manifest_path.write_text(bad_content, encoding="utf-8")
with pytest.raises(PresetValidationError, match="YAML mapping"):
PresetManifest(manifest_path)
def test_missing_schema_version(self, temp_dir, valid_pack_data):
"""Test missing schema_version field."""
del valid_pack_data["schema_version"]