From eb54540fcc76d214cc0529ac81843ed733bc6a0c Mon Sep 17 00:00:00 2001 From: yzq Date: Thu, 18 Jun 2026 14:47:45 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=20Windows=20=E7=BC=93?= =?UTF-8?q?=E5=AD=98=E5=85=83=E6=95=B0=E6=8D=AE=E7=BC=96=E7=A0=81=E8=AF=BB?= =?UTF-8?q?=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/semble/cache.py | 2 +- tests/test_cache.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/src/semble/cache.py b/src/semble/cache.py index 9f26c76..31a73bb 100644 --- a/src/semble/cache.py +++ b/src/semble/cache.py @@ -119,7 +119,7 @@ def get_validated_cache(path: str, model_path: str | None, content: Sequence[Con if model_path is None: model_path = resolve_model_name() - with open(persistence_path.metadata) as f: + with open(persistence_path.metadata, encoding="utf-8") as f: metadata = json.load(f) if not _metadata_matches(metadata, model_path, content): return None diff --git a/tests/test_cache.py b/tests/test_cache.py index 54fc9e3..52b52a4 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -1,5 +1,6 @@ from __future__ import annotations +import builtins import json import sys from pathlib import Path @@ -186,6 +187,47 @@ def test_get_validated_cache_metadata_mismatch( assert get_validated_cache("/path", req_model, req_content) is None +def test_get_validated_cache_reads_utf8_metadata_with_non_ascii_file_paths(tmp_path: Path) -> None: + """Cache metadata is always UTF-8, even when the system default encoding is not.""" + from semble.chunking.chunking import _DESIRED_CHUNK_LENGTH_CHARS + + index_path = tmp_path / "index" + index_path.mkdir(parents=True) + (index_path / "chunks.json").write_text("[]") + (index_path / "bm25_index").write_text("") + (index_path / "semantic_index").write_text("") + + non_ascii_path = "docs\\测试检查清单.md" + with pytest.raises(UnicodeDecodeError): + non_ascii_path.encode("utf-8").decode("cp936") + + (index_path / "metadata.json").write_text( + json.dumps( + { + "model_path": "my/model", + "content_type": ["docs"], + "time": 0.0, + "file_paths": [non_ascii_path], + "chunk_size": _DESIRED_CHUNK_LENGTH_CHARS, + }, + ensure_ascii=False, + ), + encoding="utf-8", + ) + + real_open = builtins.open + + def open_with_cp936_default(file: object, mode: str = "r", *args: object, **kwargs: object): + if Path(file) == index_path / "metadata.json" and "b" not in mode and "encoding" not in kwargs: + kwargs["encoding"] = "cp936" + return real_open(file, mode, *args, **kwargs) + + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + with patch("builtins.open", side_effect=open_with_cp936_default): + result = get_validated_cache("https://github.com/org/repo.git", "my/model", [ContentType.DOCS]) + assert result == index_path + + def test_get_validated_cache_chunk_size_mismatch_returns_none(tmp_path: Path) -> None: """Cache built with a different chunk_size is not reused.""" from semble.chunking.chunking import _DESIRED_CHUNK_LENGTH_CHARS