From 4e0893634b18ef16e178c610c4a2c425c5e7c4fe Mon Sep 17 00:00:00 2001 From: valiantone Date: Mon, 8 Jun 2026 21:22:58 +0200 Subject: [PATCH 1/3] Speed up JSONL hydration --- src/hotmem/db.py | 94 +++++++++++++++++++++++++++++++++++++++++++--- src/hotmem/swap.py | 94 ++++++++++++++++++++++++++++++++++++++-------- tests/test_db.py | 36 +++++++++++++++++- tests/test_swap.py | 91 +++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 292 insertions(+), 23 deletions(-) diff --git a/src/hotmem/db.py b/src/hotmem/db.py index b27600f..5830cfa 100644 --- a/src/hotmem/db.py +++ b/src/hotmem/db.py @@ -8,8 +8,9 @@ Interface: MemoryDB(db_path: str | Path) .insert(id, identifier, fact_text, embedding_blob, ...) -> None + .insert_many_ignore(records) -> int .count() -> int - .all_rows() -> list[Row] + .all_rows(include_embedding=False) -> list[Row] .exists(content_hash: str) -> bool .close() -> None @@ -23,6 +24,8 @@ import re import sqlite3 import struct +from collections.abc import Iterable +from dataclasses import dataclass from pathlib import Path from typing import Any @@ -75,6 +78,24 @@ _FTS_TOKEN_RE = re.compile(r"[\w]+") +@dataclass(frozen=True) +class MemoryRecord: + """Database-ready memory row.""" + + id: str + identifier: str + fact_text: str + embedding: bytes + embedding_dim: int = EMBEDDING_DIM + embedding_model: str = "" + source: str = "" + importance: float = 0.5 + metadata_json: str = "{}" + content_hash: str = "" + ttl_seconds: int | None = None + created_at: str | None = None + + def _cosine_similarity(blob_a: bytes | None, blob_b: bytes | None) -> float | None: """SQLite UDF: cosine similarity between two packed float32 blobs.""" if blob_a is None or blob_b is None: @@ -124,6 +145,19 @@ def _migrate(self) -> None: self._conn.commit() _trace.info("migrate", "added ttl_seconds column") + try: + self._conn.execute( + """CREATE UNIQUE INDEX IF NOT EXISTS idx_memories_content_hash_unique + ON memories(content_hash) + WHERE content_hash != ''""" + ) + self._conn.commit() + except sqlite3.IntegrityError: + _trace.warn( + "migrate", + "skipped unique content_hash index because duplicate hashes exist", + ) + def insert( self, id: str, @@ -167,6 +201,43 @@ def insert( self._conn.commit() _trace.debug("insert", f"stored memory {id[:8]}…", detail={"identifier": identifier}) + def insert_many_ignore(self, records: Iterable[MemoryRecord]) -> int: + """Insert many memory rows in one transaction, ignoring duplicate hashes/ids.""" + rows = [ + ( + record.id, + record.identifier, + record.fact_text, + record.embedding, + record.embedding_dim, + record.embedding_model, + record.source, + record.importance, + record.metadata_json, + record.content_hash, + record.ttl_seconds, + record.created_at, + ) + for record in records + ] + if not rows: + return 0 + + cursor = self._conn.executemany( + """INSERT OR IGNORE INTO memories + (id, identifier, fact_text, embedding, embedding_dim, embedding_model, + source, importance, metadata_json, content_hash, ttl_seconds, created_at) + VALUES ( + ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, + COALESCE(?, strftime('%Y-%m-%dT%H:%M:%SZ', 'now')) + )""", + rows, + ) + self._conn.commit() + inserted = cursor.rowcount if cursor.rowcount != -1 else 0 + _trace.debug("insert_many", f"stored {inserted} memories", detail={"attempted": len(rows)}) + return inserted + def search_with_cosine(self, query_embedding: bytes) -> list[dict[str, Any]]: """Return all memories with their cosine similarity to the query embedding.""" rows = self._conn.execute( @@ -206,15 +277,28 @@ def count(self) -> int: row = self._conn.execute("SELECT COUNT(*) FROM memories").fetchone() return row[0] - def all_rows(self) -> list[dict[str, Any]]: + def all_rows(self, *, include_embedding: bool = False) -> list[dict[str, Any]]: """Return all memory rows as dicts (for snapshot export).""" - rows = self._conn.execute( + query = ( """SELECT id, identifier, fact_text, embedding_dim, embedding_model, - source, importance, metadata_json, content_hash, ttl_seconds, created_at + source, importance, metadata_json, content_hash, ttl_seconds, created_at, + embedding FROM memories""" - ).fetchall() + if include_embedding + else """SELECT id, identifier, fact_text, embedding_dim, embedding_model, + source, importance, metadata_json, content_hash, ttl_seconds, created_at + FROM memories""" + ) + rows = self._conn.execute(query).fetchall() return [dict(r) for r in rows] + def content_hashes(self) -> set[str]: + """Return non-empty content hashes currently stored in the database.""" + rows = self._conn.execute( + "SELECT content_hash FROM memories WHERE content_hash != ''" + ).fetchall() + return {row["content_hash"] for row in rows} + def exists(self, content_hash: str) -> bool: """Check if a memory with this content hash already exists.""" row = self._conn.execute( diff --git a/src/hotmem/swap.py b/src/hotmem/swap.py index ac3ae7d..5ac6109 100644 --- a/src/hotmem/swap.py +++ b/src/hotmem/swap.py @@ -6,7 +6,7 @@ Interface: hydrate(db, swap_path) -> HydrateResult - snapshot(db, swap_path) -> SnapshotResult + snapshot(db, swap_path, include_embeddings=True) -> SnapshotResult compute_content_hash(identifier, fact_text) -> str Deps: hotmem.db, hotmem.embed, hotmem.trace @@ -15,13 +15,15 @@ from __future__ import annotations +import base64 +import binascii import hashlib import json import uuid from dataclasses import dataclass from pathlib import Path -from hotmem.db import MemoryDB +from hotmem.db import MemoryDB, MemoryRecord from hotmem.embed import EMBEDDING_DIM, EMBEDDING_MODEL, embed_text, pack_embedding from hotmem.trace import Timer, get_tracer @@ -45,6 +47,35 @@ class SnapshotResult: path: str +def _metadata_json(record: dict) -> str: + metadata = record.get("metadata") + if metadata is not None: + return json.dumps(metadata) + return record.get("metadata_json", "{}") + + +def _stored_embedding(record: dict) -> bytes | None: + """Return a compatible stored embedding from a snapshot record, if present.""" + if record.get("embedding_dim", EMBEDDING_DIM) != EMBEDDING_DIM: + return None + if record.get("embedding_model", EMBEDDING_MODEL) != EMBEDDING_MODEL: + return None + + encoded = record.get("embedding_b64") + if not encoded: + return None + + try: + blob = base64.b64decode(encoded, validate=True) + except (binascii.Error, TypeError): + return None + + expected_bytes = EMBEDDING_DIM * 4 + if len(blob) != expected_bytes: + return None + return blob + + def hydrate(db: MemoryDB, swap_path: str | Path) -> HydrateResult: """Load memories from a JSONL swap file into the database. @@ -56,31 +87,43 @@ def hydrate(db: MemoryDB, swap_path: str | Path) -> HydrateResult: return HydrateResult(loaded=0, skipped_dupes=0) with Timer() as t: - loaded = 0 + records: list[MemoryRecord] = [] + seen_hashes = db.content_hashes() skipped = 0 + bytes_read = 0 + parsed = 0 + reused_embeddings = 0 + computed_embeddings = 0 with open(swap_path) as f: for line in f: + bytes_read += len(line.encode()) line = line.strip() if not line: continue record = json.loads(line) + parsed += 1 identifier = record.get("identifier", "") fact_text = record.get("fact_text", "") - content_hash = record.get( - "content_hash", compute_content_hash(identifier, fact_text) + content_hash = record.get("content_hash") or compute_content_hash( + identifier, fact_text ) - if db.exists(content_hash): + if content_hash in seen_hashes: skipped += 1 continue + seen_hashes.add(content_hash) - # Compute embedding for the fact - vec = embed_text(fact_text) - blob = pack_embedding(vec) + blob = _stored_embedding(record) + if blob is None: + vec = embed_text(fact_text) + blob = pack_embedding(vec) + computed_embeddings += 1 + else: + reused_embeddings += 1 - db.insert( + records.append(MemoryRecord( id=record.get("id", uuid.uuid4().hex), identifier=identifier, fact_text=fact_text, @@ -89,29 +132,48 @@ def hydrate(db: MemoryDB, swap_path: str | Path) -> HydrateResult: embedding_model=record.get("embedding_model", EMBEDDING_MODEL), source=record.get("source", "swap"), importance=record.get("importance", 0.5), - metadata_json=json.dumps(record.get("metadata", {})), + metadata_json=_metadata_json(record), content_hash=content_hash, ttl_seconds=record.get("ttl_seconds"), created_at=record.get("created_at"), - ) - loaded += 1 + )) + + loaded = db.insert_many_ignore(records) + skipped += len(records) - loaded _trace.info( "hydrate", f"hydrated {loaded} memories, skipped {skipped} dupes", - detail={"path": str(swap_path), "ms": round(t.ms, 2)}, + detail={ + "path": str(swap_path), + "ms": round(t.ms, 2), + "bytes_read": bytes_read, + "parsed": parsed, + "loaded": loaded, + "skipped_dupes": skipped, + "computed_embeddings": computed_embeddings, + "reused_embeddings": reused_embeddings, + }, ) return HydrateResult(loaded=loaded, skipped_dupes=skipped) -def snapshot(db: MemoryDB, swap_path: str | Path) -> SnapshotResult: +def snapshot( + db: MemoryDB, + swap_path: str | Path, + *, + include_embeddings: bool = True, +) -> SnapshotResult: """Export all memories from the database to a JSONL swap file.""" swap_path = Path(swap_path) with Timer() as t: - rows = db.all_rows() + rows = db.all_rows(include_embedding=include_embeddings) with open(swap_path, "w") as f: for row in rows: + embedding = row.pop("embedding", None) + if embedding is not None: + row["embedding_b64"] = base64.b64encode(embedding).decode("ascii") f.write(json.dumps(row, default=str) + "\n") _trace.info( diff --git a/tests/test_db.py b/tests/test_db.py index f936e6e..624eda9 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -4,7 +4,7 @@ import sqlite3 -from hotmem.db import MemoryDB +from hotmem.db import MemoryDB, MemoryRecord from hotmem.embed import embed_text, pack_embedding @@ -31,6 +31,31 @@ def test_exists(tmp_db: MemoryDB): assert not tmp_db.exists("hash2") +def test_insert_many_ignore_deduplicates_content_hash(tmp_db: MemoryDB): + blob = pack_embedding(embed_text("same fact")) + inserted = tmp_db.insert_many_ignore( + [ + MemoryRecord( + id="bulk1", + identifier="x", + fact_text="same fact", + embedding=blob, + content_hash="same-hash", + ), + MemoryRecord( + id="bulk2", + identifier="x", + fact_text="same fact", + embedding=blob, + content_hash="same-hash", + ), + ] + ) + + assert inserted == 1 + assert tmp_db.count() == 1 + + def test_cosine_search(tmp_db: MemoryDB): for text in ["the quick brown fox", "hello world", "machine learning is great"]: vec = embed_text(text) @@ -84,6 +109,15 @@ def test_all_rows(tmp_db: MemoryDB): assert rows[0]["ttl_seconds"] is None +def test_all_rows_can_include_embedding(tmp_db: MemoryDB): + blob = pack_embedding(embed_text("fact")) + tmp_db.insert(id="r1", identifier="x", fact_text="fact", embedding=blob) + + rows = tmp_db.all_rows(include_embedding=True) + + assert rows[0]["embedding"] == blob + + def test_insert_with_ttl(tmp_db: MemoryDB): vec = embed_text("temporary fact") blob = pack_embedding(vec) diff --git a/tests/test_swap.py b/tests/test_swap.py index 8a2dde6..ff40adc 100644 --- a/tests/test_swap.py +++ b/tests/test_swap.py @@ -2,11 +2,14 @@ from __future__ import annotations +import base64 import json from pathlib import Path +import pytest + from hotmem.db import MemoryDB -from hotmem.embed import embed_text, pack_embedding +from hotmem.embed import EMBEDDING_MODEL, embed_text, pack_embedding from hotmem.swap import compute_content_hash, hydrate, snapshot @@ -45,6 +48,20 @@ def test_hydrate_deduplication(tmp_db: MemoryDB, tmp_path: Path): assert result.skipped_dupes == 1 +def test_hydrate_computes_empty_content_hash(tmp_db: MemoryDB, tmp_path: Path): + swap = tmp_path / "swap.jsonl" + records = [ + {"identifier": "a", "fact_text": "first fact", "content_hash": ""}, + {"identifier": "b", "fact_text": "second fact", "content_hash": ""}, + ] + swap.write_text("\n".join(json.dumps(r) for r in records) + "\n") + + result = hydrate(tmp_db, swap) + + assert result.loaded == 2 + assert result.skipped_dupes == 0 + + def test_hydrate_missing_file(tmp_db: MemoryDB, tmp_path: Path): result = hydrate(tmp_db, tmp_path / "nonexistent.jsonl") assert result.loaded == 0 @@ -64,6 +81,7 @@ def test_snapshot_roundtrip(tmp_db: MemoryDB, tmp_path: Path): assert len(lines) == 1 data = json.loads(lines[0]) assert data["fact_text"] == "test fact" + assert data["embedding_b64"] == base64.b64encode(blob).decode("ascii") def test_snapshot_hydrate_preserves_ttl_and_created_at(tmp_db: MemoryDB, tmp_path: Path): @@ -86,3 +104,74 @@ def test_snapshot_hydrate_preserves_ttl_and_created_at(tmp_db: MemoryDB, tmp_pat assert data["ttl_seconds"] == 3600 assert data["created_at"] == "2026-05-31T00:00:00Z" + + +def test_snapshot_can_omit_embeddings(tmp_db: MemoryDB, tmp_path: Path): + blob = pack_embedding(embed_text("test fact")) + tmp_db.insert(id="s1", identifier="snap", fact_text="test fact", embedding=blob) + + swap = tmp_path / "out.jsonl" + snapshot(tmp_db, swap, include_embeddings=False) + + data = json.loads(swap.read_text().strip()) + assert "embedding_b64" not in data + + +def test_hydrate_reuses_compatible_stored_embedding( + tmp_db: MemoryDB, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +): + swap = tmp_path / "swap.jsonl" + blob = pack_embedding(embed_text("stored embedding fact")) + swap.write_text( + json.dumps( + { + "identifier": "stored", + "fact_text": "stored embedding fact", + "embedding_model": EMBEDDING_MODEL, + "embedding_dim": 64, + "embedding_b64": base64.b64encode(blob).decode("ascii"), + } + ) + + "\n" + ) + + def fail_embed(_text: str): + raise AssertionError("hydrate should reuse the stored embedding") + + monkeypatch.setattr("hotmem.swap.embed_text", fail_embed) + + result = hydrate(tmp_db, swap) + + assert result.loaded == 1 + assert tmp_db.all_rows(include_embedding=True)[0]["embedding"] == blob + + +def test_hydrate_recomputes_incompatible_stored_embedding( + tmp_db: MemoryDB, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +): + swap = tmp_path / "swap.jsonl" + stored_blob = pack_embedding(embed_text("stored embedding fact")) + recomputed_blob = pack_embedding(embed_text("fallback embedding")) + swap.write_text( + json.dumps( + { + "identifier": "stored", + "fact_text": "stored embedding fact", + "embedding_model": "different-model", + "embedding_dim": 64, + "embedding_b64": base64.b64encode(stored_blob).decode("ascii"), + } + ) + + "\n" + ) + + monkeypatch.setattr("hotmem.swap.embed_text", lambda _text: embed_text("fallback embedding")) + + result = hydrate(tmp_db, swap) + + assert result.loaded == 1 + assert tmp_db.all_rows(include_embedding=True)[0]["embedding"] == recomputed_blob From 82a9edc7079012124550d88cdc67eccfebec5656 Mon Sep 17 00:00:00 2001 From: valiantone Date: Tue, 9 Jun 2026 07:26:53 +0200 Subject: [PATCH 2/3] Format hydration swap code --- src/hotmem/swap.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/hotmem/swap.py b/src/hotmem/swap.py index 5ac6109..564b9e5 100644 --- a/src/hotmem/swap.py +++ b/src/hotmem/swap.py @@ -123,20 +123,22 @@ def hydrate(db: MemoryDB, swap_path: str | Path) -> HydrateResult: else: reused_embeddings += 1 - records.append(MemoryRecord( - id=record.get("id", uuid.uuid4().hex), - identifier=identifier, - fact_text=fact_text, - embedding=blob, - embedding_dim=record.get("embedding_dim", EMBEDDING_DIM), - embedding_model=record.get("embedding_model", EMBEDDING_MODEL), - source=record.get("source", "swap"), - importance=record.get("importance", 0.5), - metadata_json=_metadata_json(record), - content_hash=content_hash, - ttl_seconds=record.get("ttl_seconds"), - created_at=record.get("created_at"), - )) + records.append( + MemoryRecord( + id=record.get("id", uuid.uuid4().hex), + identifier=identifier, + fact_text=fact_text, + embedding=blob, + embedding_dim=record.get("embedding_dim", EMBEDDING_DIM), + embedding_model=record.get("embedding_model", EMBEDDING_MODEL), + source=record.get("source", "swap"), + importance=record.get("importance", 0.5), + metadata_json=_metadata_json(record), + content_hash=content_hash, + ttl_seconds=record.get("ttl_seconds"), + created_at=record.get("created_at"), + ) + ) loaded = db.insert_many_ignore(records) skipped += len(records) - loaded From 14ba095e3a0c13345b8e2efe3a07b2bc0108a1e3 Mon Sep 17 00:00:00 2001 From: valiantone Date: Tue, 9 Jun 2026 09:21:16 +0200 Subject: [PATCH 3/3] Prepare 0.1.6 release --- .github/workflows/auto-tag.yml | 18 ++++++++++++++++++ .github/workflows/release.yml | 20 ++++++++++++++++++++ CHANGELOG.md | 10 ++++++++++ pyproject.toml | 3 +-- src/hotmem/__init__.py | 2 +- 5 files changed, 50 insertions(+), 3 deletions(-) diff --git a/.github/workflows/auto-tag.yml b/.github/workflows/auto-tag.yml index 385b5ba..785b735 100644 --- a/.github/workflows/auto-tag.yml +++ b/.github/workflows/auto-tag.yml @@ -23,6 +23,24 @@ jobs: echo "version=$VERSION" >> "$GITHUB_OUTPUT" echo "tag=v$VERSION" >> "$GITHUB_OUTPUT" + - name: Check runtime version matches package version + run: | + python - <<'PY' + import pathlib + import re + import tomllib + + pyproject = tomllib.loads(pathlib.Path("pyproject.toml").read_text()) + project_version = pyproject["project"]["version"] + init_text = pathlib.Path("src/hotmem/__init__.py").read_text() + init_version = re.search(r'__version__ = "([^"]+)"', init_text).group(1) + + if init_version != project_version: + raise SystemExit( + f"hotmem.__version__ {init_version} does not match pyproject version {project_version}" + ) + PY + - name: Check if tag exists id: check run: | diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4254183..78c4045 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -30,13 +30,33 @@ jobs: run: uv build - name: Verify package metadata + env: + RELEASE_TAG: ${{ github.event_name == 'workflow_dispatch' && inputs.tag || github.ref_name }} run: | uvx twine check dist/* python - <<'PY' + import os import pathlib + import re import tarfile + import tomllib readme = pathlib.Path("README.md").read_text() + pyproject = tomllib.loads(pathlib.Path("pyproject.toml").read_text()) + project_version = pyproject["project"]["version"] + init_text = pathlib.Path("src/hotmem/__init__.py").read_text() + init_version = re.search(r'__version__ = "([^"]+)"', init_text).group(1) + release_tag = os.environ["RELEASE_TAG"] + + if release_tag != f"v{project_version}": + raise SystemExit( + f"release tag {release_tag} does not match pyproject version {project_version}" + ) + if init_version != project_version: + raise SystemExit( + f"hotmem.__version__ {init_version} does not match pyproject version {project_version}" + ) + sdist = next(pathlib.Path("dist").glob("*.tar.gz")) with tarfile.open(sdist) as tar: diff --git a/CHANGELOG.md b/CHANGELOG.md index 2efbdbb..bb3c0e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,16 @@ All notable changes to HotMem will be documented in this file. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.1.6] - 2026-06-09 + +### Added +- Batched JSONL hydration with SQLite-native duplicate skipping. +- Snapshot embedding export via `embedding_b64` for faster compatible rehydration. +- Hydration trace counters for parsed rows, loaded rows, duplicate skips, bytes read, and embedding reuse. + +### Fixed +- Package version metadata now matches the runtime `hotmem.__version__`. + ## [0.1.0] - 2025-05-02 ### Added diff --git a/pyproject.toml b/pyproject.toml index 484101b..97e01c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "hotmem" -version = "0.1.5" +version = "0.1.6" description = "A local-first memory sidecar for agent applications" readme = "README.md" requires-python = ">=3.11" @@ -57,4 +57,3 @@ select = ["E", "F", "I", "UP", "B", "SIM"] [tool.pytest.ini_options] testpaths = ["tests"] - diff --git a/src/hotmem/__init__.py b/src/hotmem/__init__.py index 2652a2c..77ec00f 100644 --- a/src/hotmem/__init__.py +++ b/src/hotmem/__init__.py @@ -1,3 +1,3 @@ """HotMem — A local-first memory sidecar for agent applications.""" -__version__ = "0.1.0" +__version__ = "0.1.6"