From 768ce0aacbf5730700d6ac6d0f4b71e62ef2de22 Mon Sep 17 00:00:00 2001
From: seyeong <seyoung4503@gmail.com>
Date: Fri, 27 Feb 2026 19:23:30 +0900
Subject: [PATCH 01/12] feat(vectorstore): add FAISSVectorStore with file
 persistence

  IndexFlatIP + L2 normalization for cosine similarity.
  Lazy index init on first upsert(). save/load via .faiss + .meta files.
---
 .../integrations/vectorstore/faiss_.py        | 106 ++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 src/lang2sql/integrations/vectorstore/faiss_.py

diff --git a/src/lang2sql/integrations/vectorstore/faiss_.py b/src/lang2sql/integrations/vectorstore/faiss_.py
new file mode 100644
index 0000000..2ed5de1
--- /dev/null
+++ b/src/lang2sql/integrations/vectorstore/faiss_.py
@@ -0,0 +1,106 @@
+from __future__ import annotations
+
+import json
+import pathlib
+
+from ...core.exceptions import IntegrationMissingError
+
+try:
+    import faiss as _faiss
+    import numpy as _np
+except ImportError:
+    _faiss = None  # type: ignore[assignment]
+    _np = None  # type: ignore[assignment]
+
+
+class FAISSVectorStore:
+    """
+    FAISS-backed vector store with optional file persistence.
+
+    Uses IndexFlatIP + L2 normalization for exact cosine similarity.
+    Index is lazy-initialized on the first upsert() call.
+
+    Known limitation (append-only):
+        Upserting the same chunk_id twice creates duplicate FAISS entries.
+        To rebuild a clean index, create a new FAISSVectorStore instance
+        and run from_chunks() again from scratch.
+
+    Args:
+        index_path: Optional path for save() / load(). Used as default
+                    path when save() is called without an explicit argument.
+
+    Installation:
+        pip install faiss-cpu        # CPU-only
+        pip install faiss-gpu        # GPU variant
+    """
+
+    def __init__(self, index_path: str | None = None) -> None:
+        if _faiss is None or _np is None:
+            raise IntegrationMissingError("faiss", hint="pip install faiss-cpu")
+        self._index_path = index_path
+        self._index: object | None = None  # faiss.IndexFlatIP, None until first upsert
+        self._ids: list[str] = []
+
+    # ── VectorStorePort ──────────────────────────────────────────────
+
+    def upsert(self, ids: list[str], vectors: list[list[float]]) -> None:
+        """L2-normalize and add vectors. Lazy-creates index on first call."""
+        arr = _np.array(vectors, dtype=_np.float32)
+        _faiss.normalize_L2(arr)  # in-place cosine trick
+        if self._index is None:
+            self._index = _faiss.IndexFlatIP(arr.shape[1])
+        self._index.add(arr)
+        self._ids.extend(ids)
+
+    def search(self, vector: list[float], k: int) -> list[tuple[str, float]]:
+        """Return (chunk_id, cosine_score) for the k nearest vectors."""
+        if self._index is None or self._index.ntotal == 0:
+            return []
+        q = _np.array([vector], dtype=_np.float32)
+        _faiss.normalize_L2(q)
+        k = min(k, self._index.ntotal)
+        scores, positions = self._index.search(q, k)
+        return [
+            (self._ids[int(pos)], float(scores[0][j]))
+            for j, pos in enumerate(positions[0])
+            if pos >= 0
+        ]
+
+    # ── Persistence ──────────────────────────────────────────────────
+
+    def save(self, path: str | None = None) -> None:
+        """
+        Write index to {path} and id list to {path}.meta.
+        Falls back to self._index_path when path is None.
+        Raises ValueError if no path is available.
+        Raises RuntimeError if called before any upsert().
+        """
+        path = path or self._index_path
+        if path is None:
+            raise ValueError(
+                "No path provided and index_path was not set at construction."
+            )
+        if self._index is None:
+            raise RuntimeError("Cannot save before any upsert() call.")
+        _faiss.write_index(self._index, path)
+        pathlib.Path(path + ".meta").write_text(
+            json.dumps(self._ids), encoding="utf-8"
+        )
+
+    @classmethod
+    def load(cls, path: str) -> "FAISSVectorStore":
+        """
+        Load index from {path} and id list from {path}.meta.
+        Raises FileNotFoundError if either file is missing.
+        """
+        if _faiss is None or _np is None:
+            raise IntegrationMissingError("faiss", hint="pip install faiss-cpu")
+        meta_path = pathlib.Path(path + ".meta")
+        if not pathlib.Path(path).exists() or not meta_path.exists():
+            raise FileNotFoundError(
+                f"Index files not found: {path}, {path}.meta"
+            )
+        store = cls(index_path=path)
+        store._index = _faiss.read_index(path)
+        store._ids = json.loads(meta_path.read_text(encoding="utf-8"))
+        return store

From 700c945d362c11bcb308653348a23245b13ace5a Mon Sep 17 00:00:00 2001
From: seyeong <seyoung4503@gmail.com>
Date: Fri, 27 Feb 2026 19:23:47 +0900
Subject: [PATCH 02/12] feat(vectorstore): add PGVectorStore with pgvector
 backend

  True upsert via ON CONFLICT DO UPDATE. Automatic table creation
  on first upsert(). Cosine similarity via pgvector <=> operator.
---
 .../integrations/vectorstore/pgvector_.py     | 88 +++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 src/lang2sql/integrations/vectorstore/pgvector_.py

diff --git a/src/lang2sql/integrations/vectorstore/pgvector_.py b/src/lang2sql/integrations/vectorstore/pgvector_.py
new file mode 100644
index 0000000..4ec7aa5
--- /dev/null
+++ b/src/lang2sql/integrations/vectorstore/pgvector_.py
@@ -0,0 +1,88 @@
+from __future__ import annotations
+
+from ...core.exceptions import IntegrationMissingError
+
+try:
+    import psycopg2 as _psycopg2
+    from pgvector.psycopg2 import register_vector as _register_vector
+except ImportError:
+    _psycopg2 = None  # type: ignore[assignment]
+    _register_vector = None  # type: ignore[assignment]
+
+
+class PGVectorStore:
+    """
+    PostgreSQL pgvector-backed vector store.
+
+    True upsert semantics via ON CONFLICT DO UPDATE — idempotent,
+    no duplicates across multiple from_chunks() runs.
+    Table is created automatically on first upsert() call.
+
+    Args:
+        connection:  PostgreSQL connection URL.
+                     e.g. "postgresql://user:pass@localhost:5432/mydb"
+        table_name:  Name of the vector table. Default "lang2sql_vectors".
+
+    Installation:
+        pip install psycopg2-binary pgvector
+
+    Quick start with Docker:
+        docker run -d -e POSTGRES_PASSWORD=postgres \\
+            -p 5432:5432 pgvector/pgvector:pg16
+    """
+
+    def __init__(
+        self,
+        *,
+        connection: str,
+        table_name: str = "lang2sql_vectors",
+    ) -> None:
+        if _psycopg2 is None or _register_vector is None:
+            raise IntegrationMissingError(
+                "psycopg2", hint="pip install psycopg2-binary pgvector"
+            )
+        self._conn = _psycopg2.connect(connection)
+        _register_vector(self._conn)
+        self._table = table_name
+        self._ready = False  # True after first _ensure_table()
+
+    def _ensure_table(self, dim: int) -> None:
+        if self._ready:
+            return
+        with self._conn.cursor() as cur:
+            cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
+            cur.execute(
+                f"CREATE TABLE IF NOT EXISTS {self._table} "
+                f"(id TEXT PRIMARY KEY, embedding vector({dim}));"
+            )
+        self._conn.commit()
+        self._ready = True
+
+    # ── VectorStorePort ──────────────────────────────────────────────
+
+    def upsert(self, ids: list[str], vectors: list[list[float]]) -> None:
+        """Create table if needed, then upsert all (id, vector) pairs."""
+        self._ensure_table(len(vectors[0]))
+        with self._conn.cursor() as cur:
+            for id_, vec in zip(ids, vectors):
+                cur.execute(
+                    f"INSERT INTO {self._table} (id, embedding) VALUES (%s, %s) "
+                    "ON CONFLICT (id) DO UPDATE SET embedding = EXCLUDED.embedding;",
+                    (id_, vec),
+                )
+        self._conn.commit()
+
+    def search(self, vector: list[float], k: int) -> list[tuple[str, float]]:
+        """Return (chunk_id, cosine_score) for the k nearest vectors.
+        Returns [] if the table has not been created yet.
+        """
+        if not self._ready:
+            return []
+        with self._conn.cursor() as cur:
+            cur.execute(
+                f"SELECT id, 1 - (embedding <=> %s::vector) AS score "
+                f"FROM {self._table} "
+                f"ORDER BY embedding <=> %s::vector LIMIT %s;",
+                (vector, vector, k),
+            )
+            return [(row[0], float(row[1])) for row in cur.fetchall()]

From 22366d72b87fe8a3f435facad0645178cae7d815 Mon Sep 17 00:00:00 2001
From: seyeong <seyoung4503@gmail.com>
Date: Fri, 27 Feb 2026 19:24:15 +0900
Subject: [PATCH 03/12] feat(vectorstore): export FAISSVectorStore and
 PGVectorStore to public API

---
 src/lang2sql/__init__.py                          | 5 +++++
 src/lang2sql/integrations/vectorstore/__init__.py | 4 +++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/lang2sql/__init__.py b/src/lang2sql/__init__.py
index 9ab711e..2dcdc49 100644
--- a/src/lang2sql/__init__.py
+++ b/src/lang2sql/__init__.py
@@ -1,3 +1,5 @@
+from .integrations.vectorstore.faiss_ import FAISSVectorStore
+from .integrations.vectorstore.pgvector_ import PGVectorStore
 from .components.execution.sql_executor import SQLExecutor
 from .components.generation.sql_generator import SQLGenerator
 from .components.loaders.directory_ import DirectoryLoader
@@ -59,4 +61,7 @@
     "Lang2SQLError",
     "ComponentError",
     "IntegrationMissingError",
+    # Vector store backends
+    "FAISSVectorStore",
+    "PGVectorStore",
 ]
diff --git a/src/lang2sql/integrations/vectorstore/__init__.py b/src/lang2sql/integrations/vectorstore/__init__.py
index bddace4..eeab249 100644
--- a/src/lang2sql/integrations/vectorstore/__init__.py
+++ b/src/lang2sql/integrations/vectorstore/__init__.py
@@ -1,3 +1,5 @@
+from .faiss_ import FAISSVectorStore
 from .inmemory_ import InMemoryVectorStore
+from .pgvector_ import PGVectorStore
 
-__all__ = ["InMemoryVectorStore"]
+__all__ = ["InMemoryVectorStore", "FAISSVectorStore", "PGVectorStore"]

From 417213f4295318ce2f81223483275834cc55c99c Mon Sep 17 00:00:00 2001
From: seyeong <seyoung4503@gmail.com>
Date: Fri, 27 Feb 2026 19:24:29 +0900
Subject: [PATCH 04/12] test(vectorstore): add 8 tests for FAISSVectorStore

  Covers upsert/search, cosine score, save/load roundtrip,
  and error cases. Auto-skipped if faiss-cpu is not installed.
---
 tests/test_integrations_faiss_vectorstore.py | 107 +++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 tests/test_integrations_faiss_vectorstore.py

diff --git a/tests/test_integrations_faiss_vectorstore.py b/tests/test_integrations_faiss_vectorstore.py
new file mode 100644
index 0000000..2b4e1d6
--- /dev/null
+++ b/tests/test_integrations_faiss_vectorstore.py
@@ -0,0 +1,107 @@
+"""
+FAISSVectorStore integration tests.
+
+All tests are auto-skipped when faiss-cpu is not installed.
+"""
+import pytest
+
+faiss = pytest.importorskip("faiss")  # skip entire module if not installed
+
+import tempfile
+import os
+
+from lang2sql.integrations.vectorstore.faiss_ import FAISSVectorStore
+
+
+# ── helpers ──────────────────────────────────────────────────────────────────
+
+
+def _ortho_vectors() -> list[tuple[str, list[float]]]:
+    """4 orthogonal unit vectors for deterministic cosine tests."""
+    return [
+        ("a", [1.0, 0.0, 0.0, 0.0]),
+        ("b", [0.0, 1.0, 0.0, 0.0]),
+        ("c", [0.0, 0.0, 1.0, 0.0]),
+        ("d", [0.0, 0.0, 0.0, 1.0]),
+    ]
+
+
+@pytest.fixture
+def store() -> FAISSVectorStore:
+    return FAISSVectorStore()
+
+
+def _populate(store: FAISSVectorStore) -> None:
+    items = _ortho_vectors()
+    ids = [item[0] for item in items]
+    vecs = [item[1] for item in items]
+    store.upsert(ids, vecs)
+
+
+# ── tests ─────────────────────────────────────────────────────────────────────
+
+
+def test_faiss_upsert_and_search_returns_closest(store):
+    """Query vector returns its own id at rank 1."""
+    _populate(store)
+    results = store.search([1.0, 0.0, 0.0, 0.0], k=1)
+    assert len(results) == 1
+    assert results[0][0] == "a"
+
+
+def test_faiss_cosine_score_of_identical_vector(store):
+    """Identical query → score ≈ 1.0."""
+    _populate(store)
+    results = store.search([1.0, 0.0, 0.0, 0.0], k=1)
+    assert abs(results[0][1] - 1.0) < 1e-5
+
+
+def test_faiss_upsert_merge_preserves_prior_entries(store):
+    """Second upsert() call doesn't lose entries from the first."""
+    store.upsert(["a"], [[1.0, 0.0, 0.0, 0.0]])
+    store.upsert(["b"], [[0.0, 1.0, 0.0, 0.0]])
+
+    # "a" should still be retrievable
+    results = store.search([1.0, 0.0, 0.0, 0.0], k=2)
+    ids = [r[0] for r in results]
+    assert "a" in ids
+
+
+def test_faiss_search_respects_k(store):
+    """len(results) <= k."""
+    _populate(store)
+    results = store.search([1.0, 0.0, 0.0, 0.0], k=2)
+    assert len(results) <= 2
+
+
+def test_faiss_search_on_empty_store_returns_empty(store):
+    """[] before any upsert()."""
+    results = store.search([1.0, 0.0, 0.0, 0.0], k=5)
+    assert results == []
+
+
+def test_faiss_save_and_load_roundtrip(store, tmp_path):
+    """save() → load() → search() returns same results."""
+    _populate(store)
+    index_path = str(tmp_path / "catalog.faiss")
+    store.save(index_path)
+
+    loaded = FAISSVectorStore.load(index_path)
+    original_results = store.search([1.0, 0.0, 0.0, 0.0], k=1)
+    loaded_results = loaded.search([1.0, 0.0, 0.0, 0.0], k=1)
+
+    assert loaded_results[0][0] == original_results[0][0]
+    assert abs(loaded_results[0][1] - original_results[0][1]) < 1e-5
+
+
+def test_faiss_save_without_path_raises(store):
+    """save() with no path and no index_path → ValueError."""
+    _populate(store)
+    with pytest.raises(ValueError, match="No path provided"):
+        store.save()
+
+
+def test_faiss_load_nonexistent_path_raises():
+    """load("nonexistent") → FileNotFoundError."""
+    with pytest.raises(FileNotFoundError):
+        FAISSVectorStore.load("nonexistent_path_that_does_not_exist.faiss")

From 1e2f0184cf4418a5eb5110ee67d9bbc3309120e3 Mon Sep 17 00:00:00 2001
From: seyeong <seyoung4503@gmail.com>
Date: Fri, 27 Feb 2026 19:24:43 +0900
Subject: [PATCH 05/12] test(vectorstore): add 6 tests for PGVectorStore

  Covers upsert/search, idempotent upsert, score range, and
  auto table creation. Skipped if TEST_POSTGRES_URL is not set.
---
 .../test_integrations_pgvector_vectorstore.py | 144 ++++++++++++++++++
 1 file changed, 144 insertions(+)
 create mode 100644 tests/test_integrations_pgvector_vectorstore.py

diff --git a/tests/test_integrations_pgvector_vectorstore.py b/tests/test_integrations_pgvector_vectorstore.py
new file mode 100644
index 0000000..c2245e2
--- /dev/null
+++ b/tests/test_integrations_pgvector_vectorstore.py
@@ -0,0 +1,144 @@
+"""
+PGVectorStore integration tests.
+
+Requires a live PostgreSQL instance with pgvector installed.
+Skipped when TEST_POSTGRES_URL env variable is not set.
+
+Example:
+    TEST_POSTGRES_URL="postgresql://postgres:postgres@localhost:5432/test" \\
+        pytest tests/test_integrations_pgvector_vectorstore.py -v
+"""
+import os
+import pytest
+from uuid import uuid4
+
+pytestmark = pytest.mark.skipif(
+    not os.getenv("TEST_POSTGRES_URL"),
+    reason="TEST_POSTGRES_URL not set — skipping pgvector integration tests",
+)
+
+from lang2sql.integrations.vectorstore.pgvector_ import PGVectorStore
+
+
+# ── helpers ──────────────────────────────────────────────────────────────────
+
+
+def _unique_table() -> str:
+    return f"test_{uuid4().hex[:8]}"
+
+
+def _make_store(table_name: str) -> PGVectorStore:
+    url = os.environ["TEST_POSTGRES_URL"]
+    return PGVectorStore(connection=url, table_name=table_name)
+
+
+def _drop_table(store: PGVectorStore, table_name: str) -> None:
+    with store._conn.cursor() as cur:
+        cur.execute(f"DROP TABLE IF EXISTS {table_name};")
+    store._conn.commit()
+
+
+# ── tests ─────────────────────────────────────────────────────────────────────
+
+
+def test_pgvector_upsert_and_search():
+    """Query vector returns its own id."""
+    table = _unique_table()
+    store = _make_store(table)
+    try:
+        store.upsert(["a"], [[1.0, 0.0, 0.0, 0.0]])
+        results = store.search([1.0, 0.0, 0.0, 0.0], k=1)
+        assert len(results) == 1
+        assert results[0][0] == "a"
+    finally:
+        _drop_table(store, table)
+        store._conn.close()
+
+
+def test_pgvector_upsert_is_idempotent():
+    """Same id upserted twice → exactly one row in DB."""
+    table = _unique_table()
+    store = _make_store(table)
+    try:
+        store.upsert(["a"], [[1.0, 0.0, 0.0, 0.0]])
+        store.upsert(["a"], [[0.5, 0.5, 0.0, 0.0]])  # overwrite same id
+
+        with store._conn.cursor() as cur:
+            cur.execute(f"SELECT COUNT(*) FROM {table} WHERE id = 'a';")
+            count = cur.fetchone()[0]
+        assert count == 1
+    finally:
+        _drop_table(store, table)
+        store._conn.close()
+
+
+def test_pgvector_search_score_in_range():
+    """Score ∈ [-1, 1]."""
+    table = _unique_table()
+    store = _make_store(table)
+    try:
+        store.upsert(
+            ["a", "b", "c"],
+            [
+                [1.0, 0.0, 0.0, 0.0],
+                [0.0, 1.0, 0.0, 0.0],
+                [0.0, 0.0, 1.0, 0.0],
+            ],
+        )
+        results = store.search([1.0, 0.0, 0.0, 0.0], k=3)
+        for _, score in results:
+            assert -1.0 <= score <= 1.0 + 1e-6
+    finally:
+        _drop_table(store, table)
+        store._conn.close()
+
+
+def test_pgvector_search_respects_k():
+    """len(results) <= k."""
+    table = _unique_table()
+    store = _make_store(table)
+    try:
+        store.upsert(
+            ["a", "b", "c", "d"],
+            [
+                [1.0, 0.0, 0.0, 0.0],
+                [0.0, 1.0, 0.0, 0.0],
+                [0.0, 0.0, 1.0, 0.0],
+                [0.0, 0.0, 0.0, 1.0],
+            ],
+        )
+        results = store.search([1.0, 0.0, 0.0, 0.0], k=2)
+        assert len(results) <= 2
+    finally:
+        _drop_table(store, table)
+        store._conn.close()
+
+
+def test_pgvector_table_created_automatically():
+    """Table exists in information_schema after first upsert()."""
+    table = _unique_table()
+    store = _make_store(table)
+    try:
+        store.upsert(["x"], [[1.0, 0.0]])
+        with store._conn.cursor() as cur:
+            cur.execute(
+                "SELECT COUNT(*) FROM information_schema.tables "
+                "WHERE table_name = %s;",
+                (table,),
+            )
+            count = cur.fetchone()[0]
+        assert count == 1
+    finally:
+        _drop_table(store, table)
+        store._conn.close()
+
+
+def test_pgvector_search_empty_store_returns_empty():
+    """[] before any upsert()."""
+    table = _unique_table()
+    store = _make_store(table)
+    try:
+        results = store.search([1.0, 0.0, 0.0, 0.0], k=5)
+        assert results == []
+    finally:
+        store._conn.close()

From 8db83bfd690be44dcf6081ae0fe116c436039f5a Mon Sep 17 00:00:00 2001
From: seyeong <seyoung4503@gmail.com>
Date: Fri, 27 Feb 2026 19:25:03 +0900
Subject: [PATCH 06/12] docs(vectorstore): update vector-store-backends guide
 for v0.3.0

---
 docs/tutorials/vector-store-backends.md | 598 ++++++++++++++++++++++++
 1 file changed, 598 insertions(+)
 create mode 100644 docs/tutorials/vector-store-backends.md

diff --git a/docs/tutorials/vector-store-backends.md b/docs/tutorials/vector-store-backends.md
new file mode 100644
index 0000000..0a14ef4
--- /dev/null
+++ b/docs/tutorials/vector-store-backends.md
@@ -0,0 +1,598 @@
+# 벡터 저장소 백엔드 가이드 — InMemory / FAISS / pgvector
+
+> **버전**: lang2sql v0.3.0
+> **업데이트**: 2026-02-27
+
+---
+
+## 목차
+
+1. [세 가지 백엔드 비교](#1-세-가지-백엔드-비교)
+2. [의존성 — 별도 설치 불필요](#2-의존성--별도-설치-불필요)
+3. [InMemoryVectorStore — 기본값](#3-inmemoryvectorstore--기본값)
+4. [FAISSVectorStore — 로컬 파일 영속성](#4-faissvectorstore--로컬-파일-영속성)
+5. [PGVectorStore — PostgreSQL 영속성](#5-pgvectorstore--postgresql-영속성)
+6. [백엔드 교체 방법](#6-백엔드-교체-방법)
+7. [커스텀 벡터 저장소 직접 구현하기](#7-커스텀-벡터-저장소-직접-구현하기)
+8. [전체 체크리스트 — API 키 없이 실행](#8-전체-체크리스트--api-키-없이-실행)
+
+---
+
+## 1. 세 가지 백엔드 비교
+
+| | `InMemoryVectorStore` | `FAISSVectorStore` | `PGVectorStore` |
+|---|---|---|---|
+| **저장 위치** | 메모리 (휘발성) | 로컬 파일 (`.faiss` + `.meta`) | PostgreSQL DB |
+| **영속성** | 없음 — 재시작 시 소멸 | 있음 — 파일로 저장/로드 | 있음 — DB에 영구 저장 |
+| **Upsert** | true upsert (dict 기반) | append-only (동일 id 중복 주의) | true upsert (ON CONFLICT) |
+| **멀티 서버** | 불가 | 불가 (파일 단일 접근) | 가능 |
+| **권장 규모** | < 50k chunks | < 500k chunks | 500k+ chunks |
+| **추가 설치** | 불필요 | 불필요 (기본 포함) | 불필요 (기본 포함) |
+| **적합한 환경** | 개발/테스트, 소규모 | 단일 서버 운영, 중규모 | 팀 공유, 대규모 운영 |
+
+---
+
+## 2. 의존성 — 별도 설치 불필요
+
+세 백엔드 모두 `pip install lang2sql` 한 번으로 설치됩니다.
+`pyproject.toml`의 기본 의존성(`dependencies`)에 포함되어 있습니다.
+
+| 패키지 | 고정 버전 | 역할 |
+|--------|----------|------|
+| `numpy` | `<2.0` | InMemoryVectorStore 행렬 연산 |
+| `faiss-cpu` | `==1.10.0` | FAISSVectorStore 인덱스 엔진 |
+| `psycopg2-binary` | `>=2.9.10,<3.0.0` | PGVectorStore PostgreSQL 연결 |
+| `pgvector` | `==0.3.6` | PGVectorStore `vector` 타입 직렬화 |
+
+> **GPU 가속이 필요한 경우**: `faiss-cpu`를 직접 `faiss-gpu`로 교체할 수 있습니다.
+> pyproject.toml의 `faiss-cpu==1.10.0`을 `faiss-gpu==1.10.0`으로 변경 후 `uv sync`.
+
+---
+
+## 3. InMemoryVectorStore — 기본값
+
+numpy 기반 브루트 포스 코사인 유사도. `vectorstore=` 를 생략하면 자동으로 사용됩니다.
+
+**특징:**
+- true upsert — 동일 chunk_id를 두 번 넣으면 덮어씀
+- 검색 시 매번 행렬 재구성 (수만 벡터까지 충분히 빠름)
+- 프로세스 종료 시 인덱스 소멸
+
+```python
+from lang2sql import VectorRetriever, CatalogEntry
+from lang2sql.integrations.embedding import OpenAIEmbedding
+
+CATALOG: list[CatalogEntry] = [
+    {
+        "name": "orders",
+        "description": "고객 주문 정보",
+        "columns": {"order_id": "PK", "amount": "금액", "status": "상태"},
+    },
+]
+
+# vectorstore= 생략 → InMemoryVectorStore 자동 사용
+retriever = VectorRetriever.from_sources(
+    catalog=CATALOG,
+    embedding=OpenAIEmbedding(),
+)
+
+result = retriever("주문 건수")
+print(result.schemas)
+```
+
+---
+
+## 4. FAISSVectorStore — 로컬 파일 영속성
+
+Facebook AI Research의 벡터 검색 라이브러리.
+`IndexFlatIP` + L2 정규화로 정확한 코사인 유사도를 계산합니다.
+
+### 4-1. 기본 사용법 — from_sources()
+
+```python
+from lang2sql import VectorRetriever
+from lang2sql.integrations.vectorstore import FAISSVectorStore
+from lang2sql.integrations.embedding import OpenAIEmbedding
+
+store = FAISSVectorStore(index_path="./index/catalog.faiss")
+
+retriever = VectorRetriever.from_sources(
+    catalog=CATALOG,
+    embedding=OpenAIEmbedding(),
+    vectorstore=store,           # ← FAISSVectorStore 주입
+)
+
+# 인덱스를 파일로 저장
+store.save()
+# → ./index/catalog.faiss       (FAISS 바이너리)
+# → ./index/catalog.faiss.meta  (chunk id 목록 JSON)
+```
+
+### 4-2. 명시적 파이프라인 — from_chunks()
+
+```python
+from lang2sql import VectorRetriever, CatalogChunker, RecursiveCharacterChunker
+from lang2sql import TextDocument
+from lang2sql.integrations.vectorstore import FAISSVectorStore
+from lang2sql.integrations.embedding import OpenAIEmbedding
+
+embedding = OpenAIEmbedding()
+store = FAISSVectorStore(index_path="./index/catalog.faiss")
+
+DOCS: list[TextDocument] = [
+    {
+        "id": "revenue_def",
+        "title": "매출 정의",
+        "content": "매출은 취소 주문을 제외한 순매출 기준이다.",
+        "source": "docs/revenue.md",
+    },
+]
+
+chunks = (
+    CatalogChunker().split(CATALOG) +
+    RecursiveCharacterChunker(chunk_size=800, chunk_overlap=80).split(DOCS)
+)
+
+retriever = VectorRetriever.from_chunks(
+    chunks,
+    embedding=embedding,
+    vectorstore=store,
+)
+
+store.save()
+```
+
+### 4-3. 재시작 시 로드
+
+```python
+from lang2sql.integrations.vectorstore import FAISSVectorStore
+from lang2sql import VectorRetriever
+from lang2sql.integrations.embedding import OpenAIEmbedding
+
+# 파일에서 바로 로드 — 임베딩/인덱싱 없이 즉시 검색 가능
+store = FAISSVectorStore.load("./index/catalog.faiss")
+
+# registry는 from_chunks()가 자동 복원 불가 → 재인덱싱 필요
+# 실전에서는 프로세스 시작 시 from_sources()를 다시 실행하는 패턴 권장
+retriever = VectorRetriever.from_sources(
+    catalog=CATALOG,
+    embedding=OpenAIEmbedding(),
+    vectorstore=store,   # 이미 채워진 store — upsert() 추가로 호출됨 (append)
+)
+```
+
+> **append-only 제한**: `FAISSVectorStore`는 동일 chunk_id를 두 번 upsert하면
+> FAISS 인덱스에 두 개의 항목이 생깁니다. 깨끗한 인덱스가 필요하면
+> 새 `FAISSVectorStore()` 인스턴스로 처음부터 인덱싱하세요.
+
+### 4-4. save/load 예외 처리
+
+```python
+# index_path 없이 생성한 경우 save()는 경로 필요
+store = FAISSVectorStore()
+store.upsert(["a"], [[1.0, 0.0]])
+store.save("./index/catalog.faiss")      # 경로 직접 지정
+
+# upsert() 전에 save() 호출 → RuntimeError
+store_empty = FAISSVectorStore(index_path="./out.faiss")
+store_empty.save()   # RuntimeError: Cannot save before any upsert() call.
+
+# 존재하지 않는 파일 로드 → FileNotFoundError
+FAISSVectorStore.load("./nonexistent.faiss")   # FileNotFoundError
+```
+
+---
+
+## 5. PGVectorStore — PostgreSQL 영속성
+
+PostgreSQL의 `pgvector` 확장을 사용합니다.
+`ON CONFLICT DO UPDATE` true upsert로 중복 없이 멱등 인덱싱이 가능합니다.
+
+### 5-1. PostgreSQL 빠른 시작 (Docker)
+
+```bash
+docker run -d \
+  --name pgvector \
+  -e POSTGRES_PASSWORD=postgres \
+  -p 5432:5432 \
+  pgvector/pgvector:pg16
+```
+
+### 5-2. 기본 사용법 — from_sources()
+
+```python
+from lang2sql import VectorRetriever
+from lang2sql.integrations.vectorstore import PGVectorStore
+from lang2sql.integrations.embedding import OpenAIEmbedding
+
+store = PGVectorStore(
+    connection="postgresql://postgres:postgres@localhost:5432/postgres",
+    table_name="lang2sql_vectors",   # 자동 생성됨
+)
+
+retriever = VectorRetriever.from_sources(
+    catalog=CATALOG,
+    embedding=OpenAIEmbedding(),
+    vectorstore=store,               # ← PGVectorStore 주입
+)
+# → upsert() 시점에 테이블이 없으면 자동 생성
+# → 같은 chunk_id를 다시 upsert하면 덮어씀 (true upsert)
+```
+
+### 5-3. 명시적 파이프라인 — from_chunks()
+
+```python
+from lang2sql import VectorRetriever, CatalogChunker, RecursiveCharacterChunker
+from lang2sql.integrations.vectorstore import PGVectorStore
+from lang2sql.integrations.embedding import OpenAIEmbedding
+
+store = PGVectorStore(
+    connection="postgresql://postgres:postgres@localhost:5432/postgres",
+    table_name="lang2sql_vectors",
+)
+
+chunks = (
+    CatalogChunker().split(CATALOG) +
+    RecursiveCharacterChunker().split(DOCS)
+)
+
+retriever = VectorRetriever.from_chunks(
+    chunks,
+    embedding=OpenAIEmbedding(),
+    vectorstore=store,
+)
+# save() 없음 — upsert()마다 DB에 즉시 반영
+```
+
+### 5-4. 멱등 재인덱싱
+
+같은 카탈로그로 여러 번 인덱싱해도 중복이 생기지 않습니다.
+
+```python
+# 1차 실행
+retriever = VectorRetriever.from_sources(
+    catalog=CATALOG, embedding=embedding, vectorstore=store
+)
+
+# 2차 실행 (카탈로그 변경 후) — 동일 chunk_id는 embedding이 갱신됨
+retriever = VectorRetriever.from_sources(
+    catalog=UPDATED_CATALOG, embedding=embedding, vectorstore=store
+)
+# DB에 중복 없이 덮어써짐 (ON CONFLICT DO UPDATE)
+```
+
+### 5-5. 자동 테이블 구조
+
+첫 `upsert()` 시 아래 DDL이 실행됩니다:
+
+```sql
+CREATE EXTENSION IF NOT EXISTS vector;
+CREATE TABLE IF NOT EXISTS lang2sql_vectors (
+    id        TEXT PRIMARY KEY,
+    embedding vector(1536)   -- 임베딩 모델 차원에 따라 자동 결정
+);
+```
+
+---
+
+## 6. 백엔드 교체 방법
+
+`vectorstore=` 파라미터만 바꾸면 됩니다. 나머지 파이프라인은 변경 없습니다.
+
+```python
+from lang2sql import VectorRetriever
+from lang2sql.integrations.vectorstore import (
+    InMemoryVectorStore,
+    FAISSVectorStore,
+    PGVectorStore,
+)
+from lang2sql.integrations.embedding import OpenAIEmbedding
+
+embedding = OpenAIEmbedding()
+
+# ① InMemory (기본값)
+retriever = VectorRetriever.from_sources(
+    catalog=CATALOG, embedding=embedding
+)
+
+# ② FAISS — vectorstore= 한 줄 교체
+retriever = VectorRetriever.from_sources(
+    catalog=CATALOG,
+    embedding=embedding,
+    vectorstore=FAISSVectorStore(index_path="./index/catalog.faiss"),
+)
+
+# ③ pgvector — vectorstore= 한 줄 교체
+retriever = VectorRetriever.from_sources(
+    catalog=CATALOG,
+    embedding=embedding,
+    vectorstore=PGVectorStore(
+        connection="postgresql://postgres:postgres@localhost:5432/postgres"
+    ),
+)
+```
+
+---
+
+## 7. 커스텀 벡터 저장소 직접 구현하기
+
+`VectorStorePort` Protocol을 만족하는 클래스를 만들면 됩니다.
+Chroma, Qdrant, Weaviate 등 어떤 벡터 DB든 연결 가능합니다.
+
+```python
+from lang2sql import VectorStorePort   # Protocol
+
+class ChromaVectorStore:
+    """Chroma를 lang2sql VectorStorePort에 연결하는 어댑터."""
+
+    def __init__(self, collection_name: str = "lang2sql"):
+        import chromadb
+        self._client = chromadb.Client()
+        self._col    = self._client.get_or_create_collection(collection_name)
+
+    def upsert(self, ids: list[str], vectors: list[list[float]]) -> None:
+        self._col.upsert(ids=ids, embeddings=vectors)
+
+    def search(self, vector: list[float], k: int) -> list[tuple[str, float]]:
+        results = self._col.query(query_embeddings=[vector], n_results=k)
+        ids   = results["ids"][0]
+        dists = results["distances"][0]
+        return [(id_, 1.0 - dist) for id_, dist in zip(ids, dists)]
+
+
+# 사용
+retriever = VectorRetriever.from_sources(
+    catalog=CATALOG,
+    embedding=OpenAIEmbedding(),
+    vectorstore=ChromaVectorStore("my_catalog"),
+)
+```
+
+구현해야 할 메서드는 두 개뿐입니다:
+
+| 메서드 | 시그니처 | 역할 |
+|--------|---------|------|
+| `upsert` | `(ids: list[str], vectors: list[list[float]]) -> None` | 벡터 저장 |
+| `search` | `(vector: list[float], k: int) -> list[tuple[str, float]]` | 유사도 검색 → `(chunk_id, score)`, score 높을수록 유사 |
+
+---
+
+## 8. 전체 체크리스트 — API 키 없이 실행
+
+아래 코드는 `FakeEmbedding`으로 API 키 없이 세 백엔드를 모두 검증합니다.
+pgvector 테스트는 `TEST_POSTGRES_URL` 환경변수가 있을 때만 실행됩니다.
+
+```python
+"""
+벡터 저장소 백엔드 전체 체크리스트
+API 키 없이 FakeEmbedding으로 실행 가능합니다.
+
+실행:
+    python docs/tutorials/vector-store-backends.md  # ← 이 블록만 별도 .py로 저장 후 실행
+
+pgvector 테스트 포함:
+    TEST_POSTGRES_URL="postgresql://postgres:postgres@localhost:5432/postgres" \\
+        python check_backends.py
+"""
+
+import os
+
+# ── 공통 픽스처 ────────────────────────────────────────────────────────────────
+
+class FakeEmbedding:
+    """테스트용 고정 벡터 임베딩. 4차원 단위벡터를 반환합니다."""
+    def embed_query(self, text: str) -> list[float]:
+        return [1.0, 0.0, 0.0, 0.0]
+
+    def embed_texts(self, texts: list[str]) -> list[list[float]]:
+        return [[1.0, 0.0, 0.0, 0.0]] * len(texts)
+
+
+from lang2sql import CatalogEntry, TextDocument, VectorRetriever
+from lang2sql import CatalogChunker, RecursiveCharacterChunker
+
+CATALOG: list[CatalogEntry] = [
+    {
+        "name": "orders",
+        "description": "고객 주문 정보 테이블",
+        "columns": {"order_id": "PK", "amount": "금액", "status": "상태"},
+    },
+    {
+        "name": "customers",
+        "description": "고객 마스터 데이터",
+        "columns": {"customer_id": "PK", "name": "이름", "grade": "등급"},
+    },
+]
+
+DOCS: list[TextDocument] = [
+    {
+        "id": "revenue_def",
+        "title": "매출 정의",
+        "content": "매출은 취소 주문을 제외한 순매출 기준이다.",
+        "source": "docs/revenue.md",
+    },
+]
+
+embedding = FakeEmbedding()
+
+
+# ── 1. InMemoryVectorStore ─────────────────────────────────────────────────────
+
+print("=" * 50)
+print("1. InMemoryVectorStore")
+
+from lang2sql.integrations.vectorstore import InMemoryVectorStore
+
+retriever = VectorRetriever.from_sources(
+    catalog=CATALOG,
+    embedding=embedding,
+    # vectorstore= 생략 → InMemoryVectorStore 자동 사용
+)
+result = retriever("주문 건수")
+assert isinstance(result.schemas, list)
+assert len(result.schemas) > 0
+print(f"   schemas: {[s['name'] for s in result.schemas]}")
+print("   ✓ InMemoryVectorStore 정상")
+
+
+# ── 2. FAISSVectorStore ────────────────────────────────────────────────────────
+
+print("\n2. FAISSVectorStore")
+
+import tempfile, pathlib
+
+faiss = __import__("faiss")  # 없으면 ImportError → 아래 try/except
+try:
+    from lang2sql.integrations.vectorstore import FAISSVectorStore
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        index_path = str(pathlib.Path(tmpdir) / "catalog.faiss")
+
+        # 2-a. from_sources
+        store = FAISSVectorStore(index_path=index_path)
+        retriever_f = VectorRetriever.from_sources(
+            catalog=CATALOG,
+            embedding=embedding,
+            vectorstore=store,
+        )
+        result_f = retriever_f("주문 건수")
+        assert len(result_f.schemas) > 0
+        print(f"   from_sources schemas: {[s['name'] for s in result_f.schemas]}")
+
+        # 2-b. save / load
+        store.save()
+        loaded = FAISSVectorStore.load(index_path)
+        result_loaded = VectorRetriever.from_sources(
+            catalog=CATALOG,
+            embedding=embedding,
+            vectorstore=loaded,
+        )("주문 건수")
+        assert len(result_loaded.schemas) > 0
+        print(f"   save/load schemas: {[s['name'] for s in result_loaded.schemas]}")
+
+        # 2-c. from_chunks (명시적 파이프라인)
+        chunks = (
+            CatalogChunker().split(CATALOG) +
+            RecursiveCharacterChunker().split(DOCS)
+        )
+        store2 = FAISSVectorStore()
+        retriever_fc = VectorRetriever.from_chunks(
+            chunks, embedding=embedding, vectorstore=store2
+        )
+        result_fc = retriever_fc("매출 정의")
+        assert len(result_fc.context) > 0
+        print(f"   from_chunks context: {result_fc.context[0][:30]}...")
+
+        # 2-d. 예외 처리
+        try:
+            FAISSVectorStore().save()
+            assert False, "ValueError 미발생"
+        except ValueError:
+            pass
+
+        try:
+            FAISSVectorStore.load("no_such_file.faiss")
+            assert False, "FileNotFoundError 미발생"
+        except FileNotFoundError:
+            pass
+
+    print("   ✓ FAISSVectorStore 정상")
+
+except ImportError:
+    print("   ⚠ faiss 미설치 — 건너뜀")
+
+
+# ── 3. PGVectorStore ───────────────────────────────────────────────────────────
+
+print("\n3. PGVectorStore")
+
+PG_URL = os.getenv("TEST_POSTGRES_URL")
+if not PG_URL:
+    print("   ⚠ TEST_POSTGRES_URL 미설정 — 건너뜀")
+    print("   실행하려면: TEST_POSTGRES_URL=postgresql://... python <script>.py")
+else:
+    try:
+        from lang2sql.integrations.vectorstore import PGVectorStore
+        from uuid import uuid4
+
+        table = f"test_{uuid4().hex[:8]}"
+        store_pg = PGVectorStore(connection=PG_URL, table_name=table)
+
+        # 3-a. from_sources
+        retriever_pg = VectorRetriever.from_sources(
+            catalog=CATALOG,
+            embedding=embedding,
+            vectorstore=store_pg,
+        )
+        result_pg = retriever_pg("주문 건수")
+        assert len(result_pg.schemas) > 0
+        print(f"   from_sources schemas: {[s['name'] for s in result_pg.schemas]}")
+
+        # 3-b. 멱등 재인덱싱
+        retriever_pg2 = VectorRetriever.from_sources(
+            catalog=CATALOG,
+            embedding=embedding,
+            vectorstore=store_pg,
+        )
+        result_pg2 = retriever_pg2("주문 건수")
+        assert len(result_pg2.schemas) > 0
+        print(f"   idempotent re-index: {[s['name'] for s in result_pg2.schemas]}")
+
+        # 3-c. from_chunks
+        chunks_pg = CatalogChunker().split(CATALOG) + RecursiveCharacterChunker().split(DOCS)
+        store_pg2 = PGVectorStore(connection=PG_URL, table_name=f"test_{uuid4().hex[:8]}")
+        retriever_pgc = VectorRetriever.from_chunks(
+            chunks_pg, embedding=embedding, vectorstore=store_pg2
+        )
+        result_pgc = retriever_pgc("매출 정의")
+        assert len(result_pgc.context) > 0
+        print(f"   from_chunks context: {result_pgc.context[0][:30]}...")
+
+        # 정리
+        for s in [store_pg, store_pg2]:
+            with s._conn.cursor() as cur:
+                cur.execute(f"DROP TABLE IF EXISTS {s._table};")
+            s._conn.commit()
+            s._conn.close()
+
+        print("   ✓ PGVectorStore 정상")
+
+    except Exception as e:
+        print(f"   ✗ PGVectorStore 오류: {e}")
+
+
+# ── 요약 ──────────────────────────────────────────────────────────────────────
+
+print("\n" + "=" * 50)
+print("체크리스트 완료")
+print("=" * 50)
+```
+
+---
+
+## 참고: 파이프라인 전체 흐름
+
+```
+[CATALOG / DOCS]
+      │
+      ▼  chunker.split()
+  CatalogChunker / RecursiveCharacterChunker / SemanticChunker
+      │  list[IndexedChunk]
+      ▼
+  VectorRetriever.from_chunks(chunks, embedding=..., vectorstore=...)
+      ├── embedding.embed_texts(texts)
+      └── vectorstore.upsert(ids, vectors)
+           ├── InMemoryVectorStore  ← 기본값 (메모리)
+           ├── FAISSVectorStore     ← 로컬 파일 (.faiss + .meta)
+           └── PGVectorStore        ← PostgreSQL (pgvector)
+      │
+      ▼  retriever(query)
+  embedding.embed_query(query)
+      └── vectorstore.search(vector, k)
+           └── RetrievalResult
+               ├── .schemas  — 관련 CatalogEntry (중복 제거)
+               └── .context  — 관련 문서 텍스트
+      │
+      ▼
+  SQLGenerator → LLM → SQL → SQLExecutor → 결과
+```

From 0099154fb76ef36a7c4b430ad8e094e0d53c5d6f Mon Sep 17 00:00:00 2001
From: seyeong <seyoung4503@gmail.com>
Date: Fri, 27 Feb 2026 19:30:24 +0900
Subject: [PATCH 07/12] feat : file directory automatic generate

---
 src/lang2sql/integrations/vectorstore/faiss_.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/lang2sql/integrations/vectorstore/faiss_.py b/src/lang2sql/integrations/vectorstore/faiss_.py
index 2ed5de1..862936f 100644
--- a/src/lang2sql/integrations/vectorstore/faiss_.py
+++ b/src/lang2sql/integrations/vectorstore/faiss_.py
@@ -82,6 +82,7 @@ def save(self, path: str | None = None) -> None:
             )
         if self._index is None:
             raise RuntimeError("Cannot save before any upsert() call.")
+        pathlib.Path(path).parent.mkdir(parents=True, exist_ok=True)
         _faiss.write_index(self._index, path)
         pathlib.Path(path + ".meta").write_text(
             json.dumps(self._ids), encoding="utf-8"

From 3da6dd11a22a0f16a648e6b52bb7f101b2f8bb43 Mon Sep 17 00:00:00 2001
From: seyeong <seyoung4503@gmail.com>
Date: Fri, 27 Feb 2026 19:49:45 +0900
Subject: [PATCH 08/12] refactor(ports): make all Port implementations
 explicitly inherit their Protocol
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  All integration and component classes now declare their Port contract
  via explicit inheritance instead of relying on structural subtyping alone.

  - AnthropicLLM, OpenAILLM → LLMPort
  - SQLAlchemyDB → DBPort
  - OpenAIEmbedding → EmbeddingPort
  - InMemoryVectorStore, FAISSVectorStore, PGVectorStore → VectorStorePort
  - MarkdownLoader, PlainTextLoader, PDFLoader → DocumentLoaderPort
  - SemanticChunker → DocumentChunkerPort
---
 src/lang2sql/components/loaders/markdown_.py       | 3 ++-
 src/lang2sql/components/loaders/plaintext_.py      | 3 ++-
 src/lang2sql/integrations/chunking/semantic_.py    | 3 ++-
 src/lang2sql/integrations/db/sqlalchemy_.py        | 3 ++-
 src/lang2sql/integrations/embedding/openai_.py     | 3 ++-
 src/lang2sql/integrations/llm/anthropic_.py        | 3 ++-
 src/lang2sql/integrations/llm/openai_.py           | 3 ++-
 src/lang2sql/integrations/loaders/pdf_.py          | 3 ++-
 src/lang2sql/integrations/vectorstore/faiss_.py    | 3 ++-
 src/lang2sql/integrations/vectorstore/inmemory_.py | 3 ++-
 src/lang2sql/integrations/vectorstore/pgvector_.py | 3 ++-
 11 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/lang2sql/components/loaders/markdown_.py b/src/lang2sql/components/loaders/markdown_.py
index e5dc008..eb0488d 100644
--- a/src/lang2sql/components/loaders/markdown_.py
+++ b/src/lang2sql/components/loaders/markdown_.py
@@ -3,9 +3,10 @@
 from pathlib import Path
 
 from ...core.catalog import TextDocument
+from ...core.ports import DocumentLoaderPort
 
 
-class MarkdownLoader:
+class MarkdownLoader(DocumentLoaderPort):
     """
     Markdown file(s) (.md) → list[TextDocument].
 
diff --git a/src/lang2sql/components/loaders/plaintext_.py b/src/lang2sql/components/loaders/plaintext_.py
index 1f4ad5a..4337133 100644
--- a/src/lang2sql/components/loaders/plaintext_.py
+++ b/src/lang2sql/components/loaders/plaintext_.py
@@ -3,9 +3,10 @@
 from pathlib import Path
 
 from ...core.catalog import TextDocument
+from ...core.ports import DocumentLoaderPort
 
 
-class PlainTextLoader:
+class PlainTextLoader(DocumentLoaderPort):
     """
     Plain text file(s) (.txt, etc.) → list[TextDocument].
 
diff --git a/src/lang2sql/integrations/chunking/semantic_.py b/src/lang2sql/integrations/chunking/semantic_.py
index 523b5a6..253be97 100644
--- a/src/lang2sql/integrations/chunking/semantic_.py
+++ b/src/lang2sql/integrations/chunking/semantic_.py
@@ -3,9 +3,10 @@
 from ...core.catalog import IndexedChunk, TextDocument
 from ...core.exceptions import IntegrationMissingError
 from ...core.ports import EmbeddingPort
+from ...components.retrieval.chunker import DocumentChunkerPort
 
 
-class SemanticChunker:
+class SemanticChunker(DocumentChunkerPort):
     """
     Embedding-based semantic chunker. Optional — explicit opt-in only.
 
diff --git a/src/lang2sql/integrations/db/sqlalchemy_.py b/src/lang2sql/integrations/db/sqlalchemy_.py
index faa252f..7444502 100644
--- a/src/lang2sql/integrations/db/sqlalchemy_.py
+++ b/src/lang2sql/integrations/db/sqlalchemy_.py
@@ -3,6 +3,7 @@
 from typing import Any
 
 from ...core.exceptions import IntegrationMissingError
+from ...core.ports import DBPort
 
 try:
     from sqlalchemy import create_engine, text as sa_text
@@ -13,7 +14,7 @@
     Engine = None  # type: ignore[assignment,misc]
 
 
-class SQLAlchemyDB:
+class SQLAlchemyDB(DBPort):
     """DBPort implementation backed by SQLAlchemy 2.x."""
 
     def __init__(self, url: str) -> None:
diff --git a/src/lang2sql/integrations/embedding/openai_.py b/src/lang2sql/integrations/embedding/openai_.py
index 902a764..605ee66 100644
--- a/src/lang2sql/integrations/embedding/openai_.py
+++ b/src/lang2sql/integrations/embedding/openai_.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from ...core.exceptions import IntegrationMissingError
+from ...core.ports import EmbeddingPort
 
 try:
     import openai as _openai
@@ -8,7 +9,7 @@
     _openai = None  # type: ignore[assignment]
 
 
-class OpenAIEmbedding:
+class OpenAIEmbedding(EmbeddingPort):
     """EmbeddingPort implementation backed by OpenAI Embeddings API."""
 
     def __init__(
diff --git a/src/lang2sql/integrations/llm/anthropic_.py b/src/lang2sql/integrations/llm/anthropic_.py
index 5d5d803..849ecc9 100644
--- a/src/lang2sql/integrations/llm/anthropic_.py
+++ b/src/lang2sql/integrations/llm/anthropic_.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from ...core.exceptions import IntegrationMissingError
+from ...core.ports import LLMPort
 
 try:
     import anthropic as _anthropic
@@ -8,7 +9,7 @@
     _anthropic = None  # type: ignore[assignment]
 
 
-class AnthropicLLM:
+class AnthropicLLM(LLMPort):
     """LLMPort implementation backed by the Anthropic Messages API."""
 
     def __init__(self, *, model: str, api_key: str | None = None) -> None:
diff --git a/src/lang2sql/integrations/llm/openai_.py b/src/lang2sql/integrations/llm/openai_.py
index bb1e6ea..041a5a4 100644
--- a/src/lang2sql/integrations/llm/openai_.py
+++ b/src/lang2sql/integrations/llm/openai_.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from ...core.exceptions import IntegrationMissingError
+from ...core.ports import LLMPort
 
 try:
     import openai as _openai
@@ -8,7 +9,7 @@
     _openai = None  # type: ignore[assignment]
 
 
-class OpenAILLM:
+class OpenAILLM(LLMPort):
     """LLMPort implementation backed by the OpenAI Chat Completions API."""
 
     def __init__(self, *, model: str, api_key: str | None = None) -> None:
diff --git a/src/lang2sql/integrations/loaders/pdf_.py b/src/lang2sql/integrations/loaders/pdf_.py
index 1ddb901..8d1143d 100644
--- a/src/lang2sql/integrations/loaders/pdf_.py
+++ b/src/lang2sql/integrations/loaders/pdf_.py
@@ -4,6 +4,7 @@
 
 from ...core.catalog import TextDocument
 from ...core.exceptions import IntegrationMissingError
+from ...core.ports import DocumentLoaderPort
 
 try:
     import fitz as _fitz
@@ -11,7 +12,7 @@
     _fitz = None  # type: ignore[assignment]
 
 
-class PDFLoader:
+class PDFLoader(DocumentLoaderPort):
     """
     PDF file → list[TextDocument].
 
diff --git a/src/lang2sql/integrations/vectorstore/faiss_.py b/src/lang2sql/integrations/vectorstore/faiss_.py
index 862936f..3252c3e 100644
--- a/src/lang2sql/integrations/vectorstore/faiss_.py
+++ b/src/lang2sql/integrations/vectorstore/faiss_.py
@@ -4,6 +4,7 @@
 import pathlib
 
 from ...core.exceptions import IntegrationMissingError
+from ...core.ports import VectorStorePort
 
 try:
     import faiss as _faiss
@@ -13,7 +14,7 @@
     _np = None  # type: ignore[assignment]
 
 
-class FAISSVectorStore:
+class FAISSVectorStore(VectorStorePort):
     """
     FAISS-backed vector store with optional file persistence.
 
diff --git a/src/lang2sql/integrations/vectorstore/inmemory_.py b/src/lang2sql/integrations/vectorstore/inmemory_.py
index ad0169f..97a176f 100644
--- a/src/lang2sql/integrations/vectorstore/inmemory_.py
+++ b/src/lang2sql/integrations/vectorstore/inmemory_.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from ...core.exceptions import IntegrationMissingError
+from ...core.ports import VectorStorePort
 
 try:
     import numpy as _np
@@ -8,7 +9,7 @@
     _np = None  # type: ignore[assignment]
 
 
-class InMemoryVectorStore:
+class InMemoryVectorStore(VectorStorePort):
     """
     Brute-force cosine similarity vector store backed by numpy.
 
diff --git a/src/lang2sql/integrations/vectorstore/pgvector_.py b/src/lang2sql/integrations/vectorstore/pgvector_.py
index 4ec7aa5..eaa1d4d 100644
--- a/src/lang2sql/integrations/vectorstore/pgvector_.py
+++ b/src/lang2sql/integrations/vectorstore/pgvector_.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from ...core.exceptions import IntegrationMissingError
+from ...core.ports import VectorStorePort
 
 try:
     import psycopg2 as _psycopg2
@@ -10,7 +11,7 @@
     _register_vector = None  # type: ignore[assignment]
 
 
-class PGVectorStore:
+class PGVectorStore(VectorStorePort):
     """
     PostgreSQL pgvector-backed vector store.
 

From f8ee2dfeffe41178c8497698801052131a46dffb Mon Sep 17 00:00:00 2001
From: seyeong <seyoung4503@gmail.com>
Date: Fri, 27 Feb 2026 20:12:50 +0900
Subject: [PATCH 09/12] refactor(ports): add explicit Port inheritance to
 remaining implementations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  - NullHook, MemoryHook → TraceHook
  - RecursiveCharacterChunker → DocumentChunkerPort
---
 src/lang2sql/components/retrieval/chunker.py | 4 +++-
 src/lang2sql/core/hooks.py                   | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/lang2sql/components/retrieval/chunker.py b/src/lang2sql/components/retrieval/chunker.py
index 20ac9e5..93c1df2 100644
--- a/src/lang2sql/components/retrieval/chunker.py
+++ b/src/lang2sql/components/retrieval/chunker.py
@@ -97,7 +97,7 @@ def chunk(self, entry: CatalogEntry) -> list[IndexedChunk]:
         return chunks
 
 
-class RecursiveCharacterChunker:
+class RecursiveCharacterChunker(DocumentChunkerPort):
     """
     Hierarchical separator-based document chunker. No external dependencies.
 
@@ -161,6 +161,8 @@ def chunk(self, doc: TextDocument) -> list[IndexedChunk]:
 
     def _split(self, text: str, separators: list[str]) -> list[str]:
         """Recursively try separators until all chunks fit within chunk_size."""
+        if not separators:
+            return [text] if text else []
         chunks: list[str] = []
         separator = separators[-1]  # fallback: character-level split
 
diff --git a/src/lang2sql/core/hooks.py b/src/lang2sql/core/hooks.py
index 8f2729b..f4c8428 100644
--- a/src/lang2sql/core/hooks.py
+++ b/src/lang2sql/core/hooks.py
@@ -27,12 +27,12 @@ class TraceHook(Protocol):
     def on_event(self, event: Event) -> None: ...
 
 
-class NullHook:
+class NullHook(TraceHook):
     def on_event(self, event: Event) -> None:
         return
 
 
-class MemoryHook:
+class MemoryHook(TraceHook):
     def __init__(self) -> None:
         self.events: list[Event] = []
 

From bba95b758283c2d07aca9f83f5c5f745cef63403 Mon Sep 17 00:00:00 2001
From: seyeong <seyoung4503@gmail.com>
Date: Fri, 27 Feb 2026 20:13:07 +0900
Subject: [PATCH 10/12] fix: guard against empty input and per-file load
 failures

  - pgvector_: return early in upsert() when vectors list is empty
  - chunker: return early in _split() when separators list is empty
  - directory_: catch per-file load errors with warnings.warn
    so one bad file does not abort the entire directory load
---
 src/lang2sql/components/loaders/directory_.py      | 6 +++++-
 src/lang2sql/integrations/vectorstore/pgvector_.py | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/lang2sql/components/loaders/directory_.py b/src/lang2sql/components/loaders/directory_.py
index ccee0b9..48cfe3a 100644
--- a/src/lang2sql/components/loaders/directory_.py
+++ b/src/lang2sql/components/loaders/directory_.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import warnings
 from pathlib import Path
 
 from ...core.catalog import TextDocument
@@ -53,5 +54,8 @@ def load(self) -> list[TextDocument]:
             loader = self._loaders.get(file.suffix.lower())
             if loader is None:
                 continue
-            docs.extend(loader.load(str(file)))
+            try:
+                docs.extend(loader.load(str(file)))
+            except Exception as e:
+                warnings.warn(f"Failed to load {file}: {e}", stacklevel=2)
         return docs
diff --git a/src/lang2sql/integrations/vectorstore/pgvector_.py b/src/lang2sql/integrations/vectorstore/pgvector_.py
index eaa1d4d..aab745e 100644
--- a/src/lang2sql/integrations/vectorstore/pgvector_.py
+++ b/src/lang2sql/integrations/vectorstore/pgvector_.py
@@ -63,6 +63,8 @@ def _ensure_table(self, dim: int) -> None:
 
     def upsert(self, ids: list[str], vectors: list[list[float]]) -> None:
         """Create table if needed, then upsert all (id, vector) pairs."""
+        if not vectors:
+            return
         self._ensure_table(len(vectors[0]))
         with self._conn.cursor() as cur:
             for id_, vec in zip(ids, vectors):

From 7c44028d499e21dc5cb2193ea334ed5719551635 Mon Sep 17 00:00:00 2001
From: seyeong <seyoung4503@gmail.com>
Date: Fri, 27 Feb 2026 20:13:28 +0900
Subject: [PATCH 11/12] feat(llm): expose max_tokens as a constructor parameter
 in AnthropicLLM

  Hard-coded 1024 could truncate long SQL responses.
  Default raised to 4096; users can override per instance.
---
 src/lang2sql/integrations/llm/anthropic_.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/lang2sql/integrations/llm/anthropic_.py b/src/lang2sql/integrations/llm/anthropic_.py
index 849ecc9..037cc69 100644
--- a/src/lang2sql/integrations/llm/anthropic_.py
+++ b/src/lang2sql/integrations/llm/anthropic_.py
@@ -12,20 +12,23 @@
 class AnthropicLLM(LLMPort):
     """LLMPort implementation backed by the Anthropic Messages API."""
 
-    def __init__(self, *, model: str, api_key: str | None = None) -> None:
+    def __init__(
+        self, *, model: str, api_key: str | None = None, max_tokens: int = 4096
+    ) -> None:
         if _anthropic is None:
             raise IntegrationMissingError(
                 "anthropic", hint="pip install anthropic  # or: uv sync"
             )
         self._client = _anthropic.Anthropic(api_key=api_key)
         self._model = model
+        self._max_tokens = max_tokens
 
     def invoke(self, messages: list[dict[str, str]]) -> str:
         system = next((m["content"] for m in messages if m["role"] == "system"), None)
         user_msgs = [m for m in messages if m["role"] != "system"]
         resp = self._client.messages.create(
             model=self._model,
-            max_tokens=1024,
+            max_tokens=self._max_tokens,
             system=system or "",
             messages=user_msgs,
         )

From a9acce9d5492a7fc98ff9c0e39e2f5620bac51de Mon Sep 17 00:00:00 2001
From: seyeong <seyoung4503@gmail.com>
Date: Sat, 28 Feb 2026 15:37:33 +0900
Subject: [PATCH 12/12] style: apply pre-commit formatting

---
 src/lang2sql/integrations/vectorstore/faiss_.py | 8 ++------
 tests/test_integrations_faiss_vectorstore.py    | 2 +-
 tests/test_integrations_pgvector_vectorstore.py | 2 +-
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/lang2sql/integrations/vectorstore/faiss_.py b/src/lang2sql/integrations/vectorstore/faiss_.py
index 3252c3e..0e6a12f 100644
--- a/src/lang2sql/integrations/vectorstore/faiss_.py
+++ b/src/lang2sql/integrations/vectorstore/faiss_.py
@@ -85,9 +85,7 @@ def save(self, path: str | None = None) -> None:
             raise RuntimeError("Cannot save before any upsert() call.")
         pathlib.Path(path).parent.mkdir(parents=True, exist_ok=True)
         _faiss.write_index(self._index, path)
-        pathlib.Path(path + ".meta").write_text(
-            json.dumps(self._ids), encoding="utf-8"
-        )
+        pathlib.Path(path + ".meta").write_text(json.dumps(self._ids), encoding="utf-8")
 
     @classmethod
     def load(cls, path: str) -> "FAISSVectorStore":
@@ -99,9 +97,7 @@ def load(cls, path: str) -> "FAISSVectorStore":
             raise IntegrationMissingError("faiss", hint="pip install faiss-cpu")
         meta_path = pathlib.Path(path + ".meta")
         if not pathlib.Path(path).exists() or not meta_path.exists():
-            raise FileNotFoundError(
-                f"Index files not found: {path}, {path}.meta"
-            )
+            raise FileNotFoundError(f"Index files not found: {path}, {path}.meta")
         store = cls(index_path=path)
         store._index = _faiss.read_index(path)
         store._ids = json.loads(meta_path.read_text(encoding="utf-8"))
diff --git a/tests/test_integrations_faiss_vectorstore.py b/tests/test_integrations_faiss_vectorstore.py
index 2b4e1d6..0b97fcb 100644
--- a/tests/test_integrations_faiss_vectorstore.py
+++ b/tests/test_integrations_faiss_vectorstore.py
@@ -3,6 +3,7 @@
 
 All tests are auto-skipped when faiss-cpu is not installed.
 """
+
 import pytest
 
 faiss = pytest.importorskip("faiss")  # skip entire module if not installed
@@ -12,7 +13,6 @@
 
 from lang2sql.integrations.vectorstore.faiss_ import FAISSVectorStore
 
-
 # ── helpers ──────────────────────────────────────────────────────────────────
 
 
diff --git a/tests/test_integrations_pgvector_vectorstore.py b/tests/test_integrations_pgvector_vectorstore.py
index c2245e2..789872b 100644
--- a/tests/test_integrations_pgvector_vectorstore.py
+++ b/tests/test_integrations_pgvector_vectorstore.py
@@ -8,6 +8,7 @@
     TEST_POSTGRES_URL="postgresql://postgres:postgres@localhost:5432/test" \\
         pytest tests/test_integrations_pgvector_vectorstore.py -v
 """
+
 import os
 import pytest
 from uuid import uuid4
@@ -19,7 +20,6 @@
 
 from lang2sql.integrations.vectorstore.pgvector_ import PGVectorStore
 
-
 # ── helpers ──────────────────────────────────────────────────────────────────