From 768ce0aacbf5730700d6ac6d0f4b71e62ef2de22 Mon Sep 17 00:00:00 2001 From: seyeong Date: Fri, 27 Feb 2026 19:23:30 +0900 Subject: [PATCH 01/12] feat(vectorstore): add FAISSVectorStore with file persistence IndexFlatIP + L2 normalization for cosine similarity. Lazy index init on first upsert(). save/load via .faiss + .meta files. --- .../integrations/vectorstore/faiss_.py | 106 ++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 src/lang2sql/integrations/vectorstore/faiss_.py diff --git a/src/lang2sql/integrations/vectorstore/faiss_.py b/src/lang2sql/integrations/vectorstore/faiss_.py new file mode 100644 index 0000000..2ed5de1 --- /dev/null +++ b/src/lang2sql/integrations/vectorstore/faiss_.py @@ -0,0 +1,106 @@ +from __future__ import annotations + +import json +import pathlib + +from ...core.exceptions import IntegrationMissingError + +try: + import faiss as _faiss + import numpy as _np +except ImportError: + _faiss = None # type: ignore[assignment] + _np = None # type: ignore[assignment] + + +class FAISSVectorStore: + """ + FAISS-backed vector store with optional file persistence. + + Uses IndexFlatIP + L2 normalization for exact cosine similarity. + Index is lazy-initialized on the first upsert() call. + + Known limitation (append-only): + Upserting the same chunk_id twice creates duplicate FAISS entries. + To rebuild a clean index, create a new FAISSVectorStore instance + and run from_chunks() again from scratch. + + Args: + index_path: Optional path for save() / load(). Used as default + path when save() is called without an explicit argument. + + Installation: + pip install faiss-cpu # CPU-only + pip install faiss-gpu # GPU variant + """ + + def __init__(self, index_path: str | None = None) -> None: + if _faiss is None or _np is None: + raise IntegrationMissingError("faiss", hint="pip install faiss-cpu") + self._index_path = index_path + self._index: object | None = None # faiss.IndexFlatIP, None until first upsert + self._ids: list[str] = [] + + # ── VectorStorePort ────────────────────────────────────────────── + + def upsert(self, ids: list[str], vectors: list[list[float]]) -> None: + """L2-normalize and add vectors. Lazy-creates index on first call.""" + arr = _np.array(vectors, dtype=_np.float32) + _faiss.normalize_L2(arr) # in-place cosine trick + if self._index is None: + self._index = _faiss.IndexFlatIP(arr.shape[1]) + self._index.add(arr) + self._ids.extend(ids) + + def search(self, vector: list[float], k: int) -> list[tuple[str, float]]: + """Return (chunk_id, cosine_score) for the k nearest vectors.""" + if self._index is None or self._index.ntotal == 0: + return [] + q = _np.array([vector], dtype=_np.float32) + _faiss.normalize_L2(q) + k = min(k, self._index.ntotal) + scores, positions = self._index.search(q, k) + return [ + (self._ids[int(pos)], float(scores[0][j])) + for j, pos in enumerate(positions[0]) + if pos >= 0 + ] + + # ── Persistence ────────────────────────────────────────────────── + + def save(self, path: str | None = None) -> None: + """ + Write index to {path} and id list to {path}.meta. + Falls back to self._index_path when path is None. + Raises ValueError if no path is available. + Raises RuntimeError if called before any upsert(). + """ + path = path or self._index_path + if path is None: + raise ValueError( + "No path provided and index_path was not set at construction." + ) + if self._index is None: + raise RuntimeError("Cannot save before any upsert() call.") + _faiss.write_index(self._index, path) + pathlib.Path(path + ".meta").write_text( + json.dumps(self._ids), encoding="utf-8" + ) + + @classmethod + def load(cls, path: str) -> "FAISSVectorStore": + """ + Load index from {path} and id list from {path}.meta. + Raises FileNotFoundError if either file is missing. + """ + if _faiss is None or _np is None: + raise IntegrationMissingError("faiss", hint="pip install faiss-cpu") + meta_path = pathlib.Path(path + ".meta") + if not pathlib.Path(path).exists() or not meta_path.exists(): + raise FileNotFoundError( + f"Index files not found: {path}, {path}.meta" + ) + store = cls(index_path=path) + store._index = _faiss.read_index(path) + store._ids = json.loads(meta_path.read_text(encoding="utf-8")) + return store From 700c945d362c11bcb308653348a23245b13ace5a Mon Sep 17 00:00:00 2001 From: seyeong Date: Fri, 27 Feb 2026 19:23:47 +0900 Subject: [PATCH 02/12] feat(vectorstore): add PGVectorStore with pgvector backend True upsert via ON CONFLICT DO UPDATE. Automatic table creation on first upsert(). Cosine similarity via pgvector <=> operator. --- .../integrations/vectorstore/pgvector_.py | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 src/lang2sql/integrations/vectorstore/pgvector_.py diff --git a/src/lang2sql/integrations/vectorstore/pgvector_.py b/src/lang2sql/integrations/vectorstore/pgvector_.py new file mode 100644 index 0000000..4ec7aa5 --- /dev/null +++ b/src/lang2sql/integrations/vectorstore/pgvector_.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +from ...core.exceptions import IntegrationMissingError + +try: + import psycopg2 as _psycopg2 + from pgvector.psycopg2 import register_vector as _register_vector +except ImportError: + _psycopg2 = None # type: ignore[assignment] + _register_vector = None # type: ignore[assignment] + + +class PGVectorStore: + """ + PostgreSQL pgvector-backed vector store. + + True upsert semantics via ON CONFLICT DO UPDATE — idempotent, + no duplicates across multiple from_chunks() runs. + Table is created automatically on first upsert() call. + + Args: + connection: PostgreSQL connection URL. + e.g. "postgresql://user:pass@localhost:5432/mydb" + table_name: Name of the vector table. Default "lang2sql_vectors". + + Installation: + pip install psycopg2-binary pgvector + + Quick start with Docker: + docker run -d -e POSTGRES_PASSWORD=postgres \\ + -p 5432:5432 pgvector/pgvector:pg16 + """ + + def __init__( + self, + *, + connection: str, + table_name: str = "lang2sql_vectors", + ) -> None: + if _psycopg2 is None or _register_vector is None: + raise IntegrationMissingError( + "psycopg2", hint="pip install psycopg2-binary pgvector" + ) + self._conn = _psycopg2.connect(connection) + _register_vector(self._conn) + self._table = table_name + self._ready = False # True after first _ensure_table() + + def _ensure_table(self, dim: int) -> None: + if self._ready: + return + with self._conn.cursor() as cur: + cur.execute("CREATE EXTENSION IF NOT EXISTS vector;") + cur.execute( + f"CREATE TABLE IF NOT EXISTS {self._table} " + f"(id TEXT PRIMARY KEY, embedding vector({dim}));" + ) + self._conn.commit() + self._ready = True + + # ── VectorStorePort ────────────────────────────────────────────── + + def upsert(self, ids: list[str], vectors: list[list[float]]) -> None: + """Create table if needed, then upsert all (id, vector) pairs.""" + self._ensure_table(len(vectors[0])) + with self._conn.cursor() as cur: + for id_, vec in zip(ids, vectors): + cur.execute( + f"INSERT INTO {self._table} (id, embedding) VALUES (%s, %s) " + "ON CONFLICT (id) DO UPDATE SET embedding = EXCLUDED.embedding;", + (id_, vec), + ) + self._conn.commit() + + def search(self, vector: list[float], k: int) -> list[tuple[str, float]]: + """Return (chunk_id, cosine_score) for the k nearest vectors. + Returns [] if the table has not been created yet. + """ + if not self._ready: + return [] + with self._conn.cursor() as cur: + cur.execute( + f"SELECT id, 1 - (embedding <=> %s::vector) AS score " + f"FROM {self._table} " + f"ORDER BY embedding <=> %s::vector LIMIT %s;", + (vector, vector, k), + ) + return [(row[0], float(row[1])) for row in cur.fetchall()] From 22366d72b87fe8a3f435facad0645178cae7d815 Mon Sep 17 00:00:00 2001 From: seyeong Date: Fri, 27 Feb 2026 19:24:15 +0900 Subject: [PATCH 03/12] feat(vectorstore): export FAISSVectorStore and PGVectorStore to public API --- src/lang2sql/__init__.py | 5 +++++ src/lang2sql/integrations/vectorstore/__init__.py | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/lang2sql/__init__.py b/src/lang2sql/__init__.py index 9ab711e..2dcdc49 100644 --- a/src/lang2sql/__init__.py +++ b/src/lang2sql/__init__.py @@ -1,3 +1,5 @@ +from .integrations.vectorstore.faiss_ import FAISSVectorStore +from .integrations.vectorstore.pgvector_ import PGVectorStore from .components.execution.sql_executor import SQLExecutor from .components.generation.sql_generator import SQLGenerator from .components.loaders.directory_ import DirectoryLoader @@ -59,4 +61,7 @@ "Lang2SQLError", "ComponentError", "IntegrationMissingError", + # Vector store backends + "FAISSVectorStore", + "PGVectorStore", ] diff --git a/src/lang2sql/integrations/vectorstore/__init__.py b/src/lang2sql/integrations/vectorstore/__init__.py index bddace4..eeab249 100644 --- a/src/lang2sql/integrations/vectorstore/__init__.py +++ b/src/lang2sql/integrations/vectorstore/__init__.py @@ -1,3 +1,5 @@ +from .faiss_ import FAISSVectorStore from .inmemory_ import InMemoryVectorStore +from .pgvector_ import PGVectorStore -__all__ = ["InMemoryVectorStore"] +__all__ = ["InMemoryVectorStore", "FAISSVectorStore", "PGVectorStore"] From 417213f4295318ce2f81223483275834cc55c99c Mon Sep 17 00:00:00 2001 From: seyeong Date: Fri, 27 Feb 2026 19:24:29 +0900 Subject: [PATCH 04/12] test(vectorstore): add 8 tests for FAISSVectorStore Covers upsert/search, cosine score, save/load roundtrip, and error cases. Auto-skipped if faiss-cpu is not installed. --- tests/test_integrations_faiss_vectorstore.py | 107 +++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 tests/test_integrations_faiss_vectorstore.py diff --git a/tests/test_integrations_faiss_vectorstore.py b/tests/test_integrations_faiss_vectorstore.py new file mode 100644 index 0000000..2b4e1d6 --- /dev/null +++ b/tests/test_integrations_faiss_vectorstore.py @@ -0,0 +1,107 @@ +""" +FAISSVectorStore integration tests. + +All tests are auto-skipped when faiss-cpu is not installed. +""" +import pytest + +faiss = pytest.importorskip("faiss") # skip entire module if not installed + +import tempfile +import os + +from lang2sql.integrations.vectorstore.faiss_ import FAISSVectorStore + + +# ── helpers ────────────────────────────────────────────────────────────────── + + +def _ortho_vectors() -> list[tuple[str, list[float]]]: + """4 orthogonal unit vectors for deterministic cosine tests.""" + return [ + ("a", [1.0, 0.0, 0.0, 0.0]), + ("b", [0.0, 1.0, 0.0, 0.0]), + ("c", [0.0, 0.0, 1.0, 0.0]), + ("d", [0.0, 0.0, 0.0, 1.0]), + ] + + +@pytest.fixture +def store() -> FAISSVectorStore: + return FAISSVectorStore() + + +def _populate(store: FAISSVectorStore) -> None: + items = _ortho_vectors() + ids = [item[0] for item in items] + vecs = [item[1] for item in items] + store.upsert(ids, vecs) + + +# ── tests ───────────────────────────────────────────────────────────────────── + + +def test_faiss_upsert_and_search_returns_closest(store): + """Query vector returns its own id at rank 1.""" + _populate(store) + results = store.search([1.0, 0.0, 0.0, 0.0], k=1) + assert len(results) == 1 + assert results[0][0] == "a" + + +def test_faiss_cosine_score_of_identical_vector(store): + """Identical query → score ≈ 1.0.""" + _populate(store) + results = store.search([1.0, 0.0, 0.0, 0.0], k=1) + assert abs(results[0][1] - 1.0) < 1e-5 + + +def test_faiss_upsert_merge_preserves_prior_entries(store): + """Second upsert() call doesn't lose entries from the first.""" + store.upsert(["a"], [[1.0, 0.0, 0.0, 0.0]]) + store.upsert(["b"], [[0.0, 1.0, 0.0, 0.0]]) + + # "a" should still be retrievable + results = store.search([1.0, 0.0, 0.0, 0.0], k=2) + ids = [r[0] for r in results] + assert "a" in ids + + +def test_faiss_search_respects_k(store): + """len(results) <= k.""" + _populate(store) + results = store.search([1.0, 0.0, 0.0, 0.0], k=2) + assert len(results) <= 2 + + +def test_faiss_search_on_empty_store_returns_empty(store): + """[] before any upsert().""" + results = store.search([1.0, 0.0, 0.0, 0.0], k=5) + assert results == [] + + +def test_faiss_save_and_load_roundtrip(store, tmp_path): + """save() → load() → search() returns same results.""" + _populate(store) + index_path = str(tmp_path / "catalog.faiss") + store.save(index_path) + + loaded = FAISSVectorStore.load(index_path) + original_results = store.search([1.0, 0.0, 0.0, 0.0], k=1) + loaded_results = loaded.search([1.0, 0.0, 0.0, 0.0], k=1) + + assert loaded_results[0][0] == original_results[0][0] + assert abs(loaded_results[0][1] - original_results[0][1]) < 1e-5 + + +def test_faiss_save_without_path_raises(store): + """save() with no path and no index_path → ValueError.""" + _populate(store) + with pytest.raises(ValueError, match="No path provided"): + store.save() + + +def test_faiss_load_nonexistent_path_raises(): + """load("nonexistent") → FileNotFoundError.""" + with pytest.raises(FileNotFoundError): + FAISSVectorStore.load("nonexistent_path_that_does_not_exist.faiss") From 1e2f0184cf4418a5eb5110ee67d9bbc3309120e3 Mon Sep 17 00:00:00 2001 From: seyeong Date: Fri, 27 Feb 2026 19:24:43 +0900 Subject: [PATCH 05/12] test(vectorstore): add 6 tests for PGVectorStore Covers upsert/search, idempotent upsert, score range, and auto table creation. Skipped if TEST_POSTGRES_URL is not set. --- .../test_integrations_pgvector_vectorstore.py | 144 ++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 tests/test_integrations_pgvector_vectorstore.py diff --git a/tests/test_integrations_pgvector_vectorstore.py b/tests/test_integrations_pgvector_vectorstore.py new file mode 100644 index 0000000..c2245e2 --- /dev/null +++ b/tests/test_integrations_pgvector_vectorstore.py @@ -0,0 +1,144 @@ +""" +PGVectorStore integration tests. + +Requires a live PostgreSQL instance with pgvector installed. +Skipped when TEST_POSTGRES_URL env variable is not set. + +Example: + TEST_POSTGRES_URL="postgresql://postgres:postgres@localhost:5432/test" \\ + pytest tests/test_integrations_pgvector_vectorstore.py -v +""" +import os +import pytest +from uuid import uuid4 + +pytestmark = pytest.mark.skipif( + not os.getenv("TEST_POSTGRES_URL"), + reason="TEST_POSTGRES_URL not set — skipping pgvector integration tests", +) + +from lang2sql.integrations.vectorstore.pgvector_ import PGVectorStore + + +# ── helpers ────────────────────────────────────────────────────────────────── + + +def _unique_table() -> str: + return f"test_{uuid4().hex[:8]}" + + +def _make_store(table_name: str) -> PGVectorStore: + url = os.environ["TEST_POSTGRES_URL"] + return PGVectorStore(connection=url, table_name=table_name) + + +def _drop_table(store: PGVectorStore, table_name: str) -> None: + with store._conn.cursor() as cur: + cur.execute(f"DROP TABLE IF EXISTS {table_name};") + store._conn.commit() + + +# ── tests ───────────────────────────────────────────────────────────────────── + + +def test_pgvector_upsert_and_search(): + """Query vector returns its own id.""" + table = _unique_table() + store = _make_store(table) + try: + store.upsert(["a"], [[1.0, 0.0, 0.0, 0.0]]) + results = store.search([1.0, 0.0, 0.0, 0.0], k=1) + assert len(results) == 1 + assert results[0][0] == "a" + finally: + _drop_table(store, table) + store._conn.close() + + +def test_pgvector_upsert_is_idempotent(): + """Same id upserted twice → exactly one row in DB.""" + table = _unique_table() + store = _make_store(table) + try: + store.upsert(["a"], [[1.0, 0.0, 0.0, 0.0]]) + store.upsert(["a"], [[0.5, 0.5, 0.0, 0.0]]) # overwrite same id + + with store._conn.cursor() as cur: + cur.execute(f"SELECT COUNT(*) FROM {table} WHERE id = 'a';") + count = cur.fetchone()[0] + assert count == 1 + finally: + _drop_table(store, table) + store._conn.close() + + +def test_pgvector_search_score_in_range(): + """Score ∈ [-1, 1].""" + table = _unique_table() + store = _make_store(table) + try: + store.upsert( + ["a", "b", "c"], + [ + [1.0, 0.0, 0.0, 0.0], + [0.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0], + ], + ) + results = store.search([1.0, 0.0, 0.0, 0.0], k=3) + for _, score in results: + assert -1.0 <= score <= 1.0 + 1e-6 + finally: + _drop_table(store, table) + store._conn.close() + + +def test_pgvector_search_respects_k(): + """len(results) <= k.""" + table = _unique_table() + store = _make_store(table) + try: + store.upsert( + ["a", "b", "c", "d"], + [ + [1.0, 0.0, 0.0, 0.0], + [0.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0], + [0.0, 0.0, 0.0, 1.0], + ], + ) + results = store.search([1.0, 0.0, 0.0, 0.0], k=2) + assert len(results) <= 2 + finally: + _drop_table(store, table) + store._conn.close() + + +def test_pgvector_table_created_automatically(): + """Table exists in information_schema after first upsert().""" + table = _unique_table() + store = _make_store(table) + try: + store.upsert(["x"], [[1.0, 0.0]]) + with store._conn.cursor() as cur: + cur.execute( + "SELECT COUNT(*) FROM information_schema.tables " + "WHERE table_name = %s;", + (table,), + ) + count = cur.fetchone()[0] + assert count == 1 + finally: + _drop_table(store, table) + store._conn.close() + + +def test_pgvector_search_empty_store_returns_empty(): + """[] before any upsert().""" + table = _unique_table() + store = _make_store(table) + try: + results = store.search([1.0, 0.0, 0.0, 0.0], k=5) + assert results == [] + finally: + store._conn.close() From 8db83bfd690be44dcf6081ae0fe116c436039f5a Mon Sep 17 00:00:00 2001 From: seyeong Date: Fri, 27 Feb 2026 19:25:03 +0900 Subject: [PATCH 06/12] docs(vectorstore): update vector-store-backends guide for v0.3.0 --- docs/tutorials/vector-store-backends.md | 598 ++++++++++++++++++++++++ 1 file changed, 598 insertions(+) create mode 100644 docs/tutorials/vector-store-backends.md diff --git a/docs/tutorials/vector-store-backends.md b/docs/tutorials/vector-store-backends.md new file mode 100644 index 0000000..0a14ef4 --- /dev/null +++ b/docs/tutorials/vector-store-backends.md @@ -0,0 +1,598 @@ +# 벡터 저장소 백엔드 가이드 — InMemory / FAISS / pgvector + +> **버전**: lang2sql v0.3.0 +> **업데이트**: 2026-02-27 + +--- + +## 목차 + +1. [세 가지 백엔드 비교](#1-세-가지-백엔드-비교) +2. [의존성 — 별도 설치 불필요](#2-의존성--별도-설치-불필요) +3. [InMemoryVectorStore — 기본값](#3-inmemoryvectorstore--기본값) +4. [FAISSVectorStore — 로컬 파일 영속성](#4-faissvectorstore--로컬-파일-영속성) +5. [PGVectorStore — PostgreSQL 영속성](#5-pgvectorstore--postgresql-영속성) +6. [백엔드 교체 방법](#6-백엔드-교체-방법) +7. [커스텀 벡터 저장소 직접 구현하기](#7-커스텀-벡터-저장소-직접-구현하기) +8. [전체 체크리스트 — API 키 없이 실행](#8-전체-체크리스트--api-키-없이-실행) + +--- + +## 1. 세 가지 백엔드 비교 + +| | `InMemoryVectorStore` | `FAISSVectorStore` | `PGVectorStore` | +|---|---|---|---| +| **저장 위치** | 메모리 (휘발성) | 로컬 파일 (`.faiss` + `.meta`) | PostgreSQL DB | +| **영속성** | 없음 — 재시작 시 소멸 | 있음 — 파일로 저장/로드 | 있음 — DB에 영구 저장 | +| **Upsert** | true upsert (dict 기반) | append-only (동일 id 중복 주의) | true upsert (ON CONFLICT) | +| **멀티 서버** | 불가 | 불가 (파일 단일 접근) | 가능 | +| **권장 규모** | < 50k chunks | < 500k chunks | 500k+ chunks | +| **추가 설치** | 불필요 | 불필요 (기본 포함) | 불필요 (기본 포함) | +| **적합한 환경** | 개발/테스트, 소규모 | 단일 서버 운영, 중규모 | 팀 공유, 대규모 운영 | + +--- + +## 2. 의존성 — 별도 설치 불필요 + +세 백엔드 모두 `pip install lang2sql` 한 번으로 설치됩니다. +`pyproject.toml`의 기본 의존성(`dependencies`)에 포함되어 있습니다. + +| 패키지 | 고정 버전 | 역할 | +|--------|----------|------| +| `numpy` | `<2.0` | InMemoryVectorStore 행렬 연산 | +| `faiss-cpu` | `==1.10.0` | FAISSVectorStore 인덱스 엔진 | +| `psycopg2-binary` | `>=2.9.10,<3.0.0` | PGVectorStore PostgreSQL 연결 | +| `pgvector` | `==0.3.6` | PGVectorStore `vector` 타입 직렬화 | + +> **GPU 가속이 필요한 경우**: `faiss-cpu`를 직접 `faiss-gpu`로 교체할 수 있습니다. +> pyproject.toml의 `faiss-cpu==1.10.0`을 `faiss-gpu==1.10.0`으로 변경 후 `uv sync`. + +--- + +## 3. InMemoryVectorStore — 기본값 + +numpy 기반 브루트 포스 코사인 유사도. `vectorstore=` 를 생략하면 자동으로 사용됩니다. + +**특징:** +- true upsert — 동일 chunk_id를 두 번 넣으면 덮어씀 +- 검색 시 매번 행렬 재구성 (수만 벡터까지 충분히 빠름) +- 프로세스 종료 시 인덱스 소멸 + +```python +from lang2sql import VectorRetriever, CatalogEntry +from lang2sql.integrations.embedding import OpenAIEmbedding + +CATALOG: list[CatalogEntry] = [ + { + "name": "orders", + "description": "고객 주문 정보", + "columns": {"order_id": "PK", "amount": "금액", "status": "상태"}, + }, +] + +# vectorstore= 생략 → InMemoryVectorStore 자동 사용 +retriever = VectorRetriever.from_sources( + catalog=CATALOG, + embedding=OpenAIEmbedding(), +) + +result = retriever("주문 건수") +print(result.schemas) +``` + +--- + +## 4. FAISSVectorStore — 로컬 파일 영속성 + +Facebook AI Research의 벡터 검색 라이브러리. +`IndexFlatIP` + L2 정규화로 정확한 코사인 유사도를 계산합니다. + +### 4-1. 기본 사용법 — from_sources() + +```python +from lang2sql import VectorRetriever +from lang2sql.integrations.vectorstore import FAISSVectorStore +from lang2sql.integrations.embedding import OpenAIEmbedding + +store = FAISSVectorStore(index_path="./index/catalog.faiss") + +retriever = VectorRetriever.from_sources( + catalog=CATALOG, + embedding=OpenAIEmbedding(), + vectorstore=store, # ← FAISSVectorStore 주입 +) + +# 인덱스를 파일로 저장 +store.save() +# → ./index/catalog.faiss (FAISS 바이너리) +# → ./index/catalog.faiss.meta (chunk id 목록 JSON) +``` + +### 4-2. 명시적 파이프라인 — from_chunks() + +```python +from lang2sql import VectorRetriever, CatalogChunker, RecursiveCharacterChunker +from lang2sql import TextDocument +from lang2sql.integrations.vectorstore import FAISSVectorStore +from lang2sql.integrations.embedding import OpenAIEmbedding + +embedding = OpenAIEmbedding() +store = FAISSVectorStore(index_path="./index/catalog.faiss") + +DOCS: list[TextDocument] = [ + { + "id": "revenue_def", + "title": "매출 정의", + "content": "매출은 취소 주문을 제외한 순매출 기준이다.", + "source": "docs/revenue.md", + }, +] + +chunks = ( + CatalogChunker().split(CATALOG) + + RecursiveCharacterChunker(chunk_size=800, chunk_overlap=80).split(DOCS) +) + +retriever = VectorRetriever.from_chunks( + chunks, + embedding=embedding, + vectorstore=store, +) + +store.save() +``` + +### 4-3. 재시작 시 로드 + +```python +from lang2sql.integrations.vectorstore import FAISSVectorStore +from lang2sql import VectorRetriever +from lang2sql.integrations.embedding import OpenAIEmbedding + +# 파일에서 바로 로드 — 임베딩/인덱싱 없이 즉시 검색 가능 +store = FAISSVectorStore.load("./index/catalog.faiss") + +# registry는 from_chunks()가 자동 복원 불가 → 재인덱싱 필요 +# 실전에서는 프로세스 시작 시 from_sources()를 다시 실행하는 패턴 권장 +retriever = VectorRetriever.from_sources( + catalog=CATALOG, + embedding=OpenAIEmbedding(), + vectorstore=store, # 이미 채워진 store — upsert() 추가로 호출됨 (append) +) +``` + +> **append-only 제한**: `FAISSVectorStore`는 동일 chunk_id를 두 번 upsert하면 +> FAISS 인덱스에 두 개의 항목이 생깁니다. 깨끗한 인덱스가 필요하면 +> 새 `FAISSVectorStore()` 인스턴스로 처음부터 인덱싱하세요. + +### 4-4. save/load 예외 처리 + +```python +# index_path 없이 생성한 경우 save()는 경로 필요 +store = FAISSVectorStore() +store.upsert(["a"], [[1.0, 0.0]]) +store.save("./index/catalog.faiss") # 경로 직접 지정 + +# upsert() 전에 save() 호출 → RuntimeError +store_empty = FAISSVectorStore(index_path="./out.faiss") +store_empty.save() # RuntimeError: Cannot save before any upsert() call. + +# 존재하지 않는 파일 로드 → FileNotFoundError +FAISSVectorStore.load("./nonexistent.faiss") # FileNotFoundError +``` + +--- + +## 5. PGVectorStore — PostgreSQL 영속성 + +PostgreSQL의 `pgvector` 확장을 사용합니다. +`ON CONFLICT DO UPDATE` true upsert로 중복 없이 멱등 인덱싱이 가능합니다. + +### 5-1. PostgreSQL 빠른 시작 (Docker) + +```bash +docker run -d \ + --name pgvector \ + -e POSTGRES_PASSWORD=postgres \ + -p 5432:5432 \ + pgvector/pgvector:pg16 +``` + +### 5-2. 기본 사용법 — from_sources() + +```python +from lang2sql import VectorRetriever +from lang2sql.integrations.vectorstore import PGVectorStore +from lang2sql.integrations.embedding import OpenAIEmbedding + +store = PGVectorStore( + connection="postgresql://postgres:postgres@localhost:5432/postgres", + table_name="lang2sql_vectors", # 자동 생성됨 +) + +retriever = VectorRetriever.from_sources( + catalog=CATALOG, + embedding=OpenAIEmbedding(), + vectorstore=store, # ← PGVectorStore 주입 +) +# → upsert() 시점에 테이블이 없으면 자동 생성 +# → 같은 chunk_id를 다시 upsert하면 덮어씀 (true upsert) +``` + +### 5-3. 명시적 파이프라인 — from_chunks() + +```python +from lang2sql import VectorRetriever, CatalogChunker, RecursiveCharacterChunker +from lang2sql.integrations.vectorstore import PGVectorStore +from lang2sql.integrations.embedding import OpenAIEmbedding + +store = PGVectorStore( + connection="postgresql://postgres:postgres@localhost:5432/postgres", + table_name="lang2sql_vectors", +) + +chunks = ( + CatalogChunker().split(CATALOG) + + RecursiveCharacterChunker().split(DOCS) +) + +retriever = VectorRetriever.from_chunks( + chunks, + embedding=OpenAIEmbedding(), + vectorstore=store, +) +# save() 없음 — upsert()마다 DB에 즉시 반영 +``` + +### 5-4. 멱등 재인덱싱 + +같은 카탈로그로 여러 번 인덱싱해도 중복이 생기지 않습니다. + +```python +# 1차 실행 +retriever = VectorRetriever.from_sources( + catalog=CATALOG, embedding=embedding, vectorstore=store +) + +# 2차 실행 (카탈로그 변경 후) — 동일 chunk_id는 embedding이 갱신됨 +retriever = VectorRetriever.from_sources( + catalog=UPDATED_CATALOG, embedding=embedding, vectorstore=store +) +# DB에 중복 없이 덮어써짐 (ON CONFLICT DO UPDATE) +``` + +### 5-5. 자동 테이블 구조 + +첫 `upsert()` 시 아래 DDL이 실행됩니다: + +```sql +CREATE EXTENSION IF NOT EXISTS vector; +CREATE TABLE IF NOT EXISTS lang2sql_vectors ( + id TEXT PRIMARY KEY, + embedding vector(1536) -- 임베딩 모델 차원에 따라 자동 결정 +); +``` + +--- + +## 6. 백엔드 교체 방법 + +`vectorstore=` 파라미터만 바꾸면 됩니다. 나머지 파이프라인은 변경 없습니다. + +```python +from lang2sql import VectorRetriever +from lang2sql.integrations.vectorstore import ( + InMemoryVectorStore, + FAISSVectorStore, + PGVectorStore, +) +from lang2sql.integrations.embedding import OpenAIEmbedding + +embedding = OpenAIEmbedding() + +# ① InMemory (기본값) +retriever = VectorRetriever.from_sources( + catalog=CATALOG, embedding=embedding +) + +# ② FAISS — vectorstore= 한 줄 교체 +retriever = VectorRetriever.from_sources( + catalog=CATALOG, + embedding=embedding, + vectorstore=FAISSVectorStore(index_path="./index/catalog.faiss"), +) + +# ③ pgvector — vectorstore= 한 줄 교체 +retriever = VectorRetriever.from_sources( + catalog=CATALOG, + embedding=embedding, + vectorstore=PGVectorStore( + connection="postgresql://postgres:postgres@localhost:5432/postgres" + ), +) +``` + +--- + +## 7. 커스텀 벡터 저장소 직접 구현하기 + +`VectorStorePort` Protocol을 만족하는 클래스를 만들면 됩니다. +Chroma, Qdrant, Weaviate 등 어떤 벡터 DB든 연결 가능합니다. + +```python +from lang2sql import VectorStorePort # Protocol + +class ChromaVectorStore: + """Chroma를 lang2sql VectorStorePort에 연결하는 어댑터.""" + + def __init__(self, collection_name: str = "lang2sql"): + import chromadb + self._client = chromadb.Client() + self._col = self._client.get_or_create_collection(collection_name) + + def upsert(self, ids: list[str], vectors: list[list[float]]) -> None: + self._col.upsert(ids=ids, embeddings=vectors) + + def search(self, vector: list[float], k: int) -> list[tuple[str, float]]: + results = self._col.query(query_embeddings=[vector], n_results=k) + ids = results["ids"][0] + dists = results["distances"][0] + return [(id_, 1.0 - dist) for id_, dist in zip(ids, dists)] + + +# 사용 +retriever = VectorRetriever.from_sources( + catalog=CATALOG, + embedding=OpenAIEmbedding(), + vectorstore=ChromaVectorStore("my_catalog"), +) +``` + +구현해야 할 메서드는 두 개뿐입니다: + +| 메서드 | 시그니처 | 역할 | +|--------|---------|------| +| `upsert` | `(ids: list[str], vectors: list[list[float]]) -> None` | 벡터 저장 | +| `search` | `(vector: list[float], k: int) -> list[tuple[str, float]]` | 유사도 검색 → `(chunk_id, score)`, score 높을수록 유사 | + +--- + +## 8. 전체 체크리스트 — API 키 없이 실행 + +아래 코드는 `FakeEmbedding`으로 API 키 없이 세 백엔드를 모두 검증합니다. +pgvector 테스트는 `TEST_POSTGRES_URL` 환경변수가 있을 때만 실행됩니다. + +```python +""" +벡터 저장소 백엔드 전체 체크리스트 +API 키 없이 FakeEmbedding으로 실행 가능합니다. + +실행: + python docs/tutorials/vector-store-backends.md # ← 이 블록만 별도 .py로 저장 후 실행 + +pgvector 테스트 포함: + TEST_POSTGRES_URL="postgresql://postgres:postgres@localhost:5432/postgres" \\ + python check_backends.py +""" + +import os + +# ── 공통 픽스처 ──────────────────────────────────────────────────────────────── + +class FakeEmbedding: + """테스트용 고정 벡터 임베딩. 4차원 단위벡터를 반환합니다.""" + def embed_query(self, text: str) -> list[float]: + return [1.0, 0.0, 0.0, 0.0] + + def embed_texts(self, texts: list[str]) -> list[list[float]]: + return [[1.0, 0.0, 0.0, 0.0]] * len(texts) + + +from lang2sql import CatalogEntry, TextDocument, VectorRetriever +from lang2sql import CatalogChunker, RecursiveCharacterChunker + +CATALOG: list[CatalogEntry] = [ + { + "name": "orders", + "description": "고객 주문 정보 테이블", + "columns": {"order_id": "PK", "amount": "금액", "status": "상태"}, + }, + { + "name": "customers", + "description": "고객 마스터 데이터", + "columns": {"customer_id": "PK", "name": "이름", "grade": "등급"}, + }, +] + +DOCS: list[TextDocument] = [ + { + "id": "revenue_def", + "title": "매출 정의", + "content": "매출은 취소 주문을 제외한 순매출 기준이다.", + "source": "docs/revenue.md", + }, +] + +embedding = FakeEmbedding() + + +# ── 1. InMemoryVectorStore ───────────────────────────────────────────────────── + +print("=" * 50) +print("1. InMemoryVectorStore") + +from lang2sql.integrations.vectorstore import InMemoryVectorStore + +retriever = VectorRetriever.from_sources( + catalog=CATALOG, + embedding=embedding, + # vectorstore= 생략 → InMemoryVectorStore 자동 사용 +) +result = retriever("주문 건수") +assert isinstance(result.schemas, list) +assert len(result.schemas) > 0 +print(f" schemas: {[s['name'] for s in result.schemas]}") +print(" ✓ InMemoryVectorStore 정상") + + +# ── 2. FAISSVectorStore ──────────────────────────────────────────────────────── + +print("\n2. FAISSVectorStore") + +import tempfile, pathlib + +faiss = __import__("faiss") # 없으면 ImportError → 아래 try/except +try: + from lang2sql.integrations.vectorstore import FAISSVectorStore + + with tempfile.TemporaryDirectory() as tmpdir: + index_path = str(pathlib.Path(tmpdir) / "catalog.faiss") + + # 2-a. from_sources + store = FAISSVectorStore(index_path=index_path) + retriever_f = VectorRetriever.from_sources( + catalog=CATALOG, + embedding=embedding, + vectorstore=store, + ) + result_f = retriever_f("주문 건수") + assert len(result_f.schemas) > 0 + print(f" from_sources schemas: {[s['name'] for s in result_f.schemas]}") + + # 2-b. save / load + store.save() + loaded = FAISSVectorStore.load(index_path) + result_loaded = VectorRetriever.from_sources( + catalog=CATALOG, + embedding=embedding, + vectorstore=loaded, + )("주문 건수") + assert len(result_loaded.schemas) > 0 + print(f" save/load schemas: {[s['name'] for s in result_loaded.schemas]}") + + # 2-c. from_chunks (명시적 파이프라인) + chunks = ( + CatalogChunker().split(CATALOG) + + RecursiveCharacterChunker().split(DOCS) + ) + store2 = FAISSVectorStore() + retriever_fc = VectorRetriever.from_chunks( + chunks, embedding=embedding, vectorstore=store2 + ) + result_fc = retriever_fc("매출 정의") + assert len(result_fc.context) > 0 + print(f" from_chunks context: {result_fc.context[0][:30]}...") + + # 2-d. 예외 처리 + try: + FAISSVectorStore().save() + assert False, "ValueError 미발생" + except ValueError: + pass + + try: + FAISSVectorStore.load("no_such_file.faiss") + assert False, "FileNotFoundError 미발생" + except FileNotFoundError: + pass + + print(" ✓ FAISSVectorStore 정상") + +except ImportError: + print(" ⚠ faiss 미설치 — 건너뜀") + + +# ── 3. PGVectorStore ─────────────────────────────────────────────────────────── + +print("\n3. PGVectorStore") + +PG_URL = os.getenv("TEST_POSTGRES_URL") +if not PG_URL: + print(" ⚠ TEST_POSTGRES_URL 미설정 — 건너뜀") + print(" 실행하려면: TEST_POSTGRES_URL=postgresql://... python