CausalInferenceLab · seyoung4503 · Feb 28, 2026 · Feb 27, 2026 · Feb 27, 2026 · Feb 27, 2026
diff --git a/docs/tutorials/vector-store-backends.md b/docs/tutorials/vector-store-backends.md
diff --git a/src/lang2sql/__init__.py b/src/lang2sql/__init__.py
@@ -1,3 +1,5 @@
+from .integrations.vectorstore.faiss_ import FAISSVectorStore
+from .integrations.vectorstore.pgvector_ import PGVectorStore
 from .components.execution.sql_executor import SQLExecutor
 from .components.generation.sql_generator import SQLGenerator
 from .components.loaders.directory_ import DirectoryLoader
@@ -59,4 +61,7 @@
     "Lang2SQLError",
     "ComponentError",
     "IntegrationMissingError",
+    # Vector store backends
+    "FAISSVectorStore",
+    "PGVectorStore",
 ]
diff --git a/src/lang2sql/components/loaders/directory_.py b/src/lang2sql/components/loaders/directory_.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import warnings
 from pathlib import Path
 
 from ...core.catalog import TextDocument
@@ -53,5 +54,8 @@ def load(self) -> list[TextDocument]:
             loader = self._loaders.get(file.suffix.lower())
             if loader is None:
                 continue
-            docs.extend(loader.load(str(file)))
+            try:
+                docs.extend(loader.load(str(file)))
+            except Exception as e:
+                warnings.warn(f"Failed to load {file}: {e}", stacklevel=2)
         return docs
diff --git a/src/lang2sql/components/loaders/markdown_.py b/src/lang2sql/components/loaders/markdown_.py
@@ -3,9 +3,10 @@
 from pathlib import Path
 
 from ...core.catalog import TextDocument
+from ...core.ports import DocumentLoaderPort
 
 
-class MarkdownLoader:
+class MarkdownLoader(DocumentLoaderPort):
     """
     Markdown file(s) (.md) → list[TextDocument].
 

diff --git a/src/lang2sql/components/loaders/plaintext_.py b/src/lang2sql/components/loaders/plaintext_.py
@@ -3,9 +3,10 @@
 from pathlib import Path
 
 from ...core.catalog import TextDocument
+from ...core.ports import DocumentLoaderPort
 
 
-class PlainTextLoader:
+class PlainTextLoader(DocumentLoaderPort):
     """
     Plain text file(s) (.txt, etc.) → list[TextDocument].
 

diff --git a/src/lang2sql/components/retrieval/chunker.py b/src/lang2sql/components/retrieval/chunker.py
@@ -97,7 +97,7 @@ def chunk(self, entry: CatalogEntry) -> list[IndexedChunk]:
         return chunks
 
 
-class RecursiveCharacterChunker:
+class RecursiveCharacterChunker(DocumentChunkerPort):
     """
     Hierarchical separator-based document chunker. No external dependencies.
 
@@ -161,6 +161,8 @@ def chunk(self, doc: TextDocument) -> list[IndexedChunk]:
 
     def _split(self, text: str, separators: list[str]) -> list[str]:
         """Recursively try separators until all chunks fit within chunk_size."""
+        if not separators:
+            return [text] if text else []
         chunks: list[str] = []
         separator = separators[-1]  # fallback: character-level split
 

diff --git a/src/lang2sql/core/hooks.py b/src/lang2sql/core/hooks.py
@@ -27,12 +27,12 @@ class TraceHook(Protocol):
     def on_event(self, event: Event) -> None: ...
 
 
-class NullHook:
+class NullHook(TraceHook):
     def on_event(self, event: Event) -> None:
         return
 
 
-class MemoryHook:
+class MemoryHook(TraceHook):
     def __init__(self) -> None:
         self.events: list[Event] = []
 

diff --git a/src/lang2sql/integrations/chunking/semantic_.py b/src/lang2sql/integrations/chunking/semantic_.py
@@ -3,9 +3,10 @@
 from ...core.catalog import IndexedChunk, TextDocument
 from ...core.exceptions import IntegrationMissingError
 from ...core.ports import EmbeddingPort
+from ...components.retrieval.chunker import DocumentChunkerPort
 
 
-class SemanticChunker:
+class SemanticChunker(DocumentChunkerPort):
     """
     Embedding-based semantic chunker. Optional — explicit opt-in only.
 

diff --git a/src/lang2sql/integrations/db/sqlalchemy_.py b/src/lang2sql/integrations/db/sqlalchemy_.py
@@ -3,6 +3,7 @@
 from typing import Any
 
 from ...core.exceptions import IntegrationMissingError
+from ...core.ports import DBPort
 
 try:
     from sqlalchemy import create_engine, text as sa_text
@@ -13,7 +14,7 @@
     Engine = None  # type: ignore[assignment,misc]
 
 
-class SQLAlchemyDB:
+class SQLAlchemyDB(DBPort):
     """DBPort implementation backed by SQLAlchemy 2.x."""
 
     def __init__(self, url: str) -> None:

diff --git a/src/lang2sql/integrations/embedding/openai_.py b/src/lang2sql/integrations/embedding/openai_.py
@@ -1,14 +1,15 @@
 from __future__ import annotations
 
 from ...core.exceptions import IntegrationMissingError
+from ...core.ports import EmbeddingPort
 
 try:
     import openai as _openai
 except ImportError:
     _openai = None  # type: ignore[assignment]
 
 
-class OpenAIEmbedding:
+class OpenAIEmbedding(EmbeddingPort):
     """EmbeddingPort implementation backed by OpenAI Embeddings API."""
 
     def __init__(

diff --git a/src/lang2sql/integrations/llm/anthropic_.py b/src/lang2sql/integrations/llm/anthropic_.py
@@ -1,30 +1,34 @@
 from __future__ import annotations
 
 from ...core.exceptions import IntegrationMissingError
+from ...core.ports import LLMPort
 
 try:
     import anthropic as _anthropic
 except ImportError:
     _anthropic = None  # type: ignore[assignment]
 
 
-class AnthropicLLM:
+class AnthropicLLM(LLMPort):
     """LLMPort implementation backed by the Anthropic Messages API."""
 
-    def __init__(self, *, model: str, api_key: str | None = None) -> None:
+    def __init__(
+        self, *, model: str, api_key: str | None = None, max_tokens: int = 4096
+    ) -> None:
         if _anthropic is None:
             raise IntegrationMissingError(
                 "anthropic", hint="pip install anthropic  # or: uv sync"
             )
         self._client = _anthropic.Anthropic(api_key=api_key)
         self._model = model
+        self._max_tokens = max_tokens
 
     def invoke(self, messages: list[dict[str, str]]) -> str:
         system = next((m["content"] for m in messages if m["role"] == "system"), None)
         user_msgs = [m for m in messages if m["role"] != "system"]
         resp = self._client.messages.create(
             model=self._model,
-            max_tokens=1024,
+            max_tokens=self._max_tokens,
             system=system or "",
             messages=user_msgs,
         )

diff --git a/src/lang2sql/integrations/llm/openai_.py b/src/lang2sql/integrations/llm/openai_.py
@@ -1,14 +1,15 @@
 from __future__ import annotations
 
 from ...core.exceptions import IntegrationMissingError
+from ...core.ports import LLMPort
 
 try:
     import openai as _openai
 except ImportError:
     _openai = None  # type: ignore[assignment]
 
 
-class OpenAILLM:
+class OpenAILLM(LLMPort):
     """LLMPort implementation backed by the OpenAI Chat Completions API."""
 
     def __init__(self, *, model: str, api_key: str | None = None) -> None:

diff --git a/src/lang2sql/integrations/loaders/pdf_.py b/src/lang2sql/integrations/loaders/pdf_.py
@@ -4,14 +4,15 @@
 
 from ...core.catalog import TextDocument
 from ...core.exceptions import IntegrationMissingError
+from ...core.ports import DocumentLoaderPort
 
 try:
     import fitz as _fitz
 except ImportError:
     _fitz = None  # type: ignore[assignment]
 
 
-class PDFLoader:
+class PDFLoader(DocumentLoaderPort):
     """
     PDF file → list[TextDocument].
 

diff --git a/src/lang2sql/integrations/vectorstore/__init__.py b/src/lang2sql/integrations/vectorstore/__init__.py
@@ -1,3 +1,5 @@
+from .faiss_ import FAISSVectorStore
 from .inmemory_ import InMemoryVectorStore
+from .pgvector_ import PGVectorStore
 
-__all__ = ["InMemoryVectorStore"]
+__all__ = ["InMemoryVectorStore", "FAISSVectorStore", "PGVectorStore"]
diff --git a/src/lang2sql/integrations/vectorstore/faiss_.py b/src/lang2sql/integrations/vectorstore/faiss_.py
@@ -0,0 +1,104 @@
+from __future__ import annotations
+
+import json
+import pathlib
+
+from ...core.exceptions import IntegrationMissingError
+from ...core.ports import VectorStorePort
+
+try:
+    import faiss as _faiss
+    import numpy as _np
+except ImportError:
+    _faiss = None  # type: ignore[assignment]
+    _np = None  # type: ignore[assignment]
+
+
+class FAISSVectorStore(VectorStorePort):
+    """
+    FAISS-backed vector store with optional file persistence.
+
+    Uses IndexFlatIP + L2 normalization for exact cosine similarity.
+    Index is lazy-initialized on the first upsert() call.
+
+    Known limitation (append-only):
+        Upserting the same chunk_id twice creates duplicate FAISS entries.
+        To rebuild a clean index, create a new FAISSVectorStore instance
+        and run from_chunks() again from scratch.
+
+    Args:
+        index_path: Optional path for save() / load(). Used as default
+                    path when save() is called without an explicit argument.
+
+    Installation:
+        pip install faiss-cpu        # CPU-only
+        pip install faiss-gpu        # GPU variant
+    """
+
+    def __init__(self, index_path: str | None = None) -> None:
+        if _faiss is None or _np is None:
+            raise IntegrationMissingError("faiss", hint="pip install faiss-cpu")
+        self._index_path = index_path
+        self._index: object | None = None  # faiss.IndexFlatIP, None until first upsert
+        self._ids: list[str] = []
+
+    # ── VectorStorePort ──────────────────────────────────────────────
+
+    def upsert(self, ids: list[str], vectors: list[list[float]]) -> None:
+        """L2-normalize and add vectors. Lazy-creates index on first call."""
+        arr = _np.array(vectors, dtype=_np.float32)
+        _faiss.normalize_L2(arr)  # in-place cosine trick
+        if self._index is None:
+            self._index = _faiss.IndexFlatIP(arr.shape[1])
+        self._index.add(arr)
+        self._ids.extend(ids)
+
+    def search(self, vector: list[float], k: int) -> list[tuple[str, float]]:
+        """Return (chunk_id, cosine_score) for the k nearest vectors."""
+        if self._index is None or self._index.ntotal == 0:
+            return []
+        q = _np.array([vector], dtype=_np.float32)
+        _faiss.normalize_L2(q)
+        k = min(k, self._index.ntotal)
+        scores, positions = self._index.search(q, k)
+        return [
+            (self._ids[int(pos)], float(scores[0][j]))
+            for j, pos in enumerate(positions[0])
+            if pos >= 0
+        ]
+
+    # ── Persistence ──────────────────────────────────────────────────
+
+    def save(self, path: str | None = None) -> None:
+        """
+        Write index to {path} and id list to {path}.meta.
+        Falls back to self._index_path when path is None.
+        Raises ValueError if no path is available.
+        Raises RuntimeError if called before any upsert().
+        """
+        path = path or self._index_path
+        if path is None:
+            raise ValueError(
+                "No path provided and index_path was not set at construction."
+            )
+        if self._index is None:
+            raise RuntimeError("Cannot save before any upsert() call.")
+        pathlib.Path(path).parent.mkdir(parents=True, exist_ok=True)
+        _faiss.write_index(self._index, path)
+        pathlib.Path(path + ".meta").write_text(json.dumps(self._ids), encoding="utf-8")
+
+    @classmethod
+    def load(cls, path: str) -> "FAISSVectorStore":
+        """
+        Load index from {path} and id list from {path}.meta.
+        Raises FileNotFoundError if either file is missing.
+        """
+        if _faiss is None or _np is None:
+            raise IntegrationMissingError("faiss", hint="pip install faiss-cpu")
+        meta_path = pathlib.Path(path + ".meta")
+        if not pathlib.Path(path).exists() or not meta_path.exists():
+            raise FileNotFoundError(f"Index files not found: {path}, {path}.meta")
+        store = cls(index_path=path)
+        store._index = _faiss.read_index(path)
+        store._ids = json.loads(meta_path.read_text(encoding="utf-8"))
+        return store
diff --git a/src/lang2sql/integrations/vectorstore/inmemory_.py b/src/lang2sql/integrations/vectorstore/inmemory_.py
@@ -1,14 +1,15 @@
 from __future__ import annotations
 
 from ...core.exceptions import IntegrationMissingError
+from ...core.ports import VectorStorePort
 
 try:
     import numpy as _np
 except ImportError:
     _np = None  # type: ignore[assignment]
 
 
-class InMemoryVectorStore:
+class InMemoryVectorStore(VectorStorePort):
     """
     Brute-force cosine similarity vector store backed by numpy.