diff --git a/backend/app/rag/vision.py b/backend/app/rag/vision.py index aba482b..8699e0e 100644 --- a/backend/app/rag/vision.py +++ b/backend/app/rag/vision.py @@ -1,23 +1,129 @@ """Image captioning / vision helpers for RAG pipeline. -Provides a simple, pluggable interface to generate textual descriptions -for images extracted from PDFs. By default it uses local OCR (pytesseract) -when available as a robust fallback. An external VLM provider (OpenAI) -can be integrated by setting `VISION_PROVIDER` and appropriate API keys -in settings; the provider hook is intentionally small and optional. +Caption resolution order for each image chunk: +1. Bounding-box proximity — nearest text block below/above the image in the PDF + (rich, zero-cost, works offline). +2. OCR (pytesseract) — when proximity yields nothing and tesseract is installed. +3. Placeholder — "Figure on page N (WxH px)" as a guaranteed non-empty fallback. + +An optional OpenAI GPT-4o-mini vision hook is provided for deployments that set +VISION_PROVIDER=openai and OPENAI_API_KEY in settings. """ +import base64 import logging -from typing import List, Dict, Any from io import BytesIO +from typing import Any, Dict, List, Optional + +import fitz # PyMuPDF from app.config import get_settings logger = logging.getLogger(__name__) settings = get_settings() +# Minimum image area (px²) — smaller images are decorative and skipped. +_MIN_IMAGE_AREA = 1_000 + + +# ── 1. Proximity-based caption extraction ──────────────────────────────────── + +def _find_caption_near_image( + page: fitz.Page, + img_bbox: fitz.Rect, + search_margin: float = 60.0, +) -> str: + """Return the closest text block directly below (or above) an image rect.""" + page_dict = page.get_text("dict", flags=fitz.TEXT_PRESERVE_WHITESPACE) + blocks = page_dict.get("blocks", []) + + def _closest(region: fitz.Rect) -> str: + candidates = [] + for block in blocks: + if block.get("type") != 0: # 0 == text block + continue + bx0, by0, bx1, by1 = block["bbox"] + if fitz.Rect(bx0, by0, bx1, by1).intersects(region): + text = " ".join( + span["text"] + for line in block.get("lines", []) + for span in line.get("spans", []) + ).strip() + if text: + candidates.append((abs(by0 - img_bbox.y1), text)) + if candidates: + return min(candidates, key=lambda t: t[0])[1] + return "" + + # Search below first, fall back to above + below = fitz.Rect(img_bbox.x0, img_bbox.y1, img_bbox.x1, img_bbox.y1 + search_margin) + caption = _closest(below) + if caption: + return caption + + above = fitz.Rect(img_bbox.x0, img_bbox.y0 - search_margin, img_bbox.x1, img_bbox.y0) + return _closest(above) + + +def extract_captions_from_pdf(filepath: str) -> List[Dict[str, Any]]: + """Extract proximity-based image captions from a PDF. + + Returns a list of dicts ordered by (page, figure_index): + { + "page": int, # 1-based + "figure_index": int, # 0-based within the page + "caption": str, # may be empty string + "bbox": list[float], # [x0, y0, x1, y1] normalised to [0, 1] + } + """ + results: List[Dict[str, Any]] = [] + doc = fitz.open(filepath) + + try: + for page_num, page in enumerate(doc): + W, H = float(page.rect.width), float(page.rect.height) + figure_index = 0 + + for img_info in page.get_images(full=True): + xref = img_info[0] + try: + rects = page.get_image_rects(xref) + if not rects: + continue + img_rect = rects[0] + + if img_rect.width * img_rect.height < _MIN_IMAGE_AREA: + continue # skip decorative images + + caption = _find_caption_near_image(page, img_rect) + results.append( + { + "page": page_num + 1, + "figure_index": figure_index, + "caption": caption, + "bbox": [ + round(img_rect.x0 / W, 4), + round(img_rect.y0 / H, 4), + round(img_rect.x1 / W, 4), + round(img_rect.y1 / H, 4), + ], + } + ) + figure_index += 1 + + except Exception as exc: + logger.warning( + "Skipping image xref=%s on page %s: %s", xref, page_num + 1, exc + ) + finally: + doc.close() + + return results + + +# ── 2. OCR fallback ────────────────────────────────────────────────────────── def _ocr_caption(image_bytes: bytes) -> str: - """Try to produce a caption using pytesseract OCR; returns empty string if not available.""" + """Attempt OCR via pytesseract; returns empty string if unavailable.""" try: from PIL import Image import pytesseract @@ -26,14 +132,66 @@ def _ocr_caption(image_bytes: bytes) -> str: try: img = Image.open(BytesIO(image_bytes)).convert("RGB") - text = pytesseract.image_to_string(img) - text = text.strip() - return text - except Exception as e: - logger.debug(f"OCR failed: {e}") + text = pytesseract.image_to_string(img).strip() + return (text[:500] + "...") if len(text) > 500 else text + except Exception as exc: + logger.debug("OCR failed: %s", exc) return "" +# ── 3. Optional OpenAI GPT-4o-mini vision hook ─────────────────────────────── + +def _openai_caption(image_bytes: bytes) -> str: + """Call OpenAI Chat Completions vision API; returns empty string on any failure.""" + api_key = getattr(settings, "OPENAI_API_KEY", None) + if not api_key: + return "" + + try: + from openai import OpenAI + + client = OpenAI(api_key=api_key) + b64 = base64.b64encode(image_bytes).decode("utf-8") + + response = client.chat.completions.create( + model="gpt-4o-mini", + max_tokens=120, + messages=[ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{b64}", + "detail": "low", + }, + }, + { + "type": "text", + "text": ( + "Describe this figure or diagram in one concise sentence " + "suitable for use as a search index caption." + ), + }, + ], + } + ], + ) + return response.choices[0].message.content.strip() + + except Exception as exc: + logger.debug("OpenAI vision caption failed: %s", exc) + return "" + + +# ── Public API ─────────────────────────────────────────────────────────────── + +def caption_image(image_bytes: bytes, page: Optional[int] = None) -> str: + """Generate a caption for a single image (bytes). + + Resolution order: OpenAI (if configured) → OCR → placeholder. + """ def caption_image(image_bytes: bytes | List[bytes], page: int | List[int] | None = None) -> str | List[str]: """Generate a caption for a single image or a batch of images. @@ -49,56 +207,54 @@ def caption_image(image_bytes: bytes | List[bytes], page: int | List[int] | None # Placeholder for provider-based captioning (e.g., OpenAI / LLaVA hooks) provider = getattr(settings, "VISION_PROVIDER", None) + if provider == "openai": - try: - import openai - # Minimal integration: attempt a text-only caption via responses if available. - # This is a best-effort hook; users should adapt to their provider's API. - api_key = getattr(settings, "OPENAI_API_KEY", None) - if api_key: - openai.api_key = api_key - # Use a generic prompt: "Describe the following image" - # Note: concrete multimodal API usage may vary across SDK versions. - resp = openai.Image.create( - prompt="Describe this image in one concise sentence.", - n=1, - # We do not re-upload image bytes here; this is a placeholder to show - # where provider code would be invoked. For production, follow - # provider docs for sending image data. - ) - # openai.Image.create returns generated images, not captions — so skip. - except Exception: - # If provider integration fails, fall back to OCR below - logger.debug("OpenAI vision provider failed, falling back to OCR") - - # Try OCR caption + caption = _openai_caption(image_bytes) + if caption: + return caption + ocr = _ocr_caption(image_bytes) if ocr: - # Keep it short if very long - return (ocr[:500] + "...") if len(ocr) > 500 else ocr + return ocr - # Last-resort caption - if page: - return f"Image on page {page}." - return "Image." + # Derive dimensions for the placeholder + try: + pix = fitz.Pixmap(image_bytes) + dims = f"{pix.width}x{pix.height} px" + except Exception: + dims = "unknown size" + + return f"Figure on page {page} ({dims})." if page else f"Figure ({dims})." def generate_captions_for_chunks(chunks: List[Dict[str, Any]]) -> None: - """Mutate chunks in-place: for any chunk containing `image_bytes` but empty `text`, - generate a caption and set `text`. + """Mutate image chunks in-place: fill empty ``text`` with a caption. + + Called by vectorstore.store_chunks() before embedding. + Proximity-based captions should already be written into chunk["image_caption"] + by document_ingestion.ingest_document() before this point. + This function handles the OCR / placeholder fallback for any remaining gaps. """ for chunk in chunks: - if chunk.get("image_bytes") and not chunk.get("text"): - try: - caption = caption_image(chunk["image_bytes"], page=chunk.get("page")) - chunk["text"] = caption - # Remove raw bytes to avoid accidentally serializing them later - chunk.pop("image_bytes", None) - chunk["is_image"] = True - chunk["image_caption"] = caption - except Exception as e: - logger.debug(f"Failed to caption image chunk: {e}") - # ensure we still mark it as image to avoid losing it - chunk.pop("image_bytes", None) - chunk["is_image"] = True - chunk.setdefault("text", f"Image on page {chunk.get('page')}") + if not chunk.get("image_bytes"): + continue + if chunk.get("text", "").strip(): + continue # already captioned by proximity pass + + try: + # Use pre-extracted proximity caption if available + caption = chunk.get("image_caption") or caption_image( + chunk["image_bytes"], page=chunk.get("page") + ) + chunk["text"] = caption + chunk["is_image"] = True + chunk["image_caption"] = caption + except Exception as exc: + logger.debug("Failed to caption image chunk: %s", exc) + chunk["is_image"] = True + fallback = f"Image on page {chunk.get('page', '?')}" + chunk.setdefault("text", fallback) + chunk["image_caption"] = chunk["text"] + finally: + # Always strip raw bytes — never serialise them into ChromaDB + chunk.pop("image_bytes", None) diff --git a/backend/app/services/document_ingestion.py b/backend/app/services/document_ingestion.py index 6a92e7a..6e76d79 100644 --- a/backend/app/services/document_ingestion.py +++ b/backend/app/services/document_ingestion.py @@ -74,6 +74,38 @@ def ingest_document(document_id: str, filepath: str, original_name: str, user_id except TypeError: chunks = chunk_document(filepath) + # ── Proximity caption pass (PDF only) ──────────────────────────────── + # Write bounding-box-derived captions into image chunks BEFORE store_chunks() + # so generate_captions_for_chunks() in vectorstore.py only needs to handle + # the OCR / placeholder fallback for any images without adjacent text. + ext = filepath.rsplit(".", 1)[-1].lower() + if ext == "pdf": + try: + from app.rag.vision import extract_captions_from_pdf + + pdf_captions = extract_captions_from_pdf(filepath) + # Build lookup: page -> [captions in figure_index order] + caption_map: dict = {} + for cap in pdf_captions: + caption_map.setdefault(cap["page"], []).append(cap) + + fig_counters: dict = {} + for chunk in chunks: + if not chunk.get("image_bytes"): + continue + page = chunk.get("page", 1) + idx = fig_counters.get(page, 0) + page_caps = caption_map.get(page, []) + if idx < len(page_caps) and page_caps[idx]["caption"]: + chunk["image_caption"] = page_caps[idx]["caption"] + chunk["bbox"] = str(page_caps[idx]["bbox"]) + fig_counters[page] = idx + 1 + except Exception as exc: + logger.warning( + "Proximity caption extraction failed for %s: %s", document_id, exc + ) + # ── End proximity caption pass ──────────────────────────────────────── + if not chunks: doc.status = "failed" doc.processing_progress = 0