diff --git a/CLAUDE.md b/CLAUDE.md
index de0732c..b8184e0 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -183,6 +183,23 @@ GitHub Actions (`.github/workflows/ci.yml`) builds and pushes all four images on
6. Add platform-specific router in `apps/api/src/routers/` (if API endpoints are needed)
7. Add any new pip dependencies to `packages/shared/pyproject.toml` under `[project.optional-dependencies] scrapers`
+### Exception Handling
+- **Custom exceptions** are defined in `packages/shared/fastfetchbot_shared/exceptions.py`:
+ - `FastFetchBotError` — base for all domain errors
+ - `ScraperError` / `ScraperNetworkError` / `ScraperParseError` — scraper failures
+ - `TelegraphPublishError` — Telegraph publishing failures
+ - `FileExportError` — file export (PDF, video, audio) failures
+ - `ExternalServiceError` — external service call failures (OpenAI, Firecrawl, Zyte, XHS sign server, etc.)
+- **Always use typed exceptions** instead of generic `RuntimeError`, `ValueError`, or `Exception` for domain errors. Pick the most specific subclass that fits.
+- **Use `from e` chaining** when wrapping exceptions: `raise ScraperError("message") from e`
+- **Boundary-level handlers** catch exceptions at service boundaries:
+ - FastAPI: global `@app.exception_handler(FastFetchBotError)` returns 502, generic `Exception` returns 500
+ - Telegram bot: `error_process` handler catches handler exceptions; webhook server protects endpoints
+ - Celery/ARQ workers: existing task-level try/catch with outbox error push
+- **Never use `print()` or `traceback.print_exc()`** — always use `logger.exception()` (includes traceback) or `logger.error()` (message only)
+- **Never silently swallow exceptions** — if catching an exception, either re-raise it or handle it explicitly with logging. Do not return `None` or empty data on failure.
+- **Fail fast after fallback chains** — scrapers may try multiple methods/APIs, but must raise a typed error when all fallbacks are exhausted
+
### Key Conventions
- **`packages/shared/` (`fastfetchbot-shared`)** is for shared async logic — scrapers, templates, Telegraph, and async Celery task wrappers (file_export). Most code here is async and reusable across apps
- **`packages/file-export/` (`fastfetchbot-file-export`)** is exclusively for synchronous Celery worker jobs — the heavy I/O operations that run inside the Celery worker process (yt-dlp video download, WeasyPrint PDF generation, OpenAI audio transcription). Apps never import this package directly; they use the async wrappers in `fastfetchbot_shared.services.file_export` which submit tasks to the Celery worker
diff --git a/apps/api/src/main.py b/apps/api/src/main.py
index dea43d9..70c28f5 100644
--- a/apps/api/src/main.py
+++ b/apps/api/src/main.py
@@ -1,6 +1,7 @@
import sentry_sdk
from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse
from contextlib import asynccontextmanager
from starlette.middleware.base import BaseHTTPMiddleware
@@ -8,6 +9,7 @@
from src.routers import inoreader, scraper_routers, scraper
from src.config import settings
from fastfetchbot_shared.utils.logger import logger
+from fastfetchbot_shared.exceptions import FastFetchBotError
SENTRY_DSN = ""
@@ -38,12 +40,27 @@ def __init__(self, app):
async def dispatch(self, request: Request, call_next):
logger.info(f"{request.method} {request.url}")
- response = await call_next(request)
- return response
+ try:
+ response = await call_next(request)
+ return response
+ except Exception:
+ logger.exception(f"Unhandled error during {request.method} {request.url}")
+ raise
def create_app():
fastapi_app = FastAPI(lifespan=lifespan)
+
+ @fastapi_app.exception_handler(FastFetchBotError)
+ async def fastfetchbot_error_handler(request: Request, exc: FastFetchBotError):
+ logger.error(f"Domain error on {request.method} {request.url}: {exc}")
+ return JSONResponse(status_code=502, content={"error": str(exc)})
+
+ @fastapi_app.exception_handler(Exception)
+ async def generic_error_handler(request: Request, exc: Exception):
+ logger.exception(f"Unhandled error on {request.method} {request.url}")
+ return JSONResponse(status_code=500, content={"error": "Internal server error"})
+
fastapi_app.add_middleware(LogMiddleware)
fastapi_app.include_router(inoreader.router)
fastapi_app.include_router(scraper.router)
diff --git a/apps/telegram-bot/core/services/message_sender.py b/apps/telegram-bot/core/services/message_sender.py
index 666b568..1e6480d 100644
--- a/apps/telegram-bot/core/services/message_sender.py
+++ b/apps/telegram-bot/core/services/message_sender.py
@@ -163,18 +163,20 @@ async def send_item_message(
else False,
disable_notification=True,
)
- except Exception as e:
- logger.error(e)
- traceback.print_exc()
+ except Exception:
+ logger.exception("Failed to send item message")
await send_debug_channel(traceback.format_exc())
async def send_debug_channel(message: str) -> None:
+ import html as html_module
from core.config import TELEBOT_DEBUG_CHANNEL
application = _get_application()
if TELEBOT_DEBUG_CHANNEL is not None:
await application.bot.send_message(
- chat_id=TELEBOT_DEBUG_CHANNEL, text=message, parse_mode=ParseMode.HTML
+ chat_id=TELEBOT_DEBUG_CHANNEL,
+ text=html_module.escape(message),
+ parse_mode=ParseMode.HTML,
)
@@ -234,9 +236,13 @@ async def media_files_packaging(media_files: list, data: dict) -> tuple:
"https",
]: # if the url is a http url, download the file
file_format = "mp4" if media_item["media_type"] == "video" else None
- io_object = await download_file_by_metadata_item(
- media_item["url"], data=data, file_format=file_format
- )
+ try:
+ io_object = await download_file_by_metadata_item(
+ media_item["url"], data=data, file_format=file_format
+ )
+ except Exception:
+ logger.warning(f"Skipping media download: {media_item['url']}")
+ continue
filename = io_object.name
file_size = io_object.size
else: # if the url is a local file path, just add it to the media group
@@ -300,9 +306,13 @@ async def media_files_packaging(media_files: list, data: dict) -> tuple:
or img_width > settings.TELEGRAM_IMAGE_DIMENSION_LIMIT
or img_height > settings.TELEGRAM_IMAGE_DIMENSION_LIMIT
) and data["category"] not in ["xiaohongshu"]:
- io_object = await download_file_by_metadata_item(
- url=image_url, data=data
- )
+ try:
+ io_object = await download_file_by_metadata_item(
+ url=image_url, data=data
+ )
+ except Exception:
+ logger.warning(f"Skipping document download: {image_url}")
+ continue
if not io_object.name.endswith(".gif"):
if not io_object.name.endswith(ext.lower()):
io_object.name = io_object.name + "." + ext.lower()
@@ -312,11 +322,15 @@ async def media_files_packaging(media_files: list, data: dict) -> tuple:
)
file_counter += 1
elif media_item["media_type"] == "gif":
- io_object = await download_file_by_metadata_item(
- url=media_item["url"],
- data=data,
- file_name="gif_image-" + str(media_counter) + ".gif",
- )
+ try:
+ io_object = await download_file_by_metadata_item(
+ url=media_item["url"],
+ data=data,
+ file_name="gif_image-" + str(media_counter) + ".gif",
+ )
+ except Exception:
+ logger.warning(f"Skipping gif download: {media_item['url']}")
+ continue
io_object.name = io_object.name + ".gif"
media_group.append(InputMediaAnimation(io_object))
elif media_item["media_type"] == "video":
diff --git a/apps/telegram-bot/core/services/outbox_consumer.py b/apps/telegram-bot/core/services/outbox_consumer.py
index 6415235..1ec3991 100644
--- a/apps/telegram-bot/core/services/outbox_consumer.py
+++ b/apps/telegram-bot/core/services/outbox_consumer.py
@@ -4,7 +4,7 @@
import redis.asyncio as aioredis
from core.config import settings
-from core.services.message_sender import send_item_message
+from core.services.message_sender import send_item_message, send_debug_channel
from fastfetchbot_shared.utils.logger import logger
_redis: aioredis.Redis | None = None
@@ -42,7 +42,9 @@ async def _consume_loop() -> None:
if error:
logger.warning(f"[{job_id}] Scrape failed: {error}")
- await _send_error_to_chat(chat_id, error)
+ await send_debug_channel(
+ f"[Scrape Error] job_id={job_id}\nchat_id: {chat_id}\n\n{error}"
+ )
else:
metadata_item = payload.get("metadata_item")
if metadata_item and chat_id:
@@ -63,19 +65,6 @@ async def _consume_loop() -> None:
await asyncio.sleep(1)
-async def _send_error_to_chat(chat_id: int | str, error: str) -> None:
- """Send an error notification to the user's chat."""
- try:
- from core.services.bot_app import application
-
- await application.bot.send_message(
- chat_id=chat_id,
- text=f"Sorry, an error occurred while processing your request:\n\n{error}",
- )
- except Exception as e:
- logger.error(f"Failed to send error message to chat {chat_id}: {e}")
-
-
async def start(bot_id: int) -> None:
"""Start the outbox consumer as a background asyncio task.
diff --git a/apps/telegram-bot/core/webhook/server.py b/apps/telegram-bot/core/webhook/server.py
index 96ae377..8deb01a 100644
--- a/apps/telegram-bot/core/webhook/server.py
+++ b/apps/telegram-bot/core/webhook/server.py
@@ -51,24 +51,39 @@ async def lifespan(app):
await database.shutdown()
+def _log_task_exception(task: asyncio.Task):
+ """Callback for fire-and-forget tasks to ensure exceptions are logged."""
+ if not task.cancelled() and task.exception():
+ logger.exception("Unhandled error in background task", exc_info=task.exception())
+
+
async def telegram_webhook(request: Request):
secret = request.headers.get("X-Telegram-Bot-Api-Secret-Token")
if secret != settings.TELEGRAM_BOT_SECRET_TOKEN:
return JSONResponse({"error": "unauthorized"}, status_code=401)
- data = await request.json()
+ try:
+ data = await request.json()
+ except Exception:
+ logger.exception("Failed to parse webhook request body")
+ return JSONResponse({"error": "invalid JSON"}, status_code=400)
logger.debug(f"Telegram webhook update received: {data.get('update_id', 'unknown')}")
- asyncio.create_task(process_telegram_update(data))
+ task = asyncio.create_task(process_telegram_update(data))
+ task.add_done_callback(_log_task_exception)
return JSONResponse({"status": "ok"})
async def send_message_endpoint(request: Request):
- data = await request.json()
- metadata_item = data["data"]
- chat_id = data.get("chat_id")
- if isinstance(chat_id, str) and chat_id.startswith("-"):
- chat_id = int(chat_id)
- await send_item_message(metadata_item, chat_id=chat_id)
- return JSONResponse({"status": "ok"})
+ try:
+ data = await request.json()
+ metadata_item = data["data"]
+ chat_id = data.get("chat_id")
+ if isinstance(chat_id, str) and chat_id.startswith("-"):
+ chat_id = int(chat_id)
+ await send_item_message(metadata_item, chat_id=chat_id)
+ return JSONResponse({"status": "ok"})
+ except Exception:
+ logger.exception("Failed to handle send_message request")
+ return JSONResponse({"error": "Internal server error"}, status_code=500)
async def health(request: Request):
diff --git a/packages/file-export/fastfetchbot_file_export/pdf_export.py b/packages/file-export/fastfetchbot_file_export/pdf_export.py
index e9ec16c..f4a60d6 100644
--- a/packages/file-export/fastfetchbot_file_export/pdf_export.py
+++ b/packages/file-export/fastfetchbot_file_export/pdf_export.py
@@ -2,6 +2,8 @@
from weasyprint import HTML, CSS
from weasyprint.text.fonts import FontConfiguration
+from loguru import logger
+from fastfetchbot_shared.exceptions import FileExportError
CSS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "pdf_export.css")
@@ -19,7 +21,7 @@ def convert_html_to_pdf(
elif html_string:
html_item = HTML(string=html_string)
else:
- raise ValueError("Either html_string or html_file must be provided")
+ raise FileExportError("Either html_string or html_file must be provided")
html_item.write_pdf(output_filename, stylesheets=[css_item])
@@ -30,10 +32,14 @@ def export_pdf(
download_dir: str = "/tmp",
) -> str:
"""Export HTML to PDF and return the output file path."""
- output_path = os.path.join(download_dir, output_filename)
- convert_html_to_pdf(
- output_filename=output_path,
- html_string=html_string,
- html_file=html_file,
- )
- return output_path
+ try:
+ output_path = os.path.join(download_dir, output_filename)
+ convert_html_to_pdf(
+ output_filename=output_path,
+ html_string=html_string,
+ html_file=html_file,
+ )
+ return output_path
+ except Exception as e:
+ logger.exception(f"PDF export failed for {output_filename}")
+ raise FileExportError(f"PDF export failed for {output_filename}") from e
diff --git a/packages/file-export/fastfetchbot_file_export/transcribe.py b/packages/file-export/fastfetchbot_file_export/transcribe.py
index 54c81f4..94b6cae 100644
--- a/packages/file-export/fastfetchbot_file_export/transcribe.py
+++ b/packages/file-export/fastfetchbot_file_export/transcribe.py
@@ -3,6 +3,7 @@
from pydub import AudioSegment
from openai import OpenAI
from loguru import logger
+from fastfetchbot_shared.exceptions import FileExportError
TRANSCRIBE_MODEL = "whisper-1"
SEGMENT_LENGTH = 5 * 60 # 5 minutes in seconds
@@ -74,40 +75,44 @@ def get_audio_text(audio_file: str, openai_api_key: str) -> str:
Returns formatted string with summary and full transcript.
"""
- client = OpenAI(api_key=openai_api_key)
- transcript = ""
- AudioSegment.converter = "ffmpeg"
- audio_file_non_ext, audio_file_ext = os.path.splitext(audio_file)
- ext = audio_file_ext.lstrip(".")
- audio_item = AudioSegment.from_file(audio_file, ext)
- start_trim = milliseconds_until_sound(audio_item)
- audio_item = audio_item[start_trim:]
- audio_length = int(audio_item.duration_seconds) + 1
-
- for index, i in enumerate(range(0, audio_length * 1000, SEGMENT_LENGTH * 1000)):
- start_time = i
- end_time = i + SEGMENT_LENGTH * 1000
- if end_time >= audio_length * 1000:
- audio_segment = audio_item[start_time:]
- else:
- audio_segment = audio_item[start_time:end_time]
-
- segment_path = f"{audio_file_non_ext}-{index + 1}{audio_file_ext}"
- audio_segment.export(segment_path)
- logger.info(f"audio_segment_path: {segment_path}")
-
- with open(segment_path, "rb") as f:
- result = client.audio.transcriptions.create(
- model=TRANSCRIBE_MODEL, file=f
- )
- transcript += result.text
-
- os.remove(segment_path)
-
- transcript = punctuation_assistant(client, transcript)
- transcript = (
- f"全文总结:\n{summary_assistant(client, transcript)}\n原文:\n{transcript}"
- )
- logger.info(f"transcript: {transcript}")
- os.remove(audio_file)
- return transcript
+ try:
+ client = OpenAI(api_key=openai_api_key)
+ transcript = ""
+ AudioSegment.converter = "ffmpeg"
+ audio_file_non_ext, audio_file_ext = os.path.splitext(audio_file)
+ ext = audio_file_ext.lstrip(".")
+ audio_item = AudioSegment.from_file(audio_file, ext)
+ start_trim = milliseconds_until_sound(audio_item)
+ audio_item = audio_item[start_trim:]
+ audio_length = int(audio_item.duration_seconds) + 1
+
+ for index, i in enumerate(range(0, audio_length * 1000, SEGMENT_LENGTH * 1000)):
+ start_time = i
+ end_time = i + SEGMENT_LENGTH * 1000
+ if end_time >= audio_length * 1000:
+ audio_segment = audio_item[start_time:]
+ else:
+ audio_segment = audio_item[start_time:end_time]
+
+ segment_path = f"{audio_file_non_ext}-{index + 1}{audio_file_ext}"
+ audio_segment.export(segment_path)
+ logger.info(f"audio_segment_path: {segment_path}")
+
+ with open(segment_path, "rb") as f:
+ result = client.audio.transcriptions.create(
+ model=TRANSCRIBE_MODEL, file=f
+ )
+ transcript += result.text
+
+ os.remove(segment_path)
+
+ transcript = punctuation_assistant(client, transcript)
+ transcript = (
+ f"全文总结:\n{summary_assistant(client, transcript)}\n原文:\n{transcript}"
+ )
+ logger.info(f"transcript: {transcript}")
+ os.remove(audio_file)
+ return transcript
+ except Exception as e:
+ logger.exception(f"Audio transcription failed for {audio_file}")
+ raise FileExportError(f"Audio transcription failed for {audio_file}") from e
diff --git a/packages/file-export/fastfetchbot_file_export/video_download.py b/packages/file-export/fastfetchbot_file_export/video_download.py
index e70cb73..45b4b4f 100644
--- a/packages/file-export/fastfetchbot_file_export/video_download.py
+++ b/packages/file-export/fastfetchbot_file_export/video_download.py
@@ -1,8 +1,8 @@
import os
-import traceback
from loguru import logger
from yt_dlp import YoutubeDL
+from fastfetchbot_shared.exceptions import FileExportError
def get_video_orientation(content_info: dict, extractor: str) -> str:
@@ -39,7 +39,7 @@ def get_format_for_orientation(
if hd and bilibili_cookie:
return "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"
return "bv*[height<=480]+ba/b[height<=480] / wv*+ba/w"
- raise ValueError("no available extractor found")
+ raise FileExportError("no available extractor found")
def init_yt_downloader(
@@ -201,6 +201,6 @@ def download_video(
"orientation": orientation,
"file_path": file_path_output,
}
- except Exception:
- logger.exception(f"download_video failed: url={url}\n{traceback.format_exc()}")
- raise
+ except Exception as e:
+ logger.exception(f"download_video failed: url={url}")
+ raise FileExportError(f"download_video failed: url={url}") from e
diff --git a/packages/shared/fastfetchbot_shared/exceptions.py b/packages/shared/fastfetchbot_shared/exceptions.py
new file mode 100644
index 0000000..72777be
--- /dev/null
+++ b/packages/shared/fastfetchbot_shared/exceptions.py
@@ -0,0 +1,26 @@
+class FastFetchBotError(Exception):
+ """Base exception for all FastFetchBot domain errors."""
+
+
+class ScraperError(FastFetchBotError):
+ """Error during scraping."""
+
+
+class ScraperNetworkError(ScraperError):
+ """Network/connection error during scraping."""
+
+
+class ScraperParseError(ScraperError):
+ """Failed to parse scraped content."""
+
+
+class TelegraphPublishError(FastFetchBotError):
+ """Telegraph publishing failed."""
+
+
+class FileExportError(FastFetchBotError):
+ """File export (PDF, video, audio) failed."""
+
+
+class ExternalServiceError(FastFetchBotError):
+ """External service call failed (OpenAI, Inoreader, etc.)."""
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py b/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py
index 2874fe1..8c3d163 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py
@@ -189,3 +189,4 @@ async def _request_post_data(self, bluesky_post: BlueskyPost) -> ThreadViewPost:
return post_thread_data.thread
except Exception as e:
logger.error(f"Error while getting post data: {e}")
+ raise
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/common.py b/packages/shared/fastfetchbot_shared/services/scrapers/common.py
index 0e94809..f4e6e4f 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/common.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/common.py
@@ -1,6 +1,7 @@
from typing import Optional, Any
from fastfetchbot_shared.models.url_metadata import UrlMetadata
+from fastfetchbot_shared.exceptions import ScraperError
from fastfetchbot_shared.services.scrapers import (
twitter,
wechat,
@@ -75,7 +76,7 @@ def _resolve_scraper_class(self, category: str):
return self.service_classes[category]
if category in ("youtube", "bilibili"):
return self._get_video_downloader()
- raise KeyError(f"No scraper registered for category: {category}")
+ raise ScraperError(f"No scraper registered for category: {category}")
async def get_item(self, metadata_item: Optional[dict] = None) -> dict:
if not metadata_item:
@@ -92,7 +93,7 @@ async def get_item(self, metadata_item: Optional[dict] = None) -> dict:
metadata_item = await scraper_item.get_item()
except Exception as e:
logger.error(f"Error while getting item: {e}")
- raise e
+ raise
logger.info(f"Got metadata item")
logger.debug(metadata_item)
metadata_item = await self.process_item(metadata_item)
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py
index a273294..ee9a083 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py
@@ -8,6 +8,7 @@
from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html
from fastfetchbot_shared.utils.network import get_selector, HEADERS
+from fastfetchbot_shared.utils.logger import logger
from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType
from fastfetchbot_shared.services.scrapers.config import JINJA2_ENV
@@ -220,10 +221,10 @@ def _douban_short_text_process(self) -> str:
@staticmethod
def raw_content_to_html(raw_content: str) -> str:
# Split the text into paragraphs based on double newlines
- print(raw_content)
+ logger.debug(raw_content)
paragraphs = raw_content.split('
\n')
# Wrap each paragraph with
tags - print(paragraphs) + logger.debug(paragraphs) html_paragraphs = [f'
{paragraph.strip()}
' for paragraph in paragraphs] # Join the paragraphs to form the final HTML string html_string = ''.join(html_paragraphs) diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl_client.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl_client.py index 3c7e8ea..467fe9a 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl_client.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl_client.py @@ -7,6 +7,7 @@ from firecrawl import AsyncFirecrawl from fastfetchbot_shared.services.scrapers.config import settings +from fastfetchbot_shared.exceptions import ExternalServiceError @dataclass(frozen=True) @@ -93,4 +94,4 @@ async def scrape_url( ) return result.model_dump(exclude_none=True) except Exception as e: - raise RuntimeError(f"Firecrawl scrape_url failed: url={url}") from e + raise ExternalServiceError(f"Firecrawl scrape_url failed: url={url}") from e diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py index cabf2c3..d011d6f 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py @@ -4,6 +4,7 @@ from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor, BaseGeneralScraper from fastfetchbot_shared.services.scrapers.scraper import DataProcessor from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.exceptions import ExternalServiceError class ZyteDataProcessor(BaseGeneralDataProcessor): @@ -17,7 +18,7 @@ def __init__(self, url: str): async def _get_page_content(self) -> None: if not settings.ZYTE_API_KEY: - raise RuntimeError("ZYTE_API_KEY is not configured") + raise ExternalServiceError("ZYTE_API_KEY is not configured") try: client = AsyncZyteAPI(api_key=settings.ZYTE_API_KEY) diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py index 52bf256..ae99e0c 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py @@ -81,10 +81,10 @@ async def _get_post_info(self) -> dict: and "status" in ins_data and ins_data["status"] is False ): - print("get_ins_post_item error: ", self.scraper) + logger.warning(f"get_ins_post_item error: {self.scraper}") continue elif type(ins_data) == str and "400" in ins_data: - print("get_ins_post_item error: ", self.scraper, ins_data) + logger.warning(f"get_ins_post_item error: {self.scraper} {ins_data}") continue if ( self.scraper == "looter2" diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/scraper_manager.py b/packages/shared/fastfetchbot_shared/services/scrapers/scraper_manager.py index b1abed5..fbc5183 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/scraper_manager.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/scraper_manager.py @@ -1,6 +1,7 @@ from typing import Optional from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.exceptions import ScraperError from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyScraper from fastfetchbot_shared.services.scrapers.weibo.scraper import WeiboScraper from fastfetchbot_shared.services.scrapers.general.scraper import GeneralScraper @@ -40,7 +41,7 @@ async def init_scraper(cls, category: str) -> None: cls.scrapers["unknown"] = scraper else: logger.error(f"Scraper {category} is not supported") - raise ValueError(f"Scraper {category} is not supported") + raise ScraperError(f"Scraper {category} is not supported") @classmethod async def init_bluesky_scraper(cls) -> BlueskyScraper: diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/threads/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/threads/__init__.py index a56b562..69be60a 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/threads/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/threads/__init__.py @@ -97,7 +97,7 @@ async def intercept_response(response): for xhr in gql_calls: text = await xhr.text() data = json.loads(text) - print(json.dumps(data, indent=4, ensure_ascii=False)) + logger.debug(json.dumps(data, indent=4, ensure_ascii=False)) threads = data["data"]["data"]["containing_thread"]["thread_items"] for thread in threads: thread_data["threads"].append(self.parse_single_threads_data(thread["post"])) diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py index 9a38090..4c5b280 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py @@ -1,6 +1,5 @@ # TODO: https://rapidapi.com/Glavier/api/twitter135 import asyncio -import traceback from urllib.parse import urlparse from typing import Dict, List, Optional, Any, Tuple @@ -9,6 +8,7 @@ from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html +from fastfetchbot_shared.exceptions import ScraperError, ScraperParseError from twitter.scraper import Scraper from .config import ( ALL_SCRAPER, @@ -73,11 +73,10 @@ async def _get_response_tweet_data(self) -> Dict: elif self.scraper == "api-client": tweet_data = await self._api_client_get_response_tweet_data() return tweet_data - except Exception as e: - logger.error(e) - traceback.print_exc() + except Exception: + logger.exception(f"Twitter scraper {self.scraper} failed") continue - raise Exception("No valid response from all Twitter scrapers") + raise ScraperError("No valid response from all Twitter scrapers") async def _rapidapi_get_response_tweet_data(self) -> Dict: async with httpx.AsyncClient() as client: @@ -94,11 +93,11 @@ async def _rapidapi_get_response_tweet_data(self) -> Dict: type(tweet_data) == str and ("400" in tweet_data or "429" in tweet_data) ): - raise Exception("Invalid response from Twitter API") + raise ScraperParseError("Invalid response from Twitter API") else: return tweet_data else: - raise Exception("Invalid response from Twitter API") + raise ScraperParseError("Invalid response from Twitter API") async def _api_client_get_response_tweet_data(self) -> Dict: scraper = Scraper( diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/weibo/scraper.py b/packages/shared/fastfetchbot_shared/services/scrapers/weibo/scraper.py index a0dd89e..acbf414 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/weibo/scraper.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/weibo/scraper.py @@ -8,6 +8,7 @@ from lxml import html from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType +from fastfetchbot_shared.exceptions import ScraperError, ScraperNetworkError, ScraperParseError from fastfetchbot_shared.services.scrapers.scraper import Scraper, DataProcessor from fastfetchbot_shared.services.scrapers.weibo import Weibo from fastfetchbot_shared.utils.network import get_response_json, get_random_user_agent @@ -61,19 +62,20 @@ async def _get_weibo(self) -> None: # Priority 1: API with SUB cookie and isGetLongText=true try: weibo_info = await self._get_weibo_info(method="api") - except ConnectionError as e: + except ScraperNetworkError as e: logger.warning(f"Failed to get weibo info by api: {e}") # Priority 2: webpage scraping if weibo_info is None: try: weibo_info = await self._get_weibo_info(method="webpage") - except ConnectionError as e: + except ScraperNetworkError as e: logger.error(f"All weibo fetch methods failed. Last error: {e}") raise try: await self._process_weibo_item(weibo_info) except Exception as e: logger.error(f"Failed to process weibo item: {e}") + raise async def _get_weibo_info(self, method=None) -> dict: try: @@ -84,11 +86,11 @@ async def _get_weibo_info(self, method=None) -> dict: elif method == "api": weibo_info = await self._get_weibo_info_api() else: - raise ValueError("method must be webpage or api") + raise ScraperError("method must be webpage or api") weibo_info = self._parse_weibo_info(weibo_info) return weibo_info - except ConnectionError as e: - raise ConnectionError(f"There are some network issues: {e}") + except (ConnectionError, ScraperNetworkError) as e: + raise ScraperNetworkError(f"There are some network issues: {e}") from e async def _get_weibo_info_webpage(self) -> dict: url = WEIBO_WEB_HOST + self.id @@ -125,11 +127,13 @@ async def _get_weibo_info_api(self) -> dict: response.raise_for_status() ajax_json = response.json() if not ajax_json or ajax_json.get("ok") == 0: - raise ConnectionError("API returned ok=0 or empty response") + raise ScraperParseError("Weibo API returned ok=0 or empty response") logger.debug(f"weibo info by api: {ajax_json}") return ajax_json + except (ScraperParseError, ScraperNetworkError): + raise except Exception as e: - raise ConnectionError(f"Failed to get weibo info by api: {e}") + raise ScraperNetworkError(f"Failed to get weibo info by api: {e}") from e async def _get_long_weibo_info_api(self) -> dict: ajax_json = await get_response_json( @@ -173,7 +177,7 @@ async def _process_weibo_item(self, weibo_info: dict) -> None: # raise Exception("Still a long text weibo, should go long text api.") text = longtext_info.get("text") if not text: - raise Exception( + raise ScraperParseError( "Failed to get longtext of weibo by webpage scraping." ) except Exception as e: @@ -466,7 +470,7 @@ def _weibo_html_text_clean(text, method="bs4"): elif method == "lxml": return WeiboDataProcessor._weibo_html_text_clean_lxml(text) else: - raise ValueError("method must be bs4 or lxml") + raise ScraperError("method must be bs4 or lxml") @staticmethod def _weibo_html_text_clean_bs4(text): diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/adaptar.py b/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/adaptar.py index 900bc52..3a28e61 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/adaptar.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/adaptar.py @@ -9,6 +9,7 @@ from fastfetchbot_shared.config import settings as shared_settings from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.exceptions import ScraperError, ScraperParseError, ExternalServiceError XHS_API_URL = "https://edith.xiaohongshu.com" XHS_WEB_URL = "https://www.xiaohongshu.com" @@ -22,10 +23,10 @@ def parse_xhs_note_url(note_url: str) -> Dict[str, str]: query = dict(parse_qsl(parsed.query, keep_blank_values=True)) path_parts = [part for part in parsed.path.split("/") if part] if not path_parts: - raise ValueError(f"Invalid XHS note URL: {note_url}") + raise ScraperParseError(f"Invalid XHS note URL: {note_url}") note_id = path_parts[-1] if note_id in {"explore", "discovery", "item"}: - raise ValueError(f"Invalid XHS note URL path: {note_url}") + raise ScraperParseError(f"Invalid XHS note URL path: {note_url}") return { "note_id": note_id, "xsec_token": query.get("xsec_token", ""), @@ -52,7 +53,7 @@ def __init__( self.cookies = cookies.strip() self.sign_server_endpoint = (sign_server_endpoint or shared_settings.SIGN_SERVER_URL).rstrip("/") if not self.sign_server_endpoint: - raise ValueError( + raise ExternalServiceError( "XhsSinglePostAdapter requires a sign server URL. " "Set shared_settings.SIGN_SERVER_URL in the environment or pass sign_server_endpoint explicitly." ) @@ -100,12 +101,12 @@ async def _sign_headers(self, uri: str, data: Optional[Any] = None) -> Dict[str, resp.raise_for_status() body = resp.json() if not body.get("isok"): - raise RuntimeError(f"XHS sign server returned error: {body}") + raise ExternalServiceError(f"XHS sign server returned error: {body}") sign = body.get("data", {}) or {} required = ["x_s", "x_t", "x_s_common", "x_b3_traceid"] missing = [key for key in required if key not in sign] if missing: - raise RuntimeError(f"XHS sign response missing fields: {missing}") + raise ExternalServiceError(f"XHS sign response missing fields: {missing}") headers = self._base_headers() headers.update( { @@ -147,7 +148,7 @@ async def fetch_post( xsec_source=url_info["xsec_source"], ) if note is None: - raise RuntimeError(f"Cannot fetch note detail from API or HTML: {url_info['note_id']}") + raise ScraperError(f"Cannot fetch note detail from API or HTML: {url_info['note_id']}") result: Dict[str, Any] = {"platform": "xhs", "note": note, "comments": [], "url": get_pure_url(note_url)} if with_comments: @@ -191,18 +192,18 @@ def _parse_api_response(resp: httpx.Response) -> Dict[str, Any]: try: body = resp.json() except json.JSONDecodeError as exc: - raise RuntimeError(f"XHS API returned non-JSON: status={resp.status_code}") from exc + raise ScraperParseError(f"XHS API returned non-JSON: status={resp.status_code}") from exc if resp.status_code in (461, 471): verify_type = resp.headers.get("Verifytype", "") verify_uuid = resp.headers.get("Verifyuuid", "") - raise RuntimeError( + raise ScraperError( f"XHS blocked by captcha: verify_type={verify_type}, verify_uuid={verify_uuid}" ) if body.get("success"): return body.get("data", {}) or {} - raise RuntimeError(f"XHS API error: status={resp.status_code}, body={body}") + raise ScraperParseError(f"XHS API error: status={resp.status_code}, body={body}") async def _fetch_note_by_api( self, @@ -536,7 +537,7 @@ async def _get_redirection_url(self, note_url: str) -> str: final_url = str(resp.url) if XHS_WEB_URL not in final_url and "xiaohongshu.com" not in final_url: - raise RuntimeError( + raise ScraperError( f"Short URL did not redirect to xiaohongshu.com: {note_url} -> {final_url}" ) return final_url diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py index 96c3da5..8f8923d 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py @@ -1,6 +1,5 @@ import json import re -import traceback from typing import Dict, Optional, Any from urllib.parse import urlparse @@ -18,6 +17,7 @@ from fastfetchbot_shared.utils.network import get_selector, get_redirect_url, get_response_json, get_random_user_agent, \ get_content_async, get_response from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType +from fastfetchbot_shared.exceptions import ScraperError, ScraperParseError from fastfetchbot_shared.services.scrapers.config import settings, JINJA2_ENV from .config import ( SHORT_LIMIT, @@ -176,12 +176,12 @@ async def _get_zhihu_item(self) -> None: if self.title != "": break except Exception as e: - traceback.print_exc() + logger.exception(f"Zhihu scraper method {method} failed") if method == ALL_METHODS[-1]: - print("all methods failed") - raise e + logger.error("All Zhihu scraper methods failed") + raise else: - print( + logger.warning( f"zhihu {self.zhihu_type} {self.method} failed, try the next method" ) continue @@ -299,7 +299,7 @@ async def _get_zhihu_answer(self) -> None: answer_data = _parse_answer_api_json_data(json_data) logger.debug(f"answer data: {answer_data}") except Exception as e: - raise Exception("Cannot get the answer by API") + raise ScraperParseError("Cannot get the answer by API") from e elif self.method == "fxzhihu": try: resp = await get_response(url=self.request_url, headers=self.headers, client=self.httpx_client) @@ -308,7 +308,7 @@ async def _get_zhihu_answer(self) -> None: answer_data = _parse_answer_api_json_data(json_data) logger.debug(f"answer data: {answer_data}") except Exception as e: - raise Exception("Cannot get the answer by fxzhihu, error: " + str(e)) + raise ScraperError("Cannot get the answer by fxzhihu, error: " + str(e)) from e elif self.method == "json": try: selector = await get_selector(self.request_url, headers=self.headers) @@ -317,9 +317,9 @@ async def _get_zhihu_answer(self) -> None: json_data = json_data["initialState"]["entities"] answer_data = self._parse_answer_json_data(json_data) except Exception as e: - raise Exception("Cannot get the selector") + raise ScraperParseError("Cannot get the selector") from e if answer_data == {}: - raise Exception("Cannot get the answer") + raise ScraperError("Cannot get the answer") self._resolve_answer_json_data(answer_data) # Apply FxZhihu-style content processing for api method if self.method == "api": @@ -350,11 +350,11 @@ async def _get_zhihu_answer(self) -> None: if self.author_url == "https://www.zhihu.com/people/": self.author_url = "" except Exception as e: - raise Exception("Cannot get the answer") + raise ScraperError("Cannot get the answer") from e if ( self.title == "" ): # TODO: this is not a good way to check if the scraping is successful. To be improved. - raise Exception("Cannot get the answer") + raise ScraperError("Cannot get the answer") async def _get_zhihu_status(self): """ @@ -390,8 +390,8 @@ async def _get_zhihu_status(self): else: try: selector = await get_selector(self.request_url, headers=self.headers) - except: - raise Exception("zhihu request failed") + except Exception as e: + raise ScraperError("zhihu request failed") from e if self.method == "json": def _process_picture(pictures, content_attr): if not hasattr(self, content_attr): @@ -516,8 +516,8 @@ def _process_picture(pictures, content_attr): self.retweet_html = str( html.tostring(pic_html, pretty_print=True) ).replace("b'test
", output_filename="test.pdf") + + def test_preserves_original_cause(self): + from fastfetchbot_file_export.pdf_export import export_pdf + + original = OSError("disk full") + with patch( + "fastfetchbot_file_export.pdf_export.convert_html_to_pdf", + side_effect=original, + ): + with pytest.raises(FileExportError) as exc_info: + export_pdf(html_string="x
", output_filename="out.pdf") + assert exc_info.value.__cause__ is original + + +class TestConvertHtmlToPdf: + def test_no_input_raises_file_export_error(self): + from fastfetchbot_file_export.pdf_export import convert_html_to_pdf + + with pytest.raises(FileExportError, match="Either html_string or html_file"): + convert_html_to_pdf("output.pdf") diff --git a/tests/unit/file_export/test_transcribe_exceptions.py b/tests/unit/file_export/test_transcribe_exceptions.py new file mode 100644 index 0000000..4461752 --- /dev/null +++ b/tests/unit/file_export/test_transcribe_exceptions.py @@ -0,0 +1,101 @@ +"""Tests for exception handling in packages/file-export/fastfetchbot_file_export/transcribe.py""" + +from unittest.mock import patch, MagicMock, mock_open + +import pytest + +from fastfetchbot_shared.exceptions import FileExportError + + +class TestGetAudioTextExceptionHandling: + def test_raises_file_export_error_on_failure(self): + from fastfetchbot_file_export.transcribe import get_audio_text + + with patch( + "fastfetchbot_file_export.transcribe.OpenAI", + side_effect=RuntimeError("API key invalid"), + ): + with pytest.raises(FileExportError, match="Audio transcription failed"): + get_audio_text("/tmp/nonexistent.mp3", "fake-key") + + def test_preserves_original_cause(self): + from fastfetchbot_file_export.transcribe import get_audio_text + + original = RuntimeError("connection refused") + with patch( + "fastfetchbot_file_export.transcribe.OpenAI", + side_effect=original, + ): + with pytest.raises(FileExportError) as exc_info: + get_audio_text("/tmp/test.mp3", "key") + assert exc_info.value.__cause__ is original + + +class TestGetAudioTextHappyPath: + def test_successful_transcription(self): + from fastfetchbot_file_export.transcribe import get_audio_text + + # Mock OpenAI client + mock_client = MagicMock() + mock_transcription = MagicMock() + mock_transcription.text = "hello world" + mock_client.audio.transcriptions.create.return_value = mock_transcription + + mock_punctuation_response = MagicMock() + mock_punctuation_response.choices = [MagicMock()] + mock_punctuation_response.choices[0].message.content = "Hello, world." + + mock_summary_response = MagicMock() + mock_summary_response.choices = [MagicMock()] + mock_summary_response.choices[0].message.content = "A greeting." + + # Track which call we're on + call_count = [0] + def chat_side_effect(**kwargs): + call_count[0] += 1 + if call_count[0] == 1: + return mock_punctuation_response + return mock_summary_response + + mock_client.chat.completions.create.side_effect = chat_side_effect + + # Mock AudioSegment + mock_audio = MagicMock() + mock_audio.__getitem__ = MagicMock(return_value=mock_audio) + mock_audio.duration_seconds = 10 # short audio, single segment + mock_audio.dBFS = -10 # above silence threshold + mock_audio.export = MagicMock() + + mock_segment = MagicMock() + mock_segment.dBFS = -10 + mock_audio.__getitem__.return_value = mock_segment + mock_segment.export = MagicMock() + + with patch("fastfetchbot_file_export.transcribe.OpenAI", return_value=mock_client), \ + patch("fastfetchbot_file_export.transcribe.AudioSegment") as MockAudioSegment, \ + patch("fastfetchbot_file_export.transcribe.os.remove") as mock_remove, \ + patch("fastfetchbot_file_export.transcribe.os.path.splitext", return_value=("/tmp/audio", ".mp3")), \ + patch("builtins.open", mock_open(read_data=b"audio data")), \ + patch("fastfetchbot_file_export.transcribe.milliseconds_until_sound", return_value=0): + + MockAudioSegment.from_file.return_value = mock_audio + + result = get_audio_text("/tmp/audio.mp3", "test-api-key") + + assert "Hello, world." in result + assert "A greeting." in result + # Should have cleaned up the original file + mock_remove.assert_called() + + def test_audio_segment_failure_wraps_in_file_export_error(self): + from fastfetchbot_file_export.transcribe import get_audio_text + + mock_client = MagicMock() + + with patch("fastfetchbot_file_export.transcribe.OpenAI", return_value=mock_client), \ + patch("fastfetchbot_file_export.transcribe.AudioSegment") as MockAudioSegment: + + MockAudioSegment.from_file.side_effect = FileNotFoundError("no such file") + + with pytest.raises(FileExportError, match="Audio transcription failed"): + get_audio_text("/tmp/missing.mp3", "key") diff --git a/tests/unit/file_export/test_video_download.py b/tests/unit/file_export/test_video_download.py index 92099f3..b4c0842 100644 --- a/tests/unit/file_export/test_video_download.py +++ b/tests/unit/file_export/test_video_download.py @@ -426,3 +426,51 @@ def test_no_media_file_when_download_false(self, mock_celery): } vd._video_info_formatting(meta_info) assert vd.media_files == [] + + +# --------------------------------------------------------------------------- +# Exception handling in fastfetchbot_file_export.video_download +# --------------------------------------------------------------------------- + + +class TestVideoDownloadExceptions: + def test_unsupported_extractor_raises_file_export_error(self): + from fastfetchbot_file_export.video_download import get_format_for_orientation + from fastfetchbot_shared.exceptions import FileExportError + + with pytest.raises(FileExportError, match="no available extractor found"): + get_format_for_orientation("tiktok", "horizontal", False) + + def test_download_video_wraps_errors_in_file_export_error(self): + from fastfetchbot_file_export.video_download import download_video + from fastfetchbot_shared.exceptions import FileExportError + + with patch( + "fastfetchbot_file_export.video_download.init_yt_downloader" + ) as mock_init: + mock_ydl = MagicMock() + mock_ydl.__enter__ = MagicMock(return_value=mock_ydl) + mock_ydl.__exit__ = MagicMock(return_value=False) + mock_ydl.extract_info.side_effect = RuntimeError("network error") + mock_init.return_value = mock_ydl + + with pytest.raises(FileExportError, match="download_video failed"): + download_video("https://youtube.com/watch?v=test", download=True) + + def test_download_video_preserves_original_cause(self): + from fastfetchbot_file_export.video_download import download_video + from fastfetchbot_shared.exceptions import FileExportError + + original = RuntimeError("connection reset") + with patch( + "fastfetchbot_file_export.video_download.init_yt_downloader" + ) as mock_init: + mock_ydl = MagicMock() + mock_ydl.__enter__ = MagicMock(return_value=mock_ydl) + mock_ydl.__exit__ = MagicMock(return_value=False) + mock_ydl.extract_info.side_effect = original + mock_init.return_value = mock_ydl + + with pytest.raises(FileExportError) as exc_info: + download_video("https://youtube.com/watch?v=x", download=True) + assert exc_info.value.__cause__ is original diff --git a/tests/unit/scrapers/test_bluesky.py b/tests/unit/scrapers/test_bluesky.py index a5cbf1c..f0081d7 100644 --- a/tests/unit/scrapers/test_bluesky.py +++ b/tests/unit/scrapers/test_bluesky.py @@ -635,7 +635,7 @@ async def test_request_post_data_uses_handle_when_no_did(self): @pytest.mark.asyncio async def test_request_post_data_exception_handling(self): - """_request_post_data should log error and return None on exception.""" + """_request_post_data should log error and re-raise on exception.""" from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyScraper with patch( @@ -652,5 +652,5 @@ async def test_request_post_data_exception_handling(self): bluesky_post.handle = "alice" bluesky_post.post_rkey = "rkey123" - result = await scraper._request_post_data(bluesky_post) - assert result is None + with pytest.raises(Exception, match="network error"): + await scraper._request_post_data(bluesky_post) diff --git a/tests/unit/scrapers/test_common.py b/tests/unit/scrapers/test_common.py index 051f270..4f613c7 100644 --- a/tests/unit/scrapers/test_common.py +++ b/tests/unit/scrapers/test_common.py @@ -6,6 +6,7 @@ from fastfetchbot_shared.models.url_metadata import UrlMetadata from fastfetchbot_shared.services.scrapers.common import InfoExtractService +from fastfetchbot_shared.exceptions import ScraperError # --------------------------------------------------------------------------- @@ -215,3 +216,19 @@ async def test_process_item_no_strip_needed(self, make_url_metadata): svc = InfoExtractService(url_metadata=make_url_metadata()) result = await svc.process_item({"title": "clean"}) assert result["title"] == "clean" + + +# --------------------------------------------------------------------------- +# _resolve_scraper_class +# --------------------------------------------------------------------------- + +class TestResolveScraperClass: + def test_known_category_returns_class(self, make_url_metadata): + svc = InfoExtractService(url_metadata=make_url_metadata(source="twitter")) + cls = svc._resolve_scraper_class("twitter") + assert cls is not None + + def test_unknown_category_raises_scraper_error(self, make_url_metadata): + svc = InfoExtractService(url_metadata=make_url_metadata()) + with pytest.raises(ScraperError, match="No scraper registered"): + svc._resolve_scraper_class("tiktok") diff --git a/tests/unit/scrapers/test_general_firecrawl.py b/tests/unit/scrapers/test_general_firecrawl.py index d095519..a38d46a 100644 --- a/tests/unit/scrapers/test_general_firecrawl.py +++ b/tests/unit/scrapers/test_general_firecrawl.py @@ -9,6 +9,7 @@ FirecrawlClient, FirecrawlSettings, ) +from fastfetchbot_shared.exceptions import ExternalServiceError # --------------------------------------------------------------------------- @@ -166,7 +167,7 @@ async def test_scrape_url_exception(self, mock_fc_cls): mock_app.scrape.side_effect = Exception("network error") client = FirecrawlClient.get_instance() - with pytest.raises(RuntimeError, match="Firecrawl scrape_url failed"): + with pytest.raises(ExternalServiceError, match="Firecrawl scrape_url failed"): await client.scrape_url(url="https://fail.com") diff --git a/tests/unit/scrapers/test_general_zyte.py b/tests/unit/scrapers/test_general_zyte.py index 8695986..33f5861 100644 --- a/tests/unit/scrapers/test_general_zyte.py +++ b/tests/unit/scrapers/test_general_zyte.py @@ -8,6 +8,7 @@ ZyteDataProcessor, ZyteScraper, ) +from fastfetchbot_shared.exceptions import ExternalServiceError # --------------------------------------------------------------------------- @@ -36,7 +37,7 @@ class TestZyteGetPageContent: ) async def test_no_api_key_raises(self): proc = ZyteDataProcessor("https://example.com") - with pytest.raises(RuntimeError, match="ZYTE_API_KEY is not configured"): + with pytest.raises(ExternalServiceError, match="ZYTE_API_KEY is not configured"): await proc._get_page_content() @pytest.mark.asyncio diff --git a/tests/unit/scrapers/test_scraper_manager.py b/tests/unit/scrapers/test_scraper_manager.py index 24b0cc2..ec6d34c 100644 --- a/tests/unit/scrapers/test_scraper_manager.py +++ b/tests/unit/scrapers/test_scraper_manager.py @@ -5,6 +5,7 @@ import pytest from fastfetchbot_shared.services.scrapers.scraper_manager import ScraperManager +from fastfetchbot_shared.exceptions import ScraperError # --------------------------------------------------------------------------- @@ -124,8 +125,8 @@ async def test_init_unknown_when_already_initialized(self): class TestInitScraperUnsupported: @pytest.mark.asyncio - async def test_unsupported_category_raises_value_error(self): - with pytest.raises(ValueError, match="not supported"): + async def test_unsupported_category_raises_scraper_error(self): + with pytest.raises(ScraperError, match="not supported"): await ScraperManager.init_scraper("tiktok") diff --git a/tests/unit/scrapers/test_twitter_exceptions.py b/tests/unit/scrapers/test_twitter_exceptions.py new file mode 100644 index 0000000..6826730 --- /dev/null +++ b/tests/unit/scrapers/test_twitter_exceptions.py @@ -0,0 +1,97 @@ +"""Tests for exception handling in Twitter scraper.""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from fastfetchbot_shared.exceptions import ScraperError, ScraperParseError + + +class TestGetResponseTweetData: + @pytest.mark.asyncio + async def test_all_scrapers_fail_raises_scraper_error(self): + from fastfetchbot_shared.services.scrapers.twitter import Twitter + + tw = Twitter(url="https://twitter.com/user/status/123") + + with patch.object( + tw, "_rapidapi_get_response_tweet_data", + new_callable=AsyncMock, + side_effect=ScraperParseError("API error"), + ), patch.object( + tw, "_api_client_get_response_tweet_data", + new_callable=AsyncMock, + side_effect=RuntimeError("client error"), + ): + with pytest.raises(ScraperError, match="No valid response from all Twitter scrapers"): + await tw._get_response_tweet_data() + + @pytest.mark.asyncio + async def test_first_scraper_fails_falls_through(self): + from fastfetchbot_shared.services.scrapers.twitter import Twitter + + tw = Twitter(url="https://twitter.com/user/status/456") + fake_data = { + "data": { + "threaded_conversation_with_injections_v2": { + "instructions": [{"entries": []}] + } + } + } + + with patch.object( + tw, "_rapidapi_get_response_tweet_data", + new_callable=AsyncMock, + side_effect=RuntimeError("first failed"), + ), patch.object( + tw, "_api_client_get_response_tweet_data", + new_callable=AsyncMock, + return_value=fake_data, + ): + # Should fall through to api-client and succeed + result = await tw._get_response_tweet_data() + assert result == fake_data + + +class TestRapidapiResponseValidation: + @pytest.mark.asyncio + async def test_error_response_raises_scraper_parse_error(self): + from fastfetchbot_shared.services.scrapers.twitter import Twitter + import httpx + + tw = Twitter(url="https://twitter.com/user/status/789") + tw.scraper = "Twitter135" + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"errors": [{"message": "rate limited"}]} + + with patch("httpx.AsyncClient") as MockClient: + mock_client = AsyncMock() + mock_client.get = AsyncMock(return_value=mock_response) + MockClient.return_value.__aenter__ = AsyncMock(return_value=mock_client) + MockClient.return_value.__aexit__ = AsyncMock(return_value=False) + + with patch.object(tw, "_get_request_headers"): + with pytest.raises(ScraperParseError, match="Invalid response"): + await tw._rapidapi_get_response_tweet_data() + + @pytest.mark.asyncio + async def test_non_200_raises_scraper_parse_error(self): + from fastfetchbot_shared.services.scrapers.twitter import Twitter + + tw = Twitter(url="https://twitter.com/user/status/000") + tw.scraper = "Twitter135" + + mock_response = MagicMock() + mock_response.status_code = 500 + + with patch("httpx.AsyncClient") as MockClient: + mock_client = AsyncMock() + mock_client.get = AsyncMock(return_value=mock_response) + MockClient.return_value.__aenter__ = AsyncMock(return_value=mock_client) + MockClient.return_value.__aexit__ = AsyncMock(return_value=False) + + with patch.object(tw, "_get_request_headers"): + with pytest.raises(ScraperParseError, match="Invalid response"): + await tw._rapidapi_get_response_tweet_data() diff --git a/tests/unit/scrapers/test_weibo.py b/tests/unit/scrapers/test_weibo.py index 8b2230e..d19461c 100644 --- a/tests/unit/scrapers/test_weibo.py +++ b/tests/unit/scrapers/test_weibo.py @@ -24,6 +24,7 @@ from typing import Dict from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType +from fastfetchbot_shared.exceptions import ScraperError, ScraperNetworkError, ScraperParseError # --------------------------------------------------------------------------- @@ -218,7 +219,7 @@ def test_lxml_method(self): def test_invalid_method_raises(self): from fastfetchbot_shared.services.scrapers.weibo.scraper import WeiboDataProcessor - with pytest.raises(ValueError, match="method must be bs4 or lxml"): + with pytest.raises(ScraperError, match="method must be bs4 or lxml"): WeiboDataProcessor._weibo_html_text_clean("test
", method="invalid") @@ -803,7 +804,7 @@ async def test_invalid_method_raises(self): mock_env.get_template.return_value = mock_template from fastfetchbot_shared.services.scrapers.weibo.scraper import WeiboDataProcessor wp = WeiboDataProcessor(url="https://m.weibo.cn/detail/123") - with pytest.raises(ValueError, match="method must be webpage or api"): + with pytest.raises(ScraperError, match="method must be webpage or api"): await wp._get_weibo_info(method="invalid") @pytest.mark.asyncio @@ -827,7 +828,7 @@ async def test_connection_error_propagated(self): from fastfetchbot_shared.services.scrapers.weibo.scraper import WeiboDataProcessor wp = WeiboDataProcessor(url="https://m.weibo.cn/detail/123") with patch.object(wp, "_get_weibo_info_api", new_callable=AsyncMock, side_effect=ConnectionError("net fail")): - with pytest.raises(ConnectionError, match="network issues"): + with pytest.raises(ScraperNetworkError, match="network issues"): await wp._get_weibo_info(method="api") @@ -960,7 +961,7 @@ async def test_ok_zero_raises(self): mock_client.__aexit__ = AsyncMock(return_value=False) with patch("fastfetchbot_shared.services.scrapers.weibo.scraper.httpx.AsyncClient", return_value=mock_client): - with pytest.raises(ConnectionError): + with pytest.raises(ScraperParseError): await wp._get_weibo_info_api() @pytest.mark.asyncio @@ -983,7 +984,7 @@ async def test_empty_response_raises(self): mock_client.__aexit__ = AsyncMock(return_value=False) with patch("fastfetchbot_shared.services.scrapers.weibo.scraper.httpx.AsyncClient", return_value=mock_client): - with pytest.raises(ConnectionError): + with pytest.raises(ScraperParseError): await wp._get_weibo_info_api() @pytest.mark.asyncio @@ -1004,7 +1005,7 @@ async def test_http_error_raises(self): mock_client.__aexit__ = AsyncMock(return_value=False) with patch("fastfetchbot_shared.services.scrapers.weibo.scraper.httpx.AsyncClient", return_value=mock_client): - with pytest.raises(ConnectionError): + with pytest.raises(ScraperNetworkError): await wp._get_weibo_info_api() @@ -1083,7 +1084,7 @@ async def side_effect(method=None): nonlocal call_count call_count += 1 if call_count == 1: - raise ConnectionError("api fail") + raise ScraperNetworkError("api fail") return weibo_info with patch.object(wp, "_get_weibo_info", new_callable=AsyncMock, side_effect=side_effect): @@ -1100,12 +1101,12 @@ async def test_both_fail_raises(self): from fastfetchbot_shared.services.scrapers.weibo.scraper import WeiboDataProcessor wp = WeiboDataProcessor(url="https://m.weibo.cn/detail/123") - with patch.object(wp, "_get_weibo_info", new_callable=AsyncMock, side_effect=ConnectionError("fail")): - with pytest.raises(ConnectionError): + with patch.object(wp, "_get_weibo_info", new_callable=AsyncMock, side_effect=ScraperNetworkError("fail")): + with pytest.raises(ScraperNetworkError): await wp._get_weibo() @pytest.mark.asyncio - async def test_process_item_exception_caught(self): + async def test_process_item_exception_reraised(self): with patch("fastfetchbot_shared.services.scrapers.weibo.scraper.JINJA2_ENV") as mock_env: mock_template = MagicMock() mock_template.render.return_value = "rendered
" @@ -1115,8 +1116,8 @@ async def test_process_item_exception_caught(self): with patch.object(wp, "_get_weibo_info", new_callable=AsyncMock, return_value={"id": "123"}): with patch.object(wp, "_process_weibo_item", new_callable=AsyncMock, side_effect=Exception("process fail")): - # Should not raise, exception is caught and logged - await wp._get_weibo() + with pytest.raises(Exception, match="process fail"): + await wp._get_weibo() # --------------------------------------------------------------------------- diff --git a/tests/unit/scrapers/test_xiaohongshu.py b/tests/unit/scrapers/test_xiaohongshu.py index eca8e50..6fadc8e 100644 --- a/tests/unit/scrapers/test_xiaohongshu.py +++ b/tests/unit/scrapers/test_xiaohongshu.py @@ -19,6 +19,7 @@ XHS_WEB_URL, ) from fastfetchbot_shared.models.metadata_item import MessageType, MediaFile +from fastfetchbot_shared.exceptions import ScraperError, ScraperParseError, ExternalServiceError # --------------------------------------------------------------------------- @@ -48,19 +49,19 @@ def test_nested_path(self): assert result["note_id"] == "note456" def test_empty_path_raises(self): - with pytest.raises(ValueError, match="Invalid XHS note URL"): + with pytest.raises(ScraperParseError, match="Invalid XHS note URL"): parse_xhs_note_url("https://www.xiaohongshu.com/") def test_explore_only_raises(self): - with pytest.raises(ValueError, match="Invalid XHS note URL path"): + with pytest.raises(ScraperParseError, match="Invalid XHS note URL path"): parse_xhs_note_url("https://www.xiaohongshu.com/explore") def test_discovery_only_raises(self): - with pytest.raises(ValueError, match="Invalid XHS note URL path"): + with pytest.raises(ScraperParseError, match="Invalid XHS note URL path"): parse_xhs_note_url("https://www.xiaohongshu.com/discovery") def test_item_only_raises(self): - with pytest.raises(ValueError, match="Invalid XHS note URL path"): + with pytest.raises(ScraperParseError, match="Invalid XHS note URL path"): parse_xhs_note_url("https://www.xiaohongshu.com/item") @@ -101,7 +102,7 @@ def test_strips_trailing_slash(self): @patch("fastfetchbot_shared.config.settings.SIGN_SERVER_URL", "") def test_no_sign_server_raises(self): - with pytest.raises(ValueError, match="sign server URL"): + with pytest.raises(ExternalServiceError, match="sign server URL"): XhsSinglePostAdapter(cookies="c=1", sign_server_endpoint="") @patch("fastfetchbot_shared.config.settings.SIGN_SERVER_URL", "http://fallback:8989") @@ -193,7 +194,7 @@ async def test_not_ok_raises(self): adapter._http = AsyncMock() adapter._http.post = AsyncMock(return_value=mock_resp) - with pytest.raises(RuntimeError, match="sign server returned error"): + with pytest.raises(ExternalServiceError, match="sign server returned error"): await adapter._sign_headers("/api/test") @pytest.mark.asyncio @@ -210,7 +211,7 @@ async def test_missing_fields_raises(self): adapter._http = AsyncMock() adapter._http.post = AsyncMock(return_value=mock_resp) - with pytest.raises(RuntimeError, match="missing fields"): + with pytest.raises(ExternalServiceError, match="missing fields"): await adapter._sign_headers("/api/test") @pytest.mark.asyncio @@ -224,7 +225,7 @@ async def test_none_data_uses_empty_dict(self): adapter._http = AsyncMock() adapter._http.post = AsyncMock(return_value=mock_resp) - with pytest.raises(RuntimeError, match="missing fields"): + with pytest.raises(ExternalServiceError, match="missing fields"): await adapter._sign_headers("/api/test") @pytest.mark.asyncio @@ -273,7 +274,7 @@ def test_non_json_raises(self): resp = MagicMock(spec=httpx.Response) resp.status_code = 200 resp.json.side_effect = json.JSONDecodeError("err", "", 0) - with pytest.raises(RuntimeError, match="non-JSON"): + with pytest.raises(ScraperParseError, match="non-JSON"): XhsSinglePostAdapter._parse_api_response(resp) def test_captcha_461(self): @@ -281,7 +282,7 @@ def test_captcha_461(self): 461, {"success": False}, headers={"Verifytype": "captcha", "Verifyuuid": "uuid1"}, ) - with pytest.raises(RuntimeError, match="captcha"): + with pytest.raises(ScraperError, match="captcha"): XhsSinglePostAdapter._parse_api_response(resp) def test_captcha_471(self): @@ -289,12 +290,12 @@ def test_captcha_471(self): 471, {"success": False}, headers={"Verifytype": "sms", "Verifyuuid": "uuid2"}, ) - with pytest.raises(RuntimeError, match="captcha"): + with pytest.raises(ScraperError, match="captcha"): XhsSinglePostAdapter._parse_api_response(resp) def test_api_error_not_success(self): resp = self._make_response(200, {"success": False, "msg": "error"}) - with pytest.raises(RuntimeError, match="XHS API error"): + with pytest.raises(ScraperParseError, match="XHS API error"): XhsSinglePostAdapter._parse_api_response(resp) @@ -951,7 +952,7 @@ async def test_both_fail_raises(self): adapter._fetch_note_by_html = AsyncMock(return_value=None) url = f"{XHS_WEB_URL}/explore/n1" - with pytest.raises(RuntimeError, match="Cannot fetch note"): + with pytest.raises(ScraperError, match="Cannot fetch note"): await adapter.fetch_post(note_url=url) @pytest.mark.asyncio @@ -1292,7 +1293,7 @@ async def test_not_xhs_raises(self): mock_client.__aexit__ = AsyncMock(return_value=None) MockClient.return_value = mock_client - with pytest.raises(RuntimeError, match="did not redirect to xiaohongshu.com"): + with pytest.raises(ScraperError, match="did not redirect to xiaohongshu.com"): await adapter._get_redirection_url("https://xhslink.com/abc") diff --git a/tests/unit/telegram_bot/test_message_sender.py b/tests/unit/telegram_bot/test_message_sender.py new file mode 100644 index 0000000..962777b --- /dev/null +++ b/tests/unit/telegram_bot/test_message_sender.py @@ -0,0 +1,154 @@ +"""Tests for exception handling in apps/telegram-bot/core/services/message_sender.py""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + + +class TestMediaFilesPackagingDownloadFailure: + """Test that download failures in media_files_packaging are caught and skipped.""" + + @pytest.mark.asyncio + async def test_http_download_failure_skips_media(self): + from core.services.message_sender import media_files_packaging + + media_files = [ + {"media_type": "image", "url": "https://example.com/img.jpg", "caption": ""}, + ] + data = { + "url": "https://example.com", + "category": "twitter", + "message_type": "short", + } + + with patch( + "core.services.message_sender.download_file_by_metadata_item", + new_callable=AsyncMock, + side_effect=RuntimeError("network error"), + ): + media_group, file_group = await media_files_packaging(media_files, data) + + # Media item should be skipped, not crash + assert media_group == [] + assert file_group == [] + + @pytest.mark.asyncio + async def test_gif_download_failure_skips_media(self): + from core.services.message_sender import media_files_packaging + + media_files = [ + {"media_type": "gif", "url": "https://example.com/anim.gif", "caption": ""}, + ] + data = { + "url": "https://example.com", + "category": "twitter", + "message_type": "short", + } + + with patch( + "core.services.message_sender.download_file_by_metadata_item", + new_callable=AsyncMock, + side_effect=ConnectionError("timeout"), + ): + media_group, file_group = await media_files_packaging(media_files, data) + + assert media_group == [] + assert file_group == [] + + @pytest.mark.asyncio + async def test_image_document_download_failure_skips(self): + """Test the second download_file_by_metadata_item call for large image documents.""" + from core.services.message_sender import media_files_packaging + + mock_io = MagicMock() + mock_io.name = "media-abc.jpg" + mock_io.size = 15 * 1024 * 1024 # 15MB, over image size limit to trigger document path + + mock_image = MagicMock() + mock_image.size = (5000, 5000) # large image to trigger document download + mock_image.width = 5000 + mock_image.height = 5000 + + call_count = [0] + + async def download_side_effect(*args, **kwargs): + call_count[0] += 1 + if call_count[0] == 1: + return mock_io + raise RuntimeError("second download failed") + + media_files = [ + {"media_type": "image", "url": "https://example.com/big.jpg", "caption": ""}, + ] + data = { + "url": "https://example.com", + "category": "twitter", + "message_type": "short", + } + + with patch( + "core.services.message_sender.download_file_by_metadata_item", + new_callable=AsyncMock, + side_effect=download_side_effect, + ), patch( + "core.services.message_sender.check_image_type", + new_callable=AsyncMock, + return_value="jpg", + ), patch( + "core.services.message_sender.Image" + ) as MockImage, patch( + "core.services.message_sender.image_compressing", + return_value=mock_image, + ), patch( + "core.services.message_sender.settings" + ) as mock_settings: + MockImage.open.return_value = mock_image + mock_settings.TELEBOT_API_SERVER = None + mock_settings.TELEGRAM_IMAGE_DIMENSION_LIMIT = 2000 + mock_settings.TELEGRAM_IMAGE_SIZE_LIMIT = 10 * 1024 * 1024 # 10MB + + media_group, file_group = await media_files_packaging(media_files, data) + + # The image media_group should have the photo, but file_group should be empty + # because the document download failed and was skipped + assert file_group == [] + + +class TestSendItemMessageExceptionHandling: + @pytest.mark.asyncio + async def test_exception_logged_and_sent_to_debug_channel(self): + from core.services.message_sender import send_item_message + + mock_app = MagicMock() + mock_bot = AsyncMock() + mock_app.bot = mock_bot + + mock_chat = MagicMock() + mock_chat.type = "private" + mock_chat.linked_chat_id = None + mock_bot.get_chat = AsyncMock(return_value=mock_chat) + mock_bot.send_message = AsyncMock(side_effect=RuntimeError("telegram API down")) + + with patch("core.services.message_sender._get_application", return_value=mock_app), \ + patch("core.services.message_sender.send_debug_channel", new_callable=AsyncMock) as mock_debug: + + # Should not raise — the exception is caught and logged + await send_item_message( + data={ + "media_files": [], + "message_type": "short", + "text": "test", + "title": "Test", + "author": "a", + "author_url": "", + "url": "https://example.com", + "telegraph_url": "", + "category": "twitter", + "content": "", + }, + chat_id=12345, + ) + + # Debug channel should have been called with the traceback + mock_debug.assert_awaited_once() + assert "telegram API down" in mock_debug.call_args[0][0] diff --git a/tests/unit/telegram_bot/test_outbox_consumer.py b/tests/unit/telegram_bot/test_outbox_consumer.py index d3d1fb4..67f6cd8 100644 --- a/tests/unit/telegram_bot/test_outbox_consumer.py +++ b/tests/unit/telegram_bot/test_outbox_consumer.py @@ -136,7 +136,39 @@ async def brpop_side_effect(*args, **kwargs): class TestConsumeLoopError: @pytest.mark.asyncio - async def test_sends_error_to_chat(self, mock_redis): + async def test_sends_error_to_debug_channel(self, mock_redis): + payload = _make_payload(error="scraper failed", chat_id=99, job_id="j42") + call_count = 0 + + async def brpop_side_effect(*args, **kwargs): + nonlocal call_count + call_count += 1 + if call_count == 1: + return ("scrape:outbox", payload) + raise asyncio.CancelledError() + + mock_redis.brpop = AsyncMock(side_effect=brpop_side_effect) + + with patch( + "core.services.outbox_consumer.aioredis.from_url", + return_value=mock_redis, + ), patch( + "core.services.outbox_consumer.send_debug_channel", + new_callable=AsyncMock, + ) as mock_debug: + from core.services.outbox_consumer import _consume_loop + + await _consume_loop() + + mock_debug.assert_awaited_once() + msg = mock_debug.call_args[0][0] + assert "scraper failed" in msg + assert "j42" in msg + assert "99" in msg + + @pytest.mark.asyncio + async def test_does_not_send_error_to_user_chat(self, mock_redis): + """Scrape errors should NOT notify the user.""" payload = _make_payload(error="scraper failed", chat_id=99) call_count = 0 @@ -153,14 +185,18 @@ async def brpop_side_effect(*args, **kwargs): "core.services.outbox_consumer.aioredis.from_url", return_value=mock_redis, ), patch( - "core.services.outbox_consumer._send_error_to_chat", + "core.services.outbox_consumer.send_debug_channel", new_callable=AsyncMock, - ) as mock_err: + ), patch( + "core.services.outbox_consumer.send_item_message", + new_callable=AsyncMock, + ) as mock_send: from core.services.outbox_consumer import _consume_loop await _consume_loop() - mock_err.assert_awaited_once_with(99, "scraper failed") + # User should receive no message at all + mock_send.assert_not_awaited() # --------------------------------------------------------------------------- @@ -222,43 +258,6 @@ async def brpop_side_effect(*args, **kwargs): assert call_count == 3 # 2 None returns, then cancel -# --------------------------------------------------------------------------- -# _send_error_to_chat -# --------------------------------------------------------------------------- - - -class TestSendErrorToChat: - @pytest.mark.asyncio - async def test_sends_error_message(self): - mock_bot = AsyncMock() - mock_app = MagicMock() - mock_app.bot = mock_bot - - # _send_error_to_chat uses: from core.services.bot_app import application - with patch("core.services.bot_app.application", mock_app): - from core.services.outbox_consumer import _send_error_to_chat - - await _send_error_to_chat(42, "something went wrong") - - mock_bot.send_message.assert_awaited_once() - call_kwargs = mock_bot.send_message.call_args.kwargs - assert call_kwargs["chat_id"] == 42 - assert "something went wrong" in call_kwargs["text"] - - @pytest.mark.asyncio - async def test_handles_send_failure_gracefully(self): - mock_bot = AsyncMock() - mock_bot.send_message = AsyncMock(side_effect=RuntimeError("telegram down")) - mock_app = MagicMock() - mock_app.bot = mock_bot - - with patch("core.services.bot_app.application", mock_app): - from core.services.outbox_consumer import _send_error_to_chat - - # Should not raise - await _send_error_to_chat(42, "error") - - # --------------------------------------------------------------------------- # start / stop # --------------------------------------------------------------------------- diff --git a/tests/unit/telegram_bot/test_webhook_server.py b/tests/unit/telegram_bot/test_webhook_server.py new file mode 100644 index 0000000..db16adc --- /dev/null +++ b/tests/unit/telegram_bot/test_webhook_server.py @@ -0,0 +1,94 @@ +"""Tests for apps/telegram-bot/core/webhook/server.py""" + +import asyncio +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + + +class TestLogTaskException: + def test_logs_exception_from_failed_task(self): + from core.webhook.server import _log_task_exception + + mock_task = MagicMock(spec=asyncio.Task) + mock_task.cancelled.return_value = False + test_exc = RuntimeError("task boom") + mock_task.exception.return_value = test_exc + + with patch("core.webhook.server.logger") as mock_logger: + _log_task_exception(mock_task) + mock_logger.exception.assert_called_once() + + def test_does_not_log_cancelled_task(self): + from core.webhook.server import _log_task_exception + + mock_task = MagicMock(spec=asyncio.Task) + mock_task.cancelled.return_value = True + + with patch("core.webhook.server.logger") as mock_logger: + _log_task_exception(mock_task) + mock_logger.exception.assert_not_called() + + def test_does_not_log_successful_task(self): + from core.webhook.server import _log_task_exception + + mock_task = MagicMock(spec=asyncio.Task) + mock_task.cancelled.return_value = False + mock_task.exception.return_value = None + + with patch("core.webhook.server.logger") as mock_logger: + _log_task_exception(mock_task) + mock_logger.exception.assert_not_called() + + +class TestTelegramWebhook: + @pytest.mark.asyncio + async def test_invalid_json_returns_400(self): + from core.webhook.server import telegram_webhook + + mock_request = MagicMock() + mock_request.headers = {"X-Telegram-Bot-Api-Secret-Token": ""} + mock_request.json = AsyncMock(side_effect=ValueError("bad json")) + + with patch("core.webhook.server.settings") as mock_settings: + mock_settings.TELEGRAM_BOT_SECRET_TOKEN = "" + response = await telegram_webhook(mock_request) + assert response.status_code == 400 + + @pytest.mark.asyncio + async def test_unauthorized_returns_401(self): + from core.webhook.server import telegram_webhook + + mock_request = MagicMock() + mock_request.headers = {"X-Telegram-Bot-Api-Secret-Token": "wrong"} + + with patch("core.webhook.server.settings") as mock_settings: + mock_settings.TELEGRAM_BOT_SECRET_TOKEN = "correct" + response = await telegram_webhook(mock_request) + assert response.status_code == 401 + + +class TestSendMessageEndpoint: + @pytest.mark.asyncio + async def test_exception_returns_500(self): + from core.webhook.server import send_message_endpoint + + mock_request = MagicMock() + mock_request.json = AsyncMock(side_effect=RuntimeError("db error")) + + response = await send_message_endpoint(mock_request) + assert response.status_code == 500 + + @pytest.mark.asyncio + async def test_success_returns_ok(self): + from core.webhook.server import send_message_endpoint + + mock_request = MagicMock() + mock_request.json = AsyncMock(return_value={ + "data": {"title": "test"}, + "chat_id": "123", + }) + + with patch("core.webhook.server.send_item_message", new_callable=AsyncMock): + response = await send_message_endpoint(mock_request) + assert response.status_code == 200 diff --git a/tests/unit/test_api_exception_handlers.py b/tests/unit/test_api_exception_handlers.py new file mode 100644 index 0000000..ef1953c --- /dev/null +++ b/tests/unit/test_api_exception_handlers.py @@ -0,0 +1,95 @@ +"""Tests for FastAPI global exception handlers in apps/api/src/main.py + +Uses a minimal FastAPI app with the same exception handlers to avoid +needing to fully configure the real app's dependencies. +""" + +import pytest +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from fastfetchbot_shared.exceptions import ( + FastFetchBotError, + ScraperError, + ScraperParseError, + FileExportError, +) + + +def _create_test_app(): + """Create a minimal FastAPI app with the same exception handlers as main.py.""" + from fastapi import Request + from fastapi.responses import JSONResponse + from fastfetchbot_shared.utils.logger import logger + + app = FastAPI() + + @app.exception_handler(FastFetchBotError) + async def fastfetchbot_error_handler(request: Request, exc: FastFetchBotError): + logger.error(f"Domain error on {request.method} {request.url}: {exc}") + return JSONResponse(status_code=502, content={"error": str(exc)}) + + @app.exception_handler(Exception) + async def generic_error_handler(request: Request, exc: Exception): + logger.exception(f"Unhandled error on {request.method} {request.url}") + return JSONResponse(status_code=500, content={"error": "Internal server error"}) + + return app + + +@pytest.fixture +def test_app(): + return _create_test_app() + + +@pytest.fixture +def client(test_app): + return TestClient(test_app, raise_server_exceptions=False) + + +class TestFastFetchBotErrorHandler: + def test_scraper_error_returns_502(self, test_app, client): + @test_app.get("/test") + async def _(): + raise ScraperError("Twitter API failed") + + response = client.get("/test") + assert response.status_code == 502 + assert "Twitter API failed" in response.json()["error"] + + def test_scraper_parse_error_returns_502(self, test_app, client): + @test_app.get("/test-parse") + async def _(): + raise ScraperParseError("Invalid response") + + response = client.get("/test-parse") + assert response.status_code == 502 + + def test_file_export_error_returns_502(self, test_app, client): + @test_app.get("/test-export") + async def _(): + raise FileExportError("PDF failed") + + response = client.get("/test-export") + assert response.status_code == 502 + assert "PDF failed" in response.json()["error"] + + +class TestGenericExceptionHandler: + def test_unhandled_exception_returns_500(self, test_app, client): + @test_app.get("/test-crash") + async def _(): + raise RuntimeError("unexpected crash") + + response = client.get("/test-crash") + assert response.status_code == 500 + assert response.json()["error"] == "Internal server error" + + def test_generic_does_not_leak_details(self, test_app, client): + @test_app.get("/test-sensitive") + async def _(): + raise ValueError("secret database password") + + response = client.get("/test-sensitive") + assert response.status_code == 500 + assert "secret" not in response.json()["error"] diff --git a/tests/unit/test_exceptions.py b/tests/unit/test_exceptions.py new file mode 100644 index 0000000..5c685f3 --- /dev/null +++ b/tests/unit/test_exceptions.py @@ -0,0 +1,82 @@ +"""Tests for packages/shared/fastfetchbot_shared/exceptions.py""" + +import pytest + +from fastfetchbot_shared.exceptions import ( + FastFetchBotError, + ScraperError, + ScraperNetworkError, + ScraperParseError, + TelegraphPublishError, + FileExportError, + ExternalServiceError, +) + + +class TestExceptionHierarchy: + def test_scraper_error_is_fastfetchbot_error(self): + assert issubclass(ScraperError, FastFetchBotError) + + def test_scraper_network_error_is_scraper_error(self): + assert issubclass(ScraperNetworkError, ScraperError) + + def test_scraper_parse_error_is_scraper_error(self): + assert issubclass(ScraperParseError, ScraperError) + + def test_telegraph_publish_error_is_fastfetchbot_error(self): + assert issubclass(TelegraphPublishError, FastFetchBotError) + + def test_file_export_error_is_fastfetchbot_error(self): + assert issubclass(FileExportError, FastFetchBotError) + + def test_external_service_error_is_fastfetchbot_error(self): + assert issubclass(ExternalServiceError, FastFetchBotError) + + def test_all_are_exceptions(self): + for cls in ( + FastFetchBotError, + ScraperError, + ScraperNetworkError, + ScraperParseError, + TelegraphPublishError, + FileExportError, + ExternalServiceError, + ): + assert issubclass(cls, Exception) + + +class TestExceptionCatching: + """Verify that catching a parent also catches children.""" + + def test_catch_fastfetchbot_catches_scraper_error(self): + with pytest.raises(FastFetchBotError): + raise ScraperError("test") + + def test_catch_scraper_catches_network_error(self): + with pytest.raises(ScraperError): + raise ScraperNetworkError("network fail") + + def test_catch_scraper_catches_parse_error(self): + with pytest.raises(ScraperError): + raise ScraperParseError("parse fail") + + def test_catch_fastfetchbot_catches_file_export_error(self): + with pytest.raises(FastFetchBotError): + raise FileExportError("export fail") + + def test_catch_fastfetchbot_catches_external_service_error(self): + with pytest.raises(FastFetchBotError): + raise ExternalServiceError("service fail") + + +class TestExceptionChaining: + def test_from_preserves_original_cause(self): + original = ValueError("original") + try: + raise ScraperError("wrapped") from original + except ScraperError as e: + assert e.__cause__ is original + + def test_message_preserved(self): + exc = ScraperParseError("bad data") + assert str(exc) == "bad data" diff --git a/tests/unit/test_network.py b/tests/unit/test_network.py new file mode 100644 index 0000000..4cd264f --- /dev/null +++ b/tests/unit/test_network.py @@ -0,0 +1,103 @@ +"""Tests for exception handling in packages/shared/fastfetchbot_shared/utils/network.py""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from fastfetchbot_shared.models.classes import NamedBytesIO + + +class TestDownloadFileByMetadataItem: + @pytest.mark.asyncio + async def test_success_returns_named_bytes_io(self): + from fastfetchbot_shared.utils.network import download_file_by_metadata_item + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.content = b"fake image data" + + mock_client = AsyncMock() + mock_client.get = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + + with patch("fastfetchbot_shared.utils.network.httpx.AsyncClient", return_value=mock_client), \ + patch("fastfetchbot_shared.utils.network.get_random_user_agent", return_value="TestAgent"): + result = await download_file_by_metadata_item( + url="https://example.com/image.jpg", + data={"url": "https://example.com", "category": "twitter"}, + ) + + assert isinstance(result, NamedBytesIO) + assert result.name.endswith(".jpg") + + @pytest.mark.asyncio + async def test_network_error_raises(self): + from fastfetchbot_shared.utils.network import download_file_by_metadata_item + + mock_client = AsyncMock() + mock_client.get = AsyncMock(side_effect=ConnectionError("connection refused")) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + + with patch("fastfetchbot_shared.utils.network.httpx.AsyncClient", return_value=mock_client), \ + patch("fastfetchbot_shared.utils.network.get_random_user_agent", return_value="TestAgent"): + with pytest.raises(ConnectionError, match="connection refused"): + await download_file_by_metadata_item( + url="https://example.com/image.jpg", + data={"url": "https://example.com", "category": "twitter"}, + ) + + @pytest.mark.asyncio + async def test_redirect_follows_location(self): + from fastfetchbot_shared.utils.network import download_file_by_metadata_item + + mock_response = MagicMock() + mock_response.status_code = 302 + mock_response.headers = {"Location": "https://cdn.example.com/real.jpg"} + mock_response.content = b"redirected content" + + mock_client = AsyncMock() + mock_client.get = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + + with patch("fastfetchbot_shared.utils.network.httpx.AsyncClient", return_value=mock_client), \ + patch("fastfetchbot_shared.utils.network.get_random_user_agent", return_value="TestAgent"): + result = await download_file_by_metadata_item( + url="https://example.com/image.jpg", + data={"url": "https://example.com", "category": "twitter"}, + ) + + assert isinstance(result, NamedBytesIO) + + +class TestGetResponseJson: + @pytest.mark.asyncio + async def test_exception_returns_none_and_logs(self): + from fastfetchbot_shared.utils.network import get_response_json + + with patch( + "fastfetchbot_shared.utils.network.get_response", + new_callable=AsyncMock, + side_effect=ConnectionError("timeout"), + ): + result = await get_response_json("https://example.com/api") + + assert result is None + + @pytest.mark.asyncio + async def test_json_decode_error_returns_none(self): + from fastfetchbot_shared.utils.network import get_response_json + + mock_response = MagicMock() + mock_response.json.side_effect = ValueError("invalid json") + + with patch( + "fastfetchbot_shared.utils.network.get_response", + new_callable=AsyncMock, + return_value=mock_response, + ): + result = await get_response_json("https://example.com/api") + + assert result is None