From 82d413165f1cc34f7c1263df4baf78db2090e142 Mon Sep 17 00:00:00 2001 From: aturret Date: Fri, 27 Mar 2026 23:30:18 -0500 Subject: [PATCH 1/6] feat: send bot error message to dev channel instead --- .../core/services/outbox_consumer.py | 19 +---- .../unit/telegram_bot/test_outbox_consumer.py | 81 +++++++++---------- 2 files changed, 44 insertions(+), 56 deletions(-) diff --git a/apps/telegram-bot/core/services/outbox_consumer.py b/apps/telegram-bot/core/services/outbox_consumer.py index 6415235..1ec3991 100644 --- a/apps/telegram-bot/core/services/outbox_consumer.py +++ b/apps/telegram-bot/core/services/outbox_consumer.py @@ -4,7 +4,7 @@ import redis.asyncio as aioredis from core.config import settings -from core.services.message_sender import send_item_message +from core.services.message_sender import send_item_message, send_debug_channel from fastfetchbot_shared.utils.logger import logger _redis: aioredis.Redis | None = None @@ -42,7 +42,9 @@ async def _consume_loop() -> None: if error: logger.warning(f"[{job_id}] Scrape failed: {error}") - await _send_error_to_chat(chat_id, error) + await send_debug_channel( + f"[Scrape Error] job_id={job_id}\nchat_id: {chat_id}\n\n{error}" + ) else: metadata_item = payload.get("metadata_item") if metadata_item and chat_id: @@ -63,19 +65,6 @@ async def _consume_loop() -> None: await asyncio.sleep(1) -async def _send_error_to_chat(chat_id: int | str, error: str) -> None: - """Send an error notification to the user's chat.""" - try: - from core.services.bot_app import application - - await application.bot.send_message( - chat_id=chat_id, - text=f"Sorry, an error occurred while processing your request:\n\n{error}", - ) - except Exception as e: - logger.error(f"Failed to send error message to chat {chat_id}: {e}") - - async def start(bot_id: int) -> None: """Start the outbox consumer as a background asyncio task. diff --git a/tests/unit/telegram_bot/test_outbox_consumer.py b/tests/unit/telegram_bot/test_outbox_consumer.py index d3d1fb4..67f6cd8 100644 --- a/tests/unit/telegram_bot/test_outbox_consumer.py +++ b/tests/unit/telegram_bot/test_outbox_consumer.py @@ -136,7 +136,39 @@ async def brpop_side_effect(*args, **kwargs): class TestConsumeLoopError: @pytest.mark.asyncio - async def test_sends_error_to_chat(self, mock_redis): + async def test_sends_error_to_debug_channel(self, mock_redis): + payload = _make_payload(error="scraper failed", chat_id=99, job_id="j42") + call_count = 0 + + async def brpop_side_effect(*args, **kwargs): + nonlocal call_count + call_count += 1 + if call_count == 1: + return ("scrape:outbox", payload) + raise asyncio.CancelledError() + + mock_redis.brpop = AsyncMock(side_effect=brpop_side_effect) + + with patch( + "core.services.outbox_consumer.aioredis.from_url", + return_value=mock_redis, + ), patch( + "core.services.outbox_consumer.send_debug_channel", + new_callable=AsyncMock, + ) as mock_debug: + from core.services.outbox_consumer import _consume_loop + + await _consume_loop() + + mock_debug.assert_awaited_once() + msg = mock_debug.call_args[0][0] + assert "scraper failed" in msg + assert "j42" in msg + assert "99" in msg + + @pytest.mark.asyncio + async def test_does_not_send_error_to_user_chat(self, mock_redis): + """Scrape errors should NOT notify the user.""" payload = _make_payload(error="scraper failed", chat_id=99) call_count = 0 @@ -153,14 +185,18 @@ async def brpop_side_effect(*args, **kwargs): "core.services.outbox_consumer.aioredis.from_url", return_value=mock_redis, ), patch( - "core.services.outbox_consumer._send_error_to_chat", + "core.services.outbox_consumer.send_debug_channel", new_callable=AsyncMock, - ) as mock_err: + ), patch( + "core.services.outbox_consumer.send_item_message", + new_callable=AsyncMock, + ) as mock_send: from core.services.outbox_consumer import _consume_loop await _consume_loop() - mock_err.assert_awaited_once_with(99, "scraper failed") + # User should receive no message at all + mock_send.assert_not_awaited() # --------------------------------------------------------------------------- @@ -222,43 +258,6 @@ async def brpop_side_effect(*args, **kwargs): assert call_count == 3 # 2 None returns, then cancel -# --------------------------------------------------------------------------- -# _send_error_to_chat -# --------------------------------------------------------------------------- - - -class TestSendErrorToChat: - @pytest.mark.asyncio - async def test_sends_error_message(self): - mock_bot = AsyncMock() - mock_app = MagicMock() - mock_app.bot = mock_bot - - # _send_error_to_chat uses: from core.services.bot_app import application - with patch("core.services.bot_app.application", mock_app): - from core.services.outbox_consumer import _send_error_to_chat - - await _send_error_to_chat(42, "something went wrong") - - mock_bot.send_message.assert_awaited_once() - call_kwargs = mock_bot.send_message.call_args.kwargs - assert call_kwargs["chat_id"] == 42 - assert "something went wrong" in call_kwargs["text"] - - @pytest.mark.asyncio - async def test_handles_send_failure_gracefully(self): - mock_bot = AsyncMock() - mock_bot.send_message = AsyncMock(side_effect=RuntimeError("telegram down")) - mock_app = MagicMock() - mock_app.bot = mock_bot - - with patch("core.services.bot_app.application", mock_app): - from core.services.outbox_consumer import _send_error_to_chat - - # Should not raise - await _send_error_to_chat(42, "error") - - # --------------------------------------------------------------------------- # start / stop # --------------------------------------------------------------------------- From b542a68a29276c4f760883baf282c6ac48603fbc Mon Sep 17 00:00:00 2001 From: aturret Date: Sat, 28 Mar 2026 01:16:32 -0500 Subject: [PATCH 2/6] feat: add exception handling --- apps/api/src/main.py | 21 ++++- .../core/services/message_sender.py | 39 +++++---- apps/telegram-bot/core/webhook/server.py | 33 +++++--- .../fastfetchbot_file_export/pdf_export.py | 22 ++++-- .../fastfetchbot_file_export/transcribe.py | 79 ++++++++++--------- .../video_download.py | 10 +-- .../shared/fastfetchbot_shared/exceptions.py | 26 ++++++ .../services/scrapers/bluesky/scraper.py | 1 + .../services/scrapers/common.py | 5 +- .../services/scrapers/douban/__init__.py | 5 +- .../scrapers/general/firecrawl_client.py | 3 +- .../services/scrapers/general/zyte.py | 3 +- .../services/scrapers/instagram/__init__.py | 4 +- .../services/scrapers/scraper_manager.py | 3 +- .../services/scrapers/threads/__init__.py | 2 +- .../services/scrapers/twitter/__init__.py | 13 ++- .../services/scrapers/weibo/scraper.py | 22 +++--- .../services/scrapers/xiaohongshu/adaptar.py | 21 ++--- .../services/scrapers/zhihu/__init__.py | 38 ++++----- .../services/telegraph/__init__.py | 2 +- .../fastfetchbot_shared/utils/network.py | 19 +++-- 21 files changed, 230 insertions(+), 141 deletions(-) create mode 100644 packages/shared/fastfetchbot_shared/exceptions.py diff --git a/apps/api/src/main.py b/apps/api/src/main.py index dea43d9..70c28f5 100644 --- a/apps/api/src/main.py +++ b/apps/api/src/main.py @@ -1,6 +1,7 @@ import sentry_sdk from fastapi import FastAPI, Request +from fastapi.responses import JSONResponse from contextlib import asynccontextmanager from starlette.middleware.base import BaseHTTPMiddleware @@ -8,6 +9,7 @@ from src.routers import inoreader, scraper_routers, scraper from src.config import settings from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.exceptions import FastFetchBotError SENTRY_DSN = "" @@ -38,12 +40,27 @@ def __init__(self, app): async def dispatch(self, request: Request, call_next): logger.info(f"{request.method} {request.url}") - response = await call_next(request) - return response + try: + response = await call_next(request) + return response + except Exception: + logger.exception(f"Unhandled error during {request.method} {request.url}") + raise def create_app(): fastapi_app = FastAPI(lifespan=lifespan) + + @fastapi_app.exception_handler(FastFetchBotError) + async def fastfetchbot_error_handler(request: Request, exc: FastFetchBotError): + logger.error(f"Domain error on {request.method} {request.url}: {exc}") + return JSONResponse(status_code=502, content={"error": str(exc)}) + + @fastapi_app.exception_handler(Exception) + async def generic_error_handler(request: Request, exc: Exception): + logger.exception(f"Unhandled error on {request.method} {request.url}") + return JSONResponse(status_code=500, content={"error": "Internal server error"}) + fastapi_app.add_middleware(LogMiddleware) fastapi_app.include_router(inoreader.router) fastapi_app.include_router(scraper.router) diff --git a/apps/telegram-bot/core/services/message_sender.py b/apps/telegram-bot/core/services/message_sender.py index 666b568..a2ccc76 100644 --- a/apps/telegram-bot/core/services/message_sender.py +++ b/apps/telegram-bot/core/services/message_sender.py @@ -163,9 +163,8 @@ async def send_item_message( else False, disable_notification=True, ) - except Exception as e: - logger.error(e) - traceback.print_exc() + except Exception: + logger.exception("Failed to send item message") await send_debug_channel(traceback.format_exc()) @@ -234,9 +233,13 @@ async def media_files_packaging(media_files: list, data: dict) -> tuple: "https", ]: # if the url is a http url, download the file file_format = "mp4" if media_item["media_type"] == "video" else None - io_object = await download_file_by_metadata_item( - media_item["url"], data=data, file_format=file_format - ) + try: + io_object = await download_file_by_metadata_item( + media_item["url"], data=data, file_format=file_format + ) + except Exception: + logger.warning(f"Skipping media download: {media_item['url']}") + continue filename = io_object.name file_size = io_object.size else: # if the url is a local file path, just add it to the media group @@ -300,9 +303,13 @@ async def media_files_packaging(media_files: list, data: dict) -> tuple: or img_width > settings.TELEGRAM_IMAGE_DIMENSION_LIMIT or img_height > settings.TELEGRAM_IMAGE_DIMENSION_LIMIT ) and data["category"] not in ["xiaohongshu"]: - io_object = await download_file_by_metadata_item( - url=image_url, data=data - ) + try: + io_object = await download_file_by_metadata_item( + url=image_url, data=data + ) + except Exception: + logger.warning(f"Skipping document download: {image_url}") + continue if not io_object.name.endswith(".gif"): if not io_object.name.endswith(ext.lower()): io_object.name = io_object.name + "." + ext.lower() @@ -312,11 +319,15 @@ async def media_files_packaging(media_files: list, data: dict) -> tuple: ) file_counter += 1 elif media_item["media_type"] == "gif": - io_object = await download_file_by_metadata_item( - url=media_item["url"], - data=data, - file_name="gif_image-" + str(media_counter) + ".gif", - ) + try: + io_object = await download_file_by_metadata_item( + url=media_item["url"], + data=data, + file_name="gif_image-" + str(media_counter) + ".gif", + ) + except Exception: + logger.warning(f"Skipping gif download: {media_item['url']}") + continue io_object.name = io_object.name + ".gif" media_group.append(InputMediaAnimation(io_object)) elif media_item["media_type"] == "video": diff --git a/apps/telegram-bot/core/webhook/server.py b/apps/telegram-bot/core/webhook/server.py index 96ae377..8deb01a 100644 --- a/apps/telegram-bot/core/webhook/server.py +++ b/apps/telegram-bot/core/webhook/server.py @@ -51,24 +51,39 @@ async def lifespan(app): await database.shutdown() +def _log_task_exception(task: asyncio.Task): + """Callback for fire-and-forget tasks to ensure exceptions are logged.""" + if not task.cancelled() and task.exception(): + logger.exception("Unhandled error in background task", exc_info=task.exception()) + + async def telegram_webhook(request: Request): secret = request.headers.get("X-Telegram-Bot-Api-Secret-Token") if secret != settings.TELEGRAM_BOT_SECRET_TOKEN: return JSONResponse({"error": "unauthorized"}, status_code=401) - data = await request.json() + try: + data = await request.json() + except Exception: + logger.exception("Failed to parse webhook request body") + return JSONResponse({"error": "invalid JSON"}, status_code=400) logger.debug(f"Telegram webhook update received: {data.get('update_id', 'unknown')}") - asyncio.create_task(process_telegram_update(data)) + task = asyncio.create_task(process_telegram_update(data)) + task.add_done_callback(_log_task_exception) return JSONResponse({"status": "ok"}) async def send_message_endpoint(request: Request): - data = await request.json() - metadata_item = data["data"] - chat_id = data.get("chat_id") - if isinstance(chat_id, str) and chat_id.startswith("-"): - chat_id = int(chat_id) - await send_item_message(metadata_item, chat_id=chat_id) - return JSONResponse({"status": "ok"}) + try: + data = await request.json() + metadata_item = data["data"] + chat_id = data.get("chat_id") + if isinstance(chat_id, str) and chat_id.startswith("-"): + chat_id = int(chat_id) + await send_item_message(metadata_item, chat_id=chat_id) + return JSONResponse({"status": "ok"}) + except Exception: + logger.exception("Failed to handle send_message request") + return JSONResponse({"error": "Internal server error"}, status_code=500) async def health(request: Request): diff --git a/packages/file-export/fastfetchbot_file_export/pdf_export.py b/packages/file-export/fastfetchbot_file_export/pdf_export.py index e9ec16c..f4a60d6 100644 --- a/packages/file-export/fastfetchbot_file_export/pdf_export.py +++ b/packages/file-export/fastfetchbot_file_export/pdf_export.py @@ -2,6 +2,8 @@ from weasyprint import HTML, CSS from weasyprint.text.fonts import FontConfiguration +from loguru import logger +from fastfetchbot_shared.exceptions import FileExportError CSS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "pdf_export.css") @@ -19,7 +21,7 @@ def convert_html_to_pdf( elif html_string: html_item = HTML(string=html_string) else: - raise ValueError("Either html_string or html_file must be provided") + raise FileExportError("Either html_string or html_file must be provided") html_item.write_pdf(output_filename, stylesheets=[css_item]) @@ -30,10 +32,14 @@ def export_pdf( download_dir: str = "/tmp", ) -> str: """Export HTML to PDF and return the output file path.""" - output_path = os.path.join(download_dir, output_filename) - convert_html_to_pdf( - output_filename=output_path, - html_string=html_string, - html_file=html_file, - ) - return output_path + try: + output_path = os.path.join(download_dir, output_filename) + convert_html_to_pdf( + output_filename=output_path, + html_string=html_string, + html_file=html_file, + ) + return output_path + except Exception as e: + logger.exception(f"PDF export failed for {output_filename}") + raise FileExportError(f"PDF export failed for {output_filename}") from e diff --git a/packages/file-export/fastfetchbot_file_export/transcribe.py b/packages/file-export/fastfetchbot_file_export/transcribe.py index 54c81f4..94b6cae 100644 --- a/packages/file-export/fastfetchbot_file_export/transcribe.py +++ b/packages/file-export/fastfetchbot_file_export/transcribe.py @@ -3,6 +3,7 @@ from pydub import AudioSegment from openai import OpenAI from loguru import logger +from fastfetchbot_shared.exceptions import FileExportError TRANSCRIBE_MODEL = "whisper-1" SEGMENT_LENGTH = 5 * 60 # 5 minutes in seconds @@ -74,40 +75,44 @@ def get_audio_text(audio_file: str, openai_api_key: str) -> str: Returns formatted string with summary and full transcript. """ - client = OpenAI(api_key=openai_api_key) - transcript = "" - AudioSegment.converter = "ffmpeg" - audio_file_non_ext, audio_file_ext = os.path.splitext(audio_file) - ext = audio_file_ext.lstrip(".") - audio_item = AudioSegment.from_file(audio_file, ext) - start_trim = milliseconds_until_sound(audio_item) - audio_item = audio_item[start_trim:] - audio_length = int(audio_item.duration_seconds) + 1 - - for index, i in enumerate(range(0, audio_length * 1000, SEGMENT_LENGTH * 1000)): - start_time = i - end_time = i + SEGMENT_LENGTH * 1000 - if end_time >= audio_length * 1000: - audio_segment = audio_item[start_time:] - else: - audio_segment = audio_item[start_time:end_time] - - segment_path = f"{audio_file_non_ext}-{index + 1}{audio_file_ext}" - audio_segment.export(segment_path) - logger.info(f"audio_segment_path: {segment_path}") - - with open(segment_path, "rb") as f: - result = client.audio.transcriptions.create( - model=TRANSCRIBE_MODEL, file=f - ) - transcript += result.text - - os.remove(segment_path) - - transcript = punctuation_assistant(client, transcript) - transcript = ( - f"全文总结:\n{summary_assistant(client, transcript)}\n原文:\n{transcript}" - ) - logger.info(f"transcript: {transcript}") - os.remove(audio_file) - return transcript + try: + client = OpenAI(api_key=openai_api_key) + transcript = "" + AudioSegment.converter = "ffmpeg" + audio_file_non_ext, audio_file_ext = os.path.splitext(audio_file) + ext = audio_file_ext.lstrip(".") + audio_item = AudioSegment.from_file(audio_file, ext) + start_trim = milliseconds_until_sound(audio_item) + audio_item = audio_item[start_trim:] + audio_length = int(audio_item.duration_seconds) + 1 + + for index, i in enumerate(range(0, audio_length * 1000, SEGMENT_LENGTH * 1000)): + start_time = i + end_time = i + SEGMENT_LENGTH * 1000 + if end_time >= audio_length * 1000: + audio_segment = audio_item[start_time:] + else: + audio_segment = audio_item[start_time:end_time] + + segment_path = f"{audio_file_non_ext}-{index + 1}{audio_file_ext}" + audio_segment.export(segment_path) + logger.info(f"audio_segment_path: {segment_path}") + + with open(segment_path, "rb") as f: + result = client.audio.transcriptions.create( + model=TRANSCRIBE_MODEL, file=f + ) + transcript += result.text + + os.remove(segment_path) + + transcript = punctuation_assistant(client, transcript) + transcript = ( + f"全文总结:\n{summary_assistant(client, transcript)}\n原文:\n{transcript}" + ) + logger.info(f"transcript: {transcript}") + os.remove(audio_file) + return transcript + except Exception as e: + logger.exception(f"Audio transcription failed for {audio_file}") + raise FileExportError(f"Audio transcription failed for {audio_file}") from e diff --git a/packages/file-export/fastfetchbot_file_export/video_download.py b/packages/file-export/fastfetchbot_file_export/video_download.py index e70cb73..45b4b4f 100644 --- a/packages/file-export/fastfetchbot_file_export/video_download.py +++ b/packages/file-export/fastfetchbot_file_export/video_download.py @@ -1,8 +1,8 @@ import os -import traceback from loguru import logger from yt_dlp import YoutubeDL +from fastfetchbot_shared.exceptions import FileExportError def get_video_orientation(content_info: dict, extractor: str) -> str: @@ -39,7 +39,7 @@ def get_format_for_orientation( if hd and bilibili_cookie: return "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best" return "bv*[height<=480]+ba/b[height<=480] / wv*+ba/w" - raise ValueError("no available extractor found") + raise FileExportError("no available extractor found") def init_yt_downloader( @@ -201,6 +201,6 @@ def download_video( "orientation": orientation, "file_path": file_path_output, } - except Exception: - logger.exception(f"download_video failed: url={url}\n{traceback.format_exc()}") - raise + except Exception as e: + logger.exception(f"download_video failed: url={url}") + raise FileExportError(f"download_video failed: url={url}") from e diff --git a/packages/shared/fastfetchbot_shared/exceptions.py b/packages/shared/fastfetchbot_shared/exceptions.py new file mode 100644 index 0000000..72777be --- /dev/null +++ b/packages/shared/fastfetchbot_shared/exceptions.py @@ -0,0 +1,26 @@ +class FastFetchBotError(Exception): + """Base exception for all FastFetchBot domain errors.""" + + +class ScraperError(FastFetchBotError): + """Error during scraping.""" + + +class ScraperNetworkError(ScraperError): + """Network/connection error during scraping.""" + + +class ScraperParseError(ScraperError): + """Failed to parse scraped content.""" + + +class TelegraphPublishError(FastFetchBotError): + """Telegraph publishing failed.""" + + +class FileExportError(FastFetchBotError): + """File export (PDF, video, audio) failed.""" + + +class ExternalServiceError(FastFetchBotError): + """External service call failed (OpenAI, Inoreader, etc.).""" diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py b/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py index 2874fe1..8c3d163 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py @@ -189,3 +189,4 @@ async def _request_post_data(self, bluesky_post: BlueskyPost) -> ThreadViewPost: return post_thread_data.thread except Exception as e: logger.error(f"Error while getting post data: {e}") + raise diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/common.py b/packages/shared/fastfetchbot_shared/services/scrapers/common.py index 0e94809..f4e6e4f 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/common.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/common.py @@ -1,6 +1,7 @@ from typing import Optional, Any from fastfetchbot_shared.models.url_metadata import UrlMetadata +from fastfetchbot_shared.exceptions import ScraperError from fastfetchbot_shared.services.scrapers import ( twitter, wechat, @@ -75,7 +76,7 @@ def _resolve_scraper_class(self, category: str): return self.service_classes[category] if category in ("youtube", "bilibili"): return self._get_video_downloader() - raise KeyError(f"No scraper registered for category: {category}") + raise ScraperError(f"No scraper registered for category: {category}") async def get_item(self, metadata_item: Optional[dict] = None) -> dict: if not metadata_item: @@ -92,7 +93,7 @@ async def get_item(self, metadata_item: Optional[dict] = None) -> dict: metadata_item = await scraper_item.get_item() except Exception as e: logger.error(f"Error while getting item: {e}") - raise e + raise logger.info(f"Got metadata item") logger.debug(metadata_item) metadata_item = await self.process_item(metadata_item) diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py index a273294..ee9a083 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py @@ -8,6 +8,7 @@ from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html from fastfetchbot_shared.utils.network import get_selector, HEADERS +from fastfetchbot_shared.utils.logger import logger from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType from fastfetchbot_shared.services.scrapers.config import JINJA2_ENV @@ -220,10 +221,10 @@ def _douban_short_text_process(self) -> str: @staticmethod def raw_content_to_html(raw_content: str) -> str: # Split the text into paragraphs based on double newlines - print(raw_content) + logger.debug(raw_content) paragraphs = raw_content.split('
\n') # Wrap each paragraph with

tags - print(paragraphs) + logger.debug(paragraphs) html_paragraphs = [f'

{paragraph.strip()}

' for paragraph in paragraphs] # Join the paragraphs to form the final HTML string html_string = ''.join(html_paragraphs) diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl_client.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl_client.py index 3c7e8ea..467fe9a 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl_client.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl_client.py @@ -7,6 +7,7 @@ from firecrawl import AsyncFirecrawl from fastfetchbot_shared.services.scrapers.config import settings +from fastfetchbot_shared.exceptions import ExternalServiceError @dataclass(frozen=True) @@ -93,4 +94,4 @@ async def scrape_url( ) return result.model_dump(exclude_none=True) except Exception as e: - raise RuntimeError(f"Firecrawl scrape_url failed: url={url}") from e + raise ExternalServiceError(f"Firecrawl scrape_url failed: url={url}") from e diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py index cabf2c3..d011d6f 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py @@ -4,6 +4,7 @@ from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor, BaseGeneralScraper from fastfetchbot_shared.services.scrapers.scraper import DataProcessor from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.exceptions import ExternalServiceError class ZyteDataProcessor(BaseGeneralDataProcessor): @@ -17,7 +18,7 @@ def __init__(self, url: str): async def _get_page_content(self) -> None: if not settings.ZYTE_API_KEY: - raise RuntimeError("ZYTE_API_KEY is not configured") + raise ExternalServiceError("ZYTE_API_KEY is not configured") try: client = AsyncZyteAPI(api_key=settings.ZYTE_API_KEY) diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py index 52bf256..ae99e0c 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py @@ -81,10 +81,10 @@ async def _get_post_info(self) -> dict: and "status" in ins_data and ins_data["status"] is False ): - print("get_ins_post_item error: ", self.scraper) + logger.warning(f"get_ins_post_item error: {self.scraper}") continue elif type(ins_data) == str and "400" in ins_data: - print("get_ins_post_item error: ", self.scraper, ins_data) + logger.warning(f"get_ins_post_item error: {self.scraper} {ins_data}") continue if ( self.scraper == "looter2" diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/scraper_manager.py b/packages/shared/fastfetchbot_shared/services/scrapers/scraper_manager.py index b1abed5..fbc5183 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/scraper_manager.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/scraper_manager.py @@ -1,6 +1,7 @@ from typing import Optional from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.exceptions import ScraperError from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyScraper from fastfetchbot_shared.services.scrapers.weibo.scraper import WeiboScraper from fastfetchbot_shared.services.scrapers.general.scraper import GeneralScraper @@ -40,7 +41,7 @@ async def init_scraper(cls, category: str) -> None: cls.scrapers["unknown"] = scraper else: logger.error(f"Scraper {category} is not supported") - raise ValueError(f"Scraper {category} is not supported") + raise ScraperError(f"Scraper {category} is not supported") @classmethod async def init_bluesky_scraper(cls) -> BlueskyScraper: diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/threads/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/threads/__init__.py index a56b562..69be60a 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/threads/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/threads/__init__.py @@ -97,7 +97,7 @@ async def intercept_response(response): for xhr in gql_calls: text = await xhr.text() data = json.loads(text) - print(json.dumps(data, indent=4, ensure_ascii=False)) + logger.debug(json.dumps(data, indent=4, ensure_ascii=False)) threads = data["data"]["data"]["containing_thread"]["thread_items"] for thread in threads: thread_data["threads"].append(self.parse_single_threads_data(thread["post"])) diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py index 9a38090..4c5b280 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py @@ -1,6 +1,5 @@ # TODO: https://rapidapi.com/Glavier/api/twitter135 import asyncio -import traceback from urllib.parse import urlparse from typing import Dict, List, Optional, Any, Tuple @@ -9,6 +8,7 @@ from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html +from fastfetchbot_shared.exceptions import ScraperError, ScraperParseError from twitter.scraper import Scraper from .config import ( ALL_SCRAPER, @@ -73,11 +73,10 @@ async def _get_response_tweet_data(self) -> Dict: elif self.scraper == "api-client": tweet_data = await self._api_client_get_response_tweet_data() return tweet_data - except Exception as e: - logger.error(e) - traceback.print_exc() + except Exception: + logger.exception(f"Twitter scraper {self.scraper} failed") continue - raise Exception("No valid response from all Twitter scrapers") + raise ScraperError("No valid response from all Twitter scrapers") async def _rapidapi_get_response_tweet_data(self) -> Dict: async with httpx.AsyncClient() as client: @@ -94,11 +93,11 @@ async def _rapidapi_get_response_tweet_data(self) -> Dict: type(tweet_data) == str and ("400" in tweet_data or "429" in tweet_data) ): - raise Exception("Invalid response from Twitter API") + raise ScraperParseError("Invalid response from Twitter API") else: return tweet_data else: - raise Exception("Invalid response from Twitter API") + raise ScraperParseError("Invalid response from Twitter API") async def _api_client_get_response_tweet_data(self) -> Dict: scraper = Scraper( diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/weibo/scraper.py b/packages/shared/fastfetchbot_shared/services/scrapers/weibo/scraper.py index a0dd89e..acbf414 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/weibo/scraper.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/weibo/scraper.py @@ -8,6 +8,7 @@ from lxml import html from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType +from fastfetchbot_shared.exceptions import ScraperError, ScraperNetworkError, ScraperParseError from fastfetchbot_shared.services.scrapers.scraper import Scraper, DataProcessor from fastfetchbot_shared.services.scrapers.weibo import Weibo from fastfetchbot_shared.utils.network import get_response_json, get_random_user_agent @@ -61,19 +62,20 @@ async def _get_weibo(self) -> None: # Priority 1: API with SUB cookie and isGetLongText=true try: weibo_info = await self._get_weibo_info(method="api") - except ConnectionError as e: + except ScraperNetworkError as e: logger.warning(f"Failed to get weibo info by api: {e}") # Priority 2: webpage scraping if weibo_info is None: try: weibo_info = await self._get_weibo_info(method="webpage") - except ConnectionError as e: + except ScraperNetworkError as e: logger.error(f"All weibo fetch methods failed. Last error: {e}") raise try: await self._process_weibo_item(weibo_info) except Exception as e: logger.error(f"Failed to process weibo item: {e}") + raise async def _get_weibo_info(self, method=None) -> dict: try: @@ -84,11 +86,11 @@ async def _get_weibo_info(self, method=None) -> dict: elif method == "api": weibo_info = await self._get_weibo_info_api() else: - raise ValueError("method must be webpage or api") + raise ScraperError("method must be webpage or api") weibo_info = self._parse_weibo_info(weibo_info) return weibo_info - except ConnectionError as e: - raise ConnectionError(f"There are some network issues: {e}") + except (ConnectionError, ScraperNetworkError) as e: + raise ScraperNetworkError(f"There are some network issues: {e}") from e async def _get_weibo_info_webpage(self) -> dict: url = WEIBO_WEB_HOST + self.id @@ -125,11 +127,13 @@ async def _get_weibo_info_api(self) -> dict: response.raise_for_status() ajax_json = response.json() if not ajax_json or ajax_json.get("ok") == 0: - raise ConnectionError("API returned ok=0 or empty response") + raise ScraperParseError("Weibo API returned ok=0 or empty response") logger.debug(f"weibo info by api: {ajax_json}") return ajax_json + except (ScraperParseError, ScraperNetworkError): + raise except Exception as e: - raise ConnectionError(f"Failed to get weibo info by api: {e}") + raise ScraperNetworkError(f"Failed to get weibo info by api: {e}") from e async def _get_long_weibo_info_api(self) -> dict: ajax_json = await get_response_json( @@ -173,7 +177,7 @@ async def _process_weibo_item(self, weibo_info: dict) -> None: # raise Exception("Still a long text weibo, should go long text api.") text = longtext_info.get("text") if not text: - raise Exception( + raise ScraperParseError( "Failed to get longtext of weibo by webpage scraping." ) except Exception as e: @@ -466,7 +470,7 @@ def _weibo_html_text_clean(text, method="bs4"): elif method == "lxml": return WeiboDataProcessor._weibo_html_text_clean_lxml(text) else: - raise ValueError("method must be bs4 or lxml") + raise ScraperError("method must be bs4 or lxml") @staticmethod def _weibo_html_text_clean_bs4(text): diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/adaptar.py b/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/adaptar.py index 900bc52..3a28e61 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/adaptar.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/adaptar.py @@ -9,6 +9,7 @@ from fastfetchbot_shared.config import settings as shared_settings from fastfetchbot_shared.utils.logger import logger +from fastfetchbot_shared.exceptions import ScraperError, ScraperParseError, ExternalServiceError XHS_API_URL = "https://edith.xiaohongshu.com" XHS_WEB_URL = "https://www.xiaohongshu.com" @@ -22,10 +23,10 @@ def parse_xhs_note_url(note_url: str) -> Dict[str, str]: query = dict(parse_qsl(parsed.query, keep_blank_values=True)) path_parts = [part for part in parsed.path.split("/") if part] if not path_parts: - raise ValueError(f"Invalid XHS note URL: {note_url}") + raise ScraperParseError(f"Invalid XHS note URL: {note_url}") note_id = path_parts[-1] if note_id in {"explore", "discovery", "item"}: - raise ValueError(f"Invalid XHS note URL path: {note_url}") + raise ScraperParseError(f"Invalid XHS note URL path: {note_url}") return { "note_id": note_id, "xsec_token": query.get("xsec_token", ""), @@ -52,7 +53,7 @@ def __init__( self.cookies = cookies.strip() self.sign_server_endpoint = (sign_server_endpoint or shared_settings.SIGN_SERVER_URL).rstrip("/") if not self.sign_server_endpoint: - raise ValueError( + raise ExternalServiceError( "XhsSinglePostAdapter requires a sign server URL. " "Set shared_settings.SIGN_SERVER_URL in the environment or pass sign_server_endpoint explicitly." ) @@ -100,12 +101,12 @@ async def _sign_headers(self, uri: str, data: Optional[Any] = None) -> Dict[str, resp.raise_for_status() body = resp.json() if not body.get("isok"): - raise RuntimeError(f"XHS sign server returned error: {body}") + raise ExternalServiceError(f"XHS sign server returned error: {body}") sign = body.get("data", {}) or {} required = ["x_s", "x_t", "x_s_common", "x_b3_traceid"] missing = [key for key in required if key not in sign] if missing: - raise RuntimeError(f"XHS sign response missing fields: {missing}") + raise ExternalServiceError(f"XHS sign response missing fields: {missing}") headers = self._base_headers() headers.update( { @@ -147,7 +148,7 @@ async def fetch_post( xsec_source=url_info["xsec_source"], ) if note is None: - raise RuntimeError(f"Cannot fetch note detail from API or HTML: {url_info['note_id']}") + raise ScraperError(f"Cannot fetch note detail from API or HTML: {url_info['note_id']}") result: Dict[str, Any] = {"platform": "xhs", "note": note, "comments": [], "url": get_pure_url(note_url)} if with_comments: @@ -191,18 +192,18 @@ def _parse_api_response(resp: httpx.Response) -> Dict[str, Any]: try: body = resp.json() except json.JSONDecodeError as exc: - raise RuntimeError(f"XHS API returned non-JSON: status={resp.status_code}") from exc + raise ScraperParseError(f"XHS API returned non-JSON: status={resp.status_code}") from exc if resp.status_code in (461, 471): verify_type = resp.headers.get("Verifytype", "") verify_uuid = resp.headers.get("Verifyuuid", "") - raise RuntimeError( + raise ScraperError( f"XHS blocked by captcha: verify_type={verify_type}, verify_uuid={verify_uuid}" ) if body.get("success"): return body.get("data", {}) or {} - raise RuntimeError(f"XHS API error: status={resp.status_code}, body={body}") + raise ScraperParseError(f"XHS API error: status={resp.status_code}, body={body}") async def _fetch_note_by_api( self, @@ -536,7 +537,7 @@ async def _get_redirection_url(self, note_url: str) -> str: final_url = str(resp.url) if XHS_WEB_URL not in final_url and "xiaohongshu.com" not in final_url: - raise RuntimeError( + raise ScraperError( f"Short URL did not redirect to xiaohongshu.com: {note_url} -> {final_url}" ) return final_url diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py index 96c3da5..f616499 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py @@ -1,6 +1,5 @@ import json import re -import traceback from typing import Dict, Optional, Any from urllib.parse import urlparse @@ -18,6 +17,7 @@ from fastfetchbot_shared.utils.network import get_selector, get_redirect_url, get_response_json, get_random_user_agent, \ get_content_async, get_response from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType +from fastfetchbot_shared.exceptions import ScraperError, ScraperParseError from fastfetchbot_shared.services.scrapers.config import settings, JINJA2_ENV from .config import ( SHORT_LIMIT, @@ -176,12 +176,12 @@ async def _get_zhihu_item(self) -> None: if self.title != "": break except Exception as e: - traceback.print_exc() + logger.exception(f"Zhihu scraper method {method} failed") if method == ALL_METHODS[-1]: - print("all methods failed") - raise e + logger.error("All Zhihu scraper methods failed") + raise else: - print( + logger.warning( f"zhihu {self.zhihu_type} {self.method} failed, try the next method" ) continue @@ -299,7 +299,7 @@ async def _get_zhihu_answer(self) -> None: answer_data = _parse_answer_api_json_data(json_data) logger.debug(f"answer data: {answer_data}") except Exception as e: - raise Exception("Cannot get the answer by API") + raise ScraperParseError("Cannot get the answer by API") from e elif self.method == "fxzhihu": try: resp = await get_response(url=self.request_url, headers=self.headers, client=self.httpx_client) @@ -308,7 +308,7 @@ async def _get_zhihu_answer(self) -> None: answer_data = _parse_answer_api_json_data(json_data) logger.debug(f"answer data: {answer_data}") except Exception as e: - raise Exception("Cannot get the answer by fxzhihu, error: " + str(e)) + raise ScraperError("Cannot get the answer by fxzhihu, error: " + str(e)) from e elif self.method == "json": try: selector = await get_selector(self.request_url, headers=self.headers) @@ -317,9 +317,9 @@ async def _get_zhihu_answer(self) -> None: json_data = json_data["initialState"]["entities"] answer_data = self._parse_answer_json_data(json_data) except Exception as e: - raise Exception("Cannot get the selector") + raise ScraperParseError("Cannot get the selector") from e if answer_data == {}: - raise Exception("Cannot get the answer") + raise ScraperError("Cannot get the answer") self._resolve_answer_json_data(answer_data) # Apply FxZhihu-style content processing for api method if self.method == "api": @@ -350,11 +350,11 @@ async def _get_zhihu_answer(self) -> None: if self.author_url == "https://www.zhihu.com/people/": self.author_url = "" except Exception as e: - raise Exception("Cannot get the answer") + raise ScraperError("Cannot get the answer") from e if ( self.title == "" ): # TODO: this is not a good way to check if the scraping is successful. To be improved. - raise Exception("Cannot get the answer") + raise ScraperError("Cannot get the answer") async def _get_zhihu_status(self): """ @@ -390,8 +390,8 @@ async def _get_zhihu_status(self): else: try: selector = await get_selector(self.request_url, headers=self.headers) - except: - raise Exception("zhihu request failed") + except Exception as e: + raise ScraperError("zhihu request failed") from e if self.method == "json": def _process_picture(pictures, content_attr): if not hasattr(self, content_attr): @@ -516,8 +516,8 @@ def _process_picture(pictures, content_attr): self.retweet_html = str( html.tostring(pic_html, pretty_print=True) ).replace("b' Dict: "origin_pin_comment_count": pins."{self.status_id}".originPin.commentCount }}""" result = jmespath.search(expression, data) - print(result) + logger.debug(result) author_url_token = result["author_url_token"] expression = f"""{{ "author": users."{author_url_token}".name diff --git a/packages/shared/fastfetchbot_shared/services/telegraph/__init__.py b/packages/shared/fastfetchbot_shared/services/telegraph/__init__.py index 6921465..be4eb12 100644 --- a/packages/shared/fastfetchbot_shared/services/telegraph/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/telegraph/__init__.py @@ -70,5 +70,5 @@ async def get_telegraph(self, upload_images: bool = True) -> str: telegraph_url = telegraph_post["url"] return telegraph_url except Exception as e: - logger.error("Telegraph upload failed", exc_info=e) + logger.error("Telegraph upload failed", exc_info=True) return "" diff --git a/packages/shared/fastfetchbot_shared/utils/network.py b/packages/shared/fastfetchbot_shared/utils/network.py index e8b5e48..1e3e0e0 100644 --- a/packages/shared/fastfetchbot_shared/utils/network.py +++ b/packages/shared/fastfetchbot_shared/utils/network.py @@ -5,7 +5,6 @@ import aiofiles import httpx -import traceback from lxml import etree from fake_useragent import UserAgent @@ -39,8 +38,8 @@ async def get_response_json(url: str, headers=None, client: httpx.AsyncClient = try: response = await get_response(url, headers=headers, client=client) json_result = response.json() - except Exception as e: - print(e, traceback.format_exc()) + except Exception: + logger.exception(f"Failed to get JSON response from {url}") json_result = None return json_result @@ -67,16 +66,16 @@ async def get_selector( if ( resp.history ): # if there is a redirect, the request will have a response chain - print("Request was redirected") + logger.debug("Request was redirected") for h in resp.history: - print(h.status_code, h.url) + logger.debug(f"Redirect: {h.status_code} {h.url}") # if code is 302, do not follow the redirect if h.status_code == 302: selector = await get_selector( h.url, headers=headers, follow_redirects=False ) return selector - print("Final destination:", resp.status_code, resp.url) + logger.debug(f"Final destination: {resp.status_code} {resp.url}") selector = etree.HTML(resp.text) # the content of the final destination return selector @@ -114,7 +113,7 @@ async def wait_for_network_idle(): async with page.expect_response("**/api/content") as response_info: response = await response_info.value if response.status == 200: - print("Content loaded") + logger.debug("Content loaded") await page.goto(url) await wait_for_network_idle() @@ -160,9 +159,9 @@ async def download_file_by_metadata_item( file_name = "media-" + str(uuid.uuid1())[:8] + "." + file_format io_object = NamedBytesIO(file_data, name=file_name) return io_object - except Exception as e: - await asyncio.sleep(2) - logger.error(f"Failed to download {url}, {e}") + except Exception: + logger.exception(f"Failed to download {url}") + raise async def download_file_to_local( From a694ac3e49c8f24766d7eb13858f22b2bda8e05a Mon Sep 17 00:00:00 2001 From: aturret Date: Sat, 28 Mar 2026 01:16:41 -0500 Subject: [PATCH 3/6] feat: add tests for exception handlings --- .../file_export/test_pdf_export_exceptions.py | 52 ++++++++++ .../file_export/test_transcribe_exceptions.py | 31 ++++++ tests/unit/file_export/test_video_download.py | 48 +++++++++ tests/unit/scrapers/test_bluesky.py | 6 +- tests/unit/scrapers/test_common.py | 17 ++++ tests/unit/scrapers/test_general_firecrawl.py | 3 +- tests/unit/scrapers/test_general_zyte.py | 3 +- tests/unit/scrapers/test_scraper_manager.py | 5 +- .../unit/scrapers/test_twitter_exceptions.py | 97 +++++++++++++++++++ tests/unit/scrapers/test_weibo.py | 25 ++--- tests/unit/scrapers/test_xiaohongshu.py | 29 +++--- .../unit/telegram_bot/test_webhook_server.py | 94 ++++++++++++++++++ tests/unit/test_api_exception_handlers.py | 95 ++++++++++++++++++ tests/unit/test_exceptions.py | 82 ++++++++++++++++ 14 files changed, 554 insertions(+), 33 deletions(-) create mode 100644 tests/unit/file_export/test_pdf_export_exceptions.py create mode 100644 tests/unit/file_export/test_transcribe_exceptions.py create mode 100644 tests/unit/scrapers/test_twitter_exceptions.py create mode 100644 tests/unit/telegram_bot/test_webhook_server.py create mode 100644 tests/unit/test_api_exception_handlers.py create mode 100644 tests/unit/test_exceptions.py diff --git a/tests/unit/file_export/test_pdf_export_exceptions.py b/tests/unit/file_export/test_pdf_export_exceptions.py new file mode 100644 index 0000000..878e582 --- /dev/null +++ b/tests/unit/file_export/test_pdf_export_exceptions.py @@ -0,0 +1,52 @@ +"""Tests for exception handling in packages/file-export/fastfetchbot_file_export/pdf_export.py""" + +import sys +from unittest.mock import patch, MagicMock + +import pytest + +from fastfetchbot_shared.exceptions import FileExportError + + +@pytest.fixture(autouse=True) +def mock_weasyprint(monkeypatch): + """Mock weasyprint to avoid native library dependency.""" + mock_module = MagicMock() + monkeypatch.setitem(sys.modules, "weasyprint", mock_module) + monkeypatch.setitem(sys.modules, "weasyprint.text", MagicMock()) + monkeypatch.setitem(sys.modules, "weasyprint.text.fonts", MagicMock()) + # Force reimport of pdf_export with mocked weasyprint + if "fastfetchbot_file_export.pdf_export" in sys.modules: + del sys.modules["fastfetchbot_file_export.pdf_export"] + + +class TestExportPdfExceptionHandling: + def test_raises_file_export_error_on_failure(self): + from fastfetchbot_file_export.pdf_export import export_pdf + + with patch( + "fastfetchbot_file_export.pdf_export.convert_html_to_pdf", + side_effect=RuntimeError("WeasyPrint crashed"), + ): + with pytest.raises(FileExportError, match="PDF export failed"): + export_pdf(html_string="

test

", output_filename="test.pdf") + + def test_preserves_original_cause(self): + from fastfetchbot_file_export.pdf_export import export_pdf + + original = OSError("disk full") + with patch( + "fastfetchbot_file_export.pdf_export.convert_html_to_pdf", + side_effect=original, + ): + with pytest.raises(FileExportError) as exc_info: + export_pdf(html_string="

x

", output_filename="out.pdf") + assert exc_info.value.__cause__ is original + + +class TestConvertHtmlToPdf: + def test_no_input_raises_file_export_error(self): + from fastfetchbot_file_export.pdf_export import convert_html_to_pdf + + with pytest.raises(FileExportError, match="Either html_string or html_file"): + convert_html_to_pdf("output.pdf") diff --git a/tests/unit/file_export/test_transcribe_exceptions.py b/tests/unit/file_export/test_transcribe_exceptions.py new file mode 100644 index 0000000..533be53 --- /dev/null +++ b/tests/unit/file_export/test_transcribe_exceptions.py @@ -0,0 +1,31 @@ +"""Tests for exception handling in packages/file-export/fastfetchbot_file_export/transcribe.py""" + +from unittest.mock import patch, MagicMock + +import pytest + +from fastfetchbot_shared.exceptions import FileExportError + + +class TestGetAudioTextExceptionHandling: + def test_raises_file_export_error_on_failure(self): + from fastfetchbot_file_export.transcribe import get_audio_text + + with patch( + "fastfetchbot_file_export.transcribe.OpenAI", + side_effect=RuntimeError("API key invalid"), + ): + with pytest.raises(FileExportError, match="Audio transcription failed"): + get_audio_text("/tmp/nonexistent.mp3", "fake-key") + + def test_preserves_original_cause(self): + from fastfetchbot_file_export.transcribe import get_audio_text + + original = RuntimeError("connection refused") + with patch( + "fastfetchbot_file_export.transcribe.OpenAI", + side_effect=original, + ): + with pytest.raises(FileExportError) as exc_info: + get_audio_text("/tmp/test.mp3", "key") + assert exc_info.value.__cause__ is original diff --git a/tests/unit/file_export/test_video_download.py b/tests/unit/file_export/test_video_download.py index 92099f3..b4c0842 100644 --- a/tests/unit/file_export/test_video_download.py +++ b/tests/unit/file_export/test_video_download.py @@ -426,3 +426,51 @@ def test_no_media_file_when_download_false(self, mock_celery): } vd._video_info_formatting(meta_info) assert vd.media_files == [] + + +# --------------------------------------------------------------------------- +# Exception handling in fastfetchbot_file_export.video_download +# --------------------------------------------------------------------------- + + +class TestVideoDownloadExceptions: + def test_unsupported_extractor_raises_file_export_error(self): + from fastfetchbot_file_export.video_download import get_format_for_orientation + from fastfetchbot_shared.exceptions import FileExportError + + with pytest.raises(FileExportError, match="no available extractor found"): + get_format_for_orientation("tiktok", "horizontal", False) + + def test_download_video_wraps_errors_in_file_export_error(self): + from fastfetchbot_file_export.video_download import download_video + from fastfetchbot_shared.exceptions import FileExportError + + with patch( + "fastfetchbot_file_export.video_download.init_yt_downloader" + ) as mock_init: + mock_ydl = MagicMock() + mock_ydl.__enter__ = MagicMock(return_value=mock_ydl) + mock_ydl.__exit__ = MagicMock(return_value=False) + mock_ydl.extract_info.side_effect = RuntimeError("network error") + mock_init.return_value = mock_ydl + + with pytest.raises(FileExportError, match="download_video failed"): + download_video("https://youtube.com/watch?v=test", download=True) + + def test_download_video_preserves_original_cause(self): + from fastfetchbot_file_export.video_download import download_video + from fastfetchbot_shared.exceptions import FileExportError + + original = RuntimeError("connection reset") + with patch( + "fastfetchbot_file_export.video_download.init_yt_downloader" + ) as mock_init: + mock_ydl = MagicMock() + mock_ydl.__enter__ = MagicMock(return_value=mock_ydl) + mock_ydl.__exit__ = MagicMock(return_value=False) + mock_ydl.extract_info.side_effect = original + mock_init.return_value = mock_ydl + + with pytest.raises(FileExportError) as exc_info: + download_video("https://youtube.com/watch?v=x", download=True) + assert exc_info.value.__cause__ is original diff --git a/tests/unit/scrapers/test_bluesky.py b/tests/unit/scrapers/test_bluesky.py index a5cbf1c..f0081d7 100644 --- a/tests/unit/scrapers/test_bluesky.py +++ b/tests/unit/scrapers/test_bluesky.py @@ -635,7 +635,7 @@ async def test_request_post_data_uses_handle_when_no_did(self): @pytest.mark.asyncio async def test_request_post_data_exception_handling(self): - """_request_post_data should log error and return None on exception.""" + """_request_post_data should log error and re-raise on exception.""" from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyScraper with patch( @@ -652,5 +652,5 @@ async def test_request_post_data_exception_handling(self): bluesky_post.handle = "alice" bluesky_post.post_rkey = "rkey123" - result = await scraper._request_post_data(bluesky_post) - assert result is None + with pytest.raises(Exception, match="network error"): + await scraper._request_post_data(bluesky_post) diff --git a/tests/unit/scrapers/test_common.py b/tests/unit/scrapers/test_common.py index 051f270..4f613c7 100644 --- a/tests/unit/scrapers/test_common.py +++ b/tests/unit/scrapers/test_common.py @@ -6,6 +6,7 @@ from fastfetchbot_shared.models.url_metadata import UrlMetadata from fastfetchbot_shared.services.scrapers.common import InfoExtractService +from fastfetchbot_shared.exceptions import ScraperError # --------------------------------------------------------------------------- @@ -215,3 +216,19 @@ async def test_process_item_no_strip_needed(self, make_url_metadata): svc = InfoExtractService(url_metadata=make_url_metadata()) result = await svc.process_item({"title": "clean"}) assert result["title"] == "clean" + + +# --------------------------------------------------------------------------- +# _resolve_scraper_class +# --------------------------------------------------------------------------- + +class TestResolveScraperClass: + def test_known_category_returns_class(self, make_url_metadata): + svc = InfoExtractService(url_metadata=make_url_metadata(source="twitter")) + cls = svc._resolve_scraper_class("twitter") + assert cls is not None + + def test_unknown_category_raises_scraper_error(self, make_url_metadata): + svc = InfoExtractService(url_metadata=make_url_metadata()) + with pytest.raises(ScraperError, match="No scraper registered"): + svc._resolve_scraper_class("tiktok") diff --git a/tests/unit/scrapers/test_general_firecrawl.py b/tests/unit/scrapers/test_general_firecrawl.py index d095519..a38d46a 100644 --- a/tests/unit/scrapers/test_general_firecrawl.py +++ b/tests/unit/scrapers/test_general_firecrawl.py @@ -9,6 +9,7 @@ FirecrawlClient, FirecrawlSettings, ) +from fastfetchbot_shared.exceptions import ExternalServiceError # --------------------------------------------------------------------------- @@ -166,7 +167,7 @@ async def test_scrape_url_exception(self, mock_fc_cls): mock_app.scrape.side_effect = Exception("network error") client = FirecrawlClient.get_instance() - with pytest.raises(RuntimeError, match="Firecrawl scrape_url failed"): + with pytest.raises(ExternalServiceError, match="Firecrawl scrape_url failed"): await client.scrape_url(url="https://fail.com") diff --git a/tests/unit/scrapers/test_general_zyte.py b/tests/unit/scrapers/test_general_zyte.py index 8695986..33f5861 100644 --- a/tests/unit/scrapers/test_general_zyte.py +++ b/tests/unit/scrapers/test_general_zyte.py @@ -8,6 +8,7 @@ ZyteDataProcessor, ZyteScraper, ) +from fastfetchbot_shared.exceptions import ExternalServiceError # --------------------------------------------------------------------------- @@ -36,7 +37,7 @@ class TestZyteGetPageContent: ) async def test_no_api_key_raises(self): proc = ZyteDataProcessor("https://example.com") - with pytest.raises(RuntimeError, match="ZYTE_API_KEY is not configured"): + with pytest.raises(ExternalServiceError, match="ZYTE_API_KEY is not configured"): await proc._get_page_content() @pytest.mark.asyncio diff --git a/tests/unit/scrapers/test_scraper_manager.py b/tests/unit/scrapers/test_scraper_manager.py index 24b0cc2..ec6d34c 100644 --- a/tests/unit/scrapers/test_scraper_manager.py +++ b/tests/unit/scrapers/test_scraper_manager.py @@ -5,6 +5,7 @@ import pytest from fastfetchbot_shared.services.scrapers.scraper_manager import ScraperManager +from fastfetchbot_shared.exceptions import ScraperError # --------------------------------------------------------------------------- @@ -124,8 +125,8 @@ async def test_init_unknown_when_already_initialized(self): class TestInitScraperUnsupported: @pytest.mark.asyncio - async def test_unsupported_category_raises_value_error(self): - with pytest.raises(ValueError, match="not supported"): + async def test_unsupported_category_raises_scraper_error(self): + with pytest.raises(ScraperError, match="not supported"): await ScraperManager.init_scraper("tiktok") diff --git a/tests/unit/scrapers/test_twitter_exceptions.py b/tests/unit/scrapers/test_twitter_exceptions.py new file mode 100644 index 0000000..6826730 --- /dev/null +++ b/tests/unit/scrapers/test_twitter_exceptions.py @@ -0,0 +1,97 @@ +"""Tests for exception handling in Twitter scraper.""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from fastfetchbot_shared.exceptions import ScraperError, ScraperParseError + + +class TestGetResponseTweetData: + @pytest.mark.asyncio + async def test_all_scrapers_fail_raises_scraper_error(self): + from fastfetchbot_shared.services.scrapers.twitter import Twitter + + tw = Twitter(url="https://twitter.com/user/status/123") + + with patch.object( + tw, "_rapidapi_get_response_tweet_data", + new_callable=AsyncMock, + side_effect=ScraperParseError("API error"), + ), patch.object( + tw, "_api_client_get_response_tweet_data", + new_callable=AsyncMock, + side_effect=RuntimeError("client error"), + ): + with pytest.raises(ScraperError, match="No valid response from all Twitter scrapers"): + await tw._get_response_tweet_data() + + @pytest.mark.asyncio + async def test_first_scraper_fails_falls_through(self): + from fastfetchbot_shared.services.scrapers.twitter import Twitter + + tw = Twitter(url="https://twitter.com/user/status/456") + fake_data = { + "data": { + "threaded_conversation_with_injections_v2": { + "instructions": [{"entries": []}] + } + } + } + + with patch.object( + tw, "_rapidapi_get_response_tweet_data", + new_callable=AsyncMock, + side_effect=RuntimeError("first failed"), + ), patch.object( + tw, "_api_client_get_response_tweet_data", + new_callable=AsyncMock, + return_value=fake_data, + ): + # Should fall through to api-client and succeed + result = await tw._get_response_tweet_data() + assert result == fake_data + + +class TestRapidapiResponseValidation: + @pytest.mark.asyncio + async def test_error_response_raises_scraper_parse_error(self): + from fastfetchbot_shared.services.scrapers.twitter import Twitter + import httpx + + tw = Twitter(url="https://twitter.com/user/status/789") + tw.scraper = "Twitter135" + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"errors": [{"message": "rate limited"}]} + + with patch("httpx.AsyncClient") as MockClient: + mock_client = AsyncMock() + mock_client.get = AsyncMock(return_value=mock_response) + MockClient.return_value.__aenter__ = AsyncMock(return_value=mock_client) + MockClient.return_value.__aexit__ = AsyncMock(return_value=False) + + with patch.object(tw, "_get_request_headers"): + with pytest.raises(ScraperParseError, match="Invalid response"): + await tw._rapidapi_get_response_tweet_data() + + @pytest.mark.asyncio + async def test_non_200_raises_scraper_parse_error(self): + from fastfetchbot_shared.services.scrapers.twitter import Twitter + + tw = Twitter(url="https://twitter.com/user/status/000") + tw.scraper = "Twitter135" + + mock_response = MagicMock() + mock_response.status_code = 500 + + with patch("httpx.AsyncClient") as MockClient: + mock_client = AsyncMock() + mock_client.get = AsyncMock(return_value=mock_response) + MockClient.return_value.__aenter__ = AsyncMock(return_value=mock_client) + MockClient.return_value.__aexit__ = AsyncMock(return_value=False) + + with patch.object(tw, "_get_request_headers"): + with pytest.raises(ScraperParseError, match="Invalid response"): + await tw._rapidapi_get_response_tweet_data() diff --git a/tests/unit/scrapers/test_weibo.py b/tests/unit/scrapers/test_weibo.py index 8b2230e..d19461c 100644 --- a/tests/unit/scrapers/test_weibo.py +++ b/tests/unit/scrapers/test_weibo.py @@ -24,6 +24,7 @@ from typing import Dict from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType +from fastfetchbot_shared.exceptions import ScraperError, ScraperNetworkError, ScraperParseError # --------------------------------------------------------------------------- @@ -218,7 +219,7 @@ def test_lxml_method(self): def test_invalid_method_raises(self): from fastfetchbot_shared.services.scrapers.weibo.scraper import WeiboDataProcessor - with pytest.raises(ValueError, match="method must be bs4 or lxml"): + with pytest.raises(ScraperError, match="method must be bs4 or lxml"): WeiboDataProcessor._weibo_html_text_clean("

test

", method="invalid") @@ -803,7 +804,7 @@ async def test_invalid_method_raises(self): mock_env.get_template.return_value = mock_template from fastfetchbot_shared.services.scrapers.weibo.scraper import WeiboDataProcessor wp = WeiboDataProcessor(url="https://m.weibo.cn/detail/123") - with pytest.raises(ValueError, match="method must be webpage or api"): + with pytest.raises(ScraperError, match="method must be webpage or api"): await wp._get_weibo_info(method="invalid") @pytest.mark.asyncio @@ -827,7 +828,7 @@ async def test_connection_error_propagated(self): from fastfetchbot_shared.services.scrapers.weibo.scraper import WeiboDataProcessor wp = WeiboDataProcessor(url="https://m.weibo.cn/detail/123") with patch.object(wp, "_get_weibo_info_api", new_callable=AsyncMock, side_effect=ConnectionError("net fail")): - with pytest.raises(ConnectionError, match="network issues"): + with pytest.raises(ScraperNetworkError, match="network issues"): await wp._get_weibo_info(method="api") @@ -960,7 +961,7 @@ async def test_ok_zero_raises(self): mock_client.__aexit__ = AsyncMock(return_value=False) with patch("fastfetchbot_shared.services.scrapers.weibo.scraper.httpx.AsyncClient", return_value=mock_client): - with pytest.raises(ConnectionError): + with pytest.raises(ScraperParseError): await wp._get_weibo_info_api() @pytest.mark.asyncio @@ -983,7 +984,7 @@ async def test_empty_response_raises(self): mock_client.__aexit__ = AsyncMock(return_value=False) with patch("fastfetchbot_shared.services.scrapers.weibo.scraper.httpx.AsyncClient", return_value=mock_client): - with pytest.raises(ConnectionError): + with pytest.raises(ScraperParseError): await wp._get_weibo_info_api() @pytest.mark.asyncio @@ -1004,7 +1005,7 @@ async def test_http_error_raises(self): mock_client.__aexit__ = AsyncMock(return_value=False) with patch("fastfetchbot_shared.services.scrapers.weibo.scraper.httpx.AsyncClient", return_value=mock_client): - with pytest.raises(ConnectionError): + with pytest.raises(ScraperNetworkError): await wp._get_weibo_info_api() @@ -1083,7 +1084,7 @@ async def side_effect(method=None): nonlocal call_count call_count += 1 if call_count == 1: - raise ConnectionError("api fail") + raise ScraperNetworkError("api fail") return weibo_info with patch.object(wp, "_get_weibo_info", new_callable=AsyncMock, side_effect=side_effect): @@ -1100,12 +1101,12 @@ async def test_both_fail_raises(self): from fastfetchbot_shared.services.scrapers.weibo.scraper import WeiboDataProcessor wp = WeiboDataProcessor(url="https://m.weibo.cn/detail/123") - with patch.object(wp, "_get_weibo_info", new_callable=AsyncMock, side_effect=ConnectionError("fail")): - with pytest.raises(ConnectionError): + with patch.object(wp, "_get_weibo_info", new_callable=AsyncMock, side_effect=ScraperNetworkError("fail")): + with pytest.raises(ScraperNetworkError): await wp._get_weibo() @pytest.mark.asyncio - async def test_process_item_exception_caught(self): + async def test_process_item_exception_reraised(self): with patch("fastfetchbot_shared.services.scrapers.weibo.scraper.JINJA2_ENV") as mock_env: mock_template = MagicMock() mock_template.render.return_value = "

rendered

" @@ -1115,8 +1116,8 @@ async def test_process_item_exception_caught(self): with patch.object(wp, "_get_weibo_info", new_callable=AsyncMock, return_value={"id": "123"}): with patch.object(wp, "_process_weibo_item", new_callable=AsyncMock, side_effect=Exception("process fail")): - # Should not raise, exception is caught and logged - await wp._get_weibo() + with pytest.raises(Exception, match="process fail"): + await wp._get_weibo() # --------------------------------------------------------------------------- diff --git a/tests/unit/scrapers/test_xiaohongshu.py b/tests/unit/scrapers/test_xiaohongshu.py index eca8e50..6fadc8e 100644 --- a/tests/unit/scrapers/test_xiaohongshu.py +++ b/tests/unit/scrapers/test_xiaohongshu.py @@ -19,6 +19,7 @@ XHS_WEB_URL, ) from fastfetchbot_shared.models.metadata_item import MessageType, MediaFile +from fastfetchbot_shared.exceptions import ScraperError, ScraperParseError, ExternalServiceError # --------------------------------------------------------------------------- @@ -48,19 +49,19 @@ def test_nested_path(self): assert result["note_id"] == "note456" def test_empty_path_raises(self): - with pytest.raises(ValueError, match="Invalid XHS note URL"): + with pytest.raises(ScraperParseError, match="Invalid XHS note URL"): parse_xhs_note_url("https://www.xiaohongshu.com/") def test_explore_only_raises(self): - with pytest.raises(ValueError, match="Invalid XHS note URL path"): + with pytest.raises(ScraperParseError, match="Invalid XHS note URL path"): parse_xhs_note_url("https://www.xiaohongshu.com/explore") def test_discovery_only_raises(self): - with pytest.raises(ValueError, match="Invalid XHS note URL path"): + with pytest.raises(ScraperParseError, match="Invalid XHS note URL path"): parse_xhs_note_url("https://www.xiaohongshu.com/discovery") def test_item_only_raises(self): - with pytest.raises(ValueError, match="Invalid XHS note URL path"): + with pytest.raises(ScraperParseError, match="Invalid XHS note URL path"): parse_xhs_note_url("https://www.xiaohongshu.com/item") @@ -101,7 +102,7 @@ def test_strips_trailing_slash(self): @patch("fastfetchbot_shared.config.settings.SIGN_SERVER_URL", "") def test_no_sign_server_raises(self): - with pytest.raises(ValueError, match="sign server URL"): + with pytest.raises(ExternalServiceError, match="sign server URL"): XhsSinglePostAdapter(cookies="c=1", sign_server_endpoint="") @patch("fastfetchbot_shared.config.settings.SIGN_SERVER_URL", "http://fallback:8989") @@ -193,7 +194,7 @@ async def test_not_ok_raises(self): adapter._http = AsyncMock() adapter._http.post = AsyncMock(return_value=mock_resp) - with pytest.raises(RuntimeError, match="sign server returned error"): + with pytest.raises(ExternalServiceError, match="sign server returned error"): await adapter._sign_headers("/api/test") @pytest.mark.asyncio @@ -210,7 +211,7 @@ async def test_missing_fields_raises(self): adapter._http = AsyncMock() adapter._http.post = AsyncMock(return_value=mock_resp) - with pytest.raises(RuntimeError, match="missing fields"): + with pytest.raises(ExternalServiceError, match="missing fields"): await adapter._sign_headers("/api/test") @pytest.mark.asyncio @@ -224,7 +225,7 @@ async def test_none_data_uses_empty_dict(self): adapter._http = AsyncMock() adapter._http.post = AsyncMock(return_value=mock_resp) - with pytest.raises(RuntimeError, match="missing fields"): + with pytest.raises(ExternalServiceError, match="missing fields"): await adapter._sign_headers("/api/test") @pytest.mark.asyncio @@ -273,7 +274,7 @@ def test_non_json_raises(self): resp = MagicMock(spec=httpx.Response) resp.status_code = 200 resp.json.side_effect = json.JSONDecodeError("err", "", 0) - with pytest.raises(RuntimeError, match="non-JSON"): + with pytest.raises(ScraperParseError, match="non-JSON"): XhsSinglePostAdapter._parse_api_response(resp) def test_captcha_461(self): @@ -281,7 +282,7 @@ def test_captcha_461(self): 461, {"success": False}, headers={"Verifytype": "captcha", "Verifyuuid": "uuid1"}, ) - with pytest.raises(RuntimeError, match="captcha"): + with pytest.raises(ScraperError, match="captcha"): XhsSinglePostAdapter._parse_api_response(resp) def test_captcha_471(self): @@ -289,12 +290,12 @@ def test_captcha_471(self): 471, {"success": False}, headers={"Verifytype": "sms", "Verifyuuid": "uuid2"}, ) - with pytest.raises(RuntimeError, match="captcha"): + with pytest.raises(ScraperError, match="captcha"): XhsSinglePostAdapter._parse_api_response(resp) def test_api_error_not_success(self): resp = self._make_response(200, {"success": False, "msg": "error"}) - with pytest.raises(RuntimeError, match="XHS API error"): + with pytest.raises(ScraperParseError, match="XHS API error"): XhsSinglePostAdapter._parse_api_response(resp) @@ -951,7 +952,7 @@ async def test_both_fail_raises(self): adapter._fetch_note_by_html = AsyncMock(return_value=None) url = f"{XHS_WEB_URL}/explore/n1" - with pytest.raises(RuntimeError, match="Cannot fetch note"): + with pytest.raises(ScraperError, match="Cannot fetch note"): await adapter.fetch_post(note_url=url) @pytest.mark.asyncio @@ -1292,7 +1293,7 @@ async def test_not_xhs_raises(self): mock_client.__aexit__ = AsyncMock(return_value=None) MockClient.return_value = mock_client - with pytest.raises(RuntimeError, match="did not redirect to xiaohongshu.com"): + with pytest.raises(ScraperError, match="did not redirect to xiaohongshu.com"): await adapter._get_redirection_url("https://xhslink.com/abc") diff --git a/tests/unit/telegram_bot/test_webhook_server.py b/tests/unit/telegram_bot/test_webhook_server.py new file mode 100644 index 0000000..db16adc --- /dev/null +++ b/tests/unit/telegram_bot/test_webhook_server.py @@ -0,0 +1,94 @@ +"""Tests for apps/telegram-bot/core/webhook/server.py""" + +import asyncio +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + + +class TestLogTaskException: + def test_logs_exception_from_failed_task(self): + from core.webhook.server import _log_task_exception + + mock_task = MagicMock(spec=asyncio.Task) + mock_task.cancelled.return_value = False + test_exc = RuntimeError("task boom") + mock_task.exception.return_value = test_exc + + with patch("core.webhook.server.logger") as mock_logger: + _log_task_exception(mock_task) + mock_logger.exception.assert_called_once() + + def test_does_not_log_cancelled_task(self): + from core.webhook.server import _log_task_exception + + mock_task = MagicMock(spec=asyncio.Task) + mock_task.cancelled.return_value = True + + with patch("core.webhook.server.logger") as mock_logger: + _log_task_exception(mock_task) + mock_logger.exception.assert_not_called() + + def test_does_not_log_successful_task(self): + from core.webhook.server import _log_task_exception + + mock_task = MagicMock(spec=asyncio.Task) + mock_task.cancelled.return_value = False + mock_task.exception.return_value = None + + with patch("core.webhook.server.logger") as mock_logger: + _log_task_exception(mock_task) + mock_logger.exception.assert_not_called() + + +class TestTelegramWebhook: + @pytest.mark.asyncio + async def test_invalid_json_returns_400(self): + from core.webhook.server import telegram_webhook + + mock_request = MagicMock() + mock_request.headers = {"X-Telegram-Bot-Api-Secret-Token": ""} + mock_request.json = AsyncMock(side_effect=ValueError("bad json")) + + with patch("core.webhook.server.settings") as mock_settings: + mock_settings.TELEGRAM_BOT_SECRET_TOKEN = "" + response = await telegram_webhook(mock_request) + assert response.status_code == 400 + + @pytest.mark.asyncio + async def test_unauthorized_returns_401(self): + from core.webhook.server import telegram_webhook + + mock_request = MagicMock() + mock_request.headers = {"X-Telegram-Bot-Api-Secret-Token": "wrong"} + + with patch("core.webhook.server.settings") as mock_settings: + mock_settings.TELEGRAM_BOT_SECRET_TOKEN = "correct" + response = await telegram_webhook(mock_request) + assert response.status_code == 401 + + +class TestSendMessageEndpoint: + @pytest.mark.asyncio + async def test_exception_returns_500(self): + from core.webhook.server import send_message_endpoint + + mock_request = MagicMock() + mock_request.json = AsyncMock(side_effect=RuntimeError("db error")) + + response = await send_message_endpoint(mock_request) + assert response.status_code == 500 + + @pytest.mark.asyncio + async def test_success_returns_ok(self): + from core.webhook.server import send_message_endpoint + + mock_request = MagicMock() + mock_request.json = AsyncMock(return_value={ + "data": {"title": "test"}, + "chat_id": "123", + }) + + with patch("core.webhook.server.send_item_message", new_callable=AsyncMock): + response = await send_message_endpoint(mock_request) + assert response.status_code == 200 diff --git a/tests/unit/test_api_exception_handlers.py b/tests/unit/test_api_exception_handlers.py new file mode 100644 index 0000000..ef1953c --- /dev/null +++ b/tests/unit/test_api_exception_handlers.py @@ -0,0 +1,95 @@ +"""Tests for FastAPI global exception handlers in apps/api/src/main.py + +Uses a minimal FastAPI app with the same exception handlers to avoid +needing to fully configure the real app's dependencies. +""" + +import pytest +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from fastfetchbot_shared.exceptions import ( + FastFetchBotError, + ScraperError, + ScraperParseError, + FileExportError, +) + + +def _create_test_app(): + """Create a minimal FastAPI app with the same exception handlers as main.py.""" + from fastapi import Request + from fastapi.responses import JSONResponse + from fastfetchbot_shared.utils.logger import logger + + app = FastAPI() + + @app.exception_handler(FastFetchBotError) + async def fastfetchbot_error_handler(request: Request, exc: FastFetchBotError): + logger.error(f"Domain error on {request.method} {request.url}: {exc}") + return JSONResponse(status_code=502, content={"error": str(exc)}) + + @app.exception_handler(Exception) + async def generic_error_handler(request: Request, exc: Exception): + logger.exception(f"Unhandled error on {request.method} {request.url}") + return JSONResponse(status_code=500, content={"error": "Internal server error"}) + + return app + + +@pytest.fixture +def test_app(): + return _create_test_app() + + +@pytest.fixture +def client(test_app): + return TestClient(test_app, raise_server_exceptions=False) + + +class TestFastFetchBotErrorHandler: + def test_scraper_error_returns_502(self, test_app, client): + @test_app.get("/test") + async def _(): + raise ScraperError("Twitter API failed") + + response = client.get("/test") + assert response.status_code == 502 + assert "Twitter API failed" in response.json()["error"] + + def test_scraper_parse_error_returns_502(self, test_app, client): + @test_app.get("/test-parse") + async def _(): + raise ScraperParseError("Invalid response") + + response = client.get("/test-parse") + assert response.status_code == 502 + + def test_file_export_error_returns_502(self, test_app, client): + @test_app.get("/test-export") + async def _(): + raise FileExportError("PDF failed") + + response = client.get("/test-export") + assert response.status_code == 502 + assert "PDF failed" in response.json()["error"] + + +class TestGenericExceptionHandler: + def test_unhandled_exception_returns_500(self, test_app, client): + @test_app.get("/test-crash") + async def _(): + raise RuntimeError("unexpected crash") + + response = client.get("/test-crash") + assert response.status_code == 500 + assert response.json()["error"] == "Internal server error" + + def test_generic_does_not_leak_details(self, test_app, client): + @test_app.get("/test-sensitive") + async def _(): + raise ValueError("secret database password") + + response = client.get("/test-sensitive") + assert response.status_code == 500 + assert "secret" not in response.json()["error"] diff --git a/tests/unit/test_exceptions.py b/tests/unit/test_exceptions.py new file mode 100644 index 0000000..5c685f3 --- /dev/null +++ b/tests/unit/test_exceptions.py @@ -0,0 +1,82 @@ +"""Tests for packages/shared/fastfetchbot_shared/exceptions.py""" + +import pytest + +from fastfetchbot_shared.exceptions import ( + FastFetchBotError, + ScraperError, + ScraperNetworkError, + ScraperParseError, + TelegraphPublishError, + FileExportError, + ExternalServiceError, +) + + +class TestExceptionHierarchy: + def test_scraper_error_is_fastfetchbot_error(self): + assert issubclass(ScraperError, FastFetchBotError) + + def test_scraper_network_error_is_scraper_error(self): + assert issubclass(ScraperNetworkError, ScraperError) + + def test_scraper_parse_error_is_scraper_error(self): + assert issubclass(ScraperParseError, ScraperError) + + def test_telegraph_publish_error_is_fastfetchbot_error(self): + assert issubclass(TelegraphPublishError, FastFetchBotError) + + def test_file_export_error_is_fastfetchbot_error(self): + assert issubclass(FileExportError, FastFetchBotError) + + def test_external_service_error_is_fastfetchbot_error(self): + assert issubclass(ExternalServiceError, FastFetchBotError) + + def test_all_are_exceptions(self): + for cls in ( + FastFetchBotError, + ScraperError, + ScraperNetworkError, + ScraperParseError, + TelegraphPublishError, + FileExportError, + ExternalServiceError, + ): + assert issubclass(cls, Exception) + + +class TestExceptionCatching: + """Verify that catching a parent also catches children.""" + + def test_catch_fastfetchbot_catches_scraper_error(self): + with pytest.raises(FastFetchBotError): + raise ScraperError("test") + + def test_catch_scraper_catches_network_error(self): + with pytest.raises(ScraperError): + raise ScraperNetworkError("network fail") + + def test_catch_scraper_catches_parse_error(self): + with pytest.raises(ScraperError): + raise ScraperParseError("parse fail") + + def test_catch_fastfetchbot_catches_file_export_error(self): + with pytest.raises(FastFetchBotError): + raise FileExportError("export fail") + + def test_catch_fastfetchbot_catches_external_service_error(self): + with pytest.raises(FastFetchBotError): + raise ExternalServiceError("service fail") + + +class TestExceptionChaining: + def test_from_preserves_original_cause(self): + original = ValueError("original") + try: + raise ScraperError("wrapped") from original + except ScraperError as e: + assert e.__cause__ is original + + def test_message_preserved(self): + exc = ScraperParseError("bad data") + assert str(exc) == "bad data" From dcbeb3418f6b69f548a71ccd0883fc1638ea500b Mon Sep 17 00:00:00 2001 From: aturret Date: Sat, 28 Mar 2026 01:23:03 -0500 Subject: [PATCH 4/6] feat: update CLAUDE.md --- CLAUDE.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index de0732c..b8184e0 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -183,6 +183,23 @@ GitHub Actions (`.github/workflows/ci.yml`) builds and pushes all four images on 6. Add platform-specific router in `apps/api/src/routers/` (if API endpoints are needed) 7. Add any new pip dependencies to `packages/shared/pyproject.toml` under `[project.optional-dependencies] scrapers` +### Exception Handling +- **Custom exceptions** are defined in `packages/shared/fastfetchbot_shared/exceptions.py`: + - `FastFetchBotError` — base for all domain errors + - `ScraperError` / `ScraperNetworkError` / `ScraperParseError` — scraper failures + - `TelegraphPublishError` — Telegraph publishing failures + - `FileExportError` — file export (PDF, video, audio) failures + - `ExternalServiceError` — external service call failures (OpenAI, Firecrawl, Zyte, XHS sign server, etc.) +- **Always use typed exceptions** instead of generic `RuntimeError`, `ValueError`, or `Exception` for domain errors. Pick the most specific subclass that fits. +- **Use `from e` chaining** when wrapping exceptions: `raise ScraperError("message") from e` +- **Boundary-level handlers** catch exceptions at service boundaries: + - FastAPI: global `@app.exception_handler(FastFetchBotError)` returns 502, generic `Exception` returns 500 + - Telegram bot: `error_process` handler catches handler exceptions; webhook server protects endpoints + - Celery/ARQ workers: existing task-level try/catch with outbox error push +- **Never use `print()` or `traceback.print_exc()`** — always use `logger.exception()` (includes traceback) or `logger.error()` (message only) +- **Never silently swallow exceptions** — if catching an exception, either re-raise it or handle it explicitly with logging. Do not return `None` or empty data on failure. +- **Fail fast after fallback chains** — scrapers may try multiple methods/APIs, but must raise a typed error when all fallbacks are exhausted + ### Key Conventions - **`packages/shared/` (`fastfetchbot-shared`)** is for shared async logic — scrapers, templates, Telegraph, and async Celery task wrappers (file_export). Most code here is async and reusable across apps - **`packages/file-export/` (`fastfetchbot-file-export`)** is exclusively for synchronous Celery worker jobs — the heavy I/O operations that run inside the Celery worker process (yt-dlp video download, WeasyPrint PDF generation, OpenAI audio transcription). Apps never import this package directly; they use the async wrappers in `fastfetchbot_shared.services.file_export` which submit tasks to the Celery worker From 6389f0e1ad48eac9b99188d2b3d5f60d8e1bb356 Mon Sep 17 00:00:00 2001 From: aturret Date: Sat, 28 Mar 2026 01:25:42 -0500 Subject: [PATCH 5/6] feat: add unit tests for new code --- .../file_export/test_transcribe_exceptions.py | 72 +++++++- .../unit/telegram_bot/test_message_sender.py | 154 ++++++++++++++++++ tests/unit/test_network.py | 103 ++++++++++++ 3 files changed, 328 insertions(+), 1 deletion(-) create mode 100644 tests/unit/telegram_bot/test_message_sender.py create mode 100644 tests/unit/test_network.py diff --git a/tests/unit/file_export/test_transcribe_exceptions.py b/tests/unit/file_export/test_transcribe_exceptions.py index 533be53..4461752 100644 --- a/tests/unit/file_export/test_transcribe_exceptions.py +++ b/tests/unit/file_export/test_transcribe_exceptions.py @@ -1,6 +1,6 @@ """Tests for exception handling in packages/file-export/fastfetchbot_file_export/transcribe.py""" -from unittest.mock import patch, MagicMock +from unittest.mock import patch, MagicMock, mock_open import pytest @@ -29,3 +29,73 @@ def test_preserves_original_cause(self): with pytest.raises(FileExportError) as exc_info: get_audio_text("/tmp/test.mp3", "key") assert exc_info.value.__cause__ is original + + +class TestGetAudioTextHappyPath: + def test_successful_transcription(self): + from fastfetchbot_file_export.transcribe import get_audio_text + + # Mock OpenAI client + mock_client = MagicMock() + mock_transcription = MagicMock() + mock_transcription.text = "hello world" + mock_client.audio.transcriptions.create.return_value = mock_transcription + + mock_punctuation_response = MagicMock() + mock_punctuation_response.choices = [MagicMock()] + mock_punctuation_response.choices[0].message.content = "Hello, world." + + mock_summary_response = MagicMock() + mock_summary_response.choices = [MagicMock()] + mock_summary_response.choices[0].message.content = "A greeting." + + # Track which call we're on + call_count = [0] + def chat_side_effect(**kwargs): + call_count[0] += 1 + if call_count[0] == 1: + return mock_punctuation_response + return mock_summary_response + + mock_client.chat.completions.create.side_effect = chat_side_effect + + # Mock AudioSegment + mock_audio = MagicMock() + mock_audio.__getitem__ = MagicMock(return_value=mock_audio) + mock_audio.duration_seconds = 10 # short audio, single segment + mock_audio.dBFS = -10 # above silence threshold + mock_audio.export = MagicMock() + + mock_segment = MagicMock() + mock_segment.dBFS = -10 + mock_audio.__getitem__.return_value = mock_segment + mock_segment.export = MagicMock() + + with patch("fastfetchbot_file_export.transcribe.OpenAI", return_value=mock_client), \ + patch("fastfetchbot_file_export.transcribe.AudioSegment") as MockAudioSegment, \ + patch("fastfetchbot_file_export.transcribe.os.remove") as mock_remove, \ + patch("fastfetchbot_file_export.transcribe.os.path.splitext", return_value=("/tmp/audio", ".mp3")), \ + patch("builtins.open", mock_open(read_data=b"audio data")), \ + patch("fastfetchbot_file_export.transcribe.milliseconds_until_sound", return_value=0): + + MockAudioSegment.from_file.return_value = mock_audio + + result = get_audio_text("/tmp/audio.mp3", "test-api-key") + + assert "Hello, world." in result + assert "A greeting." in result + # Should have cleaned up the original file + mock_remove.assert_called() + + def test_audio_segment_failure_wraps_in_file_export_error(self): + from fastfetchbot_file_export.transcribe import get_audio_text + + mock_client = MagicMock() + + with patch("fastfetchbot_file_export.transcribe.OpenAI", return_value=mock_client), \ + patch("fastfetchbot_file_export.transcribe.AudioSegment") as MockAudioSegment: + + MockAudioSegment.from_file.side_effect = FileNotFoundError("no such file") + + with pytest.raises(FileExportError, match="Audio transcription failed"): + get_audio_text("/tmp/missing.mp3", "key") diff --git a/tests/unit/telegram_bot/test_message_sender.py b/tests/unit/telegram_bot/test_message_sender.py new file mode 100644 index 0000000..962777b --- /dev/null +++ b/tests/unit/telegram_bot/test_message_sender.py @@ -0,0 +1,154 @@ +"""Tests for exception handling in apps/telegram-bot/core/services/message_sender.py""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + + +class TestMediaFilesPackagingDownloadFailure: + """Test that download failures in media_files_packaging are caught and skipped.""" + + @pytest.mark.asyncio + async def test_http_download_failure_skips_media(self): + from core.services.message_sender import media_files_packaging + + media_files = [ + {"media_type": "image", "url": "https://example.com/img.jpg", "caption": ""}, + ] + data = { + "url": "https://example.com", + "category": "twitter", + "message_type": "short", + } + + with patch( + "core.services.message_sender.download_file_by_metadata_item", + new_callable=AsyncMock, + side_effect=RuntimeError("network error"), + ): + media_group, file_group = await media_files_packaging(media_files, data) + + # Media item should be skipped, not crash + assert media_group == [] + assert file_group == [] + + @pytest.mark.asyncio + async def test_gif_download_failure_skips_media(self): + from core.services.message_sender import media_files_packaging + + media_files = [ + {"media_type": "gif", "url": "https://example.com/anim.gif", "caption": ""}, + ] + data = { + "url": "https://example.com", + "category": "twitter", + "message_type": "short", + } + + with patch( + "core.services.message_sender.download_file_by_metadata_item", + new_callable=AsyncMock, + side_effect=ConnectionError("timeout"), + ): + media_group, file_group = await media_files_packaging(media_files, data) + + assert media_group == [] + assert file_group == [] + + @pytest.mark.asyncio + async def test_image_document_download_failure_skips(self): + """Test the second download_file_by_metadata_item call for large image documents.""" + from core.services.message_sender import media_files_packaging + + mock_io = MagicMock() + mock_io.name = "media-abc.jpg" + mock_io.size = 15 * 1024 * 1024 # 15MB, over image size limit to trigger document path + + mock_image = MagicMock() + mock_image.size = (5000, 5000) # large image to trigger document download + mock_image.width = 5000 + mock_image.height = 5000 + + call_count = [0] + + async def download_side_effect(*args, **kwargs): + call_count[0] += 1 + if call_count[0] == 1: + return mock_io + raise RuntimeError("second download failed") + + media_files = [ + {"media_type": "image", "url": "https://example.com/big.jpg", "caption": ""}, + ] + data = { + "url": "https://example.com", + "category": "twitter", + "message_type": "short", + } + + with patch( + "core.services.message_sender.download_file_by_metadata_item", + new_callable=AsyncMock, + side_effect=download_side_effect, + ), patch( + "core.services.message_sender.check_image_type", + new_callable=AsyncMock, + return_value="jpg", + ), patch( + "core.services.message_sender.Image" + ) as MockImage, patch( + "core.services.message_sender.image_compressing", + return_value=mock_image, + ), patch( + "core.services.message_sender.settings" + ) as mock_settings: + MockImage.open.return_value = mock_image + mock_settings.TELEBOT_API_SERVER = None + mock_settings.TELEGRAM_IMAGE_DIMENSION_LIMIT = 2000 + mock_settings.TELEGRAM_IMAGE_SIZE_LIMIT = 10 * 1024 * 1024 # 10MB + + media_group, file_group = await media_files_packaging(media_files, data) + + # The image media_group should have the photo, but file_group should be empty + # because the document download failed and was skipped + assert file_group == [] + + +class TestSendItemMessageExceptionHandling: + @pytest.mark.asyncio + async def test_exception_logged_and_sent_to_debug_channel(self): + from core.services.message_sender import send_item_message + + mock_app = MagicMock() + mock_bot = AsyncMock() + mock_app.bot = mock_bot + + mock_chat = MagicMock() + mock_chat.type = "private" + mock_chat.linked_chat_id = None + mock_bot.get_chat = AsyncMock(return_value=mock_chat) + mock_bot.send_message = AsyncMock(side_effect=RuntimeError("telegram API down")) + + with patch("core.services.message_sender._get_application", return_value=mock_app), \ + patch("core.services.message_sender.send_debug_channel", new_callable=AsyncMock) as mock_debug: + + # Should not raise — the exception is caught and logged + await send_item_message( + data={ + "media_files": [], + "message_type": "short", + "text": "test", + "title": "Test", + "author": "a", + "author_url": "", + "url": "https://example.com", + "telegraph_url": "", + "category": "twitter", + "content": "", + }, + chat_id=12345, + ) + + # Debug channel should have been called with the traceback + mock_debug.assert_awaited_once() + assert "telegram API down" in mock_debug.call_args[0][0] diff --git a/tests/unit/test_network.py b/tests/unit/test_network.py new file mode 100644 index 0000000..4cd264f --- /dev/null +++ b/tests/unit/test_network.py @@ -0,0 +1,103 @@ +"""Tests for exception handling in packages/shared/fastfetchbot_shared/utils/network.py""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from fastfetchbot_shared.models.classes import NamedBytesIO + + +class TestDownloadFileByMetadataItem: + @pytest.mark.asyncio + async def test_success_returns_named_bytes_io(self): + from fastfetchbot_shared.utils.network import download_file_by_metadata_item + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.content = b"fake image data" + + mock_client = AsyncMock() + mock_client.get = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + + with patch("fastfetchbot_shared.utils.network.httpx.AsyncClient", return_value=mock_client), \ + patch("fastfetchbot_shared.utils.network.get_random_user_agent", return_value="TestAgent"): + result = await download_file_by_metadata_item( + url="https://example.com/image.jpg", + data={"url": "https://example.com", "category": "twitter"}, + ) + + assert isinstance(result, NamedBytesIO) + assert result.name.endswith(".jpg") + + @pytest.mark.asyncio + async def test_network_error_raises(self): + from fastfetchbot_shared.utils.network import download_file_by_metadata_item + + mock_client = AsyncMock() + mock_client.get = AsyncMock(side_effect=ConnectionError("connection refused")) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + + with patch("fastfetchbot_shared.utils.network.httpx.AsyncClient", return_value=mock_client), \ + patch("fastfetchbot_shared.utils.network.get_random_user_agent", return_value="TestAgent"): + with pytest.raises(ConnectionError, match="connection refused"): + await download_file_by_metadata_item( + url="https://example.com/image.jpg", + data={"url": "https://example.com", "category": "twitter"}, + ) + + @pytest.mark.asyncio + async def test_redirect_follows_location(self): + from fastfetchbot_shared.utils.network import download_file_by_metadata_item + + mock_response = MagicMock() + mock_response.status_code = 302 + mock_response.headers = {"Location": "https://cdn.example.com/real.jpg"} + mock_response.content = b"redirected content" + + mock_client = AsyncMock() + mock_client.get = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + + with patch("fastfetchbot_shared.utils.network.httpx.AsyncClient", return_value=mock_client), \ + patch("fastfetchbot_shared.utils.network.get_random_user_agent", return_value="TestAgent"): + result = await download_file_by_metadata_item( + url="https://example.com/image.jpg", + data={"url": "https://example.com", "category": "twitter"}, + ) + + assert isinstance(result, NamedBytesIO) + + +class TestGetResponseJson: + @pytest.mark.asyncio + async def test_exception_returns_none_and_logs(self): + from fastfetchbot_shared.utils.network import get_response_json + + with patch( + "fastfetchbot_shared.utils.network.get_response", + new_callable=AsyncMock, + side_effect=ConnectionError("timeout"), + ): + result = await get_response_json("https://example.com/api") + + assert result is None + + @pytest.mark.asyncio + async def test_json_decode_error_returns_none(self): + from fastfetchbot_shared.utils.network import get_response_json + + mock_response = MagicMock() + mock_response.json.side_effect = ValueError("invalid json") + + with patch( + "fastfetchbot_shared.utils.network.get_response", + new_callable=AsyncMock, + return_value=mock_response, + ): + result = await get_response_json("https://example.com/api") + + assert result is None From 0697a131d2b28bc62626dd8183a1536da17941cb Mon Sep 17 00:00:00 2001 From: aturret Date: Sat, 28 Mar 2026 01:29:49 -0500 Subject: [PATCH 6/6] fix: code glitches --- apps/telegram-bot/core/services/message_sender.py | 5 ++++- .../fastfetchbot_shared/services/scrapers/zhihu/__init__.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/apps/telegram-bot/core/services/message_sender.py b/apps/telegram-bot/core/services/message_sender.py index a2ccc76..1e6480d 100644 --- a/apps/telegram-bot/core/services/message_sender.py +++ b/apps/telegram-bot/core/services/message_sender.py @@ -169,11 +169,14 @@ async def send_item_message( async def send_debug_channel(message: str) -> None: + import html as html_module from core.config import TELEBOT_DEBUG_CHANNEL application = _get_application() if TELEBOT_DEBUG_CHANNEL is not None: await application.bot.send_message( - chat_id=TELEBOT_DEBUG_CHANNEL, text=message, parse_mode=ParseMode.HTML + chat_id=TELEBOT_DEBUG_CHANNEL, + text=html_module.escape(message), + parse_mode=ParseMode.HTML, ) diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py index f616499..8f8923d 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py @@ -559,12 +559,12 @@ async def _get_zhihu_article(self): self.raw_content = fix_images_and_links(self.raw_content) self.raw_content = unmask_zhihu_links(self.raw_content) except Exception as e: - raise ScraperError("zhihu request failed") + raise ScraperError("zhihu request failed") from e else: try: selector = await get_selector(self.request_url, headers=self.headers) except Exception as e: - raise ScraperError("zhihu request failed") + raise ScraperError("zhihu request failed") from e if self.method == "json": json_data = selector.xpath('string(//script[@id="js-initialData"])') json_data = json.loads(json_data)