From 6aa483fcd81fd9cefdb3d320a25508456cab6f98 Mon Sep 17 00:00:00 2001 From: Nicholas Thompson Date: Fri, 18 Jul 2025 00:01:34 -0700 Subject: [PATCH 1/6] feat: Add reading progress support and improve type safety --- requirements.txt | 1 + storygraph_api/books_client.py | 6 + storygraph_api/parse/books_parser.py | 170 +++++++++++++++--------- storygraph_api/request/books_request.py | 17 ++- 4 files changed, 132 insertions(+), 62 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8dea80e..56e2dbe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ h11==0.14.0 idna==3.7 outcome==1.3.0.post0 PySocks==1.7.1 +python-dotenv==1.1.1 requests==2.32.3 selenium==4.23.1 sniffio==1.3.1 diff --git a/storygraph_api/books_client.py b/storygraph_api/books_client.py index 01b5a36..5c147ee 100644 --- a/storygraph_api/books_client.py +++ b/storygraph_api/books_client.py @@ -8,6 +8,12 @@ def book_info(self,book_id): data = BooksParser.book_page(book_id) return json.dumps(data,indent=4) + @handle_exceptions + def reading_progress(self, book_id, cookies): + progress = BooksParser.reading_progress(book_id, cookies) + data = {"progress": progress} + return json.dumps(data, indent=4) + @handle_exceptions def search(self,query): data = BooksParser.search(query) diff --git a/storygraph_api/parse/books_parser.py b/storygraph_api/parse/books_parser.py index 20014eb..34b27c6 100644 --- a/storygraph_api/parse/books_parser.py +++ b/storygraph_api/parse/books_parser.py @@ -1,95 +1,143 @@ from storygraph_api.request.books_request import BooksScraper from storygraph_api.exception_handler import parsing_exception -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag, NavigableString import re +from typing import Dict, Any, List class BooksParser: @staticmethod @parsing_exception - def book_page(book_id): + def book_page(book_id: str) -> Dict[str, Any]: content = BooksScraper.main(book_id) soup = BeautifulSoup(content, 'html.parser') - h3_tag = soup.find('h3',class_="font-serif font-bold text-2xl md:w-11/12") - title = h3_tag.contents[0].strip() + + h3_tag = soup.find('h3', class_="font-serif font-bold text-2xl md:w-11/12") + if not isinstance(h3_tag, Tag): + raise Exception("Could not find the main title header.") + + title = "" + if h3_tag.contents and isinstance(h3_tag.contents[0], NavigableString): + title = h3_tag.contents[0].strip() + authors = [] for a in h3_tag.find_all('a'): - if a["href"].startswith("/authors"): - authors.append(a.text) - p_tag = soup.find('p',class_="text-sm font-light text-darkestGrey dark:text-grey mt-1") - pages = p_tag.contents[0].strip().split()[0] - first_pub = p_tag.contents[1].find_all('span')[1].text.split()[2] - tags = [] - tag_div = soup.find('div',class_="book-page-tag-section").find_all('span') - for tag in tag_div: - tags.append(tag.text) - desc = soup.find_all('script')[5].text - pattern = re.compile(r"Description<\/h4>
(.*?)<\/div>", re.DOTALL) + if isinstance(a, Tag): + href = a.get("href") + if isinstance(href, str) and href.startswith("/authors"): + authors.append(a.text) + + p_tag = soup.find('p', class_="text-sm font-light text-darkestGrey dark:text-grey mt-1") + if not isinstance(p_tag, Tag) or not p_tag.contents: + raise Exception("Could not find book metadata paragraph.") + + pages_text = p_tag.contents[0] + pages = pages_text.strip().split()[0] if isinstance(pages_text, NavigableString) else "N/A" + + pub_info_spans = p_tag.find_all('span') + first_pub = pub_info_spans[1].text.split()[-1] if len(pub_info_spans) > 1 else "N/A" + + tag_div = soup.find('div', class_="book-page-tag-section") + tags = [tag.text for tag in tag_div.find_all('span')] if isinstance(tag_div, Tag) else [] + + script_tags = soup.find_all('script') + desc = "" + for s in script_tags: + if 'Description' in s.text: + desc = s.text + break + + pattern = re.compile(r"Description
(.*?)
", re.DOTALL) match = pattern.search(desc) - description = match.group(1).strip() + description = match.group(1).strip() if match else "Description not found." + review_content = BooksScraper.community_reviews(book_id) - rev_soup = BeautifulSoup(review_content,'html.parser') - avg_rating = rev_soup.find('span',class_="average-star-rating").text.strip() + rev_soup = BeautifulSoup(review_content, 'html.parser') + avg_rating_span = rev_soup.find('span', class_="average-star-rating") + avg_rating = avg_rating_span.text.strip() if avg_rating_span else "N/A" + warnings = BooksParser.content_warnings(book_id) + data = { - 'title':title, - 'authors': authors, - 'pages': pages, - 'first_pub': first_pub, - 'tags': tags, - 'average_rating': avg_rating, - 'description':description, - 'warnings': warnings - } + 'title': title, 'authors': authors, 'pages': pages, + 'first_pub': first_pub, 'tags': tags, 'average_rating': avg_rating, + 'description': description, 'warnings': warnings + } return data @staticmethod @parsing_exception - def content_warnings(book_id): + def reading_progress(book_id: str, cookies: Dict[str, str]) -> str: + content = BooksScraper.book_page_authenticated(book_id, cookies) + soup = BeautifulSoup(content, 'html.parser') + + progress_bar_div = soup.find('div', class_='progress-bar') + if not isinstance(progress_bar_div, Tag): + raise Exception("Could not find progress bar. Ensure book is 'currently-reading'.") + + progress_span = progress_bar_div.find('span') + if isinstance(progress_span, Tag) and progress_span.string: + return progress_span.string.strip() + + inner_div = progress_bar_div.find('div', style=lambda v: 'width: 0%' in v if v else False) + if inner_div is not None: + return "0%" + + raise Exception("Could not extract percentage from progress bar.") + + @staticmethod + @parsing_exception + def content_warnings(book_id: str) -> Dict[str, List[str]]: warnings_content = BooksScraper.content_warnings(book_id) - warnings_soup = BeautifulSoup(warnings_content,'html.parser') - user_warnings_pane = warnings_soup.find_all('div',class_='standard-pane')[1] - warnings_graphic = [] - warnings_moderate = [] - warnings_minor = [] - warnings_list = warnings_graphic + warnings_soup = BeautifulSoup(warnings_content, 'html.parser') + + standard_panes = warnings_soup.find_all('div', class_='standard-pane') + if len(standard_panes) < 2: + return {'graphic': [], 'moderate': [], 'minor': []} + + user_warnings_pane = standard_panes[1] + warnings: Dict[str, List[str]] = {'graphic': [], 'moderate': [], 'minor': []} + current_list_key = 'graphic' tag_re = re.compile(r'^(.*) \((\d+)\)$') + for tag in user_warnings_pane.children: - if tag == '\n': - continue + if not isinstance(tag, Tag): continue + if tag.name == 'p': - if tag.text == 'Graphic': - warnings_list = warnings_graphic - elif tag.text == 'Moderate': - warnings_list = warnings_moderate - elif tag.text == 'Minor': - warnings_list = warnings_minor + if tag.text == 'Graphic': current_list_key = 'graphic' + elif tag.text == 'Moderate': current_list_key = 'moderate' + elif tag.text == 'Minor': current_list_key = 'minor' elif tag.name == 'div': match = tag_re.match(tag.text) - warnings_list.append(match[1]) - warnings = { - 'graphic': warnings_graphic, - 'moderate': warnings_moderate, - 'minor': warnings_minor - } + if match: warnings[current_list_key].append(match.group(1)) return warnings @staticmethod @parsing_exception - def search(query): + def search(query: str) -> List[Dict[str, str]]: content = BooksScraper.search(query) soup = BeautifulSoup(content, 'html.parser') - search_results = [] + search_results: List[Dict[str, str]] = [] + books = soup.find_all('div', class_="book-title-author-and-series w-11/12") for book in books: - title = book.find('a').text.strip() - for a in book.find_all('a'): - if a["href"].startswith('/author'): - author = a.text.strip() - break - book_id = book.find('a')['href'].split('/')[-1] - search_results.append({ - 'title': title, - 'author': author, - 'book_id': book_id - }) + if not isinstance(book, Tag): continue + + title_tag = book.find('a') + title = title_tag.text.strip() if isinstance(title_tag, Tag) else "N/A" + + href_val = title_tag.get('href') if isinstance(title_tag, Tag) else None + + href = href_val[0] if isinstance(href_val, list) else href_val + book_id = href.split('/')[-1] if isinstance(href, str) else "N/A" + + author = "N/A" + for a_tag in book.find_all('a'): + if isinstance(a_tag, Tag): + href = a_tag.get("href") + if isinstance(href, str) and href.startswith('/author'): + author = a_tag.text.strip() + break + + search_results.append({'title': title, 'author': author, 'book_id': book_id}) + return search_results diff --git a/storygraph_api/request/books_request.py b/storygraph_api/request/books_request.py index b6f22f8..6ca9ef9 100644 --- a/storygraph_api/request/books_request.py +++ b/storygraph_api/request/books_request.py @@ -9,11 +9,26 @@ def fetch_url(url): response.raise_for_status() return response.content + @staticmethod + @request_exception + def fetch_url_authenticated(url, cookies): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + response = requests.get(url, cookies=cookies, headers=headers) + response.raise_for_status() + return response.content + @staticmethod def main(book_id): url = f"https://app.thestorygraph.com/books/{book_id}" return BooksScraper.fetch_url(url) + @staticmethod + def book_page_authenticated(book_id, cookies): + url = f"https://app.thestorygraph.com/books/{book_id}" + return BooksScraper.fetch_url_authenticated(url, cookies) + @staticmethod def community_reviews(book_id): url = f"https://app.thestorygraph.com/books/{book_id}/community_reviews" @@ -28,4 +43,4 @@ def content_warnings(book_id): def search(query): formatted_query = query.replace(' ', '%20') url = f"https://app.thestorygraph.com/browse?search_term={formatted_query}" - return BooksScraper.fetch_url(url) + return BooksScraper.fetch_url(url) \ No newline at end of file From 2124e56d1c5cd9e138784cffc6f08ad2e6d96d00 Mon Sep 17 00:00:00 2001 From: Nicholas Thompson Date: Fri, 18 Jul 2025 00:12:33 -0700 Subject: [PATCH 2/6] docs: Document reading_progress feature in README reading_progress to the features list; new code example for fetching progress; cookie requirements for authenticated endpoints --- README.md | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/README.md b/README.md index 75f6adf..f27855e 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ A python package to interact with and fetch data from the [StoryGraph](https://a ## Features - **Book Details**: Fetch detailed information about a book using its unique ID. +- **Reading Progress**: Get your current reading progress for a book on your "currently reading" shelf. - **Search**: Perform a book search on StoryGraph and retrieve the results. - **Fetch User lists**: - currently reading @@ -67,6 +68,42 @@ print(result) } ``` +----- + +### Reading Progress + +```python +# Fetch your reading progress for a book +# Requires authentication cookies +from storygraph_api import Book +from dotenv import load_dotenv +import os + +load_dotenv() + +# Get cookies from your .env file +# _STORYGRAPH_SESSION=your_session_cookie +# REMEMBER_USER_TOKEN=your_remember_token +cookies = { + "_storygraph_session": os.getenv('_STORYGRAPH_SESSION'), + "remember_user_token": os.getenv('REMEMBER_USER_TOKEN'), +} + +book_id = "1c023e31-637b-41d9-ba64-260c3c1b0f3d" +book = Book() +result = book.reading_progress(book_id, cookies) +print(result) +``` + +#### Result: + +```json +{ + "progress": "70%" +} +``` + +----- ### User List: From df16bde25586676663c2bf1a37a32f8b06ca31c5 Mon Sep 17 00:00:00 2001 From: Nicholas Thompson Date: Fri, 18 Jul 2025 00:56:41 -0700 Subject: [PATCH 3/6] fix: Add User-Agent to book requests to prevent 403 errors --- storygraph_api/request/books_request.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/storygraph_api/request/books_request.py b/storygraph_api/request/books_request.py index 6ca9ef9..d208559 100644 --- a/storygraph_api/request/books_request.py +++ b/storygraph_api/request/books_request.py @@ -5,7 +5,10 @@ class BooksScraper: @staticmethod @request_exception def fetch_url(url): - response = requests.get(url) + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + response = requests.get(url, headers=headers) response.raise_for_status() return response.content From b22a55f4460caafeb1a95b61d14449c0b6863f88 Mon Sep 17 00:00:00 2001 From: Nicholas Thompson Date: Thu, 31 Jul 2025 15:15:14 -0700 Subject: [PATCH 4/6] feat: Overhaul API, remove Selenium, and add features This commit marks a significant evolution of the unofficial StoryGraph API wrapper, focusing on performance, reliability, and expanded functionality. - Replaced Selenium with `requests`: The core scraping logic has been refactored to use the `requests` library directly, removing the heavy dependency on Selenium and a headless browser. This results in a much faster, lighter, and more stable client. - Expanded API Coverage: Added numerous new methods to interact with the StoryGraph API: - **User:** Fetch `currently-reading`, `to-read`, and `books-read` shelves, and retrieve all journal entries. - **Book:** Get reading progress, read dates, AI summaries, and journal entries for specific books. - Enhanced Authentication: Implemented a more reliable cookie-based authentication flow, requiring `_storygraph_session` and `remember_user_token`. - Added Comprehensive Tests: A new `test.py` script provides a full suite of tests for all client methods, ensuring reliability. - New README: Generated a completely new `README.md` that reflects all the new features, including detailed setup instructions and up-to-date usage examples. --- .vscode/ltex.dictionary.en-US.txt | 2 + README.md | 212 ++++++++----------- manual_tests.py | 39 ---- requirements.txt | 12 -- storygraph_api/books_client.py | 26 ++- storygraph_api/exception_handler.py | 20 +- storygraph_api/exceptions.py | 4 - storygraph_api/parse/books_parser.py | 264 ++++++++++++++++++++++-- storygraph_api/parse/user_parser.py | 105 +++++++++- storygraph_api/request/books_request.py | 55 +++-- storygraph_api/request/user_request.py | 70 +++---- storygraph_api/users_client.py | 49 ++++- test.py | 169 +++++++++++++++ 13 files changed, 745 insertions(+), 282 deletions(-) create mode 100644 .vscode/ltex.dictionary.en-US.txt delete mode 100644 manual_tests.py create mode 100644 test.py diff --git a/.vscode/ltex.dictionary.en-US.txt b/.vscode/ltex.dictionary.en-US.txt new file mode 100644 index 0000000..f2dba1c --- /dev/null +++ b/.vscode/ltex.dictionary.en-US.txt @@ -0,0 +1,2 @@ +StoryGraph +storygraph-api diff --git a/README.md b/README.md index f27855e..f6c555a 100644 --- a/README.md +++ b/README.md @@ -1,153 +1,121 @@ -# Storygraph API -A python package to interact with and fetch data from the [StoryGraph](https://app.thestorygraph.com/) website. - -## Features -- **Book Details**: Fetch detailed information about a book using its unique ID. -- **Reading Progress**: Get your current reading progress for a book on your "currently reading" shelf. -- **Search**: Perform a book search on StoryGraph and retrieve the results. -- **Fetch User lists**: - - currently reading - - planning to read - - books read +# Unofficial StoryGraph API for Python + +An unofficial Python wrapper for The StoryGraph API, forked from [ym496/storygraph-api](https://github.com/ym496/storygraph-api). + +This fork has been significantly refactored and enhanced to be more efficient, reliable, and feature-rich. + +## Key Enhancements in This Fork + +* **No More Selenium**: The original dependency on Selenium and a headless browser has been completely removed. This version uses the `requests` library directly for all API communication, resulting in a much lighter, faster, and more stable experience. +* **Expanded API Coverage**: Many new features have been added, including methods to: + * Fetch your reading progress. + * Get your read dates for a book. + * Retrieve all your journal entries or entries for a specific book. + * Get a book's AI-generated summary. + * Fetch a user's ID. +* **Modernized Codebase**: The code has been updated with type hints and a more robust project structure. +* **Cookie-Based Authentication**: Authentication is now handled by passing your browser's session cookies, which is a more reliable method than the previous implementation. ## Installation -``` -pip install storygraph-api + +```bash +pip install -r requirements.txt ``` -## Getting Started +## Configuration -The API is divided into two components, `Books Client` and `User Client`. +This wrapper requires authentication for most features. You'll need to provide your StoryGraph session cookies and username. -### Book Details: +1. **Create a `.env` file** in the root of the project. +2. **Find your cookies**: + * Open your web browser and log in to [The StoryGraph](https://app.thestorygraph.com/). + * Open your browser's developer tools (usually by pressing F12). + * Go to the "Application" (in Chrome) or "Storage" (in Firefox) tab. + * Under the "Cookies" section for `app.thestorygraph.com`, find the values for `_storygraph_session` and `remember_user_token`. +3. **Add your credentials to the `.env` file**: -```python -# Books Client -# Fetch details of a book using its ID - -from storygraph_api import Book -id = "fbdd6b7c-f512-47f2-aa94-d8bf0d5f5175" -book = Book() -result = book.book_info(id) -print(result) -``` -#### Result: -```json -{ - "title": "Hagakure: The Book of the Samurai", - "authors": [ - "Yamamoto Tsunetomo", - "William Scott Wilson" - ], - "pages": "179", - "first_pub": "1716", - "tags": [ - "nonfiction", - "history", - "philosophy", - "informative", - "reflective", - "slow-paced" - ], - "average_rating": "3.65", - "description": "
Hagakure<\\/em> (\\\"In the Shadow of Leaves\\\") is a manual for the samurai classes consisting of a series of short anecdotes and reflections that give both insight and instruction-in the philosophy and code of behavior that foster the true spirit of Bushido-the Way of the Warrior. It is not a book of philosophy as most would understand the word: it is a collection of thoughts and sayings recorded over a period of seven years, and as such covers a wide variety of subjects, often in no particular sequence.

The work represents an attitude far removed from our modern pragmatism and materialism, and possesses an intuitive rather than rational appeal in its assertion that Bushido is a Way of Dying, and that only a samurai retainer prepared and willing to die at any moment can be totally true to his lord. While Hagakure<\\/em> was for many years a secret text known only to the warrior vassals of the Hizen fief to which the author belonged, it later came to be recognized as a classic exposition of samurai thought and came to influence many subsequent generations, including Yukio Mishima.

This translation offers 300 selections that constitute the core texts of the 1,300 present in the original.
Hagakure<\\/em> was featured prominently in the film Ghost Dog<\\/em>, by Jim Jarmusch.<\\/div>", - "warnings": { - "graphic": [ - "Suicide", - "Violence" - ], - "moderate": [ - "Suicide", - "Suicide attempt", - "War" - ], - "minor": [ - "Gore" - ] - } -} -``` + ```dotenv + _STORYGRAPH_SESSION=your_session_cookie_value + REMEMBER_USER_TOKEN=your_remember_token_value + STORYGRAPH_USERNAME=your_storygraph_username + ``` ------ +## Usage -### Reading Progress +Here's a basic example of how to use the `Book` and `User` clients. ```python -# Fetch your reading progress for a book -# Requires authentication cookies -from storygraph_api import Book -from dotenv import load_dotenv import os +import json +from dotenv import load_dotenv +from storygraph_api import Book, User +# Load environment variables from .env file load_dotenv() -# Get cookies from your .env file -# _STORYGRAPH_SESSION=your_session_cookie -# REMEMBER_USER_TOKEN=your_remember_token -cookies = { - "_storygraph_session": os.getenv('_STORYGRAPH_SESSION'), - "remember_user_token": os.getenv('REMEMBER_USER_TOKEN'), +# --- Authentication --- +username = os.getenv("STORYGRAPH_USERNAME") +session_cookie = os.getenv("_STORYGRAPH_SESSION") +remember_token = os.getenv("REMEMBER_USER_TOKEN") + +auth_cookies = { + "_storygraph_session": session_cookie, + "remember_user_token": remember_token } -book_id = "1c023e31-637b-41d9-ba64-260c3c1b0f3d" -book = Book() -result = book.reading_progress(book_id, cookies) -print(result) -``` +# --- Initialize Clients --- +book_client = Book() +user_client = User() -#### Result: +# --- User Client Examples --- -```json -{ - "progress": "70%" -} -``` +# Get user ID +user_id_json = user_client.get_user_id(username) +user_id = json.loads(user_id_json).get("user_id") +print(f"User ID: {user_id}") ------ +# Get 'Currently Reading' list +currently_reading = user_client.currently_reading(username, auth_cookies) +print(currently_reading) -### User List: +# Get 'To-Read' list +to_read = user_client.to_read(username, auth_cookies) +print(to_read) -```python -# User Client -# works only for public profiles -# fetch user's currently reading list +# Get 'Read' list +books_read = user_client.books_read(username, auth_cookies) +print(books_read) -from storygraph_api import User -from dotenv import load_dotenv -load_dotenv() -cookie = os.getenv('COOKIE') # retrieve cookie from .env file -uname = 'sampleuname' #some username -user = User() -result = user.currently_reading(uname,cookie=cookie) -print(result) +# --- Book Client Examples --- -``` +book_id = "1c023e31-637b-41d9-ba64-260c3c1b0f3d" # Example book ID -#### Result: - - ```json - [ - { - "title": "The Murder After the Night Before", - "book_id": "38cb5b56-23f1-48fd-b4b3-a80e07a19775" - }, - { - "title": "The Graces", - "book_id": "653b54b3-a79d-4c2e-ae40-eae281a91315" - } -] +# Search for a book +search_results = book_client.search("Dune Frank Herbert") +print(search_results) - ``` +# Get book info +book_info = book_client.book_info(book_id) +print(book_info) -## Further Information -* Refer to [books_client.py](https://github.com/ym496/storygraph-api/tree/main/storygraph_api/books_client.py) and [users_client.py](https://github.com/ym496/storygraph-api/tree/main/storygraph_api/users_client.py) files to know more functionalities. -* All the user related tasks require the `remember_user_token` cookie. It can be found in the `Application` section of your browser’s developer tools for the StoryGraph website. +# Get your reading progress for a book +progress = book_client.reading_progress(book_id, auth_cookies) +print(progress) -## Contributing -Contributions are welcome! Fork the repository, make your changes, and submit a pull request. +# Get your read dates for a book +read_dates = book_client.get_read_dates(book_id, auth_cookies) +print(read_dates) -For bugs or feature requests, please open an issue on [GitHub](https://github.com/ym496/storygraph-api/issues). +# Get your journal entries for a book +journal_entries = book_client.get_journal_entries(book_id, auth_cookies) +print(journal_entries) + +# Get the AI summary for a book +if user_id: + ai_summary = book_client.get_ai_summary(book_id, user_id) + print(ai_summary) +``` -## License +## Disclaimer -This project is licensed under the MIT License. +This is an unofficial wrapper. It is not affiliated with or endorsed by The StoryGraph. Use it at your own risk. The StoryGraph's website structure could change at any time, which might break this wrapper. diff --git a/manual_tests.py b/manual_tests.py deleted file mode 100644 index 94f6dc9..0000000 --- a/manual_tests.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -Basic Manual Testing of Components. -""" -import os -from dotenv import load_dotenv -# from storygraph_api.request.books_request import BooksScraper -# from storygraph_api.parse.books_parser import BooksParser -# from storygraph_api.request.user_request import UserScraper -# from storygraph_api.parse.user_parser import UserParser -# from storygraph_api.users_client import User -load_dotenv() - -id = "a5da6127-beb2-44b9-aba6-f63de432777" -query = "pride and prejudice" -# testing book page info -# print(BooksScraper.main(id)) -# print(BooksParser.book_page(id)) - -id = "e5a59ed0-31f0-46af-849e-cd8e624b68ff" -from storygraph_api import Book -book = Book() -print(book.book_info(id)) -# print(book.search(query)) - -# cookie = os.getenv('COOKIE') -# # print(UserScraper.currently_reading(uname,session_cookie=cookie)) -# # print(UserParser.books_read(uname,cookie=cookie)) -# user = User() -# print(user.books_read(uname,cookie=cookie)) - -# -# from storygraph_api.users_client import User -# from dotenv import load_dotenv -# load_dotenv() -# cookie = os.getenv('COOKIE') -# uname = 'clyrmze' -# user = User() -# result = user.books_read(uname,cookie=cookie) -# print(result) diff --git a/requirements.txt b/requirements.txt index 56e2dbe..6ac3d6a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,21 +1,9 @@ -attrs==24.2.0 beautifulsoup4==4.12.3 certifi==2024.7.4 charset-normalizer==3.3.2 -exceptiongroup==1.2.2 -h11==0.14.0 idna==3.7 -outcome==1.3.0.post0 -PySocks==1.7.1 python-dotenv==1.1.1 requests==2.32.3 -selenium==4.23.1 -sniffio==1.3.1 -sortedcontainers==2.4.0 soupsieve==2.6 -trio==0.26.2 -trio-websocket==0.11.1 typing_extensions==4.12.2 urllib3==2.2.2 -websocket-client==1.8.0 -wsproto==1.2.0 diff --git a/storygraph_api/books_client.py b/storygraph_api/books_client.py index 5c147ee..f3da804 100644 --- a/storygraph_api/books_client.py +++ b/storygraph_api/books_client.py @@ -1,20 +1,36 @@ from storygraph_api.parse.books_parser import BooksParser from storygraph_api.exception_handler import handle_exceptions import json +from typing import Dict class Book: @handle_exceptions - def book_info(self,book_id): + def book_info(self, book_id: str) -> str: data = BooksParser.book_page(book_id) - return json.dumps(data,indent=4) + return json.dumps(data, indent=4) @handle_exceptions - def reading_progress(self, book_id, cookies): + def reading_progress(self, book_id: str, cookies: Dict[str, str]) -> str: progress = BooksParser.reading_progress(book_id, cookies) data = {"progress": progress} return json.dumps(data, indent=4) @handle_exceptions - def search(self,query): + def get_read_dates(self, book_id: str, cookies: Dict[str, str]) -> str: + data = BooksParser.get_read_dates(book_id, cookies) + return json.dumps(data, indent=4) + + @handle_exceptions + def get_ai_summary(self, book_id: str, user_id: str) -> str: + data = BooksParser.get_ai_summary(book_id, user_id) + return json.dumps(data, indent=4) + + @handle_exceptions + def get_journal_entries(self, book_id: str, cookies: Dict[str, str]) -> str: + data = BooksParser.journal_entries(book_id, cookies) + return json.dumps(data, indent=4) + + @handle_exceptions + def search(self, query: str) -> str: data = BooksParser.search(query) - return json.dumps(data,indent=4) + return json.dumps(data, indent=4) diff --git a/storygraph_api/exception_handler.py b/storygraph_api/exception_handler.py index c84f9a8..df90bba 100644 --- a/storygraph_api/exception_handler.py +++ b/storygraph_api/exception_handler.py @@ -2,18 +2,18 @@ import requests from functools import wraps from storygraph_api.exceptions import RequestError, ParsingError, UnexpectedError +from selenium.common.exceptions import WebDriverException def handle_exceptions(func): @wraps(func) def wrapper(*args, **kwargs): try: return func(*args, **kwargs) - except RequestError as e: - return json.dumps({"error": e.message}, indent=4) - except ParsingError as e: + except (RequestError, ParsingError) as e: return json.dumps({"error": e.message}, indent=4) except Exception as e: - raise UnexpectedError(f"Unexpected error: {str(e)}") + unexpected_error = UnexpectedError(f"An unexpected error occurred: {str(e)}") + return json.dumps({"error": unexpected_error.message}, indent=4) return wrapper def request_exception(func): @@ -22,9 +22,9 @@ def wrapper(*args, **kwargs): try: return func(*args, **kwargs) except requests.RequestException as e: - return json.dumps({"error": f"Scraping Error: {str(e)}"}, indent=4) - except Exception as e: - return json.dumps({"error": f"Scraping Error: {str(e)}"}, indent=4) + raise RequestError(f"A network error occurred: {str(e)}") from e + except WebDriverException as e: + raise RequestError(f"A browser automation error occurred: {str(e)}") from e return wrapper def parsing_exception(func): @@ -32,8 +32,6 @@ def parsing_exception(func): def wrapper(*args, **kwargs): try: return func(*args, **kwargs) - except ParsingError as e: - return json.dumps({"error": e.message}, indent=4) - except Exception as e: - return json.dumps({"error": f"Parsing Error: {str(e)}"}, indent=4) + except (AttributeError, IndexError, TypeError, ValueError) as e: + raise ParsingError(f"Failed to parse page content. The website structure may have changed. Details: {str(e)}") from e return wrapper diff --git a/storygraph_api/exceptions.py b/storygraph_api/exceptions.py index 3a6f046..ce548bd 100644 --- a/storygraph_api/exceptions.py +++ b/storygraph_api/exceptions.py @@ -1,21 +1,17 @@ class StoryGraphAPIError(Exception): - """Base class for exceptions in StoryGraphAPI""" pass class RequestError(StoryGraphAPIError): - """Exception raised for errors during the request.""" def __init__(self, message="An error occurred during the request."): self.message = message super().__init__(self.message) class ParsingError(StoryGraphAPIError): - """Exception raised for errors during parsing responses.""" def __init__(self, message="An error occurred while parsing the response."): self.message = message super().__init__(self.message) class UnexpectedError(StoryGraphAPIError): - """Exception raised for unexpected errors.""" def __init__(self, message="An unexpected error occurred."): self.message = message diff --git a/storygraph_api/parse/books_parser.py b/storygraph_api/parse/books_parser.py index 34b27c6..8df03bb 100644 --- a/storygraph_api/parse/books_parser.py +++ b/storygraph_api/parse/books_parser.py @@ -3,6 +3,7 @@ from bs4 import BeautifulSoup, Tag, NavigableString import re from typing import Dict, Any, List +from urllib.parse import parse_qs, urlparse class BooksParser: @staticmethod @@ -33,22 +34,32 @@ def book_page(book_id: str) -> Dict[str, Any]: pages_text = p_tag.contents[0] pages = pages_text.strip().split()[0] if isinstance(pages_text, NavigableString) else "N/A" - pub_info_spans = p_tag.find_all('span') - first_pub = pub_info_spans[1].text.split()[-1] if len(pub_info_spans) > 1 else "N/A" + pub_info_span = p_tag.find('span', string=re.compile(r'first pub')) + first_pub = pub_info_span.text.split()[-1] if pub_info_span else "N/A" tag_div = soup.find('div', class_="book-page-tag-section") tags = [tag.text for tag in tag_div.find_all('span')] if isinstance(tag_div, Tag) else [] - script_tags = soup.find_all('script') - desc = "" - for s in script_tags: - if 'Description' in s.text: - desc = s.text - break + cover_url = None + cover_div = soup.find('div', class_="book-cover") + if isinstance(cover_div, Tag): + img_tag = cover_div.find('img') + if isinstance(img_tag, Tag): + cover_url = img_tag.get('src') - pattern = re.compile(r"Description
(.*?)
", re.DOTALL) - match = pattern.search(desc) - description = match.group(1).strip() if match else "Description not found." + description = "Description not found." + script_tag = soup.find('script', string=re.compile(r"\$\('\.read-more-btn'\)")) + if isinstance(script_tag, Tag): + script_content = script_tag.string + if script_content: + pattern = re.compile(r"\.html\('(.*)'\)", re.DOTALL) + match = pattern.search(str(script_content)) + if match: + html_str = match.group(1).replace(r'\/', r'/') + desc_soup = BeautifulSoup(html_str, 'html.parser') + desc_div = desc_soup.find('div', class_='trix-content') + if desc_div: + description = desc_div.get_text(separator="\n", strip=True) review_content = BooksScraper.community_reviews(book_id) rev_soup = BeautifulSoup(review_content, 'html.parser') @@ -60,7 +71,8 @@ def book_page(book_id: str) -> Dict[str, Any]: data = { 'title': title, 'authors': authors, 'pages': pages, 'first_pub': first_pub, 'tags': tags, 'average_rating': avg_rating, - 'description': description, 'warnings': warnings + 'description': description, 'warnings': warnings, + 'cover_url': cover_url } return data @@ -70,19 +82,154 @@ def reading_progress(book_id: str, cookies: Dict[str, str]) -> str: content = BooksScraper.book_page_authenticated(book_id, cookies) soup = BeautifulSoup(content, 'html.parser') + status_label = soup.find('button', class_='read-status-label') + if isinstance(status_label, Tag) and status_label.text.strip() == 'read': + return "100%" + progress_bar_div = soup.find('div', class_='progress-bar') - if not isinstance(progress_bar_div, Tag): - raise Exception("Could not find progress bar. Ensure book is 'currently-reading'.") + if isinstance(progress_bar_div, Tag): + progress_span = progress_bar_div.find('span') + if isinstance(progress_span, Tag) and progress_span.string: + return progress_span.string.strip() + + inner_div = progress_bar_div.find('div', style=lambda v: 'width: 0%' in v if v else False) + if inner_div is not None: + return "0%" + + to_read_button = soup.find('button', string=re.compile(r'\s*to read\s*')) + if isinstance(to_read_button, Tag): + return "0%" - progress_span = progress_bar_div.find('span') - if isinstance(progress_span, Tag) and progress_span.string: - return progress_span.string.strip() + raise Exception("Could not determine reading status from the page.") - inner_div = progress_bar_div.find('div', style=lambda v: 'width: 0%' in v if v else False) - if inner_div is not None: - return "0%" + @staticmethod + @parsing_exception + def get_read_dates(book_id: str, cookies: Dict[str, str]) -> Dict[str, Any]: + try: + from storygraph_api.parse.user_parser import UserParser + from storygraph_api.request.user_request import UserScraper + + all_entries = [] + page = 1 + while True: + content = UserScraper.all_journal_entries(cookies, page) + entries = UserParser.all_journal_entries(content) + if not entries: + break + all_entries.extend(entries) + page += 1 + + start_date = None + finish_date = None + + for entry in all_entries: + if entry.get('book_id') == book_id: + if entry.get('status') == 'Started reading': + date_str = entry.get('date', '') + if date_str: + try: + from datetime import datetime + parsed_date = datetime.strptime(date_str, '%d %B %Y') + start_date = parsed_date.strftime('%Y-%m-%d') + except: + pass + elif entry.get('status') == 'Finished': + date_str = entry.get('date', '') + if date_str: + try: + from datetime import datetime + parsed_date = datetime.strptime(date_str, '%d %B %Y') + finish_date = parsed_date.strftime('%Y-%m-%d') + except: + pass + + return {'start_date': start_date, 'finish_date': finish_date} + + except Exception: + pass + + content = BooksScraper.book_page_authenticated(book_id, cookies) + soup = BeautifulSoup(content, 'html.parser') + + edit_link = soup.find('a', href=re.compile(r'/edit-(read-instance|journal-entry)-from-book')) + if not (isinstance(edit_link, Tag) and edit_link.get('href')): + return {'start_date': None, 'finish_date': None} + + href = edit_link['href'] + if not isinstance(href, str): + raise Exception("Could not find a valid edit link href.") + + parsed_url = urlparse(href) + query_params = parse_qs(parsed_url.query) + + id_val = None + form_content = b'' + id_type = '' + + if 'read_instance_id' in query_params: + id_val = query_params.get('read_instance_id', [None])[0] + id_type = 'read_instance' + if not id_val: + raise Exception("Could not extract read_instance_id from edit link.") + try: + form_content = BooksScraper.get_read_dates_form(book_id, id_val, cookies) + except: + return {'start_date': None, 'finish_date': None} + + elif 'journal_entry_id' in query_params: + id_val = query_params.get('journal_entry_id', [None])[0] + id_type = 'journal_entry' + if not id_val: + raise Exception("Could not extract journal_entry_id from edit link.") + try: + form_content = BooksScraper.get_journal_entry_form(book_id, id_val, cookies) + except: + return {'start_date': None, 'finish_date': None} + + if not form_content: + return {'start_date': None, 'finish_date': None} + + form_soup = BeautifulSoup(form_content, 'html.parser') + + def get_date(date_prefix: str) -> str | None: + day_select = form_soup.find('select', id=f'{id_type}_{date_prefix}day') + month_select = form_soup.find('select', id=f'{id_type}_{date_prefix}month') + year_select = form_soup.find('select', id=f'{id_type}_{date_prefix}year') + + if not (isinstance(day_select, Tag) and isinstance(month_select, Tag) and isinstance(year_select, Tag)): + return None + + day_option = day_select.find('option', selected=True) + month_option = month_select.find('option', selected=True) + year_option = year_select.find('option', selected=True) - raise Exception("Could not extract percentage from progress bar.") + if isinstance(day_option, Tag) and isinstance(month_option, Tag) and isinstance(year_option, Tag): + day = day_option.get('value') + month = month_option.get('value') + year = year_option.get('value') + if day and month and year: + return f"{year}-{str(month).zfill(2)}-{str(day).zfill(2)}" + return None + + start_date_prefix = 'started_at_' if id_type == 'journal_entry' else 'start_' + start_date = get_date(start_date_prefix) + finish_date = get_date('finished_at_') if id_type == 'journal_entry' else get_date('') + + return {'start_date': start_date, 'finish_date': finish_date} + + @staticmethod + @parsing_exception + def get_ai_summary(book_id: str, user_id: str) -> Dict[str, str]: + content = BooksScraper.get_ai_summary(book_id, user_id) + soup = BeautifulSoup(content, 'html.parser') + + template = soup.find('template') + if isinstance(template, Tag): + p_tag = template.find('p') + if isinstance(p_tag, Tag) and p_tag.string: + return {'summary': p_tag.string.strip()} + + raise Exception("Could not parse AI summary.") @staticmethod @parsing_exception @@ -141,3 +288,78 @@ def search(query: str) -> List[Dict[str, str]]: search_results.append({'title': title, 'author': author, 'book_id': book_id}) return search_results + + @staticmethod + @parsing_exception + def journal_entries(book_id: str, cookies: Dict[str, str]) -> List[Dict[str, Any]]: + content = BooksScraper.get_journal_page(book_id, cookies) + soup = BeautifulSoup(content, 'html.parser') + + journal_entries: List[Dict[str, Any]] = [] + + entry_panes = soup.find_all('span', class_="journal-entry-panes") + if not entry_panes: + return journal_entries + + for entry in entry_panes[0].find_all(lambda tag: tag.name == 'div' and 'grid-cols-4' in tag.get('class', [])): + date_tag = entry.find('p', class_="font-semibold") + date = date_tag.text.strip().split('\n')[0] if date_tag else "N/A" + + progress_percent_tag = entry.find('div', class_="text-teal-500") + progress_percent = int(progress_percent_tag.text.strip().replace('%', '')) if progress_percent_tag else None + + pages_read_tag = entry.find('p', class_="clear-both") + pages_read_this_session = None + total_pages_read = None + total_pages = None + + if pages_read_tag: + pages_text = pages_read_tag.text + session_match = re.search(r'(\d+) pages read', pages_text) + if session_match: + pages_read_this_session = int(session_match.group(1)) + + total_match = re.search(r'\((\d+) pages out of (\d+)\)', pages_text) + if total_match: + total_pages_read = int(total_match.group(1)) + total_pages = int(total_match.group(2)) + + note_tag = entry.find('div', class_="trix-content") + note = note_tag.text.strip() if note_tag else None + + status_tag = entry.find('span', class_=lambda x: x and 'inline-flex' in x) + status = status_tag.text.strip() if status_tag else None + + if status == "Started reading": + if progress_percent is None: + progress_percent = 0 + if pages_read_this_session is None: + pages_read_this_session = 0 + if total_pages_read is None: + total_pages_read = 0 + elif status == "Finished": + if progress_percent is None: + progress_percent = 100 + + journal_entries.append({ + 'date': date, + 'status': status, + 'progress_percent': progress_percent, + 'pages_read_this_session': pages_read_this_session, + 'total_pages_read': total_pages_read, + 'total_pages': total_pages, + 'note': note + }) + + book_total_pages = None + for entry in journal_entries: + if entry.get('total_pages') is not None: + book_total_pages = entry['total_pages'] + break + + if book_total_pages is not None: + for entry in journal_entries: + if entry.get('total_pages') is None: + entry['total_pages'] = book_total_pages + + return journal_entries diff --git a/storygraph_api/parse/user_parser.py b/storygraph_api/parse/user_parser.py index b4f30c4..2ca80f5 100644 --- a/storygraph_api/parse/user_parser.py +++ b/storygraph_api/parse/user_parser.py @@ -1,10 +1,24 @@ from storygraph_api.request.user_request import UserScraper from storygraph_api.exception_handler import parsing_exception -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag +from typing import Dict +import re class UserParser: @staticmethod - @parsing_exception + @parsing_exception + def get_user_id(username: str) -> Dict[str, str]: + content = UserScraper.get_profile_page(username) + soup = BeautifulSoup(content, 'html.parser') + profile_pane = soup.find('div', id='profile-heading-pane') + if isinstance(profile_pane, Tag): + user_id = profile_pane.get('data-user-id') + if user_id and isinstance(user_id, str): + return {'user_id': user_id} + raise Exception(f"Could not find user_id for username '{username}'.") + + @staticmethod + @parsing_exception def parse_html(html): soup = BeautifulSoup(html, 'html.parser') books_list = [] @@ -21,15 +35,96 @@ def parse_html(html): @staticmethod def currently_reading(uname, cookie): - content = UserScraper.currently_reading(uname,cookie) + content = UserScraper.currently_reading(uname, cookie) return UserParser.parse_html(content) @staticmethod def to_read(uname, cookie): - content = UserScraper.to_read(uname,cookie) + content = UserScraper.to_read(uname, cookie) return UserParser.parse_html(content) @staticmethod def books_read(uname, cookie): - content = UserScraper.books_read(uname,cookie) + content = UserScraper.books_read(uname, cookie) return UserParser.parse_html(content) + + @staticmethod + @parsing_exception + def all_journal_entries(html_content): + soup = BeautifulSoup(html_content, 'html.parser') + + journal_entries = [] + + for entry in soup.find_all('div', class_="mb-7"): + try: + book_title_tag = entry.find('p', class_="font-semibold text-sm md:text-base font-semibold").find('a') + book_title = book_title_tag.text.strip() if book_title_tag else "N/A" + + book_id = book_title_tag['href'].split('/')[-1] if book_title_tag else "N/A" + + date_tag = entry.find('p', class_="font-semibold text-xs md:text-sm") + date = date_tag.text.strip().split('\n')[0] if date_tag else "N/A" + + progress_percent_tag = entry.find('div', class_="text-teal-500") + progress_percent = int(progress_percent_tag.text.strip().replace('%', '')) if progress_percent_tag else None + + pages_read_tag = entry.find('p', class_=re.compile(r'clear-both.*')) + pages_read_this_session = None + total_pages_read = None + total_pages = None + + if pages_read_tag: + pages_text = pages_read_tag.text + session_match = re.search(r'(\d+) pages read', pages_text) + if session_match: + pages_read_this_session = int(session_match.group(1)) + + total_match = re.search(r'\((\d+) pages out of (\d+)\)', pages_text) + if total_match: + total_pages_read = int(total_match.group(1)) + total_pages = int(total_match.group(2)) + + note_tag = entry.find('div', class_="trix-content") + note = note_tag.text.strip() if note_tag else None + + status_tag = entry.find('span', class_=lambda x: x and 'inline-flex' in x) + status = status_tag.text.strip() if status_tag else None + + if status == "Started reading": + if progress_percent is None: + progress_percent = 0 + if pages_read_this_session is None: + pages_read_this_session = 0 + if total_pages_read is None: + total_pages_read = 0 + elif status == "Finished": + if progress_percent is None: + progress_percent = 100 + + journal_entries.append({ + 'book_title': book_title, + 'book_id': book_id, + 'date': date, + 'status': status, + 'progress_percent': progress_percent, + 'pages_read_this_session': pages_read_this_session, + 'total_pages_read': total_pages_read, + 'total_pages': total_pages, + 'note': note + }) + except Exception: + continue + + book_total_pages = {} + + for entry in journal_entries: + book_id = entry.get('book_id') + if book_id and entry.get('total_pages') is not None: + book_total_pages[book_id] = entry['total_pages'] + + for entry in journal_entries: + book_id = entry.get('book_id') + if book_id and entry.get('total_pages') is None and book_id in book_total_pages: + entry['total_pages'] = book_total_pages[book_id] + + return journal_entries diff --git a/storygraph_api/request/books_request.py b/storygraph_api/request/books_request.py index d208559..9cb2433 100644 --- a/storygraph_api/request/books_request.py +++ b/storygraph_api/request/books_request.py @@ -1,49 +1,70 @@ import requests -from storygraph_api.exception_handler import request_exception +from typing import Dict class BooksScraper: @staticmethod - @request_exception - def fetch_url(url): + def fetch_url(url: str, cookies: Dict[str, str] | None = None, params: Dict[str, str] | None = None) -> bytes: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } - response = requests.get(url, headers=headers) + response = requests.get(url, cookies=cookies, headers=headers, params=params) response.raise_for_status() return response.content @staticmethod - @request_exception - def fetch_url_authenticated(url, cookies): + def post_url(url: str, cookies: Dict[str, str] | None = None, data: Dict[str, str] | None = None) -> bytes: headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Content-Type': 'application/x-www-form-urlencoded' } - response = requests.get(url, cookies=cookies, headers=headers) + response = requests.post(url, cookies=cookies, headers=headers, data=data) response.raise_for_status() return response.content @staticmethod - def main(book_id): + def main(book_id: str) -> bytes: url = f"https://app.thestorygraph.com/books/{book_id}" return BooksScraper.fetch_url(url) @staticmethod - def book_page_authenticated(book_id, cookies): + def book_page_authenticated(book_id: str, cookies: Dict[str, str]) -> bytes: url = f"https://app.thestorygraph.com/books/{book_id}" - return BooksScraper.fetch_url_authenticated(url, cookies) + return BooksScraper.fetch_url(url, cookies=cookies) @staticmethod - def community_reviews(book_id): + def community_reviews(book_id: str) -> bytes: url = f"https://app.thestorygraph.com/books/{book_id}/community_reviews" return BooksScraper.fetch_url(url) @staticmethod - def content_warnings(book_id): + def content_warnings(book_id: str) -> bytes: url = f"https://app.thestorygraph.com/books/{book_id}/content_warnings" return BooksScraper.fetch_url(url) + + @staticmethod + def get_read_dates_form(book_id: str, read_instance_id: str, cookies: Dict[str, str]) -> bytes: + url = f"https://app.thestorygraph.com/edit-read-instance-from-book?book_id={book_id}&read_instance_id={read_instance_id}" + return BooksScraper.post_url(url, cookies=cookies) + + @staticmethod + def get_journal_entry_form(book_id: str, journal_entry_id: str, cookies: Dict[str, str]) -> bytes: + url = f"https://app.thestorygraph.com/edit-journal-entry-from-book?book_id={book_id}&journal_entry_id={journal_entry_id}" + return BooksScraper.post_url(url, cookies=cookies) + + @staticmethod + def get_ai_summary(book_id: str, user_id: str) -> bytes: + url = f"https://app.thestorygraph.com/personalized-preview.turbo_stream" + params = {'book_id': book_id, 'personalized': 'false', 'user_id': user_id} + return BooksScraper.fetch_url(url, params=params) + + @staticmethod + def get_journal_page(book_id: str, cookies: Dict[str, str]) -> bytes: + url = "https://app.thestorygraph.com/journal" + params = {'book_id': book_id} + return BooksScraper.fetch_url(url, cookies=cookies, params=params) @staticmethod - def search(query): - formatted_query = query.replace(' ', '%20') - url = f"https://app.thestorygraph.com/browse?search_term={formatted_query}" - return BooksScraper.fetch_url(url) \ No newline at end of file + def search(query: str) -> bytes: + url = "https://app.thestorygraph.com/browse" + params = {'search_term': query} + return BooksScraper.fetch_url(url, params=params) \ No newline at end of file diff --git a/storygraph_api/request/user_request.py b/storygraph_api/request/user_request.py index c1b24e3..00bfdb5 100644 --- a/storygraph_api/request/user_request.py +++ b/storygraph_api/request/user_request.py @@ -1,46 +1,42 @@ -from storygraph_api.exception_handler import request_exception -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -import time +import requests +from typing import Dict class UserScraper: @staticmethod - @request_exception - def fetch_url(url,cookie): - options = Options() - options.add_argument("--headless") - driver = webdriver.Chrome(options=options) - driver.get(url) - if cookie: - driver.add_cookie({ - 'name': 'remember_user_token', - 'value': cookie, - }) - driver.refresh() - SCROLL_PAUSE_TIME = 2 - last_height = driver.execute_script("return document.body.scrollHeight") - while True: - driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") - time.sleep(SCROLL_PAUSE_TIME) - new_height = driver.execute_script("return document.body.scrollHeight") - if new_height == last_height: - break - last_height = new_height - html_content = driver.page_source - driver.quit() - return html_content + def get_profile_page(username: str) -> bytes: + url = f"https://app.thestorygraph.com/profile/{username}" + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + response = requests.get(url, headers=headers) + response.raise_for_status() + return response.content @staticmethod - def currently_reading(uname, cookie): - url = f"https://app.thestorygraph.com/currently-reading/{uname}" - return UserScraper.fetch_url(url,cookie) + def fetch_paginated_url(url: str, cookies: dict) -> bytes: + headers = { + 'User-agent': 'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/91.0.4472.124 safari/537.36' + } + response = requests.get(url, cookies=cookies, headers=headers) + response.raise_for_status() + return response.content @staticmethod - def to_read(uname, cookie): - url = f"https://app.thestorygraph.com/to-read/{uname}" - return UserScraper.fetch_url(url,cookie) + def currently_reading(uname: str, cookies: Dict[str, str], page: int) -> bytes: + url = f"https://app.thestorygraph.com/currently-reading/{uname}?page={page}" + return UserScraper.fetch_paginated_url(url, cookies) @staticmethod - def books_read(uname, cookie): - url = f"https://app.thestorygraph.com/books-read/{uname}" - return UserScraper.fetch_url(url,cookie) + def to_read(uname: str, cookies: Dict[str, str], page: int) -> bytes: + url = f"https://app.thestorygraph.com/to-read/{uname}?page={page}" + return UserScraper.fetch_paginated_url(url, cookies) + + @staticmethod + def books_read(uname: str, cookies: Dict[str, str], page: int) -> bytes: + url = f"https://app.thestorygraph.com/books-read/{uname}?page={page}" + return UserScraper.fetch_paginated_url(url, cookies) + + @staticmethod + def all_journal_entries(cookies: Dict[str, str], page: int) -> bytes: + url = f"https://app.thestorygraph.com/journal?page={page}" + return UserScraper.fetch_paginated_url(url, cookies) diff --git a/storygraph_api/users_client.py b/storygraph_api/users_client.py index 0b4c41e..759c46f 100644 --- a/storygraph_api/users_client.py +++ b/storygraph_api/users_client.py @@ -1,19 +1,50 @@ from storygraph_api.parse.user_parser import UserParser +from storygraph_api.request.user_request import UserScraper from storygraph_api.exception_handler import handle_exceptions import json class User: @handle_exceptions - def currently_reading(self,uname,cookie): - data = UserParser.currently_reading(uname,cookie) - return json.dumps(data,indent=4) + def get_user_id(self, username: str) -> str: + data = UserParser.get_user_id(username) + return json.dumps(data, indent=4) + + def _fetch_paginated_books(self, fetch_function, uname, cookies): + all_books = [] + page = 1 + while True: + content = fetch_function(uname, cookies, page) + books = UserParser.parse_html(content) + if not books: + break + all_books.extend(books) + page += 1 + return all_books + + @handle_exceptions + def currently_reading(self, uname, cookies): + data = self._fetch_paginated_books(UserScraper.currently_reading, uname, cookies) + return json.dumps(data, indent=4) + + @handle_exceptions + def to_read(self, uname, cookies): + data = self._fetch_paginated_books(UserScraper.to_read, uname, cookies) + return json.dumps(data, indent=4) @handle_exceptions - def to_read(self,uname,cookie): - data = UserParser.to_read(uname,cookie) - return json.dumps(data,indent=4) + def books_read(self, uname, cookies): + data = self._fetch_paginated_books(UserScraper.books_read, uname, cookies) + return json.dumps(data, indent=4) @handle_exceptions - def books_read(self,uname,cookie): - data = UserParser.books_read(uname,cookie) - return json.dumps(data,indent=4) + def get_all_journal_entries(self, cookies): + all_entries = [] + page = 1 + while True: + content = UserScraper.all_journal_entries(cookies, page) + entries = UserParser.all_journal_entries(content) + if not entries: + break + all_entries.extend(entries) + page += 1 + return json.dumps(all_entries, indent=4) \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000..2f3bf78 --- /dev/null +++ b/test.py @@ -0,0 +1,169 @@ +import os +import json +from dotenv import load_dotenv + +from storygraph_api import Book, User + +def run_tests(): + print("πŸš€ Starting StoryGraph API tests...") + + print("\n--- 1. Loading Environment & Initializing Clients ---") + load_dotenv() + + username = os.getenv("STORYGRAPH_USERNAME") + session_cookie = os.getenv("_STORYGRAPH_SESSION") + remember_token = os.getenv("REMEMBER_USER_TOKEN") + + if not all([username, session_cookie, remember_token]): + print("πŸ›‘ Error: Missing one or more required environment variables.") + print("Please ensure _STORYGRAPH_SESSION, REMEMBER_USER_TOKEN, and STORYGRAPH_USERNAME are in your .env file.") + else: + auth_cookies = { + "_storygraph_session": session_cookie, + "remember_user_token": remember_token + } + + book_client = Book() + user_client = User() + print("βœ… Setup complete.") + + print("\n--- 2. Testing User Client ---") + + user_id = None + try: + print("\nFetching User ID for:", username) + user_id_json = user_client.get_user_id(username) + user_id_data = json.loads(user_id_json) + if "user_id" in user_id_data: + user_id = user_id_data["user_id"] + print(f"βœ… Success! User ID found: {user_id}") + else: + print(f"⚠️ Could not extract user_id from response: {user_id_json}") + except Exception as e: + print(f"πŸ›‘ Error fetching user ID: {e}") + + try: + print("\nFetching 'Currently Reading' list...") + currently_reading = user_client.currently_reading(username, auth_cookies) + print("βœ… Success! Result:") + print(currently_reading) + except Exception as e: + print(f"πŸ›‘ Error fetching 'Currently Reading' list: {e}") + + try: + print("\nFetching 'To-Read' list...") + to_read = user_client.to_read(username, auth_cookies) + print("βœ… Success! Result:") + print(to_read) + except Exception as e: + print(f"πŸ›‘ Error fetching 'To-Read' list: {e}") + + try: + print("\nFetching 'Read' list...") + books_read = user_client.books_read(username, auth_cookies) + print("βœ… Success! Result:") + print(books_read) + except Exception as e: + print(f"πŸ›‘ Error fetching 'Read' list: {e}") + + try: + print("\nFetching all journal entries...") + all_journal_entries = user_client.get_all_journal_entries(auth_cookies) + all_journal_data = json.loads(all_journal_entries) + if isinstance(all_journal_data, list): + print(f"βœ… All Journal Entries: {len(all_journal_data)} entries found.") + print(all_journal_entries) + else: + print(f"πŸ›‘ Error: All journal entries not returned as a list.") + except Exception as e: + print(f"πŸ›‘ Error fetching all journal entries: {e}") + + print("\n--- 3. Testing Book Client ---") + + finished_book_id = "1c023e31-637b-41d9-ba64-260c3c1b0f3d" + reading_book_id = "87ca0994-06fb-4360-a0bf-660918a7fbc4" + unread_book_id = "c89e808f-39db-49f0-98af-6028d98097f9" + search_query = "Dune Frank Herbert" + + try: + print(f"\nSearching for '{search_query}'...") + search_results = book_client.search(search_query) + print("βœ… Success! Result:") + print(search_results) + except Exception as e: + print(f"πŸ›‘ Error searching for book: {e}") + + try: + print(f"\nFetching info for book ID: {finished_book_id}") + book_info = book_client.book_info(finished_book_id) + print("βœ… Success! Result:") + print(book_info) + info = json.loads(book_info) + cover_url = info.get('cover_url') + if cover_url and isinstance(cover_url, str) and cover_url.startswith("https://cdn.thestorygraph.com/"): + print(f"βœ… cover_url present and valid: {cover_url}") + else: + print(f"πŸ›‘ cover_url missing or invalid: {cover_url}") + except Exception as e: + print(f"πŸ›‘ Error fetching book info: {e}") + + print("\nFetching reading progress for all 3 test cases...") + try: + progress_finished = book_client.reading_progress(finished_book_id, auth_cookies) + print(f"βœ… Finished Book Progress: {json.loads(progress_finished).get('progress')}") + + progress_reading = book_client.reading_progress(reading_book_id, auth_cookies) + print(f"βœ… Currently Reading Book Progress: {json.loads(progress_reading).get('progress')}") + + progress_unread = book_client.reading_progress(unread_book_id, auth_cookies) + print(f"βœ… Unread Book Progress: {json.loads(progress_unread).get('progress')}") + except Exception as e: + print(f"πŸ›‘ Error fetching reading progress: {e}") + + print("\nFetching read dates for all 3 test cases...") + try: + dates_finished = book_client.get_read_dates(finished_book_id, auth_cookies) + print(f"βœ… Finished Book Dates: {dates_finished}") + + dates_reading = book_client.get_read_dates(reading_book_id, auth_cookies) + print(f"βœ… Currently Reading Book Dates: {dates_reading}") + + dates_unread = book_client.get_read_dates(unread_book_id, auth_cookies) + print(f"βœ… Unread Book Dates: {dates_unread}") + except Exception as e: + print(f"πŸ›‘ Error fetching read dates: {e}") + + print("\nFetching journal entries for a finished book and an unread book...") + try: + journal_entries_finished = book_client.get_journal_entries(finished_book_id, auth_cookies) + journal_data = json.loads(journal_entries_finished) + if isinstance(journal_data, list): + print(f"βœ… Finished Book Journal Entries: {len(journal_data)} entries found.") + print(journal_entries_finished) + else: + print(f"πŸ›‘ Error: Journal entries for finished book not returned as a list.") + + journal_entries_unread = book_client.get_journal_entries(unread_book_id, auth_cookies) + journal_data_unread = json.loads(journal_entries_unread) + if isinstance(journal_data_unread, list) and len(journal_data_unread) == 0: + print(f"βœ… Unread Book Journal Entries: Correctly returned an empty list.") + else: + print(f"πŸ›‘ Error: Journal entries for unread book did not return an empty list.") + except Exception as e: + print(f"πŸ›‘ Error fetching journal entries: {e}") + + if user_id: + try: + print(f"\nFetching AI summary for book ID: {finished_book_id}") + ai_summary = book_client.get_ai_summary(finished_book_id, user_id) + print("βœ… Success! Result:") + print(ai_summary) + except Exception as e: + print(f"πŸ›‘ Error fetching AI summary: {e}") + else: + print("\n⚠️ Skipping AI Summary test because user_id could not be retrieved.") + + print("\nπŸŽ‰ All tests complete!") + +if __name__ == "__main__": + run_tests() From 9a23136528667c00083da3d93c7fe816fcb0b3e9 Mon Sep 17 00:00:00 2001 From: Nicholas Thompson Date: Thu, 31 Jul 2025 15:43:18 -0700 Subject: [PATCH 5/6] ensure --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 537ab1f..8fc00a9 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,5 @@ build/ dist/ *.egg-info/ .eggs/ + +**/.env From 921a9e68450056bbef1d359db262f55c53566813 Mon Sep 17 00:00:00 2001 From: Nicholas Thompson Date: Thu, 31 Jul 2025 15:43:45 -0700 Subject: [PATCH 6/6] ensure 2.0 --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 8fc00a9..6b3375d 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,5 @@ dist/ .eggs/ **/.env +repomix-output.xml +**/repomix-output.xml \ No newline at end of file