diff --git a/.gitignore b/.gitignore index 537ab1f..6b3375d 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,7 @@ build/ dist/ *.egg-info/ .eggs/ + +**/.env +repomix-output.xml +**/repomix-output.xml \ No newline at end of file diff --git a/.vscode/ltex.dictionary.en-US.txt b/.vscode/ltex.dictionary.en-US.txt new file mode 100644 index 0000000..f2dba1c --- /dev/null +++ b/.vscode/ltex.dictionary.en-US.txt @@ -0,0 +1,2 @@ +StoryGraph +storygraph-api diff --git a/README.md b/README.md index 75f6adf..f6c555a 100644 --- a/README.md +++ b/README.md @@ -1,116 +1,121 @@ -# Storygraph API -A python package to interact with and fetch data from the [StoryGraph](https://app.thestorygraph.com/) website. +# Unofficial StoryGraph API for Python -## Features -- **Book Details**: Fetch detailed information about a book using its unique ID. -- **Search**: Perform a book search on StoryGraph and retrieve the results. -- **Fetch User lists**: - - currently reading - - planning to read - - books read +An unofficial Python wrapper for The StoryGraph API, forked from [ym496/storygraph-api](https://github.com/ym496/storygraph-api). + +This fork has been significantly refactored and enhanced to be more efficient, reliable, and feature-rich. + +## Key Enhancements in This Fork + +* **No More Selenium**: The original dependency on Selenium and a headless browser has been completely removed. This version uses the `requests` library directly for all API communication, resulting in a much lighter, faster, and more stable experience. +* **Expanded API Coverage**: Many new features have been added, including methods to: + * Fetch your reading progress. + * Get your read dates for a book. + * Retrieve all your journal entries or entries for a specific book. + * Get a book's AI-generated summary. + * Fetch a user's ID. +* **Modernized Codebase**: The code has been updated with type hints and a more robust project structure. +* **Cookie-Based Authentication**: Authentication is now handled by passing your browser's session cookies, which is a more reliable method than the previous implementation. ## Installation -``` -pip install storygraph-api + +```bash +pip install -r requirements.txt ``` -## Getting Started +## Configuration -The API is divided into two components, `Books Client` and `User Client`. +This wrapper requires authentication for most features. You'll need to provide your StoryGraph session cookies and username. -### Book Details: +1. **Create a `.env` file** in the root of the project. +2. **Find your cookies**: + * Open your web browser and log in to [The StoryGraph](https://app.thestorygraph.com/). + * Open your browser's developer tools (usually by pressing F12). + * Go to the "Application" (in Chrome) or "Storage" (in Firefox) tab. + * Under the "Cookies" section for `app.thestorygraph.com`, find the values for `_storygraph_session` and `remember_user_token`. +3. **Add your credentials to the `.env` file**: -```python -# Books Client -# Fetch details of a book using its ID - -from storygraph_api import Book -id = "fbdd6b7c-f512-47f2-aa94-d8bf0d5f5175" -book = Book() -result = book.book_info(id) -print(result) -``` -#### Result: -```json -{ - "title": "Hagakure: The Book of the Samurai", - "authors": [ - "Yamamoto Tsunetomo", - "William Scott Wilson" - ], - "pages": "179", - "first_pub": "1716", - "tags": [ - "nonfiction", - "history", - "philosophy", - "informative", - "reflective", - "slow-paced" - ], - "average_rating": "3.65", - "description": "
Hagakure<\\/em> (\\\"In the Shadow of Leaves\\\") is a manual for the samurai classes consisting of a series of short anecdotes and reflections that give both insight and instruction-in the philosophy and code of behavior that foster the true spirit of Bushido-the Way of the Warrior. It is not a book of philosophy as most would understand the word: it is a collection of thoughts and sayings recorded over a period of seven years, and as such covers a wide variety of subjects, often in no particular sequence.

The work represents an attitude far removed from our modern pragmatism and materialism, and possesses an intuitive rather than rational appeal in its assertion that Bushido is a Way of Dying, and that only a samurai retainer prepared and willing to die at any moment can be totally true to his lord. While Hagakure<\\/em> was for many years a secret text known only to the warrior vassals of the Hizen fief to which the author belonged, it later came to be recognized as a classic exposition of samurai thought and came to influence many subsequent generations, including Yukio Mishima.

This translation offers 300 selections that constitute the core texts of the 1,300 present in the original.
Hagakure<\\/em> was featured prominently in the film Ghost Dog<\\/em>, by Jim Jarmusch.<\\/div>", - "warnings": { - "graphic": [ - "Suicide", - "Violence" - ], - "moderate": [ - "Suicide", - "Suicide attempt", - "War" - ], - "minor": [ - "Gore" - ] - } -} -``` + ```dotenv + _STORYGRAPH_SESSION=your_session_cookie_value + REMEMBER_USER_TOKEN=your_remember_token_value + STORYGRAPH_USERNAME=your_storygraph_username + ``` +## Usage -### User List: +Here's a basic example of how to use the `Book` and `User` clients. ```python -# User Client -# works only for public profiles -# fetch user's currently reading list - -from storygraph_api import User +import os +import json from dotenv import load_dotenv +from storygraph_api import Book, User + +# Load environment variables from .env file load_dotenv() -cookie = os.getenv('COOKIE') # retrieve cookie from .env file -uname = 'sampleuname' #some username -user = User() -result = user.currently_reading(uname,cookie=cookie) -print(result) -``` +# --- Authentication --- +username = os.getenv("STORYGRAPH_USERNAME") +session_cookie = os.getenv("_STORYGRAPH_SESSION") +remember_token = os.getenv("REMEMBER_USER_TOKEN") -#### Result: - - ```json - [ - { - "title": "The Murder After the Night Before", - "book_id": "38cb5b56-23f1-48fd-b4b3-a80e07a19775" - }, - { - "title": "The Graces", - "book_id": "653b54b3-a79d-4c2e-ae40-eae281a91315" - } -] +auth_cookies = { + "_storygraph_session": session_cookie, + "remember_user_token": remember_token +} + +# --- Initialize Clients --- +book_client = Book() +user_client = User() + +# --- User Client Examples --- + +# Get user ID +user_id_json = user_client.get_user_id(username) +user_id = json.loads(user_id_json).get("user_id") +print(f"User ID: {user_id}") + +# Get 'Currently Reading' list +currently_reading = user_client.currently_reading(username, auth_cookies) +print(currently_reading) + +# Get 'To-Read' list +to_read = user_client.to_read(username, auth_cookies) +print(to_read) - ``` +# Get 'Read' list +books_read = user_client.books_read(username, auth_cookies) +print(books_read) -## Further Information -* Refer to [books_client.py](https://github.com/ym496/storygraph-api/tree/main/storygraph_api/books_client.py) and [users_client.py](https://github.com/ym496/storygraph-api/tree/main/storygraph_api/users_client.py) files to know more functionalities. -* All the user related tasks require the `remember_user_token` cookie. It can be found in the `Application` section of your browser’s developer tools for the StoryGraph website. +# --- Book Client Examples --- -## Contributing -Contributions are welcome! Fork the repository, make your changes, and submit a pull request. +book_id = "1c023e31-637b-41d9-ba64-260c3c1b0f3d" # Example book ID -For bugs or feature requests, please open an issue on [GitHub](https://github.com/ym496/storygraph-api/issues). +# Search for a book +search_results = book_client.search("Dune Frank Herbert") +print(search_results) + +# Get book info +book_info = book_client.book_info(book_id) +print(book_info) + +# Get your reading progress for a book +progress = book_client.reading_progress(book_id, auth_cookies) +print(progress) + +# Get your read dates for a book +read_dates = book_client.get_read_dates(book_id, auth_cookies) +print(read_dates) + +# Get your journal entries for a book +journal_entries = book_client.get_journal_entries(book_id, auth_cookies) +print(journal_entries) + +# Get the AI summary for a book +if user_id: + ai_summary = book_client.get_ai_summary(book_id, user_id) + print(ai_summary) +``` -## License +## Disclaimer -This project is licensed under the MIT License. +This is an unofficial wrapper. It is not affiliated with or endorsed by The StoryGraph. Use it at your own risk. The StoryGraph's website structure could change at any time, which might break this wrapper. diff --git a/manual_tests.py b/manual_tests.py deleted file mode 100644 index 94f6dc9..0000000 --- a/manual_tests.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -Basic Manual Testing of Components. -""" -import os -from dotenv import load_dotenv -# from storygraph_api.request.books_request import BooksScraper -# from storygraph_api.parse.books_parser import BooksParser -# from storygraph_api.request.user_request import UserScraper -# from storygraph_api.parse.user_parser import UserParser -# from storygraph_api.users_client import User -load_dotenv() - -id = "a5da6127-beb2-44b9-aba6-f63de432777" -query = "pride and prejudice" -# testing book page info -# print(BooksScraper.main(id)) -# print(BooksParser.book_page(id)) - -id = "e5a59ed0-31f0-46af-849e-cd8e624b68ff" -from storygraph_api import Book -book = Book() -print(book.book_info(id)) -# print(book.search(query)) - -# cookie = os.getenv('COOKIE') -# # print(UserScraper.currently_reading(uname,session_cookie=cookie)) -# # print(UserParser.books_read(uname,cookie=cookie)) -# user = User() -# print(user.books_read(uname,cookie=cookie)) - -# -# from storygraph_api.users_client import User -# from dotenv import load_dotenv -# load_dotenv() -# cookie = os.getenv('COOKIE') -# uname = 'clyrmze' -# user = User() -# result = user.books_read(uname,cookie=cookie) -# print(result) diff --git a/requirements.txt b/requirements.txt index 8dea80e..6ac3d6a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,20 +1,9 @@ -attrs==24.2.0 beautifulsoup4==4.12.3 certifi==2024.7.4 charset-normalizer==3.3.2 -exceptiongroup==1.2.2 -h11==0.14.0 idna==3.7 -outcome==1.3.0.post0 -PySocks==1.7.1 +python-dotenv==1.1.1 requests==2.32.3 -selenium==4.23.1 -sniffio==1.3.1 -sortedcontainers==2.4.0 soupsieve==2.6 -trio==0.26.2 -trio-websocket==0.11.1 typing_extensions==4.12.2 urllib3==2.2.2 -websocket-client==1.8.0 -wsproto==1.2.0 diff --git a/storygraph_api/books_client.py b/storygraph_api/books_client.py index 01b5a36..f3da804 100644 --- a/storygraph_api/books_client.py +++ b/storygraph_api/books_client.py @@ -1,14 +1,36 @@ from storygraph_api.parse.books_parser import BooksParser from storygraph_api.exception_handler import handle_exceptions import json +from typing import Dict class Book: @handle_exceptions - def book_info(self,book_id): + def book_info(self, book_id: str) -> str: data = BooksParser.book_page(book_id) - return json.dumps(data,indent=4) + return json.dumps(data, indent=4) @handle_exceptions - def search(self,query): + def reading_progress(self, book_id: str, cookies: Dict[str, str]) -> str: + progress = BooksParser.reading_progress(book_id, cookies) + data = {"progress": progress} + return json.dumps(data, indent=4) + + @handle_exceptions + def get_read_dates(self, book_id: str, cookies: Dict[str, str]) -> str: + data = BooksParser.get_read_dates(book_id, cookies) + return json.dumps(data, indent=4) + + @handle_exceptions + def get_ai_summary(self, book_id: str, user_id: str) -> str: + data = BooksParser.get_ai_summary(book_id, user_id) + return json.dumps(data, indent=4) + + @handle_exceptions + def get_journal_entries(self, book_id: str, cookies: Dict[str, str]) -> str: + data = BooksParser.journal_entries(book_id, cookies) + return json.dumps(data, indent=4) + + @handle_exceptions + def search(self, query: str) -> str: data = BooksParser.search(query) - return json.dumps(data,indent=4) + return json.dumps(data, indent=4) diff --git a/storygraph_api/exception_handler.py b/storygraph_api/exception_handler.py index c84f9a8..df90bba 100644 --- a/storygraph_api/exception_handler.py +++ b/storygraph_api/exception_handler.py @@ -2,18 +2,18 @@ import requests from functools import wraps from storygraph_api.exceptions import RequestError, ParsingError, UnexpectedError +from selenium.common.exceptions import WebDriverException def handle_exceptions(func): @wraps(func) def wrapper(*args, **kwargs): try: return func(*args, **kwargs) - except RequestError as e: - return json.dumps({"error": e.message}, indent=4) - except ParsingError as e: + except (RequestError, ParsingError) as e: return json.dumps({"error": e.message}, indent=4) except Exception as e: - raise UnexpectedError(f"Unexpected error: {str(e)}") + unexpected_error = UnexpectedError(f"An unexpected error occurred: {str(e)}") + return json.dumps({"error": unexpected_error.message}, indent=4) return wrapper def request_exception(func): @@ -22,9 +22,9 @@ def wrapper(*args, **kwargs): try: return func(*args, **kwargs) except requests.RequestException as e: - return json.dumps({"error": f"Scraping Error: {str(e)}"}, indent=4) - except Exception as e: - return json.dumps({"error": f"Scraping Error: {str(e)}"}, indent=4) + raise RequestError(f"A network error occurred: {str(e)}") from e + except WebDriverException as e: + raise RequestError(f"A browser automation error occurred: {str(e)}") from e return wrapper def parsing_exception(func): @@ -32,8 +32,6 @@ def parsing_exception(func): def wrapper(*args, **kwargs): try: return func(*args, **kwargs) - except ParsingError as e: - return json.dumps({"error": e.message}, indent=4) - except Exception as e: - return json.dumps({"error": f"Parsing Error: {str(e)}"}, indent=4) + except (AttributeError, IndexError, TypeError, ValueError) as e: + raise ParsingError(f"Failed to parse page content. The website structure may have changed. Details: {str(e)}") from e return wrapper diff --git a/storygraph_api/exceptions.py b/storygraph_api/exceptions.py index 3a6f046..ce548bd 100644 --- a/storygraph_api/exceptions.py +++ b/storygraph_api/exceptions.py @@ -1,21 +1,17 @@ class StoryGraphAPIError(Exception): - """Base class for exceptions in StoryGraphAPI""" pass class RequestError(StoryGraphAPIError): - """Exception raised for errors during the request.""" def __init__(self, message="An error occurred during the request."): self.message = message super().__init__(self.message) class ParsingError(StoryGraphAPIError): - """Exception raised for errors during parsing responses.""" def __init__(self, message="An error occurred while parsing the response."): self.message = message super().__init__(self.message) class UnexpectedError(StoryGraphAPIError): - """Exception raised for unexpected errors.""" def __init__(self, message="An unexpected error occurred."): self.message = message diff --git a/storygraph_api/parse/books_parser.py b/storygraph_api/parse/books_parser.py index 20014eb..8df03bb 100644 --- a/storygraph_api/parse/books_parser.py +++ b/storygraph_api/parse/books_parser.py @@ -1,95 +1,365 @@ from storygraph_api.request.books_request import BooksScraper from storygraph_api.exception_handler import parsing_exception -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag, NavigableString import re +from typing import Dict, Any, List +from urllib.parse import parse_qs, urlparse class BooksParser: @staticmethod @parsing_exception - def book_page(book_id): + def book_page(book_id: str) -> Dict[str, Any]: content = BooksScraper.main(book_id) soup = BeautifulSoup(content, 'html.parser') - h3_tag = soup.find('h3',class_="font-serif font-bold text-2xl md:w-11/12") - title = h3_tag.contents[0].strip() + + h3_tag = soup.find('h3', class_="font-serif font-bold text-2xl md:w-11/12") + if not isinstance(h3_tag, Tag): + raise Exception("Could not find the main title header.") + + title = "" + if h3_tag.contents and isinstance(h3_tag.contents[0], NavigableString): + title = h3_tag.contents[0].strip() + authors = [] for a in h3_tag.find_all('a'): - if a["href"].startswith("/authors"): - authors.append(a.text) - p_tag = soup.find('p',class_="text-sm font-light text-darkestGrey dark:text-grey mt-1") - pages = p_tag.contents[0].strip().split()[0] - first_pub = p_tag.contents[1].find_all('span')[1].text.split()[2] - tags = [] - tag_div = soup.find('div',class_="book-page-tag-section").find_all('span') - for tag in tag_div: - tags.append(tag.text) - desc = soup.find_all('script')[5].text - pattern = re.compile(r"Description<\/h4>
(.*?)<\/div>", re.DOTALL) - match = pattern.search(desc) - description = match.group(1).strip() + if isinstance(a, Tag): + href = a.get("href") + if isinstance(href, str) and href.startswith("/authors"): + authors.append(a.text) + + p_tag = soup.find('p', class_="text-sm font-light text-darkestGrey dark:text-grey mt-1") + if not isinstance(p_tag, Tag) or not p_tag.contents: + raise Exception("Could not find book metadata paragraph.") + + pages_text = p_tag.contents[0] + pages = pages_text.strip().split()[0] if isinstance(pages_text, NavigableString) else "N/A" + + pub_info_span = p_tag.find('span', string=re.compile(r'first pub')) + first_pub = pub_info_span.text.split()[-1] if pub_info_span else "N/A" + + tag_div = soup.find('div', class_="book-page-tag-section") + tags = [tag.text for tag in tag_div.find_all('span')] if isinstance(tag_div, Tag) else [] + + cover_url = None + cover_div = soup.find('div', class_="book-cover") + if isinstance(cover_div, Tag): + img_tag = cover_div.find('img') + if isinstance(img_tag, Tag): + cover_url = img_tag.get('src') + + description = "Description not found." + script_tag = soup.find('script', string=re.compile(r"\$\('\.read-more-btn'\)")) + if isinstance(script_tag, Tag): + script_content = script_tag.string + if script_content: + pattern = re.compile(r"\.html\('(.*)'\)", re.DOTALL) + match = pattern.search(str(script_content)) + if match: + html_str = match.group(1).replace(r'\/', r'/') + desc_soup = BeautifulSoup(html_str, 'html.parser') + desc_div = desc_soup.find('div', class_='trix-content') + if desc_div: + description = desc_div.get_text(separator="\n", strip=True) + review_content = BooksScraper.community_reviews(book_id) - rev_soup = BeautifulSoup(review_content,'html.parser') - avg_rating = rev_soup.find('span',class_="average-star-rating").text.strip() + rev_soup = BeautifulSoup(review_content, 'html.parser') + avg_rating_span = rev_soup.find('span', class_="average-star-rating") + avg_rating = avg_rating_span.text.strip() if avg_rating_span else "N/A" + warnings = BooksParser.content_warnings(book_id) + data = { - 'title':title, - 'authors': authors, - 'pages': pages, - 'first_pub': first_pub, - 'tags': tags, - 'average_rating': avg_rating, - 'description':description, - 'warnings': warnings - } + 'title': title, 'authors': authors, 'pages': pages, + 'first_pub': first_pub, 'tags': tags, 'average_rating': avg_rating, + 'description': description, 'warnings': warnings, + 'cover_url': cover_url + } return data @staticmethod @parsing_exception - def content_warnings(book_id): + def reading_progress(book_id: str, cookies: Dict[str, str]) -> str: + content = BooksScraper.book_page_authenticated(book_id, cookies) + soup = BeautifulSoup(content, 'html.parser') + + status_label = soup.find('button', class_='read-status-label') + if isinstance(status_label, Tag) and status_label.text.strip() == 'read': + return "100%" + + progress_bar_div = soup.find('div', class_='progress-bar') + if isinstance(progress_bar_div, Tag): + progress_span = progress_bar_div.find('span') + if isinstance(progress_span, Tag) and progress_span.string: + return progress_span.string.strip() + + inner_div = progress_bar_div.find('div', style=lambda v: 'width: 0%' in v if v else False) + if inner_div is not None: + return "0%" + + to_read_button = soup.find('button', string=re.compile(r'\s*to read\s*')) + if isinstance(to_read_button, Tag): + return "0%" + + raise Exception("Could not determine reading status from the page.") + + @staticmethod + @parsing_exception + def get_read_dates(book_id: str, cookies: Dict[str, str]) -> Dict[str, Any]: + try: + from storygraph_api.parse.user_parser import UserParser + from storygraph_api.request.user_request import UserScraper + + all_entries = [] + page = 1 + while True: + content = UserScraper.all_journal_entries(cookies, page) + entries = UserParser.all_journal_entries(content) + if not entries: + break + all_entries.extend(entries) + page += 1 + + start_date = None + finish_date = None + + for entry in all_entries: + if entry.get('book_id') == book_id: + if entry.get('status') == 'Started reading': + date_str = entry.get('date', '') + if date_str: + try: + from datetime import datetime + parsed_date = datetime.strptime(date_str, '%d %B %Y') + start_date = parsed_date.strftime('%Y-%m-%d') + except: + pass + elif entry.get('status') == 'Finished': + date_str = entry.get('date', '') + if date_str: + try: + from datetime import datetime + parsed_date = datetime.strptime(date_str, '%d %B %Y') + finish_date = parsed_date.strftime('%Y-%m-%d') + except: + pass + + return {'start_date': start_date, 'finish_date': finish_date} + + except Exception: + pass + + content = BooksScraper.book_page_authenticated(book_id, cookies) + soup = BeautifulSoup(content, 'html.parser') + + edit_link = soup.find('a', href=re.compile(r'/edit-(read-instance|journal-entry)-from-book')) + if not (isinstance(edit_link, Tag) and edit_link.get('href')): + return {'start_date': None, 'finish_date': None} + + href = edit_link['href'] + if not isinstance(href, str): + raise Exception("Could not find a valid edit link href.") + + parsed_url = urlparse(href) + query_params = parse_qs(parsed_url.query) + + id_val = None + form_content = b'' + id_type = '' + + if 'read_instance_id' in query_params: + id_val = query_params.get('read_instance_id', [None])[0] + id_type = 'read_instance' + if not id_val: + raise Exception("Could not extract read_instance_id from edit link.") + try: + form_content = BooksScraper.get_read_dates_form(book_id, id_val, cookies) + except: + return {'start_date': None, 'finish_date': None} + + elif 'journal_entry_id' in query_params: + id_val = query_params.get('journal_entry_id', [None])[0] + id_type = 'journal_entry' + if not id_val: + raise Exception("Could not extract journal_entry_id from edit link.") + try: + form_content = BooksScraper.get_journal_entry_form(book_id, id_val, cookies) + except: + return {'start_date': None, 'finish_date': None} + + if not form_content: + return {'start_date': None, 'finish_date': None} + + form_soup = BeautifulSoup(form_content, 'html.parser') + + def get_date(date_prefix: str) -> str | None: + day_select = form_soup.find('select', id=f'{id_type}_{date_prefix}day') + month_select = form_soup.find('select', id=f'{id_type}_{date_prefix}month') + year_select = form_soup.find('select', id=f'{id_type}_{date_prefix}year') + + if not (isinstance(day_select, Tag) and isinstance(month_select, Tag) and isinstance(year_select, Tag)): + return None + + day_option = day_select.find('option', selected=True) + month_option = month_select.find('option', selected=True) + year_option = year_select.find('option', selected=True) + + if isinstance(day_option, Tag) and isinstance(month_option, Tag) and isinstance(year_option, Tag): + day = day_option.get('value') + month = month_option.get('value') + year = year_option.get('value') + if day and month and year: + return f"{year}-{str(month).zfill(2)}-{str(day).zfill(2)}" + return None + + start_date_prefix = 'started_at_' if id_type == 'journal_entry' else 'start_' + start_date = get_date(start_date_prefix) + finish_date = get_date('finished_at_') if id_type == 'journal_entry' else get_date('') + + return {'start_date': start_date, 'finish_date': finish_date} + + @staticmethod + @parsing_exception + def get_ai_summary(book_id: str, user_id: str) -> Dict[str, str]: + content = BooksScraper.get_ai_summary(book_id, user_id) + soup = BeautifulSoup(content, 'html.parser') + + template = soup.find('template') + if isinstance(template, Tag): + p_tag = template.find('p') + if isinstance(p_tag, Tag) and p_tag.string: + return {'summary': p_tag.string.strip()} + + raise Exception("Could not parse AI summary.") + + @staticmethod + @parsing_exception + def content_warnings(book_id: str) -> Dict[str, List[str]]: warnings_content = BooksScraper.content_warnings(book_id) - warnings_soup = BeautifulSoup(warnings_content,'html.parser') - user_warnings_pane = warnings_soup.find_all('div',class_='standard-pane')[1] - warnings_graphic = [] - warnings_moderate = [] - warnings_minor = [] - warnings_list = warnings_graphic + warnings_soup = BeautifulSoup(warnings_content, 'html.parser') + + standard_panes = warnings_soup.find_all('div', class_='standard-pane') + if len(standard_panes) < 2: + return {'graphic': [], 'moderate': [], 'minor': []} + + user_warnings_pane = standard_panes[1] + warnings: Dict[str, List[str]] = {'graphic': [], 'moderate': [], 'minor': []} + current_list_key = 'graphic' tag_re = re.compile(r'^(.*) \((\d+)\)$') + for tag in user_warnings_pane.children: - if tag == '\n': - continue + if not isinstance(tag, Tag): continue + if tag.name == 'p': - if tag.text == 'Graphic': - warnings_list = warnings_graphic - elif tag.text == 'Moderate': - warnings_list = warnings_moderate - elif tag.text == 'Minor': - warnings_list = warnings_minor + if tag.text == 'Graphic': current_list_key = 'graphic' + elif tag.text == 'Moderate': current_list_key = 'moderate' + elif tag.text == 'Minor': current_list_key = 'minor' elif tag.name == 'div': match = tag_re.match(tag.text) - warnings_list.append(match[1]) - warnings = { - 'graphic': warnings_graphic, - 'moderate': warnings_moderate, - 'minor': warnings_minor - } + if match: warnings[current_list_key].append(match.group(1)) return warnings @staticmethod @parsing_exception - def search(query): + def search(query: str) -> List[Dict[str, str]]: content = BooksScraper.search(query) soup = BeautifulSoup(content, 'html.parser') - search_results = [] + search_results: List[Dict[str, str]] = [] + books = soup.find_all('div', class_="book-title-author-and-series w-11/12") for book in books: - title = book.find('a').text.strip() - for a in book.find_all('a'): - if a["href"].startswith('/author'): - author = a.text.strip() - break - book_id = book.find('a')['href'].split('/')[-1] - search_results.append({ - 'title': title, - 'author': author, - 'book_id': book_id - }) + if not isinstance(book, Tag): continue + + title_tag = book.find('a') + title = title_tag.text.strip() if isinstance(title_tag, Tag) else "N/A" + + href_val = title_tag.get('href') if isinstance(title_tag, Tag) else None + + href = href_val[0] if isinstance(href_val, list) else href_val + book_id = href.split('/')[-1] if isinstance(href, str) else "N/A" + + author = "N/A" + for a_tag in book.find_all('a'): + if isinstance(a_tag, Tag): + href = a_tag.get("href") + if isinstance(href, str) and href.startswith('/author'): + author = a_tag.text.strip() + break + + search_results.append({'title': title, 'author': author, 'book_id': book_id}) + return search_results + + @staticmethod + @parsing_exception + def journal_entries(book_id: str, cookies: Dict[str, str]) -> List[Dict[str, Any]]: + content = BooksScraper.get_journal_page(book_id, cookies) + soup = BeautifulSoup(content, 'html.parser') + + journal_entries: List[Dict[str, Any]] = [] + + entry_panes = soup.find_all('span', class_="journal-entry-panes") + if not entry_panes: + return journal_entries + + for entry in entry_panes[0].find_all(lambda tag: tag.name == 'div' and 'grid-cols-4' in tag.get('class', [])): + date_tag = entry.find('p', class_="font-semibold") + date = date_tag.text.strip().split('\n')[0] if date_tag else "N/A" + + progress_percent_tag = entry.find('div', class_="text-teal-500") + progress_percent = int(progress_percent_tag.text.strip().replace('%', '')) if progress_percent_tag else None + + pages_read_tag = entry.find('p', class_="clear-both") + pages_read_this_session = None + total_pages_read = None + total_pages = None + + if pages_read_tag: + pages_text = pages_read_tag.text + session_match = re.search(r'(\d+) pages read', pages_text) + if session_match: + pages_read_this_session = int(session_match.group(1)) + + total_match = re.search(r'\((\d+) pages out of (\d+)\)', pages_text) + if total_match: + total_pages_read = int(total_match.group(1)) + total_pages = int(total_match.group(2)) + + note_tag = entry.find('div', class_="trix-content") + note = note_tag.text.strip() if note_tag else None + + status_tag = entry.find('span', class_=lambda x: x and 'inline-flex' in x) + status = status_tag.text.strip() if status_tag else None + + if status == "Started reading": + if progress_percent is None: + progress_percent = 0 + if pages_read_this_session is None: + pages_read_this_session = 0 + if total_pages_read is None: + total_pages_read = 0 + elif status == "Finished": + if progress_percent is None: + progress_percent = 100 + + journal_entries.append({ + 'date': date, + 'status': status, + 'progress_percent': progress_percent, + 'pages_read_this_session': pages_read_this_session, + 'total_pages_read': total_pages_read, + 'total_pages': total_pages, + 'note': note + }) + + book_total_pages = None + for entry in journal_entries: + if entry.get('total_pages') is not None: + book_total_pages = entry['total_pages'] + break + + if book_total_pages is not None: + for entry in journal_entries: + if entry.get('total_pages') is None: + entry['total_pages'] = book_total_pages + + return journal_entries diff --git a/storygraph_api/parse/user_parser.py b/storygraph_api/parse/user_parser.py index b4f30c4..2ca80f5 100644 --- a/storygraph_api/parse/user_parser.py +++ b/storygraph_api/parse/user_parser.py @@ -1,10 +1,24 @@ from storygraph_api.request.user_request import UserScraper from storygraph_api.exception_handler import parsing_exception -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag +from typing import Dict +import re class UserParser: @staticmethod - @parsing_exception + @parsing_exception + def get_user_id(username: str) -> Dict[str, str]: + content = UserScraper.get_profile_page(username) + soup = BeautifulSoup(content, 'html.parser') + profile_pane = soup.find('div', id='profile-heading-pane') + if isinstance(profile_pane, Tag): + user_id = profile_pane.get('data-user-id') + if user_id and isinstance(user_id, str): + return {'user_id': user_id} + raise Exception(f"Could not find user_id for username '{username}'.") + + @staticmethod + @parsing_exception def parse_html(html): soup = BeautifulSoup(html, 'html.parser') books_list = [] @@ -21,15 +35,96 @@ def parse_html(html): @staticmethod def currently_reading(uname, cookie): - content = UserScraper.currently_reading(uname,cookie) + content = UserScraper.currently_reading(uname, cookie) return UserParser.parse_html(content) @staticmethod def to_read(uname, cookie): - content = UserScraper.to_read(uname,cookie) + content = UserScraper.to_read(uname, cookie) return UserParser.parse_html(content) @staticmethod def books_read(uname, cookie): - content = UserScraper.books_read(uname,cookie) + content = UserScraper.books_read(uname, cookie) return UserParser.parse_html(content) + + @staticmethod + @parsing_exception + def all_journal_entries(html_content): + soup = BeautifulSoup(html_content, 'html.parser') + + journal_entries = [] + + for entry in soup.find_all('div', class_="mb-7"): + try: + book_title_tag = entry.find('p', class_="font-semibold text-sm md:text-base font-semibold").find('a') + book_title = book_title_tag.text.strip() if book_title_tag else "N/A" + + book_id = book_title_tag['href'].split('/')[-1] if book_title_tag else "N/A" + + date_tag = entry.find('p', class_="font-semibold text-xs md:text-sm") + date = date_tag.text.strip().split('\n')[0] if date_tag else "N/A" + + progress_percent_tag = entry.find('div', class_="text-teal-500") + progress_percent = int(progress_percent_tag.text.strip().replace('%', '')) if progress_percent_tag else None + + pages_read_tag = entry.find('p', class_=re.compile(r'clear-both.*')) + pages_read_this_session = None + total_pages_read = None + total_pages = None + + if pages_read_tag: + pages_text = pages_read_tag.text + session_match = re.search(r'(\d+) pages read', pages_text) + if session_match: + pages_read_this_session = int(session_match.group(1)) + + total_match = re.search(r'\((\d+) pages out of (\d+)\)', pages_text) + if total_match: + total_pages_read = int(total_match.group(1)) + total_pages = int(total_match.group(2)) + + note_tag = entry.find('div', class_="trix-content") + note = note_tag.text.strip() if note_tag else None + + status_tag = entry.find('span', class_=lambda x: x and 'inline-flex' in x) + status = status_tag.text.strip() if status_tag else None + + if status == "Started reading": + if progress_percent is None: + progress_percent = 0 + if pages_read_this_session is None: + pages_read_this_session = 0 + if total_pages_read is None: + total_pages_read = 0 + elif status == "Finished": + if progress_percent is None: + progress_percent = 100 + + journal_entries.append({ + 'book_title': book_title, + 'book_id': book_id, + 'date': date, + 'status': status, + 'progress_percent': progress_percent, + 'pages_read_this_session': pages_read_this_session, + 'total_pages_read': total_pages_read, + 'total_pages': total_pages, + 'note': note + }) + except Exception: + continue + + book_total_pages = {} + + for entry in journal_entries: + book_id = entry.get('book_id') + if book_id and entry.get('total_pages') is not None: + book_total_pages[book_id] = entry['total_pages'] + + for entry in journal_entries: + book_id = entry.get('book_id') + if book_id and entry.get('total_pages') is None and book_id in book_total_pages: + entry['total_pages'] = book_total_pages[book_id] + + return journal_entries diff --git a/storygraph_api/request/books_request.py b/storygraph_api/request/books_request.py index b6f22f8..9cb2433 100644 --- a/storygraph_api/request/books_request.py +++ b/storygraph_api/request/books_request.py @@ -1,31 +1,70 @@ import requests -from storygraph_api.exception_handler import request_exception +from typing import Dict class BooksScraper: @staticmethod - @request_exception - def fetch_url(url): - response = requests.get(url) + def fetch_url(url: str, cookies: Dict[str, str] | None = None, params: Dict[str, str] | None = None) -> bytes: + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + response = requests.get(url, cookies=cookies, headers=headers, params=params) response.raise_for_status() return response.content @staticmethod - def main(book_id): + def post_url(url: str, cookies: Dict[str, str] | None = None, data: Dict[str, str] | None = None) -> bytes: + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Content-Type': 'application/x-www-form-urlencoded' + } + response = requests.post(url, cookies=cookies, headers=headers, data=data) + response.raise_for_status() + return response.content + + @staticmethod + def main(book_id: str) -> bytes: url = f"https://app.thestorygraph.com/books/{book_id}" return BooksScraper.fetch_url(url) @staticmethod - def community_reviews(book_id): + def book_page_authenticated(book_id: str, cookies: Dict[str, str]) -> bytes: + url = f"https://app.thestorygraph.com/books/{book_id}" + return BooksScraper.fetch_url(url, cookies=cookies) + + @staticmethod + def community_reviews(book_id: str) -> bytes: url = f"https://app.thestorygraph.com/books/{book_id}/community_reviews" return BooksScraper.fetch_url(url) @staticmethod - def content_warnings(book_id): + def content_warnings(book_id: str) -> bytes: url = f"https://app.thestorygraph.com/books/{book_id}/content_warnings" return BooksScraper.fetch_url(url) + + @staticmethod + def get_read_dates_form(book_id: str, read_instance_id: str, cookies: Dict[str, str]) -> bytes: + url = f"https://app.thestorygraph.com/edit-read-instance-from-book?book_id={book_id}&read_instance_id={read_instance_id}" + return BooksScraper.post_url(url, cookies=cookies) @staticmethod - def search(query): - formatted_query = query.replace(' ', '%20') - url = f"https://app.thestorygraph.com/browse?search_term={formatted_query}" - return BooksScraper.fetch_url(url) + def get_journal_entry_form(book_id: str, journal_entry_id: str, cookies: Dict[str, str]) -> bytes: + url = f"https://app.thestorygraph.com/edit-journal-entry-from-book?book_id={book_id}&journal_entry_id={journal_entry_id}" + return BooksScraper.post_url(url, cookies=cookies) + + @staticmethod + def get_ai_summary(book_id: str, user_id: str) -> bytes: + url = f"https://app.thestorygraph.com/personalized-preview.turbo_stream" + params = {'book_id': book_id, 'personalized': 'false', 'user_id': user_id} + return BooksScraper.fetch_url(url, params=params) + + @staticmethod + def get_journal_page(book_id: str, cookies: Dict[str, str]) -> bytes: + url = "https://app.thestorygraph.com/journal" + params = {'book_id': book_id} + return BooksScraper.fetch_url(url, cookies=cookies, params=params) + + @staticmethod + def search(query: str) -> bytes: + url = "https://app.thestorygraph.com/browse" + params = {'search_term': query} + return BooksScraper.fetch_url(url, params=params) \ No newline at end of file diff --git a/storygraph_api/request/user_request.py b/storygraph_api/request/user_request.py index c1b24e3..00bfdb5 100644 --- a/storygraph_api/request/user_request.py +++ b/storygraph_api/request/user_request.py @@ -1,46 +1,42 @@ -from storygraph_api.exception_handler import request_exception -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -import time +import requests +from typing import Dict class UserScraper: @staticmethod - @request_exception - def fetch_url(url,cookie): - options = Options() - options.add_argument("--headless") - driver = webdriver.Chrome(options=options) - driver.get(url) - if cookie: - driver.add_cookie({ - 'name': 'remember_user_token', - 'value': cookie, - }) - driver.refresh() - SCROLL_PAUSE_TIME = 2 - last_height = driver.execute_script("return document.body.scrollHeight") - while True: - driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") - time.sleep(SCROLL_PAUSE_TIME) - new_height = driver.execute_script("return document.body.scrollHeight") - if new_height == last_height: - break - last_height = new_height - html_content = driver.page_source - driver.quit() - return html_content + def get_profile_page(username: str) -> bytes: + url = f"https://app.thestorygraph.com/profile/{username}" + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + response = requests.get(url, headers=headers) + response.raise_for_status() + return response.content @staticmethod - def currently_reading(uname, cookie): - url = f"https://app.thestorygraph.com/currently-reading/{uname}" - return UserScraper.fetch_url(url,cookie) + def fetch_paginated_url(url: str, cookies: dict) -> bytes: + headers = { + 'User-agent': 'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/91.0.4472.124 safari/537.36' + } + response = requests.get(url, cookies=cookies, headers=headers) + response.raise_for_status() + return response.content @staticmethod - def to_read(uname, cookie): - url = f"https://app.thestorygraph.com/to-read/{uname}" - return UserScraper.fetch_url(url,cookie) + def currently_reading(uname: str, cookies: Dict[str, str], page: int) -> bytes: + url = f"https://app.thestorygraph.com/currently-reading/{uname}?page={page}" + return UserScraper.fetch_paginated_url(url, cookies) @staticmethod - def books_read(uname, cookie): - url = f"https://app.thestorygraph.com/books-read/{uname}" - return UserScraper.fetch_url(url,cookie) + def to_read(uname: str, cookies: Dict[str, str], page: int) -> bytes: + url = f"https://app.thestorygraph.com/to-read/{uname}?page={page}" + return UserScraper.fetch_paginated_url(url, cookies) + + @staticmethod + def books_read(uname: str, cookies: Dict[str, str], page: int) -> bytes: + url = f"https://app.thestorygraph.com/books-read/{uname}?page={page}" + return UserScraper.fetch_paginated_url(url, cookies) + + @staticmethod + def all_journal_entries(cookies: Dict[str, str], page: int) -> bytes: + url = f"https://app.thestorygraph.com/journal?page={page}" + return UserScraper.fetch_paginated_url(url, cookies) diff --git a/storygraph_api/users_client.py b/storygraph_api/users_client.py index 0b4c41e..759c46f 100644 --- a/storygraph_api/users_client.py +++ b/storygraph_api/users_client.py @@ -1,19 +1,50 @@ from storygraph_api.parse.user_parser import UserParser +from storygraph_api.request.user_request import UserScraper from storygraph_api.exception_handler import handle_exceptions import json class User: @handle_exceptions - def currently_reading(self,uname,cookie): - data = UserParser.currently_reading(uname,cookie) - return json.dumps(data,indent=4) + def get_user_id(self, username: str) -> str: + data = UserParser.get_user_id(username) + return json.dumps(data, indent=4) + + def _fetch_paginated_books(self, fetch_function, uname, cookies): + all_books = [] + page = 1 + while True: + content = fetch_function(uname, cookies, page) + books = UserParser.parse_html(content) + if not books: + break + all_books.extend(books) + page += 1 + return all_books + + @handle_exceptions + def currently_reading(self, uname, cookies): + data = self._fetch_paginated_books(UserScraper.currently_reading, uname, cookies) + return json.dumps(data, indent=4) + + @handle_exceptions + def to_read(self, uname, cookies): + data = self._fetch_paginated_books(UserScraper.to_read, uname, cookies) + return json.dumps(data, indent=4) @handle_exceptions - def to_read(self,uname,cookie): - data = UserParser.to_read(uname,cookie) - return json.dumps(data,indent=4) + def books_read(self, uname, cookies): + data = self._fetch_paginated_books(UserScraper.books_read, uname, cookies) + return json.dumps(data, indent=4) @handle_exceptions - def books_read(self,uname,cookie): - data = UserParser.books_read(uname,cookie) - return json.dumps(data,indent=4) + def get_all_journal_entries(self, cookies): + all_entries = [] + page = 1 + while True: + content = UserScraper.all_journal_entries(cookies, page) + entries = UserParser.all_journal_entries(content) + if not entries: + break + all_entries.extend(entries) + page += 1 + return json.dumps(all_entries, indent=4) \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000..2f3bf78 --- /dev/null +++ b/test.py @@ -0,0 +1,169 @@ +import os +import json +from dotenv import load_dotenv + +from storygraph_api import Book, User + +def run_tests(): + print("πŸš€ Starting StoryGraph API tests...") + + print("\n--- 1. Loading Environment & Initializing Clients ---") + load_dotenv() + + username = os.getenv("STORYGRAPH_USERNAME") + session_cookie = os.getenv("_STORYGRAPH_SESSION") + remember_token = os.getenv("REMEMBER_USER_TOKEN") + + if not all([username, session_cookie, remember_token]): + print("πŸ›‘ Error: Missing one or more required environment variables.") + print("Please ensure _STORYGRAPH_SESSION, REMEMBER_USER_TOKEN, and STORYGRAPH_USERNAME are in your .env file.") + else: + auth_cookies = { + "_storygraph_session": session_cookie, + "remember_user_token": remember_token + } + + book_client = Book() + user_client = User() + print("βœ… Setup complete.") + + print("\n--- 2. Testing User Client ---") + + user_id = None + try: + print("\nFetching User ID for:", username) + user_id_json = user_client.get_user_id(username) + user_id_data = json.loads(user_id_json) + if "user_id" in user_id_data: + user_id = user_id_data["user_id"] + print(f"βœ… Success! User ID found: {user_id}") + else: + print(f"⚠️ Could not extract user_id from response: {user_id_json}") + except Exception as e: + print(f"πŸ›‘ Error fetching user ID: {e}") + + try: + print("\nFetching 'Currently Reading' list...") + currently_reading = user_client.currently_reading(username, auth_cookies) + print("βœ… Success! Result:") + print(currently_reading) + except Exception as e: + print(f"πŸ›‘ Error fetching 'Currently Reading' list: {e}") + + try: + print("\nFetching 'To-Read' list...") + to_read = user_client.to_read(username, auth_cookies) + print("βœ… Success! Result:") + print(to_read) + except Exception as e: + print(f"πŸ›‘ Error fetching 'To-Read' list: {e}") + + try: + print("\nFetching 'Read' list...") + books_read = user_client.books_read(username, auth_cookies) + print("βœ… Success! Result:") + print(books_read) + except Exception as e: + print(f"πŸ›‘ Error fetching 'Read' list: {e}") + + try: + print("\nFetching all journal entries...") + all_journal_entries = user_client.get_all_journal_entries(auth_cookies) + all_journal_data = json.loads(all_journal_entries) + if isinstance(all_journal_data, list): + print(f"βœ… All Journal Entries: {len(all_journal_data)} entries found.") + print(all_journal_entries) + else: + print(f"πŸ›‘ Error: All journal entries not returned as a list.") + except Exception as e: + print(f"πŸ›‘ Error fetching all journal entries: {e}") + + print("\n--- 3. Testing Book Client ---") + + finished_book_id = "1c023e31-637b-41d9-ba64-260c3c1b0f3d" + reading_book_id = "87ca0994-06fb-4360-a0bf-660918a7fbc4" + unread_book_id = "c89e808f-39db-49f0-98af-6028d98097f9" + search_query = "Dune Frank Herbert" + + try: + print(f"\nSearching for '{search_query}'...") + search_results = book_client.search(search_query) + print("βœ… Success! Result:") + print(search_results) + except Exception as e: + print(f"πŸ›‘ Error searching for book: {e}") + + try: + print(f"\nFetching info for book ID: {finished_book_id}") + book_info = book_client.book_info(finished_book_id) + print("βœ… Success! Result:") + print(book_info) + info = json.loads(book_info) + cover_url = info.get('cover_url') + if cover_url and isinstance(cover_url, str) and cover_url.startswith("https://cdn.thestorygraph.com/"): + print(f"βœ… cover_url present and valid: {cover_url}") + else: + print(f"πŸ›‘ cover_url missing or invalid: {cover_url}") + except Exception as e: + print(f"πŸ›‘ Error fetching book info: {e}") + + print("\nFetching reading progress for all 3 test cases...") + try: + progress_finished = book_client.reading_progress(finished_book_id, auth_cookies) + print(f"βœ… Finished Book Progress: {json.loads(progress_finished).get('progress')}") + + progress_reading = book_client.reading_progress(reading_book_id, auth_cookies) + print(f"βœ… Currently Reading Book Progress: {json.loads(progress_reading).get('progress')}") + + progress_unread = book_client.reading_progress(unread_book_id, auth_cookies) + print(f"βœ… Unread Book Progress: {json.loads(progress_unread).get('progress')}") + except Exception as e: + print(f"πŸ›‘ Error fetching reading progress: {e}") + + print("\nFetching read dates for all 3 test cases...") + try: + dates_finished = book_client.get_read_dates(finished_book_id, auth_cookies) + print(f"βœ… Finished Book Dates: {dates_finished}") + + dates_reading = book_client.get_read_dates(reading_book_id, auth_cookies) + print(f"βœ… Currently Reading Book Dates: {dates_reading}") + + dates_unread = book_client.get_read_dates(unread_book_id, auth_cookies) + print(f"βœ… Unread Book Dates: {dates_unread}") + except Exception as e: + print(f"πŸ›‘ Error fetching read dates: {e}") + + print("\nFetching journal entries for a finished book and an unread book...") + try: + journal_entries_finished = book_client.get_journal_entries(finished_book_id, auth_cookies) + journal_data = json.loads(journal_entries_finished) + if isinstance(journal_data, list): + print(f"βœ… Finished Book Journal Entries: {len(journal_data)} entries found.") + print(journal_entries_finished) + else: + print(f"πŸ›‘ Error: Journal entries for finished book not returned as a list.") + + journal_entries_unread = book_client.get_journal_entries(unread_book_id, auth_cookies) + journal_data_unread = json.loads(journal_entries_unread) + if isinstance(journal_data_unread, list) and len(journal_data_unread) == 0: + print(f"βœ… Unread Book Journal Entries: Correctly returned an empty list.") + else: + print(f"πŸ›‘ Error: Journal entries for unread book did not return an empty list.") + except Exception as e: + print(f"πŸ›‘ Error fetching journal entries: {e}") + + if user_id: + try: + print(f"\nFetching AI summary for book ID: {finished_book_id}") + ai_summary = book_client.get_ai_summary(finished_book_id, user_id) + print("βœ… Success! Result:") + print(ai_summary) + except Exception as e: + print(f"πŸ›‘ Error fetching AI summary: {e}") + else: + print("\n⚠️ Skipping AI Summary test because user_id could not be retrieved.") + + print("\nπŸŽ‰ All tests complete!") + +if __name__ == "__main__": + run_tests()