diff --git a/datalab_sdk/cli.py b/datalab_sdk/cli.py index 5a9ee6f..f86eccb 100644 --- a/datalab_sdk/cli.py +++ b/datalab_sdk/cli.py @@ -91,6 +91,11 @@ def marker_options(func): default="balanced", help="OCR mode", )(func) + func = click.option( + "--token_efficient_markdown", + is_flag=True, + help="Generate token-efficient markdown optimized for LLM consumption", + )(func) return func @@ -263,6 +268,7 @@ def process_documents( page_schema: Optional[str] = None, add_block_ids: bool = False, mode: str = "balanced", + token_efficient_markdown: bool = False, ): """Unified document processing function""" try: @@ -304,6 +310,7 @@ def process_documents( page_schema=page_schema, add_block_ids=add_block_ids, mode=mode, + token_efficient_markdown=token_efficient_markdown, ) else: # method == "ocr" options = OCROptions( @@ -364,6 +371,7 @@ def convert( page_schema: Optional[str], add_block_ids: bool, mode: str, + token_efficient_markdown: bool, ): """Convert documents to markdown, HTML, or JSON""" process_documents( @@ -386,6 +394,7 @@ def convert( page_schema=page_schema, add_block_ids=add_block_ids, mode=mode, + token_efficient_markdown=token_efficient_markdown, ) diff --git a/datalab_sdk/models.py b/datalab_sdk/models.py index 733d6ac..2026b07 100644 --- a/datalab_sdk/models.py +++ b/datalab_sdk/models.py @@ -51,10 +51,11 @@ class ConvertOptions(ProcessingOptions): keep_spreadsheet_formatting: bool = False webhook_url: Optional[str] = None # Comma-separated list of extra features: 'track_changes', 'chart_understanding', - # 'table_row_bboxes', 'extract_links', 'infographic', 'new_block_types' + # 'table_row_bboxes', 'extract_links', 'infographic', 'new_block_types', 'training_mode' extras: Optional[str] = None add_block_ids: bool = False # add block IDs to HTML output include_markdown_in_chunks: bool = False # include markdown field in chunks/JSON output + token_efficient_markdown: bool = False # generate token-efficient markdown optimized for LLM consumption def to_form_data(self) -> Dict[str, Any]: """Convert to form data format for API requests"""