Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions datalab_sdk/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ def marker_options(func):
default="balanced",
help="OCR mode",
)(func)
func = click.option(
"--token_efficient_markdown",
is_flag=True,
help="Generate token-efficient markdown optimized for LLM consumption",
)(func)
return func


Expand Down Expand Up @@ -263,6 +268,7 @@ def process_documents(
page_schema: Optional[str] = None,
add_block_ids: bool = False,
mode: str = "balanced",
token_efficient_markdown: bool = False,
):
"""Unified document processing function"""
try:
Expand Down Expand Up @@ -304,6 +310,7 @@ def process_documents(
page_schema=page_schema,
add_block_ids=add_block_ids,
mode=mode,
token_efficient_markdown=token_efficient_markdown,
)
else: # method == "ocr"
options = OCROptions(
Expand Down Expand Up @@ -364,6 +371,7 @@ def convert(
page_schema: Optional[str],
add_block_ids: bool,
mode: str,
token_efficient_markdown: bool,
):
"""Convert documents to markdown, HTML, or JSON"""
process_documents(
Expand All @@ -386,6 +394,7 @@ def convert(
page_schema=page_schema,
add_block_ids=add_block_ids,
mode=mode,
token_efficient_markdown=token_efficient_markdown,
)


Expand Down
3 changes: 2 additions & 1 deletion datalab_sdk/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,11 @@ class ConvertOptions(ProcessingOptions):
keep_spreadsheet_formatting: bool = False
webhook_url: Optional[str] = None
# Comma-separated list of extra features: 'track_changes', 'chart_understanding',
# 'table_row_bboxes', 'extract_links', 'infographic', 'new_block_types'
# 'table_row_bboxes', 'extract_links', 'infographic', 'new_block_types', 'training_mode'
extras: Optional[str] = None
add_block_ids: bool = False # add block IDs to HTML output
include_markdown_in_chunks: bool = False # include markdown field in chunks/JSON output
token_efficient_markdown: bool = False # generate token-efficient markdown optimized for LLM consumption

def to_form_data(self) -> Dict[str, Any]:
"""Convert to form data format for API requests"""
Expand Down