From daaec2e4aa83bee98019c8d4648dc6e9b9c260c7 Mon Sep 17 00:00:00 2001 From: "jas.bali" Date: Wed, 15 Oct 2025 12:49:28 -0400 Subject: [PATCH] Rename folder and remove v1.0 parser references - Rename workflow_with_ai_parse_document to job_with_ai_parse_document - Remove CASE statement handling v1.0 vs v2.0 in text extraction - Update README to remove v1.0 mention - Keep explicit version parameter in ai_parse_document call --- .../.gitignore | 0 .../README.md | 2 +- .../assets/document_summary.png | Bin .../assets/figure_description.png | Bin .../assets/page1_bounding_boxes.png | Bin .../assets/page1_elements_list.png | Bin .../assets/page2_contents_table.png | Bin .../assets/table_extraction.png | Bin .../databricks.yml | 0 .../resources/ai_parse_document_job.job.yml | 0 .../ai_parse_document -- debug output.py | 0 .../src/transformations/01_parse_documents.py | 0 .../src/transformations/02_extract_text.py | 6 +----- .../transformations/03_extract_structured_data.py | 0 14 files changed, 2 insertions(+), 6 deletions(-) rename contrib/{workflow_with_ai_parse_document => job_with_ai_parse_document}/.gitignore (100%) rename contrib/{workflow_with_ai_parse_document => job_with_ai_parse_document}/README.md (99%) rename contrib/{workflow_with_ai_parse_document => job_with_ai_parse_document}/assets/document_summary.png (100%) rename contrib/{workflow_with_ai_parse_document => job_with_ai_parse_document}/assets/figure_description.png (100%) rename contrib/{workflow_with_ai_parse_document => job_with_ai_parse_document}/assets/page1_bounding_boxes.png (100%) rename contrib/{workflow_with_ai_parse_document => job_with_ai_parse_document}/assets/page1_elements_list.png (100%) rename contrib/{workflow_with_ai_parse_document => job_with_ai_parse_document}/assets/page2_contents_table.png (100%) rename contrib/{workflow_with_ai_parse_document => job_with_ai_parse_document}/assets/table_extraction.png (100%) rename contrib/{workflow_with_ai_parse_document => job_with_ai_parse_document}/databricks.yml (100%) rename contrib/{workflow_with_ai_parse_document => job_with_ai_parse_document}/resources/ai_parse_document_job.job.yml (100%) rename contrib/{workflow_with_ai_parse_document => job_with_ai_parse_document}/src/explorations/ai_parse_document -- debug output.py (100%) rename contrib/{workflow_with_ai_parse_document => job_with_ai_parse_document}/src/transformations/01_parse_documents.py (100%) rename contrib/{workflow_with_ai_parse_document => job_with_ai_parse_document}/src/transformations/02_extract_text.py (87%) rename contrib/{workflow_with_ai_parse_document => job_with_ai_parse_document}/src/transformations/03_extract_structured_data.py (100%) diff --git a/contrib/workflow_with_ai_parse_document/.gitignore b/contrib/job_with_ai_parse_document/.gitignore similarity index 100% rename from contrib/workflow_with_ai_parse_document/.gitignore rename to contrib/job_with_ai_parse_document/.gitignore diff --git a/contrib/workflow_with_ai_parse_document/README.md b/contrib/job_with_ai_parse_document/README.md similarity index 99% rename from contrib/workflow_with_ai_parse_document/README.md rename to contrib/job_with_ai_parse_document/README.md index 2e0e921e..f899e1d0 100644 --- a/contrib/workflow_with_ai_parse_document/README.md +++ b/contrib/job_with_ai_parse_document/README.md @@ -96,7 +96,7 @@ Uses `ai_parse_document` to extract text, tables, and metadata from PDFs/images: Extracts clean concatenated text using `transform()`: - Reads from previous task's table via streaming -- Handles both parser v1.0 and v2.0 formats +- Extracts text from parsed document elements - Uses `transform()` for efficient text extraction - Includes error handling for failed parses diff --git a/contrib/workflow_with_ai_parse_document/assets/document_summary.png b/contrib/job_with_ai_parse_document/assets/document_summary.png similarity index 100% rename from contrib/workflow_with_ai_parse_document/assets/document_summary.png rename to contrib/job_with_ai_parse_document/assets/document_summary.png diff --git a/contrib/workflow_with_ai_parse_document/assets/figure_description.png b/contrib/job_with_ai_parse_document/assets/figure_description.png similarity index 100% rename from contrib/workflow_with_ai_parse_document/assets/figure_description.png rename to contrib/job_with_ai_parse_document/assets/figure_description.png diff --git a/contrib/workflow_with_ai_parse_document/assets/page1_bounding_boxes.png b/contrib/job_with_ai_parse_document/assets/page1_bounding_boxes.png similarity index 100% rename from contrib/workflow_with_ai_parse_document/assets/page1_bounding_boxes.png rename to contrib/job_with_ai_parse_document/assets/page1_bounding_boxes.png diff --git a/contrib/workflow_with_ai_parse_document/assets/page1_elements_list.png b/contrib/job_with_ai_parse_document/assets/page1_elements_list.png similarity index 100% rename from contrib/workflow_with_ai_parse_document/assets/page1_elements_list.png rename to contrib/job_with_ai_parse_document/assets/page1_elements_list.png diff --git a/contrib/workflow_with_ai_parse_document/assets/page2_contents_table.png b/contrib/job_with_ai_parse_document/assets/page2_contents_table.png similarity index 100% rename from contrib/workflow_with_ai_parse_document/assets/page2_contents_table.png rename to contrib/job_with_ai_parse_document/assets/page2_contents_table.png diff --git a/contrib/workflow_with_ai_parse_document/assets/table_extraction.png b/contrib/job_with_ai_parse_document/assets/table_extraction.png similarity index 100% rename from contrib/workflow_with_ai_parse_document/assets/table_extraction.png rename to contrib/job_with_ai_parse_document/assets/table_extraction.png diff --git a/contrib/workflow_with_ai_parse_document/databricks.yml b/contrib/job_with_ai_parse_document/databricks.yml similarity index 100% rename from contrib/workflow_with_ai_parse_document/databricks.yml rename to contrib/job_with_ai_parse_document/databricks.yml diff --git a/contrib/workflow_with_ai_parse_document/resources/ai_parse_document_job.job.yml b/contrib/job_with_ai_parse_document/resources/ai_parse_document_job.job.yml similarity index 100% rename from contrib/workflow_with_ai_parse_document/resources/ai_parse_document_job.job.yml rename to contrib/job_with_ai_parse_document/resources/ai_parse_document_job.job.yml diff --git a/contrib/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.py b/contrib/job_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.py similarity index 100% rename from contrib/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.py rename to contrib/job_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.py diff --git a/contrib/workflow_with_ai_parse_document/src/transformations/01_parse_documents.py b/contrib/job_with_ai_parse_document/src/transformations/01_parse_documents.py similarity index 100% rename from contrib/workflow_with_ai_parse_document/src/transformations/01_parse_documents.py rename to contrib/job_with_ai_parse_document/src/transformations/01_parse_documents.py diff --git a/contrib/workflow_with_ai_parse_document/src/transformations/02_extract_text.py b/contrib/job_with_ai_parse_document/src/transformations/02_extract_text.py similarity index 87% rename from contrib/workflow_with_ai_parse_document/src/transformations/02_extract_text.py rename to contrib/job_with_ai_parse_document/src/transformations/02_extract_text.py index 29fa9097..28a22449 100644 --- a/contrib/workflow_with_ai_parse_document/src/transformations/02_extract_text.py +++ b/contrib/job_with_ai_parse_document/src/transformations/02_extract_text.py @@ -47,11 +47,7 @@ "\n\n", expr(""" transform( - CASE - WHEN try_cast(parsed:metadata:version AS STRING) = '1.0' - THEN try_cast(parsed:document:pages AS ARRAY) - ELSE try_cast(parsed:document:elements AS ARRAY) - END, + try_cast(parsed:document:elements AS ARRAY), element -> try_cast(element:content AS STRING) ) """), diff --git a/contrib/workflow_with_ai_parse_document/src/transformations/03_extract_structured_data.py b/contrib/job_with_ai_parse_document/src/transformations/03_extract_structured_data.py similarity index 100% rename from contrib/workflow_with_ai_parse_document/src/transformations/03_extract_structured_data.py rename to contrib/job_with_ai_parse_document/src/transformations/03_extract_structured_data.py