diff --git a/README.md b/README.md index 5142585..e3cd950 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,8 @@ -# Adobe India Hackathon: Document Intelligence (Round 1B) +# Document Intelligence System -This repository contains the solution for Round 1B of the "Connecting the Dots" Hackathon by Team Sentinels. +This repository contains an intelligent document analysis system that extracts and prioritizes the most relevant sections from a collection of PDFs based on a specific user persona and their job-to-be-done. The solution is designed to run entirely offline, leveraging a hybrid approach that combines structural analysis, semantic understanding, and keyword relevance. -The goal is to build a system that acts as an intelligent document analyst, extracting and prioritizing the most relevant sections from a collection of PDFs based on a specific user persona and their job-to-be-done. The solution is designed to run entirely offline, leveraging a hybrid approach that combines structural analysis, semantic understanding, and keyword relevance. - -## Team Sentinels +## Contributors - [Saksham Kumar](https://github.com/sakshamkumar04) - [Aloukik Joshi](https://github.com/aloukikjoshi) @@ -19,7 +17,7 @@ Our solution employs a multi-stage, hybrid pipeline that combines structural doc ### 1. Parallel Structure Extraction -- Reuses the high-performance outline extractor from Round 1A. +- Uses a high-performance outline extractor for document processing. - Processes all input PDFs in parallel, with each document handled by a separate CPU core. - Extracts all titles and headings (H1, H2, H3) for each document, serving as candidate sections for relevance ranking. @@ -50,7 +48,7 @@ Our solution employs a multi-stage, hybrid pipeline that combines structural doc ### 5. Final Output Generation - Aggregates results, including ranked section titles and refined subsection text. -- Formats output into `challenge1b_output.json` with all required metadata. +- Formats output into `output.json` with all required metadata. ## Models and Libraries Used @@ -63,7 +61,7 @@ Our solution employs a multi-stage, hybrid pipeline that combines structural doc - `torch`: Framework for running the sentence-transformer model. - `sentence-transformers`: For loading and using the embedding model. - `rank_bm25`: For keyword-based relevance scoring. -- `pymupdf`: For efficient PDF text extraction (reused from Round 1A). +- `pymupdf`: For efficient PDF text extraction. - `numpy` & `pandas`: For numerical operations and data handling. All dependencies are listed in [requirements.txt](requirements.txt) and are installed within the Docker container. @@ -87,7 +85,7 @@ Before running the solution, ensure your directories are organized as follows: ``` root/ ├── input/ -│ ├── challenge1b_input.json +│ ├── input.json │ └── PDFs/ # All required PDF documents ├── output/ # Results will be written here (create empty) ├── Dockerfile @@ -100,7 +98,7 @@ root/ After building the image: 1. Create an `input` directory containing in the root directory of your project. Refer to the expected directory structure above. -1. Place `challenge1b_input.json` and a sub-folder `PDFs` with all required documents in your local `input` directory. +1. Place `input.json` and a sub-folder `PDFs` with all required documents in your local `input` directory. 1. (Optional) Create an empty local directory for results (e.g., `output`). 1. Run the following command from the directory containing your `input` and `output` folders: @@ -108,7 +106,7 @@ After building the image: docker run --rm -v $(pwd)/input:/app/input -v $(pwd)/output:/app/output --network none document-intelligence:somerandomidentifier ``` -The script inside the container will process the input JSON and PDF collection, and generate `challenge1b_output.json` in `/app/output`. +The script inside the container will process the input JSON and PDF collection, and generate `output.json` in `/app/output`. --- @@ -173,8 +171,8 @@ This solution was realized with the support of Gemini, Perplexity, and GitHub Ch ## Copyright -© Team Sentinels (Saksham Kumar, Aloukik Joshi, Nihal Pandey). -All rights reserved. Team members possess exclusive rights to this solution, along with Adobe for the purpose of the competition. +© Contributors (Saksham Kumar, Aloukik Joshi, Nihal Pandey). +All rights reserved. Unauthorized copying, distribution, or use of this code or documentation is strictly prohibited and liable to legal action. --- diff --git a/main.py b/main.py index 08209ca..3f01e94 100644 --- a/main.py +++ b/main.py @@ -1,15 +1,15 @@ #!/usr/bin/env python3 """ -Round-1B | Persona-Aware PDF Pipeline +Document Intelligence | Persona-Aware PDF Pipeline ------------------------------------ CLI python main.py -• Reads challenge1b_input.json from . -• Runs the original Round-1A extractor *in parallel* on every PDF +• Reads input.json from . +• Runs the PDF outline extractor *in parallel* on every PDF (each worker opens the file from bytes once). • Embeds + ranks sections with Granite-107 M embeddings and BM25. -• Refines best sections (thread-pool) and writes challenge1b_output.json +• Refines best sections (thread-pool) and writes output.json to . """ @@ -23,7 +23,7 @@ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed -# Round-1B modules +# Document Intelligence modules from semantic_analyzer import SemanticAnalyzer from relevance_scorer import RelevanceScorer from subsection_extractor import SubsectionExtractor @@ -34,20 +34,20 @@ # ───────────────────────── Helpers ────────────────────────── def _extract_outline_blob(pdf_path: str) -> Dict: - """Worker: run Round-1A on a PDF file path.""" + """Worker: run PDF outline extraction on a PDF file path.""" # This import is intentionally local to the worker process from r1a.enhanced_pdf_extractor import process_pdf_enhanced return process_pdf_enhanced(pdf_path) def _load_input(inp_dir: str) -> Dict: - with open(os.path.join(inp_dir, "challenge1b_input.json"), encoding="utf-8") as fh: + with open(os.path.join(inp_dir, "input.json"), encoding="utf-8") as fh: return json.load(fh) def _write_output(out_dir: str, data: Dict) -> None: os.makedirs(out_dir, exist_ok=True) - with open(os.path.join(out_dir, "challenge1b_output.json"), "w", encoding="utf-8") as fh: + with open(os.path.join(out_dir, "output.json"), "w", encoding="utf-8") as fh: json.dump(data, fh, indent=2, ensure_ascii=False) @@ -63,7 +63,7 @@ def run_pipeline(input_dir: str, output_dir: str) -> None: "processing_timestamp": datetime.now(timezone.utc).isoformat() } - # 1. Parallel outline extraction (Round-1A) + # 1. Parallel outline extraction pdf_dir = os.path.join(input_dir, "PDFs") outlines: Dict[str, Dict] = {} pdf_paths = [os.path.join(pdf_dir, d["filename"]) for d in req["documents"]]