From e8ebb47b4c1641d9274a00e244040a2931ae4b2c Mon Sep 17 00:00:00 2001 From: Adi Date: Fri, 26 Sep 2025 19:07:01 +0530 Subject: [PATCH] feat: Add persistence for document generation --- tools/doc_generator/docker-compose.yml | 3 +-- tools/doc_generator/src/main.py | 32 +++++++++++++++++++++++--- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/tools/doc_generator/docker-compose.yml b/tools/doc_generator/docker-compose.yml index 3b1f6bb..e493064 100644 --- a/tools/doc_generator/docker-compose.yml +++ b/tools/doc_generator/docker-compose.yml @@ -1,4 +1,3 @@ -version: "3.8" services: doc_generator: build: . @@ -6,4 +5,4 @@ services: volumes: - ./src/temp:/app/src/temp environment: - - GEMINI_API_KEY=${GEMINI_API_KEY} + - GOOGLE_API_KEY=${GOOGLE_API_KEY} diff --git a/tools/doc_generator/src/main.py b/tools/doc_generator/src/main.py index 792e814..dc5b28c 100644 --- a/tools/doc_generator/src/main.py +++ b/tools/doc_generator/src/main.py @@ -32,6 +32,7 @@ def __init__(self): self.llm = ChatGoogleGenerativeAI(model=Config.MODEL_NAME) self.structured_llm = self.llm.with_structured_output(DocumentIssues) self._ensure_output_dir() + self.starting_index = self._get_initial_index() # Added functionality logger.info( f"Initialized processor │ model={Config.MODEL_NAME} │ max_docs={Config.MAX_DOCUMENTS} │ delay={Config.API_DELAY_SECONDS}s" @@ -40,6 +41,29 @@ def __init__(self): def _ensure_output_dir(self): os.makedirs(Config.OUTPUT_DIR, exist_ok=True) + def _get_initial_index(self) -> int: # Added functionality + """Determines the starting index by checking existing files in the output directory.""" + try: + # List files, filter for those matching the pattern, extract index, and find the max + processed_indices = [] + for filename in os.listdir(Config.OUTPUT_DIR): + if filename.endswith("_doc.json") and filename[:-9].isdigit(): + try: + # Filename format is "NNN_doc.json", where NNN is the index + 1 + index = int(filename.split("_")[0]) + processed_indices.append(index) + except ValueError: + continue # Skip files that don't conform to the index part + + if processed_indices: + max_filename_index = max(processed_indices) + return max_filename_index + else: + return 0 # Start from the beginning + except Exception as e: + logger.error(f"Error determining initial index: {str(e)}. Starting from 0.") + return 0 + def _generate_filename(self, index: int) -> str: return os.path.join(Config.OUTPUT_DIR, f"{index:03d}_doc.json") @@ -85,8 +109,9 @@ def _save_document(self, content: str, index: int) -> bool: def initialize_processing( self, state: DocumentProcessingState ) -> DocumentProcessingState: - logger.info("Processing initialized │ starting_index=0") - return {"document_index": 0, "llm_output_content": ""} + # Changed functionality to use self.starting_index + logger.info(f"Processing initialized │ starting_index={self.starting_index}") + return {"document_index": self.starting_index, "llm_output_content": ""} def invoke_llm_for_document( self, state: DocumentProcessingState @@ -172,8 +197,9 @@ def run(self): try: app = self.build_graph() + # Changed functionality to pass the initial state based on persistence check app.invoke( - {"document_index": 0, "llm_output_content": ""}, + {"document_index": self.starting_index, "llm_output_content": ""}, {"recursion_limit": Config.MAX_DOCUMENTS + 5}, )