diff --git a/.gitignore b/.gitignore
index 7a848e39f..3418b2c7a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,3 +51,9 @@ VERCEL_CHANGES_SUMMARY.md
VERCEL_DEPLOYMENT_ASSESSMENT.md
VERCEL_MIGRATION_GUIDE.md
node_modules/
+
+# Build artifacts
+**/build/
+**/dist/
+**/public/*.es.js
+venv2/
diff --git a/TEST_FAILURES_ANALYSIS.md b/TEST_FAILURES_ANALYSIS.md
index b80f23783..5824972d1 100644
--- a/TEST_FAILURES_ANALYSIS.md
+++ b/TEST_FAILURES_ANALYSIS.md
@@ -162,3 +162,5 @@ But the tests expect formatted output with bullet points:
3. **Consider**: Some tests might need to navigate to specific pages first before checking for widgets
+
+
diff --git a/report_analyst/core/analyzer.py b/report_analyst/core/analyzer.py
index b6822a1b1..d2601f73c 100644
--- a/report_analyst/core/analyzer.py
+++ b/report_analyst/core/analyzer.py
@@ -823,7 +823,25 @@ async def process_document(
else:
logger.warning("No EVIDENCE field found in result")
- # 5. Save complete analysis
+ # 5. Add chunks to result before saving
+ # Prepare chunks with all metadata for saving
+ result_chunks = []
+ for i, chunk in enumerate(similar_chunks):
+ chunk_data = {
+ "text": chunk.get("text", ""),
+ "chunk_order": i,
+ "similarity_score": chunk.get("similarity_score", chunk.get("score", 0.0)),
+ "llm_score": chunk.get("llm_score"),
+ "is_evidence": chunk.get("is_evidence", False),
+ "evidence_order": chunk.get("evidence_order"),
+ "metadata": chunk.get("metadata", {}),
+ }
+ result_chunks.append(chunk_data)
+
+ result["chunks"] = result_chunks
+ logger.info(f"[ANALYSIS] Added {len(result_chunks)} chunks to result for saving")
+
+ # 6. Save complete analysis
logger.info(
f"[ANALYSIS] Saving analysis result for question {question_id}"
)
@@ -839,7 +857,7 @@ async def process_document(
"question_set": self.question_set,
}
- # Save analysis result
+ # Save analysis result (includes chunks)
self.cache_manager.save_analysis(
file_path=file_path,
question_id=question_id,
@@ -1067,6 +1085,15 @@ async def _analyze_chunks(
# Get LLM response
try:
+ if self.llm is None:
+ logger.error("LLM not initialized - cannot analyze chunks")
+ return {
+ "ANSWER": "Error: LLM not initialized. Please check your API keys and configuration.",
+ "SCORE": 0,
+ "EVIDENCE": [],
+ "GAPS": ["LLM service unavailable"],
+ "SOURCES": [],
+ }
response = await self.llm.achat(messages)
response_text = (
response.message.content
@@ -1417,6 +1444,11 @@ async def _get_similar_chunks(
try:
logger.info(f"Getting similar chunks for query: {query_text[:50]}...")
+ # Check if embeddings are available
+ if self.embeddings is None:
+ logger.error("Embeddings not initialized - cannot get similar chunks")
+ return []
+
# Get embedding for the query
query_embedding = self.embeddings.get_text_embedding(query_text)
diff --git a/report_analyst/core/cache_manager.py b/report_analyst/core/cache_manager.py
index c54d4e76f..7586b2c3a 100644
--- a/report_analyst/core/cache_manager.py
+++ b/report_analyst/core/cache_manager.py
@@ -316,18 +316,27 @@ def save_analysis(
logger.debug(f"Processing chunk: {json.dumps(chunk, indent=2)}")
# Get chunk ID from document_chunks table
+ # Must match on file_path, chunk_text, chunk_size, and chunk_overlap
result_obj = conn.execute(
text("""
SELECT id FROM document_chunks
- WHERE file_path = :file_path AND chunk_text = :chunk_text
+ WHERE file_path = :file_path
+ AND chunk_text = :chunk_text
+ AND chunk_size = :chunk_size
+ AND chunk_overlap = :chunk_overlap
"""),
- {"file_path": str(file_path), "chunk_text": chunk["text"]},
+ {
+ "file_path": str(file_path),
+ "chunk_text": chunk["text"],
+ "chunk_size": config["chunk_size"],
+ "chunk_overlap": config["chunk_overlap"],
+ },
)
row = result_obj.fetchone()
if row:
chunk_id = row[0]
logger.debug(f"Found chunk ID: {chunk_id}")
-
+
# Save chunk relevance with all available information
if self.db_manager.is_postgres():
conn.execute(
@@ -380,8 +389,128 @@ def save_analysis(
f"Saving raw values to DB - similarity_score: {chunk.get('similarity_score')}, llm_score: {chunk.get('llm_score')}, is_evidence: {chunk.get('is_evidence')}"
)
else:
- logger.warning(
- f"Could not find chunk in document_chunks table"
+ # Chunk doesn't exist in document_chunks - create it first (even without embedding)
+ logger.info(
+ f"Chunk not found in document_chunks, creating it for file_path={file_path}, chunk_size={config['chunk_size']}, chunk_overlap={config['chunk_overlap']}"
+ )
+
+ chunk_metadata = chunk.get("metadata", {})
+ timestamp = datetime.now().isoformat()
+
+ # Insert chunk into document_chunks (embedding can be NULL)
+ if self.db_manager.is_postgres():
+ insert_result = conn.execute(
+ text("""
+ INSERT INTO document_chunks
+ (file_path, chunk_text, chunk_size, chunk_overlap, embedding, metadata, created_at)
+ VALUES (:file_path, :chunk_text, :chunk_size, :chunk_overlap, :embedding, :metadata, :created_at)
+ ON CONFLICT (file_path, chunk_text, chunk_size, chunk_overlap) DO UPDATE
+ SET metadata = EXCLUDED.metadata
+ RETURNING id
+ """),
+ {
+ "file_path": str(file_path),
+ "chunk_text": chunk["text"],
+ "chunk_size": config["chunk_size"],
+ "chunk_overlap": config["chunk_overlap"],
+ "embedding": None, # No embedding available, but we still need the chunk
+ "metadata": json.dumps(chunk_metadata),
+ "created_at": timestamp,
+ },
+ )
+ chunk_id = insert_result.fetchone()[0]
+ else:
+ conn.execute(
+ text("""
+ INSERT OR IGNORE INTO document_chunks
+ (file_path, chunk_text, chunk_size, chunk_overlap, embedding, metadata, created_at)
+ VALUES (:file_path, :chunk_text, :chunk_size, :chunk_overlap, :embedding, :metadata, :created_at)
+ """),
+ {
+ "file_path": str(file_path),
+ "chunk_text": chunk["text"],
+ "chunk_size": config["chunk_size"],
+ "chunk_overlap": config["chunk_overlap"],
+ "embedding": None, # No embedding available, but we still need the chunk
+ "metadata": json.dumps(chunk_metadata),
+ "created_at": timestamp,
+ },
+ )
+ # Get the ID after insert
+ result_obj = conn.execute(
+ text("""
+ SELECT id FROM document_chunks
+ WHERE file_path = :file_path
+ AND chunk_text = :chunk_text
+ AND chunk_size = :chunk_size
+ AND chunk_overlap = :chunk_overlap
+ """),
+ {
+ "file_path": str(file_path),
+ "chunk_text": chunk["text"],
+ "chunk_size": config["chunk_size"],
+ "chunk_overlap": config["chunk_overlap"],
+ },
+ )
+ row = result_obj.fetchone()
+ if row:
+ chunk_id = row[0]
+ else:
+ logger.error(f"Failed to retrieve chunk ID after insert")
+ continue
+
+ logger.info(f"Created chunk in document_chunks with ID: {chunk_id}, now saving chunk_relevance")
+
+ # Now save chunk_relevance with the newly created chunk_id
+ if self.db_manager.is_postgres():
+ conn.execute(
+ text("""
+ INSERT INTO chunk_relevance
+ (question_analysis_id, document_chunk_id, chunk_order,
+ similarity_score, llm_score, is_evidence, evidence_order, metadata)
+ VALUES (:question_analysis_id, :document_chunk_id, :chunk_order,
+ :similarity_score, :llm_score, :is_evidence, :evidence_order, :metadata)
+ ON CONFLICT (question_analysis_id, document_chunk_id) DO UPDATE
+ SET chunk_order = EXCLUDED.chunk_order,
+ similarity_score = EXCLUDED.similarity_score,
+ llm_score = EXCLUDED.llm_score,
+ is_evidence = EXCLUDED.is_evidence,
+ evidence_order = EXCLUDED.evidence_order,
+ metadata = EXCLUDED.metadata
+ """),
+ {
+ "question_analysis_id": analysis_id,
+ "document_chunk_id": chunk_id,
+ "chunk_order": chunk.get("chunk_order", 0),
+ "similarity_score": chunk.get("similarity_score", 0.0),
+ "llm_score": chunk.get("llm_score"),
+ "is_evidence": chunk.get("is_evidence", False),
+ "evidence_order": chunk.get("evidence_order"),
+ "metadata": json.dumps(chunk.get("metadata", {})),
+ },
+ )
+ else:
+ conn.execute(
+ text("""
+ INSERT OR REPLACE INTO chunk_relevance
+ (question_analysis_id, document_chunk_id, chunk_order,
+ similarity_score, llm_score, is_evidence, evidence_order, metadata)
+ VALUES (:question_analysis_id, :document_chunk_id, :chunk_order,
+ :similarity_score, :llm_score, :is_evidence, :evidence_order, :metadata)
+ """),
+ {
+ "question_analysis_id": analysis_id,
+ "document_chunk_id": chunk_id,
+ "chunk_order": chunk.get("chunk_order", 0),
+ "similarity_score": chunk.get("similarity_score", 0.0),
+ "llm_score": chunk.get("llm_score"),
+ "is_evidence": chunk.get("is_evidence", False),
+ "evidence_order": chunk.get("evidence_order"),
+ "metadata": json.dumps(chunk.get("metadata", {})),
+ },
+ )
+ logger.info(
+ f"Saved chunk_relevance - similarity_score: {chunk.get('similarity_score')}, llm_score: {chunk.get('llm_score')}, is_evidence: {chunk.get('is_evidence')}"
)
# Save to analysis cache
@@ -505,6 +634,14 @@ def get_analysis(
for row in rows:
question_id, result_json = row
result = json.loads(result_json)
+
+ # Ensure SCORE is a number, not a string (fix for JSON deserialization)
+ if "SCORE" in result:
+ try:
+ result["SCORE"] = float(result["SCORE"]) if result["SCORE"] is not None else 0
+ except (ValueError, TypeError):
+ result["SCORE"] = 0
+
results[question_id] = {
"result": result,
"chunks": [], # Will be populated from chunk_relevance
@@ -527,7 +664,10 @@ def get_analysis(
cr.metadata as relevance_metadata
FROM analysis_cache ac
JOIN questions q ON q.question_id = ac.question_id
- JOIN question_analysis qa ON qa.question_id = q.id AND qa.file_path = ac.file_path
+ JOIN question_analysis qa ON qa.question_id = q.id
+ AND qa.file_path = ac.file_path
+ AND qa.model = ac.model
+ AND qa.top_k = ac.top_k
JOIN chunk_relevance cr ON cr.question_analysis_id = qa.id
JOIN document_chunks dc ON cr.document_chunk_id = dc.id
WHERE ac.file_path = :file_path
@@ -552,9 +692,145 @@ def get_analysis(
chunk_params[f"qid_{i}"] = qid
logger.info(f"Executing chunk query with params: {list(chunk_params.keys())}")
+ logger.info(f"Chunk query params values: file_path={file_path}, chunk_size={config['chunk_size']}, chunk_overlap={config['chunk_overlap']}, top_k={config['top_k']}, model={config['model']}, question_set={db_question_set}, question_ids={list(results.keys())}")
+
+ # Debug: Check if question_analysis records exist
+ qid_placeholders_test = ",".join(f":qid_{i}" for i in range(len(results)))
+ test_params = {"file_path": str(file_path), "model": config["model"], "top_k": config["top_k"]}
+ for i, qid in enumerate(results.keys()):
+ test_params[f"qid_{i}"] = qid
+
+ test_query = text(f"""
+ SELECT COUNT(*) FROM question_analysis qa
+ JOIN questions q ON q.id = qa.question_id
+ WHERE q.question_id IN ({qid_placeholders_test})
+ AND qa.file_path = :file_path
+ AND qa.model = :model
+ AND qa.top_k = :top_k
+ """)
+ test_result = conn.execute(test_query, test_params)
+ test_count = test_result.scalar()
+ logger.info(f"Found {test_count} question_analysis records matching file_path, model, and top_k")
+
+ # Debug: Check if chunk_relevance records exist
+ if test_count > 0:
+ chunk_relevance_query = text(f"""
+ SELECT COUNT(*) FROM question_analysis qa
+ JOIN questions q ON q.id = qa.question_id
+ JOIN chunk_relevance cr ON cr.question_analysis_id = qa.id
+ WHERE q.question_id IN ({qid_placeholders_test})
+ AND qa.file_path = :file_path
+ AND qa.model = :model
+ AND qa.top_k = :top_k
+ """)
+ cr_result = conn.execute(chunk_relevance_query, test_params)
+ cr_count = cr_result.scalar()
+ logger.info(f"Found {cr_count} chunk_relevance records for these question_analysis records")
+
chunk_result = conn.execute(text(chunk_query), chunk_params)
chunk_rows = chunk_result.fetchall()
- logger.info(f"Retrieved {len(chunk_rows)} chunk rows")
+ logger.info(f"Retrieved {len(chunk_rows)} chunk rows from database via chunk_relevance JOIN")
+
+ # If no chunks found via chunk_relevance, try to get chunks directly from document_chunks
+ # This is a fallback for cases where chunks exist but weren't linked via chunk_relevance
+ if len(chunk_rows) == 0:
+ logger.warning("No chunks found via chunk_relevance JOIN, trying fallback: get chunks directly from document_chunks")
+
+ # Get all document_chunks for this file with matching parameters
+ fallback_query = text("""
+ SELECT
+ dc.id,
+ dc.chunk_text,
+ dc.metadata as chunk_metadata
+ FROM document_chunks dc
+ WHERE dc.file_path = :file_path
+ AND dc.chunk_size = :chunk_size
+ AND dc.chunk_overlap = :chunk_overlap
+ ORDER BY dc.id
+ """)
+ fallback_params = {
+ "file_path": str(file_path),
+ "chunk_size": config["chunk_size"],
+ "chunk_overlap": config["chunk_overlap"],
+ }
+ fallback_result = conn.execute(fallback_query, fallback_params)
+ fallback_chunks = fallback_result.fetchall()
+ logger.info(f"Found {len(fallback_chunks)} chunks in document_chunks (fallback)")
+
+ # If we have chunks but no chunk_relevance, we can't match them to questions
+ # So we'll assign them to all questions that have analysis results
+ if fallback_chunks and len(results) > 0:
+ logger.warning("Chunks exist in document_chunks but not linked via chunk_relevance. Cannot match to specific questions without chunk_relevance data.")
+ # For now, we'll skip the fallback since we can't match chunks to questions without chunk_relevance
+ # The chunks need to be properly linked during analysis save
+
+ if len(chunk_rows) == 0:
+ logger.warning(f"No chunks found in database for file_path={file_path}, question_set={db_question_set}")
+ logger.warning(f"Query was: {chunk_query[:500]}...")
+ # Check each step of the JOIN
+ logger.warning("Debugging JOIN query step by step:")
+
+ # Step 1: Check analysis_cache
+ ac_query = text(f"""
+ SELECT COUNT(*) FROM analysis_cache ac
+ WHERE ac.file_path = :file_path
+ AND ac.chunk_size = :chunk_size
+ AND ac.chunk_overlap = :chunk_overlap
+ AND ac.top_k = :top_k
+ AND ac.model = :model
+ AND ac.question_set = :question_set
+ AND ac.question_id IN ({qid_placeholders_test})
+ """)
+ ac_result = conn.execute(ac_query, chunk_params)
+ ac_count = ac_result.scalar()
+ logger.warning(f" - analysis_cache records: {ac_count}")
+
+ # Step 2: Check questions
+ q_query = text(f"""
+ SELECT COUNT(*) FROM questions q
+ WHERE q.question_id IN ({qid_placeholders_test})
+ AND q.question_set = :question_set
+ """)
+ q_params = {"question_set": db_question_set}
+ for i, qid in enumerate(results.keys()):
+ q_params[f"qid_{i}"] = qid
+ q_result = conn.execute(q_query, q_params)
+ q_count = q_result.scalar()
+ logger.warning(f" - questions records: {q_count}")
+
+ # Step 3: Check question_analysis (already done above)
+ logger.warning(f" - question_analysis records: {test_count}")
+
+ # Step 4: Check chunk_relevance
+ if test_count > 0:
+ cr_query = text(f"""
+ SELECT COUNT(*) FROM question_analysis qa
+ JOIN questions q ON q.id = qa.question_id
+ JOIN chunk_relevance cr ON cr.question_analysis_id = qa.id
+ WHERE q.question_id IN ({qid_placeholders_test})
+ AND qa.file_path = :file_path
+ AND qa.model = :model
+ AND qa.top_k = :top_k
+ """)
+ cr_result = conn.execute(cr_query, test_params)
+ cr_count = cr_result.scalar()
+ logger.warning(f" - chunk_relevance records: {cr_count}")
+
+ # Step 5: Check document_chunks
+ dc_query = text("""
+ SELECT COUNT(*) FROM document_chunks dc
+ WHERE dc.file_path = :file_path
+ AND dc.chunk_size = :chunk_size
+ AND dc.chunk_overlap = :chunk_overlap
+ """)
+ dc_params = {
+ "file_path": str(file_path),
+ "chunk_size": config["chunk_size"],
+ "chunk_overlap": config["chunk_overlap"],
+ }
+ dc_result = conn.execute(dc_query, dc_params)
+ dc_count = dc_result.scalar()
+ logger.warning(f" - document_chunks records: {dc_count}")
# Add chunks to their respective questions
for row in chunk_rows:
@@ -569,7 +845,7 @@ def get_analysis(
"evidence_order": row[7],
"relevance_metadata": json.loads(row[8]) if row[8] else {},
}
- logger.info(
+ logger.debug(
f"Raw DB values for chunk - similarity_score: {row[4]}, llm_score: {row[5]}, is_evidence: {row[6]}"
)
results[question_id]["chunks"].append(chunk_info)
diff --git a/report_analyst/core/dataframe_manager.py b/report_analyst/core/dataframe_manager.py
index dc39516e4..6ec63e33d 100644
--- a/report_analyst/core/dataframe_manager.py
+++ b/report_analyst/core/dataframe_manager.py
@@ -62,11 +62,17 @@ def create_analysis_dataframes(
f"Processing question {question_id} with keys: {list(result.keys())}"
)
- # Create analysis row
+ # Create analysis row - ensure score is a number
+ score = result.get("SCORE", 0)
+ try:
+ score = float(score) if score is not None else 0
+ except (ValueError, TypeError):
+ score = 0
+
analysis_row = {
"Question ID": question_id,
"Analysis": result.get("ANSWER", ""),
- "Score": float(result.get("SCORE", 0)),
+ "Score": score,
"Key Evidence": format_list_field(result.get("EVIDENCE", [])),
"Gaps": format_list_field(result.get("GAPS", [])),
"Sources": format_list_field(result.get("SOURCES", [])),
@@ -74,8 +80,9 @@ def create_analysis_dataframes(
analysis_rows.append(analysis_row)
logger.info(f"Added analysis row for question {question_id}")
- # Process chunks - use exactly what's in the database
- chunks = data.get("chunks", [])
+ # Process chunks - check both result and data for chunks
+ # Chunks can be in result (if added during analysis) or in data (if from database)
+ chunks = result.get("chunks", data.get("chunks", []))
logger.info(
f"Processing {len(chunks)} chunks for question {question_id}"
)
diff --git a/report_analyst/streamlit_app.py b/report_analyst/streamlit_app.py
index 4d3a02a86..125b30bba 100644
--- a/report_analyst/streamlit_app.py
+++ b/report_analyst/streamlit_app.py
@@ -266,10 +266,11 @@ def process_document(
use_llm_scoring: bool = False,
single_call: bool = True,
force_recompute: bool = False,
+ pre_retrieved_chunks: Optional[List[Dict[str, Any]]] = None,
):
"""Delegate to the analyzer's process_document method"""
return self.analyzer.process_document(
- file_path, selected_questions, use_llm_scoring, single_call, force_recompute
+ file_path, selected_questions, use_llm_scoring, single_call, force_recompute, pre_retrieved_chunks
)
@@ -741,7 +742,7 @@ def get_uploaded_files_history(backend_config=None) -> List[Dict]:
def display_analysis_results(
- analysis_df: pd.DataFrame, chunks_df: pd.DataFrame, file_key: str = None
+ analysis_df: pd.DataFrame, chunks_df: pd.DataFrame, file_key: str = None, file_path: str = None, question_set: str = None
) -> None:
"""Display analysis results in a consistent format for both individual and consolidated views"""
try:
@@ -749,6 +750,14 @@ def display_analysis_results(
st.warning("No analysis results to display")
return
+ # Try to import and use PDF viewer component if available
+ pdf_viewer_available = False
+ try:
+ from report_analyst_enterprise.components.streamlit_component.backend import pdf_viewer
+ pdf_viewer_available = True
+ except ImportError:
+ pass
+
# Analysis Results Table
st.subheader("Analysis Results")
st.dataframe(
@@ -783,6 +792,100 @@ def display_analysis_results(
},
)
+ # PDF Viewer with Chunks (if available and file_path provided)
+ if pdf_viewer_available and file_path and not chunks_df.empty:
+ try:
+ # Get question set if not provided
+ if not question_set:
+ question_set = st.session_state.get("question_set", "tcfd")
+
+ # Load questions
+ question_set_obj = question_loader.get_question_set(question_set)
+ questions_data = {}
+ if question_set_obj:
+ for q_id, q_data in question_set_obj.questions.items():
+ questions_data[q_id] = q_data.get("text", q_id)
+
+ # Try to get chunks with full metadata from cache if available
+ # Otherwise, reconstruct from dataframe (without page_number)
+ chunks_by_question = {}
+ try:
+ # Try to get from analyzer cache if available
+ from report_analyst.core.analyzer import DocumentAnalyzer
+ analyzer = DocumentAnalyzer()
+
+ # Get config from session state
+ config = {
+ "chunk_size": st.session_state.get("chunk_size", 500),
+ "chunk_overlap": st.session_state.get("chunk_overlap", 0),
+ "top_k": st.session_state.get("top_k", 10),
+ "model": st.session_state.get("llm_model", "gpt-4o-mini"),
+ "question_set": question_set,
+ }
+
+ # Get cached results with full chunk metadata
+ cached_results = analyzer.cache_manager.get_analysis(
+ file_path=file_path,
+ config=config
+ )
+
+ if cached_results:
+ # Extract chunks with full metadata and normalize page numbers
+ for q_id, data in cached_results.items():
+ if q_id not in chunks_by_question:
+ chunks_by_question[q_id] = []
+ chunks = data.get("chunks", [])
+ # Normalize page_number in metadata (convert from 'source' if needed)
+ for chunk in chunks:
+ if chunk.get("metadata"):
+ metadata = chunk["metadata"]
+ # PyMuPDFReader uses 'source' as page number string, normalize to 'page_number' as integer
+ if "page_number" not in metadata and "source" in metadata:
+ try:
+ metadata["page_number"] = int(metadata["source"])
+ except (ValueError, TypeError):
+ metadata["page_number"] = 1
+ elif "page_number" in metadata:
+ # Ensure it's an integer
+ try:
+ metadata["page_number"] = int(metadata["page_number"])
+ except (ValueError, TypeError):
+ metadata["page_number"] = 1
+ else:
+ # Default to page 1 if no page info
+ metadata["page_number"] = 1
+ chunks_by_question[q_id].extend(chunks)
+ except Exception as cache_error:
+ logger.debug(f"Could not get chunks from cache: {cache_error}")
+ # Fallback: reconstruct from dataframe (without page_number)
+ for _, row in chunks_df.iterrows():
+ q_id = row.get("Question ID", "")
+ if q_id not in chunks_by_question:
+ chunks_by_question[q_id] = []
+
+ chunk = {
+ "text": row.get("Chunk Text", ""),
+ "metadata": {}, # No metadata available from dataframe
+ "is_evidence": row.get("Is Evidence", False),
+ "similarity_score": row.get("Vector Similarity", 0.0),
+ "llm_score": row.get("LLM Score"),
+ "chunk_order": row.get("Position", 0),
+ }
+ chunks_by_question[q_id].append(chunk)
+
+ # Display PDF viewer in a tab or expander
+ with st.expander("📄 PDF Viewer with Chunks", expanded=False):
+ pdf_viewer(
+ pdf_path=file_path,
+ chunks_data=chunks_by_question,
+ questions_data=questions_data,
+ height=800,
+ key=f"pdf_viewer_{file_key}" if file_key else "pdf_viewer"
+ )
+ except Exception as e:
+ logger.warning(f"Could not display PDF viewer: {e}", exc_info=True)
+ # Fall back to table view
+
# Document Chunks Table
if not chunks_df.empty:
st.subheader("Document Chunks")
@@ -1247,7 +1350,13 @@ def display_consolidated_results(analyzer, question_set):
# Display results using the existing display function
file_key = f"{Path(file_path).stem}_cs{selected_config['config']['chunk_size']}"
- display_analysis_results(analysis_df, chunks_df, file_key)
+ display_analysis_results(
+ analysis_df,
+ chunks_df,
+ file_key,
+ file_path=file_path,
+ question_set=question_set
+ )
else:
st.warning("No results found in stored for this configuration")
else:
@@ -1415,7 +1524,11 @@ async def run_analysis(analyzer, file_path, selected_questions, progress_text):
if cached_results and not st.session_state.get("force_recompute", False):
logger.info(f"[CACHE] Cache HIT for config: {config}")
progress_text.success("Found stored results!")
- st.session_state.results = cached_results
+ # Convert cached_results to the expected format: {"answers": {question_id: result}}
+ if "results" not in st.session_state:
+ st.session_state.results = {"answers": {}}
+ for question_id, data in cached_results.items():
+ st.session_state.results["answers"][question_id] = data
logger.info(
f"[ANALYSIS] Writing results to session state for file: {file_path}"
)
@@ -1504,7 +1617,11 @@ async def run_analysis(analyzer, file_path, selected_questions, progress_text):
logger.info(
f"[ANALYSIS] Writing results to session state for file: {file_path}"
)
- st.session_state.results = final_results
+ # Convert final_results to the expected format: {"answers": {question_id: result}}
+ if "results" not in st.session_state:
+ st.session_state.results = {"answers": {}}
+ for question_id, data in final_results.items():
+ st.session_state.results["answers"][question_id] = data
logger.info(f"[ANALYSIS] Attempting to display results for file: {file_path}")
progress_text.success("Analysis complete!")
@@ -2756,8 +2873,8 @@ def main():
with st.sidebar:
nav_page = option_menu(
menu_title=None,
- options=["Upload Report", "Report Analyst", "All Results"],
- icons=["house", "file-text", "bar-chart"],
+ options=["Upload Report", "Report Analyst", "View Report", "All Results"],
+ icons=["house", "file-text", "file-pdf", "bar-chart"],
menu_icon=None,
default_index=0,
orientation="vertical",
@@ -2784,7 +2901,7 @@ def main():
)
except ImportError:
# Fallback to regular radio if package not installed
- nav_options = ["Upload Report", "Report Analyst", "All Results"]
+ nav_options = ["Upload Report", "Report Analyst", "View Report", "All Results"]
nav_page = st.sidebar.radio(
"",
nav_options,
@@ -3548,8 +3665,13 @@ def main():
create_analysis_dataframes(all_results)
)
file_key = Path(file_path).stem
+ question_set = config.get("question_set", st.session_state.get("question_set", "tcfd"))
display_analysis_results(
- analysis_df, chunks_df, file_key
+ analysis_df,
+ chunks_df,
+ file_key,
+ file_path=str(file_path),
+ question_set=question_set
)
progress_text.success(
f"✓ Analysis complete for {len(selected_questions)} questions"
@@ -3571,6 +3693,216 @@ def main():
st.error("File not found: No file path available. Please select a valid file.")
else:
st.error(f"File not found: {file_path}. Please ensure the file exists.")
+
+ # Display results if they exist - check both session state and database
+ # First, check if we have results in session state
+ has_dataframes = (
+ "analysis_df" in st.session_state and
+ "chunks_df" in st.session_state and
+ not st.session_state.analysis_df.empty
+ )
+ has_raw_results = (
+ "results" in st.session_state and
+ "answers" in st.session_state.results and
+ len(st.session_state.results["answers"]) > 0
+ )
+
+ # Always try to load from database/cache manager when a file is selected (even if session state has results, database is source of truth)
+ if previous_files and "previous_file" in st.session_state:
+ try:
+ # Get the selected file
+ prev_file = st.session_state.previous_file
+ selected_file_obj = None
+ if isinstance(prev_file, dict):
+ selected_file_obj = prev_file
+ else:
+ for f in previous_files:
+ if f["name"] == prev_file or f.get("path") == prev_file:
+ selected_file_obj = f
+ break
+
+ if selected_file_obj and "analyzer" in st.session_state:
+ # Get file path
+ selected_uri = selected_file_obj.get("uri", selected_file_obj.get("path", ""))
+ is_backend = selected_uri.startswith("urn:report-analyst:backend:")
+
+ if is_backend:
+ file_path_for_cache = selected_uri
+ else:
+ file_path_for_cache = selected_file_obj.get("path", "")
+ if file_path_for_cache.startswith("file://"):
+ file_path_for_cache = file_path_for_cache.replace("file://", "")
+
+ # Normalize path - resolve to absolute path for comparison
+ try:
+ file_path_for_cache = str(Path(file_path_for_cache).resolve())
+ except Exception:
+ pass # Keep original if resolve fails
+
+ # Get question set
+ question_set = st.session_state.get("new_question_set", "tcfd")
+
+ # Map question set to database identifier
+ question_set_mapping = {
+ "tcfd": "tcfd",
+ "s4m": "s4m",
+ "lucia": "lucia",
+ "everest": "ev",
+ }
+ db_question_set = question_set_mapping.get(question_set, question_set)
+
+ # Get current config
+ config = {
+ "chunk_size": st.session_state.get("new_chunk_size", 500),
+ "chunk_overlap": st.session_state.get("new_overlap", 20),
+ "top_k": st.session_state.get("new_top_k", 5),
+ "model": st.session_state.get("new_llm_model", "gpt-4o-mini"),
+ "question_set": question_set,
+ }
+
+ # Get all question IDs for this question set to load all results
+ try:
+ # Use global question_loader or create a new one
+ from report_analyst.core.question_loader import get_question_loader
+ q_loader = get_question_loader()
+ question_set_obj = q_loader.get_question_set(question_set)
+ all_question_ids = list(question_set_obj.questions.keys()) if question_set_obj else []
+ except Exception:
+ all_question_ids = None
+
+ # Try to load results from database with current config
+ logger.info(f"Attempting to load results from database for file: {file_path_for_cache}, config: {config}")
+ cached_results = st.session_state.analyzer.analyzer.cache_manager.get_analysis(
+ file_path=file_path_for_cache,
+ config=config,
+ question_ids=all_question_ids
+ )
+
+ # If no results with exact config match, try to find any config for this file and question set
+ if not cached_results:
+ cache_configs = st.session_state.analyzer.analyzer.cache_manager.check_cache_status()
+ matching_configs = []
+ for cache_config in cache_configs:
+ if len(cache_config) == 6:
+ cfg_file_path, chunk_size, chunk_overlap, top_k, model, qs = cache_config
+ # Normalize both paths for comparison
+ try:
+ cfg_path_normalized = str(Path(str(cfg_file_path)).resolve())
+ file_path_normalized = str(Path(file_path_for_cache).resolve())
+ except Exception:
+ cfg_path_normalized = str(cfg_file_path)
+ file_path_normalized = file_path_for_cache
+
+ if cfg_path_normalized == file_path_normalized and qs == db_question_set:
+ matching_configs.append({
+ "chunk_size": chunk_size,
+ "chunk_overlap": chunk_overlap,
+ "top_k": top_k,
+ "model": model,
+ "question_set": question_set,
+ })
+
+ # If we found any matching configs, use the first one
+ if matching_configs:
+ logger.info(f"Found {len(matching_configs)} matching configs, using first one")
+ config = matching_configs[0]
+ cached_results = st.session_state.analyzer.analyzer.cache_manager.get_analysis(
+ file_path=file_path_for_cache,
+ config=config,
+ question_ids=all_question_ids
+ )
+
+ # If we have results, load them
+ if cached_results:
+ logger.info(f"Successfully loaded {len(cached_results)} results from database")
+ # Store in session state
+ if "results" not in st.session_state:
+ st.session_state.results = {"answers": {}}
+ for question_id, data in cached_results.items():
+ st.session_state.results["answers"][question_id] = data
+
+ # Create dataframes
+ file_key = generate_file_key(file_path_for_cache, st)
+ analysis_df, chunks_df = create_analysis_dataframes(
+ st.session_state.results["answers"],
+ file_key
+ )
+ st.session_state.analysis_df = analysis_df
+ st.session_state.chunks_df = chunks_df
+ st.session_state.analysis_complete = True
+
+ has_dataframes = True
+ has_raw_results = True
+ logger.info(f"Created dataframes: analysis_df has {len(analysis_df)} rows, chunks_df has {len(chunks_df)} rows")
+ else:
+ logger.info(f"No cached results found for file: {file_path_for_cache} with config: {config}")
+ except Exception as e:
+ logger.error(f"Error loading results from database: {e}", exc_info=True)
+
+ # Display results if we have them
+ if has_dataframes or has_raw_results:
+ # If we have raw results but no dataframes, create them
+ if has_raw_results and not has_dataframes:
+ try:
+ # Get file path for generating file key
+ display_file_path = None
+ if previous_files and "previous_file" in st.session_state:
+ prev_file = st.session_state.previous_file
+ if isinstance(prev_file, dict):
+ display_file_path = prev_file.get("path", "")
+ else:
+ for f in previous_files:
+ if f["name"] == prev_file or f.get("path") == prev_file:
+ display_file_path = f.get("path", "")
+ break
+
+ file_key = generate_file_key(display_file_path, st) if display_file_path else "analysis"
+
+ # Create dataframes from raw results
+ analysis_df, chunks_df = create_analysis_dataframes(
+ st.session_state.results["answers"],
+ file_key
+ )
+ st.session_state.analysis_df = analysis_df
+ st.session_state.chunks_df = chunks_df
+ st.session_state.analysis_complete = True
+ has_dataframes = True
+ except Exception as e:
+ logger.error(f"Error creating dataframes from session state results: {e}", exc_info=True)
+ st.warning("Results found but could not be displayed. Please re-run analysis.")
+ has_dataframes = False
+
+ # Display results if we have dataframes
+ if has_dataframes:
+ # Get file path for display
+ display_file_path = None
+ if previous_files and "previous_file" in st.session_state:
+ prev_file = st.session_state.previous_file
+ if isinstance(prev_file, dict):
+ display_file_path = prev_file.get("path", "")
+ else:
+ for f in previous_files:
+ if f["name"] == prev_file or f.get("path") == prev_file:
+ display_file_path = f.get("path", "")
+ break
+
+ # Get question set
+ question_set = st.session_state.get("new_question_set", "tcfd")
+
+ # Generate file key
+ if display_file_path:
+ file_key = Path(display_file_path).stem
+ else:
+ file_key = "analysis"
+
+ # Display the results
+ display_analysis_results(
+ st.session_state.analysis_df,
+ st.session_state.chunks_df,
+ file_key=file_key,
+ file_path=display_file_path,
+ question_set=question_set
+ )
else:
st.info("No previously analyzed reports found")
@@ -3683,6 +4015,34 @@ def main():
""", unsafe_allow_html=True)
+ # Try to import JSON Schema form component (enterprise feature)
+ try:
+ # Use the proper Streamlit custom component
+ from report_analyst_enterprise.components.streamlit_component.backend import json_schema_form
+ import json
+ # Path is already imported at the top of the file
+
+ JSON_SCHEMA_FORM_AVAILABLE = True
+
+ # Load PDF upload schema
+ schema_path = Path(__file__).parent.parent / "report_analyst_enterprise" / "components" / "schemas" / "pdf_upload_schema.json"
+ ui_schema_path = Path(__file__).parent.parent / "report_analyst_enterprise" / "components" / "schemas" / "pdf_upload_ui_schema.json"
+
+ if schema_path.exists() and ui_schema_path.exists():
+ with open(schema_path) as f:
+ pdf_upload_schema = json.load(f)
+ with open(ui_schema_path) as f:
+ pdf_upload_ui_schema = json.load(f)
+ else:
+ JSON_SCHEMA_FORM_AVAILABLE = False
+ pdf_upload_schema = None
+ pdf_upload_ui_schema = None
+ except ImportError:
+ JSON_SCHEMA_FORM_AVAILABLE = False
+ pdf_upload_schema = None
+ pdf_upload_ui_schema = None
+
+ # File upload with optional metadata form
uploaded_file = st.file_uploader(
"Choose a PDF file",
type="pdf",
@@ -3690,6 +4050,48 @@ def main():
help="Limit 200MB per file • PDF"
)
+ # Show metadata form if JSON Schema form is available
+ pdf_metadata = None
+ company_metadata = None
+
+ if JSON_SCHEMA_FORM_AVAILABLE:
+ # ESRS Company Information Form
+ esrs_schema_path = Path(__file__).parent.parent / "report_analyst_enterprise" / "components" / "schemas" / "esrs_company_schema.json"
+ esrs_ui_schema_path = Path(__file__).parent.parent / "report_analyst_enterprise" / "components" / "schemas" / "esrs_company_ui_schema.json"
+
+ if esrs_schema_path.exists() and esrs_ui_schema_path.exists():
+ with open(esrs_schema_path) as f:
+ esrs_company_schema = json.load(f)
+ with open(esrs_ui_schema_path) as f:
+ esrs_company_ui_schema = json.load(f)
+
+ with st.expander("ESRS Company Information", expanded=True):
+ st.caption("Enter company data aligned with ESRS XBRL taxonomy requirements")
+ company_metadata = json_schema_form(
+ schema=esrs_company_schema,
+ ui_schema=esrs_company_ui_schema,
+ key="esrs_company_form",
+ height=700
+ )
+ if company_metadata and company_metadata.get("type") == "submit":
+ st.success("Company information saved!")
+ st.session_state.esrs_company_metadata = company_metadata.get("formData", company_metadata)
+
+ # Basic PDF metadata form
+ if pdf_upload_schema:
+ with st.expander("Add Document Metadata (Optional)", expanded=False):
+ st.caption("Add metadata like category, tags, and description to help organize your documents.")
+ pdf_metadata = json_schema_form(
+ schema=pdf_upload_schema,
+ ui_schema=pdf_upload_ui_schema,
+ key="pdf_metadata_form",
+ height=500
+ )
+ if pdf_metadata:
+ st.success("Metadata saved!")
+ # Store in session state for use after upload
+ st.session_state.pdf_metadata = pdf_metadata
+
if uploaded_file:
# Handle upload based on mode
if use_s3_upload and BACKEND_INTEGRATION_AVAILABLE:
@@ -3778,6 +4180,317 @@ def main():
st.rerun()
# All Results page
+ elif nav_page == "View Report":
+ st.header("View Report")
+ st.write("View PDF with chunks and analysis results by question")
+
+ # Get file list for dropdown (including backend resources if enabled)
+ backend_config = st.session_state.get("backend_config")
+ previous_files = get_uploaded_files_history(backend_config=backend_config)
+
+ if not previous_files:
+ st.info("No reports available. Please upload a report first.")
+ else:
+ # File selector
+ selected_file_dropdown = st.selectbox(
+ "Select Report",
+ options=previous_files,
+ format_func=lambda x: x["name"],
+ key="view_report_file",
+ )
+
+ if selected_file_dropdown:
+ selected_uri = selected_file_dropdown.get("uri", selected_file_dropdown.get("path", ""))
+ is_backend = selected_uri.startswith("urn:report-analyst:backend:")
+
+ # Determine file path: use URI for backend, absolute path for local files
+ if is_backend:
+ file_path = selected_uri # Use URN for backend resources
+ else:
+ file_path = selected_file_dropdown.get("path", "")
+ # Handle file:// URI format
+ if file_path.startswith("file://"):
+ file_path = file_path.replace("file://", "")
+ # Resolve to absolute path (same as Report Analyst)
+ file_path = str(Path(file_path).resolve()) if file_path else file_path
+
+ # Question set selection
+ selected_set = st.selectbox(
+ "Select Question Set",
+ options=list(question_sets.keys()),
+ format_func=lambda x: question_sets[x]["name"],
+ key="view_report_set",
+ )
+
+ if selected_set and file_path:
+ # Load questions (always needed for PDF viewer)
+ # Use global question_loader (imported at module level)
+ from report_analyst.core.question_loader import get_question_loader
+ q_loader = get_question_loader()
+ question_set_obj = q_loader.get_question_set(selected_set)
+ questions_data = {}
+ if question_set_obj:
+ for q_id, q_data in question_set_obj.questions.items():
+ questions_data[q_id] = q_data.get("text", q_id)
+
+ # Try to get cached results (optional - PDF will show even without them)
+ cached_results = None
+ selected_config = None
+ chunks_by_question = {}
+ analysis_by_question = {}
+
+ try:
+ # Map question set to database identifier
+ question_set_mapping = {
+ "tcfd": "tcfd",
+ "s4m": "s4m",
+ "lucia": "lucia",
+ "everest": "ev",
+ }
+ db_question_set = question_set_mapping.get(selected_set, selected_set)
+
+ # Get all cache configs
+ cache_configs = analyzer.analyzer.cache_manager.check_cache_status()
+ logger.info(f"Found {len(cache_configs)} total cache configs")
+ logger.info(f"Looking for file_path: {file_path}, question_set: {db_question_set}")
+
+ # Filter configs for this file and question set
+ matching_configs = []
+ for config in cache_configs:
+ if len(config) == 6:
+ cfg_file_path, chunk_size, chunk_overlap, top_k, model, qs = config
+ # Match file path and question set
+ # Compare both as strings to handle path variations
+ if str(cfg_file_path) == str(file_path) and qs == db_question_set:
+ matching_configs.append({
+ "chunk_size": chunk_size,
+ "chunk_overlap": chunk_overlap,
+ "top_k": top_k,
+ "model": model,
+ "question_set": selected_set, # Use original question set ID - get_analysis will map it internally
+ })
+
+ logger.info(f"Found {len(matching_configs)} matching configs for file and question set")
+
+ if matching_configs:
+ # Let user select config if multiple, otherwise use first
+ if len(matching_configs) > 1:
+ config_options = [
+ f"Chunk: {cfg['chunk_size']}, Overlap: {cfg['chunk_overlap']}, Top-K: {cfg['top_k']}, Model: {cfg['model']}"
+ for cfg in matching_configs
+ ]
+ selected_config_idx = st.selectbox(
+ "Select Configuration",
+ options=range(len(matching_configs)),
+ format_func=lambda i: config_options[i],
+ key="view_report_config",
+ )
+ selected_config = matching_configs[selected_config_idx]
+ else:
+ selected_config = matching_configs[0]
+
+ # Get cached results with the selected config
+ # Note: get_analysis will map question_set internally, so we pass the ID
+ logger.info(f"Retrieving cached results with config: {selected_config}")
+ # Get all question IDs for this question set
+ all_question_ids = list(questions_data.keys())
+ logger.info(f"Retrieving chunks for {len(all_question_ids)} questions: {all_question_ids}")
+ cached_results = analyzer.analyzer.cache_manager.get_analysis(
+ file_path=file_path,
+ config=selected_config,
+ question_ids=all_question_ids
+ )
+ logger.info(f"Retrieved cached results for {len(cached_results) if cached_results else 0} questions")
+
+ if cached_results:
+ # Prepare chunks by question and normalize page numbers
+ for q_id, data in cached_results.items():
+ chunks = data.get("chunks", [])
+ # Normalize page_number in metadata (convert from 'source' if needed)
+ for chunk in chunks:
+ if chunk.get("metadata"):
+ metadata = chunk["metadata"]
+ # PyMuPDFReader uses 'source' as page number string, normalize to 'page_number' as integer
+ if "page_number" not in metadata and "source" in metadata:
+ try:
+ metadata["page_number"] = int(metadata["source"])
+ except (ValueError, TypeError):
+ metadata["page_number"] = 1
+ elif "page_number" in metadata:
+ # Ensure it's an integer
+ try:
+ metadata["page_number"] = int(metadata["page_number"])
+ except (ValueError, TypeError):
+ metadata["page_number"] = 1
+ else:
+ # Default to page 1 if no page info
+ metadata["page_number"] = 1
+ chunks_by_question[q_id] = chunks
+ logger.info(f"Question {q_id}: Found {len(chunks)} chunks")
+ if chunks:
+ logger.debug(f"First chunk sample for {q_id}: {chunks[0] if chunks else 'None'}")
+ result = data.get("result", {})
+ # Ensure score is a number, not a string
+ score = result.get("SCORE", 0)
+ try:
+ score = float(score) if score is not None else 0
+ except (ValueError, TypeError):
+ score = 0
+
+ analysis_by_question[q_id] = {
+ "answer": result.get("ANSWER", ""),
+ "score": score,
+ "evidence": result.get("EVIDENCE", []),
+ "gaps": result.get("GAPS", []),
+ }
+
+ # Log total chunks for debugging
+ total_chunks = sum(len(chunks) for chunks in chunks_by_question.values())
+ logger.info(f"Total chunks prepared for PDF viewer: {total_chunks}")
+ else:
+ st.info("No cached analysis results found. PDF will display without chunks.")
+ else:
+ st.info(f"No cached results found for this file and question set '{selected_set}'. PDF will display without chunks. Run analysis in 'Report Analyst' tab to see chunks.")
+
+ except Exception as e:
+ logger.error(f"Error getting cached results: {e}", exc_info=True)
+ st.warning(f"Could not load cached results: {str(e)}. PDF will display without chunks.")
+
+ # Try to import PDF viewer
+ pdf_viewer_available = False
+ try:
+ from report_analyst_enterprise.components.streamlit_component.backend import pdf_viewer
+ pdf_viewer_available = True
+ except ImportError:
+ pass
+
+ # Create two-column layout: questions on left, PDF viewer on right
+ if pdf_viewer_available:
+ left_col, right_col = st.columns([1, 1])
+ else:
+ left_col = st.container()
+ right_col = None
+
+ with left_col:
+ st.subheader("Questions & Chunks")
+
+ if cached_results and chunks_by_question:
+ # Sort questions by question_id for consistent display
+ sorted_question_ids = sorted(questions_data.keys())
+
+ for q_id in sorted_question_ids:
+ question_text = questions_data[q_id]
+ chunks = chunks_by_question.get(q_id, [])
+ analysis = analysis_by_question.get(q_id, {})
+
+ with st.expander(f"**{q_id}**: {question_text[:80]}{'...' if len(question_text) > 80 else ''}", expanded=False):
+ if chunks:
+ # Sort chunks: evidence first, then by score (higher is better)
+ sorted_chunks = sorted(
+ chunks,
+ key=lambda c: (
+ not c.get("is_evidence", False), # Evidence first (False < True)
+ -(c.get("llm_score") if c.get("llm_score") is not None else c.get("similarity_score", 0)) # Higher scores first
+ )
+ )
+
+ # Create dataframe for chunks with chunk IDs for navigation
+ chunk_rows = []
+ chunk_id_map = {} # Map row index to chunk_id
+ for idx, chunk in enumerate(sorted_chunks):
+ chunk_order = chunk.get('chunk_order', 0)
+ # Generate chunk ID: "question_id_chunk_order"
+ chunk_id = f"{q_id}_{chunk_order}"
+ chunk_id_map[idx] = chunk_id
+ chunk_rows.append({
+ "Chunk": f"Chunk {chunk_order + 1}",
+ "Text": chunk.get("text", "")[:200] + ("..." if len(chunk.get("text", "")) > 200 else ""),
+ "Page": chunk.get("metadata", {}).get("page_number", "N/A"),
+ "Evidence": "✓" if chunk.get("is_evidence", False) else "",
+ "Similarity": f"{chunk.get('similarity_score', 0):.3f}",
+ "LLM Score": f"{chunk.get('llm_score', 0):.3f}" if chunk.get("llm_score") else "N/A",
+ })
+
+ chunks_df = pd.DataFrame(chunk_rows)
+
+ # Use session state to track selected chunk for this question
+ chunk_selection_key = f"selected_chunk_{q_id}_{selected_set}"
+
+ # Add a "Select" column with buttons for each chunk
+ select_buttons = []
+ for idx in range(len(chunks_df)):
+ chunk_id = chunk_id_map[idx]
+ select_buttons.append(chunk_id)
+
+ # Display chunks with clickable select buttons
+ for idx, row in chunks_df.iterrows():
+ chunk_id = chunk_id_map[idx]
+ col1, col2 = st.columns([0.12, 0.88])
+ with col1:
+ if st.button("📍", key=f"select_chunk_{chunk_id}", help="Click to highlight this chunk in PDF", use_container_width=True):
+ st.session_state[chunk_selection_key] = chunk_id
+ st.rerun()
+ with col2:
+ st.markdown(f"**{row['Chunk']}** | Page {row['Page']} | {row['Evidence']} | Similarity: {row['Similarity']}")
+ st.caption(row['Text'])
+
+ # Also show as compact dataframe for overview
+ st.dataframe(
+ chunks_df,
+ use_container_width=True,
+ hide_index=True,
+ column_config={
+ "Chunk": st.column_config.TextColumn("Chunk", width="small"),
+ "Text": st.column_config.TextColumn("Text", width="large"),
+ "Page": st.column_config.TextColumn("Page", width="small"),
+ "Evidence": st.column_config.TextColumn("Evidence", width="small"),
+ "Similarity": st.column_config.TextColumn("Similarity", width="small"),
+ "LLM Score": st.column_config.TextColumn("LLM Score", width="small"),
+ }
+ )
+
+ # Show analysis result below chunks
+ st.markdown("---")
+ st.markdown("**Analysis Result:**")
+ if analysis.get("answer"):
+ st.write(analysis["answer"])
+ if analysis.get("score") is not None:
+ # Handle score as either number or string
+ try:
+ score_value = float(analysis["score"])
+ st.metric("Score", f"{score_value:.1f}")
+ except (ValueError, TypeError):
+ # If score is not a number, display as-is
+ st.metric("Score", str(analysis["score"]))
+ else:
+ st.info("No chunks available for this question.")
+ else:
+ st.info("No cached analysis results available. Run analysis in 'Report Analyst' tab to see chunks and analysis.")
+
+ # PDF viewer on the right - always show if file is selected
+ if pdf_viewer_available and right_col:
+ with right_col:
+ st.subheader("PDF Viewer")
+
+ # Get selected chunk ID from session state (check all questions)
+ selected_chunk_id = None
+ for q_id_check in questions_data.keys():
+ chunk_key = f"selected_chunk_{q_id_check}_{selected_set}"
+ if chunk_key in st.session_state:
+ selected_chunk_id = st.session_state[chunk_key]
+ break # Use first found, or could use most recent
+
+ pdf_viewer(
+ pdf_path=file_path,
+ chunks_data=chunks_by_question,
+ questions_data=questions_data,
+ highlight_chunk_id=selected_chunk_id,
+ height=800,
+ key=f"view_report_pdf_viewer_{selected_set}"
+ )
+ elif not pdf_viewer_available:
+ st.info("PDF viewer component not available. Install enterprise components to enable PDF viewing.")
+
elif nav_page == "All Results":
st.header("View All Results")
st.write("View and export consolidated results for all analyzed reports")
diff --git a/report_analyst_enterprise/components/streamlit_component/PDF_VIEWER_README.md b/report_analyst_enterprise/components/streamlit_component/PDF_VIEWER_README.md
new file mode 100644
index 000000000..9cbbbebfd
--- /dev/null
+++ b/report_analyst_enterprise/components/streamlit_component/PDF_VIEWER_README.md
@@ -0,0 +1,180 @@
+# PDF Viewer Component with Chunks
+
+A Streamlit custom component that displays PDFs with chunk annotations, allowing users to view chunks per question and filter by evidence.
+
+## Features
+
+- **PDF Display**: Renders PDF documents using PDF.js
+- **Chunk Annotations**: Shows chunks associated with each question
+- **Evidence Filtering**: Filter to show only evidence chunks
+- **Question Navigation**: Select a question to see its associated chunks
+- **Page Navigation**: Navigate to specific pages and see chunk highlights
+- **Works Standalone**: Can be used outside Streamlit as a web component
+
+## Architecture
+
+The component follows a three-layer architecture:
+
+1. **Web Component** (`web/src/pdf-viewer.js`): Framework-agnostic web component using PDF.js directly (not using streamlit-pdf-viewer repo - we built our own)
+2. **React Wrapper** (`frontend/src/pdf-viewer.tsx`): React component that wraps the web component for Streamlit
+3. **Streamlit Backend** (`backend/pdf_viewer.py`): Python interface for Streamlit
+
+**Note**: This is a custom implementation built from scratch using PDF.js. We do not use or depend on the streamlit-pdf-viewer repository. We use PDF.js (the same underlying library) but have built our own component specifically for displaying chunks per question with evidence filtering.
+
+## Development Setup
+
+### Prerequisites
+
+- Node.js and npm
+- Python with Streamlit
+
+### Building the Component
+
+1. **Build the web component** (framework-agnostic):
+```bash
+cd report_analyst_enterprise/components/web
+npm install
+npm run build
+```
+
+This creates `dist/pdf-viewer.es.js` which is used by both standalone and Streamlit versions.
+
+2. **Build the Streamlit component**:
+```bash
+cd report_analyst_enterprise/components/streamlit_component/frontend
+npm install
+npm run build:pdf-viewer
+```
+
+### Development Mode
+
+For hot-reload during development:
+
+1. **Start the PDF viewer dev server** (in one terminal):
+```bash
+cd report_analyst_enterprise/components/streamlit_component/frontend
+npm run dev:pdf-viewer
+```
+
+This starts a dev server on port 3002.
+
+2. **Run your Streamlit app** (in another terminal):
+```bash
+streamlit run report_analyst/streamlit_app.py
+```
+
+The component will automatically use the dev server if it's running.
+
+## Usage in Streamlit
+
+```python
+from report_analyst_enterprise.components.streamlit_component.backend import pdf_viewer
+
+# Prepare data
+chunks_by_question = {
+ "q1": [
+ {
+ "text": "Chunk text...",
+ "metadata": {"page_number": 1},
+ "is_evidence": True,
+ "similarity_score": 0.85,
+ "llm_score": 0.92,
+ "chunk_order": 0
+ }
+ ]
+}
+
+questions_data = {
+ "q1": "How does the organization identify climate risks?"
+}
+
+# Display the component
+pdf_viewer(
+ pdf_path="/path/to/document.pdf",
+ chunks_data=chunks_by_question,
+ questions_data=questions_data,
+ selected_question_id="q1", # Optional
+ show_evidence_only=False, # Optional
+ height=800,
+ key="my_pdf_viewer"
+)
+```
+
+## Standalone Usage
+
+The web component can be used outside Streamlit. See `web/examples/pdf-viewer-standalone.html` for an example.
+
+```html
+
+
+
+
+
+
+
+
+
+
+
+```
+
+## Data Format
+
+### Chunks
+
+Each chunk should have:
+- `text`: The chunk text content
+- `metadata`: Object containing metadata (should include `page_number`)
+- `is_evidence`: Boolean indicating if this chunk is evidence
+- `similarity_score`: Float similarity score
+- `llm_score`: Optional float LLM relevance score
+- `chunk_order`: Integer position of chunk
+
+### Questions
+
+Questions should be provided as a dictionary mapping question_id to question text:
+```python
+{
+ "q1": "Question text here",
+ "q2": "Another question..."
+}
+```
+
+## Integration with Streamlit App
+
+The component is integrated into `report_analyst/streamlit_app.py` in the `display_analysis_results()` function. It automatically appears when:
+- The PDF viewer component is available (enterprise feature)
+- A file path is provided
+- Chunks data is available
+
+The component appears in an expander section titled "📄 PDF Viewer with Chunks".
+
+## Troubleshooting
+
+### Component not loading
+
+1. Check that the dev server is running (for development) or the component is built (for production)
+2. Check browser console for errors
+3. Verify PDF.js is loading correctly
+
+### PDF not displaying
+
+1. Check that the PDF path is correct and accessible
+2. For local files, ensure the path is absolute or relative to the Streamlit app
+3. Check browser console for PDF.js errors
+
+### Chunks not showing
+
+1. Verify chunks data format matches the expected structure
+2. Check that `page_number` is included in chunk metadata
+3. Verify questions data is provided correctly
+
+
diff --git a/report_analyst_enterprise/components/streamlit_component/backend/__init__.py b/report_analyst_enterprise/components/streamlit_component/backend/__init__.py
index 01ab57640..dfadea552 100644
--- a/report_analyst_enterprise/components/streamlit_component/backend/__init__.py
+++ b/report_analyst_enterprise/components/streamlit_component/backend/__init__.py
@@ -1,9 +1,10 @@
"""
-Streamlit custom component backend for JSON Schema form.
+Streamlit custom component backend for JSON Schema form and PDF viewer.
"""
from .json_schema_form import json_schema_form
+from .pdf_viewer import pdf_viewer
-__all__ = ['json_schema_form']
+__all__ = ['json_schema_form', 'pdf_viewer']
diff --git a/report_analyst_enterprise/components/streamlit_component/backend/pdf_viewer.py b/report_analyst_enterprise/components/streamlit_component/backend/pdf_viewer.py
new file mode 100644
index 000000000..5d9f72b72
--- /dev/null
+++ b/report_analyst_enterprise/components/streamlit_component/backend/pdf_viewer.py
@@ -0,0 +1,192 @@
+"""
+Streamlit custom component backend for PDF viewer with chunks.
+
+This creates a proper Streamlit custom component using the framework-agnostic
+web component, which internally uses PDF.js.
+"""
+
+import base64
+import json
+import logging
+import socket
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import streamlit.components.v1 as components
+
+logger = logging.getLogger(__name__)
+
+# Get the path to the frontend
+_COMPONENT_DIR = Path(__file__).parent.parent / "frontend"
+_RELEASE_DIR = _COMPONENT_DIR / "build"
+
+
+def pdf_viewer(
+ pdf_path: str,
+ chunks_data: Dict[str, List[Dict[str, Any]]],
+ questions_data: Dict[str, str],
+ selected_question_id: Optional[str] = None,
+ show_evidence_only: bool = False,
+ highlight_chunk_id: Optional[str] = None,
+ key: Optional[str] = None,
+ height: int = 800,
+) -> Optional[Dict[str, Any]]:
+ """
+ Render a PDF viewer with chunk annotations in Streamlit using a custom component.
+
+ Args:
+ pdf_path: Path to PDF file (local file path or URI)
+ chunks_data: Dictionary mapping question_id to list of chunk dictionaries.
+ Each chunk should have:
+ - text: str
+ - metadata: dict (with page_number)
+ - is_evidence: bool
+ - similarity_score: float
+ - llm_score: float (optional)
+ - chunk_order: int
+ questions_data: Dictionary mapping question_id to question text
+ selected_question_id: Optional question ID to highlight initially
+ show_evidence_only: Whether to filter to show only evidence chunks
+ highlight_chunk_id: Optional chunk ID to highlight (format: "question_id_chunk_order")
+ key: Optional key for Streamlit component (for state management)
+ height: Height of the component in pixels
+
+ Returns:
+ Dictionary with event data if chunk was selected, None otherwise
+ """
+ # Check for dev server availability (prefer dev server for hot reload)
+ dev_server_port = None
+ dev_server_available = False
+
+ # Check common dev server ports (use 3002 for PDF viewer, different from JSON form)
+ for port in [3002, 3003, 3004]:
+ try:
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ sock.settimeout(1)
+ result = sock.connect_ex(('localhost', port))
+ sock.close()
+ if result == 0:
+ dev_server_port = port
+ dev_server_available = True
+ break
+ except:
+ pass
+
+ if dev_server_available:
+ # Use dev server (hot reload) - need to specify the HTML file
+ logger.info(f"Using PDF viewer component from dev server (http://localhost:{dev_server_port})")
+ component = components.declare_component(
+ "pdf_viewer",
+ url=f"http://localhost:{dev_server_port}",
+ )
+ elif _RELEASE_DIR.exists() and any(_RELEASE_DIR.iterdir()):
+ # Use built component - check for PDF viewer subdirectory
+ pdf_viewer_dir = _RELEASE_DIR / "pdf-viewer"
+ if pdf_viewer_dir.exists() and (pdf_viewer_dir / "index.html").exists():
+ # Use PDF viewer specific subdirectory
+ logger.info(f"Using PDF viewer component from build: {pdf_viewer_dir}")
+ component = components.declare_component(
+ "pdf_viewer",
+ path=str(pdf_viewer_dir),
+ )
+ else:
+ # Fallback: check for index-pdf-viewer.html and create subdirectory structure
+ pdf_viewer_html = _RELEASE_DIR / "index-pdf-viewer.html"
+ if pdf_viewer_html.exists():
+ logger.warning("PDF viewer build found but not in expected structure. Please rebuild with: npm run build:pdf-viewer")
+ # Still try to use the build directory
+ logger.info(f"Using PDF viewer component from build (fallback): {_RELEASE_DIR}")
+ component = components.declare_component(
+ "pdf_viewer",
+ path=str(_RELEASE_DIR),
+ )
+ else:
+ # No build and no dev server - show helpful error
+ logger.warning(
+ f"PDF viewer component not built and dev server not running.\n"
+ f"To build the component, run:\n"
+ f" cd {_COMPONENT_DIR}\n"
+ f" npm install\n"
+ f" npm run build:pdf-viewer\n"
+ f"Or for development, start the dev server:\n"
+ f" cd {_COMPONENT_DIR}\n"
+ f" npm run dev:pdf-viewer (in a separate terminal)"
+ )
+ # Still try to declare component - Streamlit will show its own error
+ component = components.declare_component(
+ "pdf_viewer",
+ url="http://localhost:3002",
+ )
+
+ # Prepare PDF data
+ pdf_url = None
+ pdf_data = None
+
+ # Check if it's a local file or URI
+ if pdf_path.startswith("file://") or pdf_path.startswith("http://") or pdf_path.startswith("https://") or pdf_path.startswith("urn:"):
+ # It's a URI, pass it directly
+ pdf_url = pdf_path
+ else:
+ # It's a local file path, convert to base64
+ try:
+ pdf_file = Path(pdf_path)
+ if pdf_file.exists():
+ with open(pdf_file, 'rb') as f:
+ pdf_bytes = f.read()
+ pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
+ pdf_data = f"data:application/pdf;base64,{pdf_base64}"
+ else:
+ logger.warning(f"PDF file not found: {pdf_path}")
+ pdf_url = pdf_path # Fallback: pass as URL
+ except Exception as e:
+ logger.error(f"Error reading PDF file: {e}")
+ pdf_url = pdf_path # Fallback: pass as URL
+
+ # Prepare questions in the format expected by the component
+ questions_list = []
+ for question_id, question_text in questions_data.items():
+ # Get chunks for this question
+ question_chunks = chunks_data.get(question_id, [])
+ questions_list.append({
+ "question_id": question_id,
+ "text": question_text,
+ "chunks": question_chunks
+ })
+
+ # Flatten all chunks for the component (it will filter by question)
+ all_chunks = []
+ for question_id, chunks in chunks_data.items():
+ for chunk in chunks:
+ # Add question_id to chunk for filtering
+ chunk_with_qid = chunk.copy()
+ chunk_with_qid["question_id"] = question_id
+ all_chunks.append(chunk_with_qid)
+
+ # Log chunk data for debugging
+ logger.info(f"PDF viewer: Preparing {len(all_chunks)} total chunks for {len(questions_list)} questions")
+ if all_chunks:
+ logger.debug(f"Sample chunk structure: {all_chunks[0]}")
+ else:
+ logger.warning(f"No chunks found in chunks_data. Keys: {list(chunks_data.keys())}, Total chunks per question: {[len(chunks) for chunks in chunks_data.values()]}")
+
+ # Render component and get result
+ result = component(
+ pdfUrl=pdf_url,
+ pdfData=pdf_data,
+ chunks=json.dumps(all_chunks),
+ questions=json.dumps(questions_list),
+ selectedQuestionId=selected_question_id,
+ showEvidenceOnly=show_evidence_only,
+ key=key,
+ height=height,
+ )
+
+ # Parse result if it's a string
+ if isinstance(result, str):
+ try:
+ result = json.loads(result)
+ except (json.JSONDecodeError, TypeError):
+ pass
+
+ return result
+
diff --git a/report_analyst_enterprise/components/streamlit_component/frontend/index-pdf-viewer.html b/report_analyst_enterprise/components/streamlit_component/frontend/index-pdf-viewer.html
new file mode 100644
index 000000000..6a8a8f953
--- /dev/null
+++ b/report_analyst_enterprise/components/streamlit_component/frontend/index-pdf-viewer.html
@@ -0,0 +1,13 @@
+
+
+
+
+
+ PDF Viewer Component
+
+
+
+
+
+
+
diff --git a/report_analyst_enterprise/components/streamlit_component/frontend/package.json b/report_analyst_enterprise/components/streamlit_component/frontend/package.json
index 0c37438a2..42157f2b9 100644
--- a/report_analyst_enterprise/components/streamlit_component/frontend/package.json
+++ b/report_analyst_enterprise/components/streamlit_component/frontend/package.json
@@ -4,8 +4,10 @@
"description": "Streamlit custom component for JSON Schema forms",
"main": "src/index.tsx",
"scripts": {
- "start": "vite",
- "build": "vite build && node -e \"const fs=require('fs'); const html=fs.readFileSync('build/index.html','utf8'); fs.writeFileSync('build/index.html', html.replace('src=\\\"/index.js\\\"','src=\\\"./index.js\\\"'));\""
+ "start": "vite --config vite.config.ts",
+ "dev:pdf-viewer": "vite --config vite.config.pdf-viewer.ts",
+ "build": "vite build --config vite.config.ts && node -e \"const fs=require('fs'); const html=fs.readFileSync('build/index.html','utf8'); fs.writeFileSync('build/index.html', html.replace('src=\\\"/index.js\\\"','src=\\\"./index.js\\\"'));\"",
+ "build:pdf-viewer": "vite build --config vite.config.pdf-viewer.ts && node -e \"const fs=require('fs'); const path=require('path'); const html=fs.readFileSync('build/index-pdf-viewer.html','utf8'); const fixedHtml=html.replace('src=\\\"/index-pdf-viewer.js\\\"','src=\\\"index-pdf-viewer.js\\\"'); fs.writeFileSync('build/index-pdf-viewer.html', fixedHtml); fs.mkdirSync('build/pdf-viewer', {recursive: true}); if(fs.existsSync('build/index-pdf-viewer.js')) fs.copyFileSync('build/index-pdf-viewer.js', 'build/pdf-viewer/index-pdf-viewer.js'); if(fs.existsSync('build/pdf-viewer.es.js')) { fs.copyFileSync('build/pdf-viewer.es.js', 'build/pdf-viewer/pdf-viewer.es.js'); } else { console.warn('Warning: build/pdf-viewer.es.js not found, PDF viewer web component may not load correctly'); } const pdfViewerHtml=html.replace('src=\\\"/index-pdf-viewer.js\\\"','src=\\\"index-pdf-viewer.js\\\"'); fs.writeFileSync('build/pdf-viewer/index.html', pdfViewerHtml);\""
},
"dependencies": {
"@emotion/react": "^11.14.0",
diff --git a/report_analyst_enterprise/components/streamlit_component/frontend/src/main-pdf-viewer.tsx b/report_analyst_enterprise/components/streamlit_component/frontend/src/main-pdf-viewer.tsx
new file mode 100644
index 000000000..1783a63a6
--- /dev/null
+++ b/report_analyst_enterprise/components/streamlit_component/frontend/src/main-pdf-viewer.tsx
@@ -0,0 +1,66 @@
+import React, { useEffect, useState } from 'react';
+import ReactDOM from 'react-dom/client';
+import { Streamlit } from 'streamlit-component-lib';
+import PdfViewer from './pdf-viewer';
+
+// Call setComponentReady IMMEDIATELY - before React renders
+Streamlit.setComponentReady();
+
+// Streamlit component entry point
+function App() {
+ const [args, setArgs] = useState({});
+
+ // Listen for render events from Streamlit
+ useEffect(() => {
+ const handleRender = (event: any) => {
+ // Extract args from the render event
+ const renderData = event.detail || event;
+ if (renderData && renderData.args) {
+ setArgs(renderData.args);
+ }
+ };
+
+ // Listen to Streamlit's event target
+ Streamlit.events.addEventListener(Streamlit.RENDER_EVENT, handleRender);
+
+ // Also listen on window as fallback
+ window.addEventListener(Streamlit.RENDER_EVENT, handleRender);
+
+ return () => {
+ Streamlit.events.removeEventListener(Streamlit.RENDER_EVENT, handleRender);
+ window.removeEventListener(Streamlit.RENDER_EVENT, handleRender);
+ };
+ }, []);
+
+ // If no args yet, show loading
+ if (!args || Object.keys(args).length === 0) {
+ return (
+
+
Loading PDF viewer...
+
+ );
+ }
+
+ return (
+
+ );
+}
+
+const root = ReactDOM.createRoot(
+ document.getElementById('root') as HTMLElement
+);
+
+root.render(
+
+
+
+);
+
diff --git a/report_analyst_enterprise/components/streamlit_component/frontend/src/pdf-viewer.tsx b/report_analyst_enterprise/components/streamlit_component/frontend/src/pdf-viewer.tsx
new file mode 100644
index 000000000..935f4d7e1
--- /dev/null
+++ b/report_analyst_enterprise/components/streamlit_component/frontend/src/pdf-viewer.tsx
@@ -0,0 +1,248 @@
+/**
+ * PDF Viewer React component for Streamlit
+ *
+ * Wraps the framework-agnostic web component for use in Streamlit.
+ */
+
+import React, { useEffect, useRef } from "react";
+import { Streamlit } from "streamlit-component-lib";
+
+interface PdfViewerProps {
+ pdfUrl?: string;
+ pdfData?: string;
+ chunks: string; // JSON string
+ questions: string; // JSON string
+ selectedQuestionId?: string;
+ showEvidenceOnly?: boolean;
+}
+
+// Extend HTMLElement to include web component methods
+interface PdfViewerElement extends HTMLElement {
+ setPdfUrl(url: string): void;
+ setPdfData(data: string): void;
+ setChunks(chunks: any[]): void;
+ setQuestions(questions: any[]): void;
+ setSelectedQuestionId(questionId: string | null): void;
+ setShowEvidenceOnly(show: boolean): void;
+ navigateToPage(pageNum: number): Promise;
+ navigateToChunk(chunk: any): Promise;
+ navigateToChunkById(chunkId: string): Promise;
+}
+
+const PdfViewer: React.FC = (props) => {
+ const viewerRef = useRef(null);
+ const containerRef = useRef(null);
+ const heightUpdateTimeoutRef = useRef(null);
+ const lastHeightRef = useRef(0);
+ const observerRef = useRef(null);
+
+ // Parse props
+ const chunks = JSON.parse(props.chunks || "[]");
+ const questions = JSON.parse(props.questions || "[]");
+
+ // Debounced height update function
+ const updateFrameHeight = React.useCallback(() => {
+ if (heightUpdateTimeoutRef.current) {
+ clearTimeout(heightUpdateTimeoutRef.current);
+ }
+
+ heightUpdateTimeoutRef.current = setTimeout(() => {
+ try {
+ const container = containerRef.current;
+ if (!container) return;
+
+ const height = Math.max(
+ container.offsetHeight || container.scrollHeight || 800,
+ 800 // minimum height
+ );
+
+ if (Math.abs(height - lastHeightRef.current) > 50 || lastHeightRef.current === 0) {
+ lastHeightRef.current = height;
+ Streamlit.setFrameHeight(height);
+ }
+ } catch (e) {
+ console.debug('Could not set frame height yet:', e);
+ }
+ }, 150);
+ }, []);
+
+ useEffect(() => {
+ // Load web component script if not already loaded
+ const loadWebComponent = async () => {
+ // Check if web component is already defined
+ if (customElements.get('pdf-viewer-with-chunks')) {
+ createViewerElement();
+ return;
+ }
+
+ // Load PDF.js first
+ if (typeof window.pdfjsLib === 'undefined') {
+ const pdfjsScript = document.createElement('script');
+ pdfjsScript.src = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.min.js';
+ pdfjsScript.async = true;
+ await new Promise((resolve, reject) => {
+ pdfjsScript.onload = resolve;
+ pdfjsScript.onerror = reject;
+ document.head.appendChild(pdfjsScript);
+ });
+ }
+
+ // Load the web component script
+ const script = document.createElement('script');
+ script.type = 'module';
+ script.src = './pdf-viewer.es.js';
+
+ const waitForCustomElement = (maxAttempts = 50) => {
+ let attempts = 0;
+ const check = () => {
+ if (customElements.get('pdf-viewer-with-chunks')) {
+ createViewerElement();
+ } else if (attempts < maxAttempts) {
+ attempts++;
+ setTimeout(check, 100);
+ } else {
+ console.error('Custom element pdf-viewer-with-chunks not defined after loading script');
+ }
+ };
+ setTimeout(check, 100);
+ };
+
+ script.onload = () => {
+ waitForCustomElement();
+ };
+ script.onerror = (e) => {
+ console.error('Failed to load web component from', script.src, e);
+ // Try absolute path as fallback (for dev server)
+ const fallbackScript = document.createElement('script');
+ fallbackScript.type = 'module';
+ fallbackScript.src = '/pdf-viewer.es.js';
+ fallbackScript.onload = () => {
+ waitForCustomElement();
+ };
+ fallbackScript.onerror = (e2) => {
+ console.error('Failed to load web component from fallback path:', e2);
+ };
+ document.head.appendChild(fallbackScript);
+ };
+ document.head.appendChild(script);
+ };
+
+ const createViewerElement = () => {
+ if (!containerRef.current) return;
+
+ // Remove existing viewer if any
+ const existing = containerRef.current.querySelector('pdf-viewer-with-chunks');
+ if (existing) {
+ existing.remove();
+ }
+
+ // Disconnect previous observer
+ if (observerRef.current) {
+ observerRef.current.disconnect();
+ observerRef.current = null;
+ }
+
+ // Create web component element
+ const viewerElement = document.createElement('pdf-viewer-with-chunks') as PdfViewerElement;
+ viewerRef.current = viewerElement;
+
+ // Set properties
+ if (props.pdfUrl) {
+ viewerElement.setPdfUrl(props.pdfUrl);
+ } else if (props.pdfData) {
+ viewerElement.setPdfData(props.pdfData);
+ }
+ viewerElement.setChunks(chunks);
+ viewerElement.setQuestions(questions);
+ if (props.selectedQuestionId) {
+ viewerElement.setSelectedQuestionId(props.selectedQuestionId);
+ }
+ viewerElement.setShowEvidenceOnly(props.showEvidenceOnly || false);
+
+ // Set up event listeners
+ const handleChunkSelected = (e: CustomEvent) => {
+ Streamlit.setComponentValue({
+ type: "chunk-selected",
+ chunk: e.detail.chunk,
+ pageNum: e.detail.pageNum,
+ });
+ updateFrameHeight();
+ };
+
+ viewerElement.addEventListener('chunk-selected', handleChunkSelected as EventListener);
+
+ // Append to container
+ containerRef.current.appendChild(viewerElement);
+
+ // Set up mutation observer for dynamic height updates
+ observerRef.current = new MutationObserver(() => {
+ updateFrameHeight();
+ });
+
+ if (containerRef.current) {
+ observerRef.current.observe(containerRef.current, {
+ childList: true,
+ subtree: true,
+ attributes: false
+ });
+ }
+
+ // Initial height update
+ setTimeout(updateFrameHeight, 500);
+ };
+
+ loadWebComponent();
+
+ // Update when props change
+ if (viewerRef.current) {
+ if (props.pdfUrl) {
+ viewerRef.current.setPdfUrl(props.pdfUrl);
+ } else if (props.pdfData) {
+ viewerRef.current.setPdfData(props.pdfData);
+ }
+ viewerRef.current.setChunks(chunks);
+ viewerRef.current.setQuestions(questions);
+ if (props.selectedQuestionId) {
+ viewerRef.current.setSelectedQuestionId(props.selectedQuestionId);
+ }
+ viewerRef.current.setShowEvidenceOnly(props.showEvidenceOnly || false);
+ updateFrameHeight();
+ }
+
+ return () => {
+ // Cleanup
+ if (heightUpdateTimeoutRef.current) {
+ clearTimeout(heightUpdateTimeoutRef.current);
+ }
+ if (observerRef.current) {
+ observerRef.current.disconnect();
+ observerRef.current = null;
+ }
+ if (viewerRef.current) {
+ viewerRef.current.remove();
+ viewerRef.current = null;
+ }
+ };
+ }, [props.pdfUrl, props.pdfData, props.chunks, props.questions, props.selectedQuestionId, props.showEvidenceOnly, chunks, questions, updateFrameHeight]);
+
+ // Watch for highlightChunkId changes and navigate to chunk
+ useEffect(() => {
+ if (props.highlightChunkId && viewerRef.current) {
+ viewerRef.current.navigateToChunkById(props.highlightChunkId);
+ updateFrameHeight();
+ }
+ }, [props.highlightChunkId, updateFrameHeight]);
+
+ return (
+
+ );
+};
+
+export default PdfViewer;
+
diff --git a/report_analyst_enterprise/components/streamlit_component/frontend/vite.config.pdf-viewer.ts b/report_analyst_enterprise/components/streamlit_component/frontend/vite.config.pdf-viewer.ts
new file mode 100644
index 000000000..a6dd4bdf2
--- /dev/null
+++ b/report_analyst_enterprise/components/streamlit_component/frontend/vite.config.pdf-viewer.ts
@@ -0,0 +1,76 @@
+import { defineConfig } from 'vite';
+import react from '@vitejs/plugin-react';
+import { copyFileSync, existsSync } from 'fs';
+import { join } from 'path';
+
+export default defineConfig({
+ plugins: [
+ react(),
+ // Plugin to copy web component to build directory
+ {
+ name: 'copy-web-component',
+ writeBundle() {
+ const webComponentPath = join(__dirname, '../../web/dist/pdf-viewer.es.js');
+ const publicPath = join(__dirname, 'public/pdf-viewer.es.js');
+ const buildPath = join(__dirname, 'build/pdf-viewer.es.js');
+
+ // Copy to public for dev server
+ if (existsSync(webComponentPath)) {
+ try {
+ copyFileSync(webComponentPath, publicPath);
+ console.log('✓ Copied PDF viewer web component to public/');
+ } catch (e) {
+ console.warn('Could not copy PDF viewer web component to public:', e);
+ }
+ }
+
+ // Copy to build for production
+ if (existsSync(webComponentPath)) {
+ try {
+ copyFileSync(webComponentPath, buildPath);
+ console.log('✓ Copied PDF viewer web component to build/');
+ } catch (e) {
+ console.warn('Could not copy PDF viewer web component to build:', e);
+ }
+ }
+ },
+ },
+ ],
+ define: {
+ 'process.env': '{}',
+ 'process': JSON.stringify({ env: {} }),
+ },
+ build: {
+ outDir: 'build',
+ emptyOutDir: false, // Don't clean build directory to preserve JSON schema form files
+ rollupOptions: {
+ input: 'index-pdf-viewer.html', // Use PDF viewer HTML as entry point
+ output: {
+ entryFileNames: 'index-pdf-viewer.js',
+ format: 'es',
+ },
+ },
+ // Copy pdf-viewer.es.js to build directory
+ copyPublicDir: true,
+ // Ensure relative paths in HTML
+ base: './',
+ commonjsOptions: {
+ include: [/node_modules/],
+ transformMixedEsModules: true,
+ strictRequires: true,
+ },
+ target: 'es2020',
+ },
+ optimizeDeps: {
+ include: ['react', 'react-dom', 'streamlit-component-lib'],
+ esbuildOptions: {
+ target: 'es2020',
+ },
+ },
+ server: {
+ port: 3002, // Different port from JSON schema form
+ cors: true,
+ },
+ publicDir: 'public',
+});
+
diff --git a/report_analyst_enterprise/components/web/examples/pdf-viewer-standalone.html b/report_analyst_enterprise/components/web/examples/pdf-viewer-standalone.html
new file mode 100644
index 000000000..fc8e1ccf4
--- /dev/null
+++ b/report_analyst_enterprise/components/web/examples/pdf-viewer-standalone.html
@@ -0,0 +1,92 @@
+
+
+
+
+
+ PDF Viewer with Chunks - Standalone Example
+
+
+
+