From 56cb0b94591e76ab406eff50403d3791b06af6c6 Mon Sep 17 00:00:00 2001 From: John Hall Date: Wed, 12 Nov 2025 17:06:47 -0500 Subject: [PATCH] fix gemini api request and response in rag search --- vector_database_tutorial.qmd | 420 ++++++++++++++++++----------------- 1 file changed, 214 insertions(+), 206 deletions(-) diff --git a/vector_database_tutorial.qmd b/vector_database_tutorial.qmd index ed57522..d5409dc 100644 --- a/vector_database_tutorial.qmd +++ b/vector_database_tutorial.qmd @@ -556,10 +556,84 @@ limits—you can adjust this based on your API tier. In production, you might cache embeddings so you don't need to regenerate them every time you restart your application. -### Google AI API Approach: +### Google Gemini API for Embeddings Generation ```{r} +# General-purpose Gemini API call function +gemini_api_call <- function(prompt, + model = "gemini-2.5-flash", + #model = "gemini-2.5-pro", + max_tokens = 1000, + google_api_key = Sys.getenv("GOOGLE_API_KEY")) { + + # Validate API key + if (google_api_key == "") { + stop("GOOGLE_API_KEY environment variable not set") + } + + # Build the API URL with the API key as query parameter + url <- sprintf( + "https://generativelanguage.googleapis.com/v1beta/models/%s:generateContent?key=%s", + model, + google_api_key + ) + + response <- httr::POST( + url = url, + httr::add_headers("Content-Type" = "application/json"), + body = jsonlite::toJSON(list( + contents = list(list( + parts = list(list( + text = prompt + )) + )), + generationConfig = list( + maxOutputTokens = max_tokens + ) + ), auto_unbox = TRUE) + ) + + if (httr::status_code(response) == 200) { + result <- jsonlite::fromJSON(httr::content(response, "text")) + message("Gemini API request successful") + + if (!is.null(result$candidates) && length(result$candidates) > 0 && !is.null(result$candidates[[1]])) { + + # Extract text from Gemini response structure + content <- result$candidates[[1]] + if (!is.null(content$parts) && !is.null(content$parts[[1]]) && !is.null(content$parts[[1]]$text)) { + return(content$parts[[1]]$text) + } + + } else { + + # Fallback if structure is unexpected + warning("Unexpected Gemini API response structure") + return(result) + + } + + } else { + status_code <- httr::status_code(response) + error_content <- httr::content(response, "text") + warning(paste("Gemini API error - Status:", status_code, "Response:", error_content)) + + # Check specific error cases + if (status_code == 401 || status_code == 403) { + warning("Authentication failed - check your GOOGLE_API_KEY") + } else if (status_code == 400) { + warning("Bad request - check API request format") + } else if (status_code == 429) { + warning("Rate limit exceeded - too many requests") + } else if (status_code == 500) { + warning("Internal server error - try again later") + } + + stop("API call failed - see warnings above") + } +} + generate_google_embeddings <- function(texts, api_key = Sys.getenv("GOOGLE_API_KEY")) { embeddings <- list() batch_size <- 100 # API rate limiting @@ -642,6 +716,7 @@ generate_vertex_embeddings <- function(texts, project_id = Sys.getenv("GOOGLE_CL return(embeddings) } + ``` ### Local Embedding Models: @@ -664,22 +739,20 @@ generate_local_embeddings <- function(texts) { } ``` -### Claude Integration for Document Analysis: +### Claude API Integration for Document Analysis: ```{r} -# Enhanced document processing with Claude for intelligent analysis -enhance_document_with_claude <- function(text, claude_api_key = Sys.getenv("ANTHROPIC_API_KEY")) { - # Use Claude for document analysis and enhancement - prompt <- paste( - "Analyze this document and provide:", - "1. A concise summary (2-3 sentences)", - "2. Key topics and themes", - "3. Important entities (people, places, organizations)", - "4. Suggested metadata tags", - "Document text:", text, - sep = "\n" - ) +# General-purpose Anthropic API call function +claude_api_call <- function(prompt, + model = "claude-sonnet-4-20250514", + max_tokens = 1000, + claude_api_key = Sys.getenv("ANTHROPIC_API_KEY")) { + + # Validate API key + if (claude_api_key == "") { + stop("ANTHROPIC_API_KEY environment variable not set") + } response <- httr::POST( url = "https://api.anthropic.com/v1/messages", @@ -689,8 +762,8 @@ enhance_document_with_claude <- function(text, claude_api_key = Sys.getenv("ANTH "anthropic-version" = "2023-06-01" ), body = jsonlite::toJSON(list( - model = "claude-sonnet-4-20250514", - max_tokens = 1000, + model = model, + max_tokens = max_tokens, messages = list(list( role = "user", content = prompt @@ -701,6 +774,7 @@ enhance_document_with_claude <- function(text, claude_api_key = Sys.getenv("ANTH if (httr::status_code(response) == 200) { result <- jsonlite::fromJSON(httr::content(response, "text")) message("Claude API request successful") + # Handle the data.frame structure of content if (is.data.frame(result$content)) { return(result$content$text[1]) @@ -720,22 +794,26 @@ enhance_document_with_claude <- function(text, claude_api_key = Sys.getenv("ANTH warning("Bad request - check API request format") } else if (status_code == 429) { warning("Rate limit exceeded - too many requests") + } else if (status_code == 500) { + warning("Internal server error - try again later") } - return("Analysis unavailable - API connection failed") + stop("API call failed - see warnings above") } } -# General-purpose Anthropic API call function -anthropic_api_call <- function(prompt, - model = "claude-sonnet-4-20250514", - max_tokens = 1000, - claude_api_key = Sys.getenv("ANTHROPIC_API_KEY")) { - - # Validate API key - if (claude_api_key == "") { - stop("ANTHROPIC_API_KEY environment variable not set") - } +# Enhanced document processing with Claude for intelligent analysis +enhance_document_with_claude <- function(text, claude_api_key = Sys.getenv("ANTHROPIC_API_KEY")) { + # Use Claude for document analysis and enhancement + prompt <- paste( + "Analyze this document and provide:", + "1. A concise summary (2-3 sentences)", + "2. Key topics and themes", + "3. Important entities (people, places, organizations)", + "4. Suggested metadata tags", + "Document text:", text, + sep = "\n" + ) response <- httr::POST( url = "https://api.anthropic.com/v1/messages", @@ -745,8 +823,8 @@ anthropic_api_call <- function(prompt, "anthropic-version" = "2023-06-01" ), body = jsonlite::toJSON(list( - model = model, - max_tokens = max_tokens, + model = "claude-sonnet-4-20250514", + max_tokens = 1000, messages = list(list( role = "user", content = prompt @@ -757,7 +835,6 @@ anthropic_api_call <- function(prompt, if (httr::status_code(response) == 200) { result <- jsonlite::fromJSON(httr::content(response, "text")) message("Claude API request successful") - # Handle the data.frame structure of content if (is.data.frame(result$content)) { return(result$content$text[1]) @@ -777,80 +854,9 @@ anthropic_api_call <- function(prompt, warning("Bad request - check API request format") } else if (status_code == 429) { warning("Rate limit exceeded - too many requests") - } else if (status_code == 500) { - warning("Internal server error - try again later") - } - - stop("API call failed - see warnings above") - } -} - -# General-purpose Gemini API call function -gemini_api_call <- function(prompt, - model = "gemini-2.5-flash", - #model = "gemini-2.5-pro", - max_tokens = 1000, - google_api_key = Sys.getenv("GOOGLE_API_KEY")) { - - # Validate API key - if (google_api_key == "") { - stop("GOOGLE_API_KEY environment variable not set") - } - - # Build the API URL with the API key as query parameter - url <- sprintf( - "https://generativelanguage.googleapis.com/v1beta/models/%s:generateContent?key=%s", - model, - google_api_key - ) - - response <- httr::POST( - url = url, - httr::add_headers("Content-Type" = "application/json"), - body = jsonlite::toJSON(list( - contents = list(list( - parts = list(list( - text = prompt - )) - )), - generationConfig = list( - maxOutputTokens = max_tokens - ) - ), auto_unbox = TRUE) - ) - - if (httr::status_code(response) == 200) { - result <- jsonlite::fromJSON(httr::content(response, "text")) - message("Gemini API request successful") - - # Extract text from Gemini response structure - if (!is.null(result$content) && length(result$content) > 0) { - content <- result$content - if (!is.null(content$parts) && !is.null(content$parts[[1]]) && !is.null(content$parts[[1]]$text)) { - return(content$parts[[1]]$text) - } - } - - # Fallback if structure is unexpected - warning("Unexpected Gemini API response structure") - return(result) - } else { - status_code <- httr::status_code(response) - error_content <- httr::content(response, "text") - warning(paste("Gemini API error - Status:", status_code, "Response:", error_content)) - - # Check specific error cases - if (status_code == 401 || status_code == 403) { - warning("Authentication failed - check your GOOGLE_API_KEY") - } else if (status_code == 400) { - warning("Bad request - check API request format") - } else if (status_code == 429) { - warning("Rate limit exceeded - too many requests") - } else if (status_code == 500) { - warning("Internal server error - try again later") } - stop("API call failed - see warnings above") + return("Analysis unavailable - API connection failed") } } ``` @@ -1148,93 +1154,93 @@ rag_search <- function(query, return(list( query = query, found_documents = 0, - structured_data = NULL, + # structured_data = NULL, response = "I couldn't find any relevant information to answer your question." )) } - # 2. Check if top result has metadata (indicating a query pattern) - structured_data <- NULL - if (!is.na(search_results$metadata[1]) && !is.null(parquet_path)) { - # This is a query pattern match - execute structured query - tryCatch({ - matched_metadata <- jsonlite::fromJSON(search_results$metadata[1]) - - if (!is.null(matched_metadata$parquet_filter)) { - # Execute DuckDB query - library(duckdb) - con_duckdb <- dbConnect(duckdb::duckdb(), dbdir = ":memory:") - on.exit(dbDisconnect(con_duckdb, shutdown = TRUE), add = TRUE) - - dbExecute(con_duckdb, sprintf( - "CREATE VIEW data_view AS SELECT * FROM read_parquet('%s')", - parquet_path - )) - - # Select only relevant columns (exclude large BLOB columns like geometry/centroid) - query_sql <- sprintf(" - SELECT geoid_co, rin_community, county, namelsad, statefp, - lat, lon, aland, awater, intptlat, intptlon - FROM data_view WHERE %s", matched_metadata$parquet_filter) - structured_data <- dbGetQuery(con_duckdb, query_sql) - - cat(sprintf("\nExecuted structured query, found %d records\n", nrow(structured_data))) - } - }, error = function(e) { - warning(paste("Could not execute structured query:", e$message)) - }) - } - - # 2b. If no query pattern but parquet_path provided, try to extract entity from query - if (is.null(structured_data) && !is.null(parquet_path)) { - tryCatch({ - # Use Claude to extract RIN community name from the query - extraction_prompt <- sprintf(" -Extract the RIN community name from this question. Return ONLY the community name, nothing else. -If no community name is found, return 'NONE'. - -Question: %s - -Community name:", query) - - print(extraction_prompt) - - community_name <- anthropic_api_call( - prompt = extraction_prompt, - model = "claude-sonnet-4-20250514", - max_tokens = 50, - claude_api_key = claude_api_key - ) - - community_name <- trimws(community_name) - - if (community_name != "NONE" && nchar(community_name) > 0) { - # Execute DuckDB query with extracted name - library(duckdb) - con_duckdb <- dbConnect(duckdb::duckdb(), dbdir = ":memory:") - on.exit(dbDisconnect(con_duckdb, shutdown = TRUE), add = TRUE) - - dbExecute(con_duckdb, sprintf( - "CREATE VIEW data_view AS SELECT * FROM read_parquet('%s')", - parquet_path - )) - - # Remove common articles - search_term <- gsub("^(the|a|an)\\s+", "", community_name, ignore.case = TRUE) - - # Select only relevant columns (exclude large BLOB columns like geometry/centroid) - query_sql <- sprintf(" - SELECT geoid_co, rin_community, county, namelsad, statefp, - lat, lon, aland, awater, intptlat, intptlon - FROM data_view WHERE rin_community ILIKE '%%%s%%'", search_term) - structured_data <- dbGetQuery(con_duckdb, query_sql) - - cat(sprintf("\nExtracted '%s', found %d matching records\n", search_term, nrow(structured_data))) - } - }, error = function(e) { - warning(paste("Could not extract entity or query data:", e$message)) - }) - } +# # 2. Check if top result has metadata (indicating a query pattern) +# structured_data <- NULL +# if (!is.na(search_results$metadata[1]) && !is.null(parquet_path)) { +# # This is a query pattern match - execute structured query +# tryCatch({ +# matched_metadata <- jsonlite::fromJSON(search_results$metadata[1]) + +# if (!is.null(matched_metadata$parquet_filter)) { +# # Execute DuckDB query +# library(duckdb) +# con_duckdb <- dbConnect(duckdb::duckdb(), dbdir = ":memory:") +# on.exit(dbDisconnect(con_duckdb, shutdown = TRUE), add = TRUE) + +# dbExecute(con_duckdb, sprintf( +# "CREATE VIEW data_view AS SELECT * FROM read_parquet('%s')", +# parquet_path +# )) + +# # Select only relevant columns (exclude large BLOB columns like geometry/centroid) +# query_sql <- sprintf(" +# SELECT geoid_co, rin_community, county, namelsad, statefp, +# lat, lon, aland, awater, intptlat, intptlon +# FROM data_view WHERE %s", matched_metadata$parquet_filter) +# structured_data <- dbGetQuery(con_duckdb, query_sql) + +# cat(sprintf("\nExecuted structured query, found %d records\n", nrow(structured_data))) +# } +# }, error = function(e) { +# warning(paste("Could not execute structured query:", e$message)) +# }) +# } + +# # 2b. If no query pattern but parquet_path provided, try to extract entity from query +# if (is.null(structured_data) && !is.null(parquet_path)) { +# tryCatch({ +# # Use Claude to extract RIN community name from the query +# extraction_prompt <- sprintf(" +# Extract the RIN community name from this question. Return ONLY the community name, nothing else. +# If no community name is found, return 'NONE'. + +# Question: %s + +# Community name:", query) + +# print(extraction_prompt) + +# community_name <- claude_api_call( +# prompt = extraction_prompt, +# model = "claude-sonnet-4-20250514", +# max_tokens = 50, +# claude_api_key = claude_api_key +# ) + +# community_name <- trimws(community_name) + +# if (community_name != "NONE" && nchar(community_name) > 0) { +# # Execute DuckDB query with extracted name +# library(duckdb) +# con_duckdb <- dbConnect(duckdb::duckdb(), dbdir = ":memory:") +# on.exit(dbDisconnect(con_duckdb, shutdown = TRUE), add = TRUE) + +# dbExecute(con_duckdb, sprintf( +# "CREATE VIEW data_view AS SELECT * FROM read_parquet('%s')", +# parquet_path +# )) + +# # Remove common articles +# search_term <- gsub("^(the|a|an)\\s+", "", community_name, ignore.case = TRUE) + +# # Select only relevant columns (exclude large BLOB columns like geometry/centroid) +# query_sql <- sprintf(" +# SELECT geoid_co, rin_community, county, namelsad, statefp, +# lat, lon, aland, awater, intptlat, intptlon +# FROM data_view WHERE rin_community ILIKE '%%%s%%'", search_term) +# structured_data <- dbGetQuery(con_duckdb, query_sql) + +# cat(sprintf("\nExtracted '%s', found %d matching records\n", search_term, nrow(structured_data))) +# } +# }, error = function(e) { +# warning(paste("Could not extract entity or query data:", e$message)) +# }) +# } # 3. Build context for LLM (HYBRID: documents + structured data) context_parts <- c() @@ -1250,14 +1256,14 @@ Community name:", query) context_parts <- c(context_parts, paste("=== Relevant Documentation ===\n", doc_context)) } - # Add structured data if available - if (!is.null(structured_data) && nrow(structured_data) > 0) { - data_context <- paste( - sprintf("=== Structured Data Query Results (%d records) ===\n", nrow(structured_data)), - paste(capture.output(print(head(structured_data, 10))), collapse = "\n") - ) - context_parts <- c(context_parts, data_context) - } + # # Add structured data if available + # if (!is.null(structured_data) && nrow(structured_data) > 0) { + # data_context <- paste( + # sprintf("=== Structured Data Query Results (%d records) ===\n", nrow(structured_data)), + # paste(capture.output(print(head(structured_data, 10))), collapse = "\n") + # ) + # context_parts <- c(context_parts, data_context) + # } context <- paste(context_parts, collapse = "\n\n") @@ -1274,7 +1280,7 @@ Be specific and cite relevant details from the data. # 5. Get LLM response # claude_response <- tryCatch({ - # anthropic_api_call( + # claude_api_call( # prompt = enhanced_prompt, # model = "claude-sonnet-4-20250514", # max_tokens = 2000 @@ -1286,7 +1292,8 @@ Be specific and cite relevant details from the data. # 5.b Get Gemini response gemini_response <- tryCatch({ gemini_api_call( - prompt = enhanced_prompt + prompt = enhanced_prompt, + model = "gemini-2.5-pro" ) }, error = function(e) { paste("Error getting LLM response:", e$message, "\n\nRaw data:\n", context) @@ -1301,7 +1308,7 @@ Be specific and cite relevant details from the data. found_documents = nrow(search_results), similarity_scores = search_results$similarity_score, source_files = unique(search_results$source_file), - structured_data = structured_data, + # structured_data = structured_data, response = gemini_response, search_results = search_results )) @@ -1445,7 +1452,7 @@ cat("\n==================\n") # 7. Send to LLM for natural language response llm_response <- tryCatch({ - anthropic_api_call( + claude_api_call( prompt = llm_prompt, model = "claude-sonnet-4-20250514", max_tokens = 1000 @@ -1484,11 +1491,12 @@ conversational response. The vector database acts as the "intent recognition" layer, translating user questions into precise database operations. -**Important**: Query patterns must use **SQL syntax**: - ✅ -`IS NOT NULL` (SQL) — ❌ `.notna()` (Pandas method) - ✅ -`column = 'value'` (standard SQL) — ⚠️ `column == 'value'` (works in -DuckDB but non-standard) - ✅ `ILIKE '%pattern%'` (PostgreSQL/DuckDB) — -❌ `.str.contains()` (Pandas method) +**Important**: +Query patterns must use **SQL syntax**: +- ✅`IS NOT NULL` (SQL) +- ✅ `ILIKE '%pattern%'` (PostgreSQL/DuckDB) +- ✅ `column = 'value'` (standard SQL) +— ⚠️ `column == 'value'` (works in DuckDB but non-standard) ### Example 2: Hybrid RAG with Documents + Structured Data @@ -1568,10 +1576,10 @@ cat("\n=== RAG Search Results ===\n") cat("Query:", rag_result$query, "\n") cat("Found", rag_result$found_documents, "relevant documents\n") cat("Sources:", paste(rag_result$source_files, collapse = ", "), "\n") -if (!is.null(rag_result$structured_data)) { - cat("Structured data records:", nrow(rag_result$structured_data), "\n") -} -cat("\n=== Claude's Response ===\n") +# if (!is.null(rag_result$structured_data)) { +# cat("Structured data records:", nrow(rag_result$structured_data), "\n") +# } +cat("\n=== LLM Response ===\n") cat(rag_result$response) cat("\n=========================\n")