Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ jobs:
- name: Build and lint
run: cargo clippy --all-targets -- -D warnings

- name: Run tests
run: cargo test
- name: Run unit tests
run: cargo test --lib --all

- name: Check build
run: cargo check --release
Expand Down
10 changes: 7 additions & 3 deletions mise.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,15 @@ description = "Lint with cargo clippy"
run = "cargo clippy --all-targets -- -D warnings"

[tasks.test]
description = "Run tests with cargo test"
run = "RUSTFLAGS=\"-D warnings\" cargo test --all-targets"
description = "Run unit tests"
run = "RUSTFLAGS=\"-D warnings\" cargo test --lib --all"

[tasks.test-e2e]
description = "Run e2e tests"
run = "cargo test --test e2e_cli --test e2e_mcp"

[tasks.pre-commit]
description = "Run all of the above"
description = "Run fmt, clippy, and unit tests"
depends = ["fmt", "clippy", "test"]

[tasks.skill-test]
Expand Down
60 changes: 17 additions & 43 deletions src/core/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,19 @@ impl SearchOptions {
q_parts.push(query.clone());
}

// Assignee is handled manually later to support comma separation

if !q_parts.is_empty() {
serializer.append_pair("q", &q_parts.join(" "));
}

// Add assignee as separate parameter (no quotes)
if let Some(assignees) = &self.assignee
&& !assignees.is_empty()
{
for a in assignees {
serializer.append_pair("assignee", a);
}
}

if let Some(country) = &self.country {
serializer.append_pair("country", country);
match country.to_uppercase().as_str() {
Expand Down Expand Up @@ -176,31 +183,7 @@ impl SearchOptions {
}
}

let mut url_str = url.to_string();

// Manually append assignee parameter if present
if let Some(assignees) = &self.assignee
&& !assignees.is_empty()
{
let encoded_assignees: Vec<String> = assignees
.iter()
.map(|a| {
// Encode each assignee value, including quotes, using form_urlencoded logic
let quoted = format!("\"{}\"", a);
url::form_urlencoded::byte_serialize(quoted.as_bytes()).collect::<String>()
})
.collect();

// Determine if we need to add '?' or '&'
let separator = if !url_str.contains('?') {
"?"
} else if url_str.ends_with('?') {
""
} else {
"&"
};
url_str.push_str(&format!("{}assignee={}", separator, encoded_assignees.join(",")));
}
let url_str = url.to_string();

// Manual check for empty params (after constructing)
// Check if url string ends with / or /? and has no params
Expand Down Expand Up @@ -260,33 +243,26 @@ mod tests {
// Test assignee only (single assignee)
let options =
SearchOptions { assignee: Some(vec!["Google LLC".to_string()]), ..Default::default() };
// assignee="Google LLC" -> encoded %22Google%20LLC%22
let url = options.to_url().unwrap();

// Since no other params, it should start with ?assignee=
// form_urlencoded::byte_serialize uses + for spaces in query values
assert!(url.contains("?assignee=%22Google+LLC%22"));
assert_eq!(url, "https://patents.google.com/?assignee=Google+LLC");

// Test assignee (multiple assignees)
let options = SearchOptions {
assignee: Some(vec!["Google LLC".to_string(), "Microsoft Corp".to_string()]),
..Default::default()
};
// assignee="Google LLC","Microsoft Corp"
// Encoded individual values, joined by comma
let url = options.to_url().unwrap();
assert!(url.contains("?assignee=%22Google+LLC%22,%22Microsoft+Corp%22"));
assert!(url.contains("assignee=Google+LLC"));
assert!(url.contains("assignee=Microsoft+Corp"));

// Test assignee (comma handling)
// Test assignee with comma in name
let options = SearchOptions {
assignee: Some(vec!["Salesforce.com, inc.".to_string()]),
..Default::default()
};
let url = options.to_url().unwrap();
// assignee="Salesforce.com, inc."
// comma inside quotes encoded as %2C. space as %20.
// %22Salesforce.com%2C%20inc.%22
assert!(url.contains("?assignee=%22Salesforce.com%2C+inc.%22"));
// comma encoded as %2C
assert!(url.contains("assignee=Salesforce.com%2C+inc."));

// Test query with assignee
let options = SearchOptions {
Expand All @@ -295,11 +271,9 @@ mod tests {
country: None,
..Default::default()
};
// q=foo&assignee="Google LLC"
// q is added via serializer (foo). assignee appended manually (&assignee=...)
let url = options.to_url().unwrap();
assert!(url.contains("q=foo"));
assert!(url.contains("&assignee=%22Google+LLC%22"));
assert!(url.contains("assignee=Google+LLC"));

// Test query with country (JP should add language=JAPANESE)
let options = SearchOptions {
Expand Down
213 changes: 36 additions & 177 deletions src/core/patent_search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,74 +3,6 @@ use crate::core::{BrowserManager, CdpPage};
use crate::core::{Error, Result};
use async_trait::async_trait;

// API response types for Google Patents /xhr/query endpoint
#[derive(serde::Deserialize)]
struct ApiResponse {
results: ApiResults,
}

#[derive(serde::Deserialize)]
struct ApiResults {
total_num_results: u64,
cluster: Vec<ApiCluster>,
}

#[derive(serde::Deserialize)]
struct ApiCluster {
result: Vec<ApiPatentEntry>,
}

#[derive(serde::Deserialize)]
struct ApiPatentEntry {
patent: ApiPatent,
}

#[derive(serde::Deserialize)]
struct ApiPatent {
title: Option<String>,
snippet: Option<String>,
filing_date: Option<String>,
assignee: Option<String>,
publication_number: Option<String>,
}

fn convert_api_response(api: ApiResponse) -> SearchResult {
let patents = api
.results
.cluster
.iter()
.flat_map(|cluster| cluster.result.iter())
.map(|entry| {
let p = &entry.patent;
let id = p.publication_number.clone().unwrap_or_default();
Patent {
id: id.clone(),
title: p.title.clone().unwrap_or_default(),
abstract_text: None,
description_paragraphs: None,
claims: None,
images: None,
snippet: p.snippet.clone(),
description: None,
filing_date: p.filing_date.clone(),
assignee: p.assignee.clone(),
related_application: None,
claiming_priority: None,
family_applications: None,
legal_status: None,
url: format!("https://patents.google.com/patent/{}", id),
}
})
.collect();

SearchResult {
total_results: api.results.total_num_results.to_string(),
top_assignees: None,
top_cpcs: None,
patents,
}
}

#[async_trait]
pub trait PatentSearch: Send + Sync {
async fn search(&self, options: &SearchOptions) -> Result<SearchResult>;
Expand Down Expand Up @@ -174,129 +106,56 @@ impl PatentSearcher {
patents,
})
} else {
// Search results page - fetch via /xhr/query API
let mut all_patents: Vec<Patent> = Vec::new();
// Search results page - scrape from DOM
let limit = options.limit.unwrap_or(10);
let mut total_results_str = "Unknown".to_string();
let mut top_assignees: Option<Vec<crate::core::models::SummaryItem>> = None;
let mut top_cpcs: Option<Vec<crate::core::models::SummaryItem>> = None;

if self.verbose {
eprintln!("Fetching search results (limit: {})...", limit);
}

// Append num=100 to base_url to fetch more results per page if needed
let base_url = if limit > 10 { format!("{}&num=100", base_url) } else { base_url };

// Calculate pagination
let results_per_page = if limit > 10 { 100 } else { 10 };
let pages_needed = limit.div_ceil(results_per_page);

for page_num in 0..pages_needed {
let page_url = if page_num == 0 {
base_url.clone()
} else {
format!("{}&page={}", base_url, page_num)
};

if self.verbose {
eprintln!("Loading page {} of {}...", page_num + 1, pages_needed);
eprintln!("URL: {}", page_url);
}

page.goto(&page_url).await?;

// Check for bot detection / rate limiting page
let title = page
.evaluate("document.title")
.await
.ok()
.and_then(|v| v.as_str().map(String::from))
.unwrap_or_default();
if title == "Sorry..." {
let _ = page.close().await;
return Err(Error::Search(
"Google blocked this request (bot detection / rate limiting). \
The IP address may be temporarily blocked. Try again later."
.to_string(),
));
}

// Build API URL from the search URL
let api_path =
base_url.strip_prefix("https://patents.google.com/").unwrap_or(&base_url);
let api_url = format!("/xhr/query?url={}", api_path);
let fetch_script = format!(
r#"(async () => {{
try {{
const resp = await fetch("{}");
if (!resp.ok) return {{ error: "HTTP " + resp.status }};
return await resp.json();
}} catch(e) {{
return {{ error: e.message }};
}}
}})()"#,
api_url
);

let api_result = page.evaluate(&fetch_script).await?;

if self.verbose {
if let Some(err) = api_result.get("error") {
eprintln!("API error: {}", err);
} else {
eprintln!("API response received");
}
}

let sr = serde_json::from_value::<ApiResponse>(api_result)
.map_err(|e| Error::Search(format!("Failed to parse API response: {}", e)))
.map(convert_api_response)?;

if page_num == 0 {
total_results_str = sr.total_results.clone();
if self.verbose {
eprintln!("Total results found: {}", total_results_str);
}
top_assignees = sr.top_assignees;
top_cpcs = sr.top_cpcs;
}

let page_patents = sr.patents;

if self.verbose {
eprintln!("Found {} patents on this page", page_patents.len());
}

if page_patents.is_empty() {
break;
}

all_patents.extend(page_patents);

if all_patents.len() >= limit {
break;
}
page.goto(&base_url).await?;

// Check for bot detection / rate limiting page
let title = page
.evaluate("document.title")
.await
.ok()
.and_then(|v| v.as_str().map(String::from))
.unwrap_or_default();
if title == "Sorry..." {
let _ = page.close().await;
return Err(Error::Search(
"Google blocked this request (bot detection / rate limiting). \
The IP address may be temporarily blocked. Try again later."
.to_string(),
));
}

if self.verbose {
eprintln!("Waiting for search results to load...");
}
// Wait for search results to render
tokio::time::sleep(std::time::Duration::from_secs(3)).await;

if self.verbose {
eprintln!("Extracting search results from DOM...");
}
let result = page.evaluate(include_str!("scripts/extract_search_results.js")).await?;
let mut sr: SearchResult = serde_json::from_value(result)
.map_err(|e| Error::Search(format!("Failed to parse search results: {}", e)))?;

let _ = page.close().await;

if self.verbose {
eprintln!("Total patents collected: {}", all_patents.len());
eprintln!("Total results found: {}", sr.total_results);
eprintln!("Patents on page: {}", sr.patents.len());
}

if all_patents.len() > limit {
if self.verbose {
eprintln!("Truncating to limit: {}", limit);
}
all_patents.truncate(limit);
if sr.patents.len() > limit {
sr.patents.truncate(limit);
}

Ok(SearchResult {
total_results: total_results_str,
top_assignees,
top_cpcs,
patents: all_patents,
})
Ok(sr)
}
}
}
Expand Down
Loading