diff --git a/docs/norgolith.toml b/docs/norgolith.toml index dc7d6d6..e4c33e0 100644 --- a/docs/norgolith.toml +++ b/docs/norgolith.toml @@ -15,6 +15,17 @@ description = "Latest posts" ttl = 60 image = "/assets/norgolith.svg" +# SEO: sitemap.xml, OpenGraph meta tags +[seo] +sitemap = true +open_graph = true +default_image = "/assets/norgolith.svg" + +# Robots.txt: controls crawler access +[robots] +enable = true +preset = "allow_all" # options: "allow_all", "no_llms", "block_all" + [extra] license = "GPLv2" footer_author_link = "https://github.com/NTBBloodbath" diff --git a/docs/theme/templates/base.html b/docs/theme/templates/base.html index a7757a7..2c1c1e3 100644 --- a/docs/theme/templates/base.html +++ b/docs/theme/templates/base.html @@ -123,6 +123,36 @@ {% endif %} + {# Canonical URL #} + {% if metadata.permalink %} + + {% endif %} + + {# OpenGraph + Twitter Cards #} + {% if config.seo is defined and config.seo.open_graph %} + + {% if metadata.description and not metadata.description == "nil" %} + + {% endif %} + {% if metadata.permalink %} + + {% endif %} + + + {% if metadata.image %} + + + {% elif config.seo is defined and config.seo.default_image %} + + + {% endif %} + + + {% if metadata.description and not metadata.description == "nil" %} + + {% endif %} + {% endif %} + {% block title %}{% endblock title %} - {{ config.title | title }} {% endblock head %} diff --git a/scripts/update-robots-presets.sh b/scripts/update-robots-presets.sh new file mode 100755 index 0000000..7aaa243 --- /dev/null +++ b/scripts/update-robots-presets.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# +# Fetches the latest ai.robots.txt list from GitHub and updates the +# ROBOTS_NO_LLMS const in src/cmd/seo.rs. +# +# Usage: ./scripts/update-robots-presets.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(dirname "$SCRIPT_DIR")" +SEO_FILE="$REPO_ROOT/src/cmd/seo.rs" +UPSTREAM_URL="https://raw.githubusercontent.com/ai-robots-txt/ai.robots.txt/main/robots.txt" + +echo "Fetching ai.robots.txt from GitHub..." +ROBOTS_RAW=$(curl -fsSL "$UPSTREAM_URL") + +# Extract only User-agent and Disallow lines +ROBOTS_FILTERED=$(echo "$ROBOTS_RAW" | grep -E '^User-agent:|^Disallow:') + +# Find line numbers for the const block +START_LINE=$(grep -n '^const ROBOTS_NO_LLMS: &str = r"' "$SEO_FILE" | head -1 | cut -d: -f1) +# Find the closing line (contains just "); +END_LINE=$(awk "NR>=$START_LINE && /\";\$/{print NR; exit}" "$SEO_FILE") + +if [ -z "$START_LINE" ] || [ -z "$END_LINE" ]; then + echo "Error: Could not find ROBOTS_NO_LLMS const in $SEO_FILE" + exit 1 +fi + +# Build new file +TEMP_FILE=$(mktemp) + +# Lines before the const (1 to START_LINE-1) +if [ "$START_LINE" -gt 1 ]; then + head -n $((START_LINE - 1)) "$SEO_FILE" > "$TEMP_FILE" +fi + +# The new const +echo "const ROBOTS_NO_LLMS: &str = r\"$ROBOTS_FILTERED\";" >> "$TEMP_FILE" + +# Lines after the const (END_LINE+1 to end) +TOTAL_LINES=$(wc -l < "$SEO_FILE") +if [ "$END_LINE" -lt "$TOTAL_LINES" ]; then + tail -n +$((END_LINE + 1)) "$SEO_FILE" >> "$TEMP_FILE" +fi + +mv "$TEMP_FILE" "$SEO_FILE" + +echo "Done. Updated ROBOTS_NO_LLMS const in $SEO_FILE (lines $START_LINE-$END_LINE)" diff --git a/src/cmd/build.rs b/src/cmd/build.rs index d588147..31c96e6 100644 --- a/src/cmd/build.rs +++ b/src/cmd/build.rs @@ -18,6 +18,7 @@ fn href_root_re() -> &'static regex::Regex { } use crate::{cache::BuildCache, config, fs, shared}; +use super::seo; /// Represents the directory structure of a Norgolith site. /// @@ -138,16 +139,17 @@ fn generate_xml_feeds( tera: &Tera, shared_context: &Context, public_dir: &Path, -) -> Result { +) -> Result<(usize, Vec)> { let xml_templates = collect_xml_templates(tera); let count = xml_templates.len(); if count == 0 { - return Ok(0); + return Ok((0, Vec::new())); } let mut context = shared_context.clone(); context.insert("now", &chrono::Utc::now()); + let mut feed_names = Vec::with_capacity(count); for template_name in &xml_templates { let rendered = tera .render(template_name, &context) @@ -173,9 +175,10 @@ fn generate_xml_feeds( } std::fs::write(&output_path, &rendered) .wrap_err(format!("Failed to write '{}'", output_path.display()))?; + feed_names.push(template_name.clone()); } - Ok(count) + Ok((count, feed_names)) } /// Generates the final public build from intermediate build artifacts @@ -197,7 +200,7 @@ fn build_contents( shared_context: &Context, cache: &mut BuildCache, minify: bool, -) -> Result<(usize, BuildTimings)> { +) -> Result<(usize, Vec, BuildTimings)> { use rayon::prelude::*; let entries: Vec<_> = WalkDir::new(&paths.content) @@ -231,10 +234,12 @@ fn build_contents( // Collect results and handle errors let mut buffered_writes = Vec::new(); + let mut permalinks = Vec::new(); for result in results { match result { - Ok(Some((public_path, content, cache_entry))) => { + Ok(Some((public_path, content, permalink, cache_entry))) => { buffered_writes.push((public_path, content)); + permalinks.push(permalink); if let Some((key, content_str, metadata)) = cache_entry { cache.insert(&key, &content_str, metadata); } @@ -258,13 +263,13 @@ fn build_contents( timings.page_write_ms = write_ms; timings.page_count = built_count; - Ok((built_count, timings)) + Ok((built_count, permalinks, timings)) } /// (cache_key, content, metadata) for cache insertion type CacheInsert = (PathBuf, String, serde_json::Value); /// Result of building a single content entry -type BuildResult = Result)>>; +type BuildResult = Result)>>; /// Processes a single build entry (HTML file with metadata) /// @@ -359,7 +364,14 @@ fn build_content_entry( rendered }; - Ok(Some((public_path, rendered, cache_insert))) + // Extract permalink for SEO generation + let permalink = metadata + .get("permalink") + .and_then(|v| v.as_str()) + .unwrap_or("/") + .to_string(); + + Ok(Some((public_path, rendered, permalink, cache_insert))) } /// Generates category listing pages @@ -693,6 +705,7 @@ struct BuildTimings { content_ms: u128, categories_ms: u128, feeds_ms: u128, + seo_ms: u128, assets_ms: u128, cache_save_ms: u128, // Per-page sub-timing (sums across all pages) @@ -722,6 +735,7 @@ impl BuildTimings { content_ms: 0, categories_ms: 0, feeds_ms: 0, + seo_ms: 0, assets_ms: 0, cache_save_ms: 0, page_file_ms: 0, @@ -750,6 +764,7 @@ impl BuildTimings { .saturating_sub(self.content_ms) .saturating_sub(self.categories_ms) .saturating_sub(self.feeds_ms) + .saturating_sub(self.seo_ms) .saturating_sub(self.assets_ms) .saturating_sub(self.cache_save_ms); @@ -765,6 +780,7 @@ impl BuildTimings { println!(" {:<30} {:>6}ms ({:>4.1}%)", "Content build (all pages)", self.content_ms, pct(self.content_ms, total_ms)); println!(" {:<30} {:>6}ms ({:>4.1}%)", "Category pages", self.categories_ms, pct(self.categories_ms, total_ms)); println!(" {:<30} {:>6}ms ({:>4.1}%)", "XML feeds", self.feeds_ms, pct(self.feeds_ms, total_ms)); + println!(" {:<30} {:>6}ms ({:>4.1}%)", "SEO (sitemap+robots)", self.seo_ms, pct(self.seo_ms, total_ms)); println!(" {:<30} {:>6}ms ({:>4.1}%)", "Asset copy", self.assets_ms, pct(self.assets_ms, total_ms)); println!(" {:<30} {:>6}ms ({:>4.1}%)", "Cache save", self.cache_save_ms, pct(self.cache_save_ms, total_ms)); println!(" {:<30} {:>6}ms ({:>4.1}%)", "Overhead/other", overhead, pct(overhead, total_ms)); @@ -908,7 +924,7 @@ pub fn build(minify: bool) -> Result<()> { // Build content let t = Instant::now(); - let (page_count, content_timings) = build_contents(&tera, &paths, &posts, &site_config, &shared_context, &mut cache, minify)?; + let (page_count, permalinks, content_timings) = build_contents(&tera, &paths, &posts, &site_config, &shared_context, &mut cache, minify)?; timings.content_ms = t.elapsed().as_millis(); timings.page_count = page_count; // Copy per-page sub-timings from the concurrent build @@ -937,7 +953,7 @@ pub fn build(minify: bool) -> Result<()> { // XML feeds let t = Instant::now(); - let feed_count = generate_xml_feeds(&tera, &shared_context, &paths.public)?; + let (feed_count, feed_names) = generate_xml_feeds(&tera, &shared_context, &paths.public)?; timings.feeds_ms = t.elapsed().as_millis(); if feed_count > 0 { println!( @@ -949,6 +965,103 @@ pub fn build(minify: bool) -> Result<()> { ); } + // SEO generation + let t = Instant::now(); + let mut seo_count = 0usize; + let seo_enabled = site_config.seo.is_some() || site_config.robots.is_some(); + if seo_enabled { + // Sitemap + let sitemap_enabled = site_config + .seo + .as_ref() + .is_none_or(|s| s.sitemap); + if sitemap_enabled { + // Build date map from posts: permalink → updated/created + use std::collections::HashMap; + let date_map: HashMap<&str, &str> = posts.iter() + .filter_map(|p| { + let permalink = p.get("permalink")?.as_str()?; + let date = p.get("updated") + .or_else(|| p.get("created"))? + .as_str()?; + Some((permalink, date)) + }) + .collect(); + + let mut urls = Vec::with_capacity(permalinks.len() + 16); + + // Homepage + urls.push(seo::SitemapUrl { + loc: "/".into(), + lastmod: None, + }); + + // Content pages (with dates from posts where available) + for p in &permalinks { + let lastmod = date_map.get(p.as_str()).map(|s| s.to_string()); + urls.push(seo::SitemapUrl { + loc: p.clone(), + lastmod, + }); + } + + // Category pages + if !posts.is_empty() { + let categories = shared::collect_all_posts_categories(&posts); + let categories_dir = &site_config.categories_dir; + urls.push(seo::SitemapUrl { + loc: format!("/{}/", categories_dir), + lastmod: None, + }); + for cat in &categories { + urls.push(seo::SitemapUrl { + loc: format!("/{}/{}/", categories_dir, cat), + lastmod: None, + }); + } + } + + // Feed URLs + for feed_name in &feed_names { + urls.push(seo::SitemapUrl { + loc: format!("/{}", feed_name), + lastmod: None, + }); + } + + let xml = seo::generate_sitemap_xml(&urls, &site_config.root_url); + let output_path = paths.public.join("sitemap.xml"); + std::fs::write(&output_path, &xml) + .wrap_err("Failed to write sitemap.xml")?; + seo_count += 1; + } + + // Robots.txt + if let Some(ref robots_config) = site_config.robots { + if robots_config.enable { + let content = seo::generate_robots_txt( + &site_config, + robots_config, + sitemap_enabled, + ); + let output_path = paths.public.join("robots.txt"); + std::fs::write(&output_path, &content) + .wrap_err("Failed to write robots.txt")?; + seo_count += 1; + } + } + } + timings.seo_ms = t.elapsed().as_millis(); + if seo_count > 0 { + println!( + " {} {} {:<12} {}", + "•".green(), + format!("{:<12}", "SEO").bold(), + format!("{} files", seo_count), + shared::get_elapsed_time(t).dimmed() + ); + } + // Assets let t = Instant::now(); let public_assets_dir = paths.public.join("assets"); diff --git a/src/cmd/mod.rs b/src/cmd/mod.rs index 8be10cc..d05693e 100644 --- a/src/cmd/mod.rs +++ b/src/cmd/mod.rs @@ -3,6 +3,7 @@ mod dev; mod init; mod new; mod preview; +pub mod seo; mod theme; pub use build::build; diff --git a/src/cmd/seo.rs b/src/cmd/seo.rs new file mode 100644 index 0000000..cd86bbd --- /dev/null +++ b/src/cmd/seo.rs @@ -0,0 +1,409 @@ +use std::fmt::Write; + +use tracing::warn; + +use crate::config::{RobotsPreset, SiteConfig, SiteConfigRobots}; + +const ROBOTS_ALLOW_ALL: &str = "\ +User-agent: * +Allow: / +"; + +const ROBOTS_BLOCK_ALL: &str = "\ +User-agent: * +Disallow: / +"; + +// This const is updated by scripts/update-robots-presets.sh +// To update: run the script, which fetches from ai.robots.txt +const ROBOTS_NO_LLMS: &str = r"User-agent: AddSearchBot +User-agent: AgentTimes +User-agent: AI2Bot +User-agent: AI2Bot-DeepResearchEval +User-agent: Ai2Bot-Dolma +User-agent: aiHitBot +User-agent: amazon-kendra +User-agent: Amazonbot +User-agent: AmazonBuyForMe +User-agent: Amzn-SearchBot +User-agent: Amzn-User +User-agent: Andibot +User-agent: Anomura +User-agent: anthropic-ai +User-agent: ApifyBot +User-agent: ApifyWebsiteContentCrawler +User-agent: Applebot +User-agent: Applebot-Extended +User-agent: Aranet-SearchBot +User-agent: atlassian-bot +User-agent: Awario +User-agent: AzureAI-SearchBot +User-agent: bedrockbot +User-agent: bigsur.ai +User-agent: Bravebot +User-agent: Brightbot +User-agent: Brightbot 1.0 +User-agent: BuddyBot +User-agent: Bytespider +User-agent: CCBot +User-agent: Channel3Bot +User-agent: ChatGLM-Spider +User-agent: ChatGPT Agent +User-agent: ChatGPT-User +User-agent: Claude-Code +User-agent: Claude-SearchBot +User-agent: Claude-User +User-agent: Claude-Web +User-agent: ClaudeBot +User-agent: Cloudflare-AutoRAG +User-agent: CloudVertexBot +User-agent: Code +User-agent: cohere-ai +User-agent: cohere-training-data-crawler +User-agent: Cotoyogi +User-agent: CragCrawler +User-agent: Crawl4AI +User-agent: Crawlspace +User-agent: Datenbank Crawler +User-agent: DeepSeekBot +User-agent: Devin +User-agent: Diffbot +User-agent: DuckAssistBot +User-agent: Echobot Bot +User-agent: EchoboxBot +User-agent: ExaBot +User-agent: FacebookBot +User-agent: facebookexternalhit +User-agent: Factset_spyderbot +User-agent: FirecrawlAgent +User-agent: FriendlyCrawler +User-agent: GeistHaus-PageFetcher +User-agent: Gemini-Deep-Research +User-agent: Google-Agent +User-agent: Google-CloudVertexBot +User-agent: Google-Extended +User-agent: Google-Firebase +User-agent: Google-Gemini-CLI +User-agent: Google-NotebookLM +User-agent: GoogleAgent-Mariner +User-agent: GoogleAgent-URLContext +User-agent: GoogleOther +User-agent: GoogleOther-Image +User-agent: GoogleOther-Video +User-agent: GPTBot +User-agent: HenkBot +User-agent: iAskBot +User-agent: iaskspider +User-agent: iaskspider/2.0 +User-agent: IbouBot +User-agent: ICC-Crawler +User-agent: ImagesiftBot +User-agent: imageSpider +User-agent: img2dataset +User-agent: ISSCyberRiskCrawler +User-agent: kagi-fetcher +User-agent: Kangaroo Bot +User-agent: Kimi-User +User-agent: KlaviyoAIBot +User-agent: KunatoCrawler +User-agent: laion-huggingface-processor +User-agent: LAIONDownloader +User-agent: LCC +User-agent: LinerBot +User-agent: Linguee Bot +User-agent: LinkupBot +User-agent: Manus-User +User-agent: meta-externalagent +User-agent: Meta-ExternalAgent +User-agent: meta-externalfetcher +User-agent: Meta-ExternalFetcher +User-agent: meta-webindexer +User-agent: MistralAI-User +User-agent: MistralAI-User/1.0 +User-agent: MyCentralAIScraperBot +User-agent: NagetBot +User-agent: netEstate Imprint Crawler +User-agent: newsai +User-agent: NotebookLM +User-agent: NovaAct +User-agent: OAI-SearchBot +User-agent: omgili +User-agent: omgilibot +User-agent: OpenAI +User-agent: opencode +User-agent: Operator +User-agent: PanguBot +User-agent: Panscient +User-agent: panscient.com +User-agent: Perplexity-User +User-agent: PerplexityBot +User-agent: PetalBot +User-agent: PhindBot +User-agent: Poggio-Citations +User-agent: Poseidon Research Crawler +User-agent: QualifiedBot +User-agent: Querit-SearchBot +User-agent: QueritBot +User-agent: QuillBot +User-agent: quillbot.com +User-agent: SBIntuitionsBot +User-agent: Scrapy +User-agent: SemrushBot-OCOB +User-agent: SemrushBot-SWA +User-agent: Shap-User +User-agent: ShapBot +User-agent: Sidetrade indexer bot +User-agent: Spider +User-agent: TavilyBot +User-agent: Terra Cotta +User-agent: TerraCotta +User-agent: Thinkbot +User-agent: TikTokSpider +User-agent: Timpibot +User-agent: Trae +User-agent: TwinAgent +User-agent: UseAI +User-agent: VelenPublicWebCrawler +User-agent: WARDBot +User-agent: Webzio-Extended +User-agent: webzio-extended +User-agent: wpbot +User-agent: WRTNBot +User-agent: YaK +User-agent: YandexAdditional +User-agent: YandexAdditionalBot +User-agent: YouBot +User-agent: ZanistaBot +Disallow: /"; + +/// Represents a URL entry for the sitemap XML. +#[derive(Debug, Clone)] +pub struct SitemapUrl { + pub loc: String, + pub lastmod: Option, +} + +/// Generates a `robots.txt` content based on the site configuration. +/// +/// If `custom_file` is set, reads that file and appends the Sitemap directive. +/// Otherwise, generates from the configured preset. +pub fn generate_robots_txt( + site_config: &SiteConfig, + robots_config: &SiteConfigRobots, + sitemap_enabled: bool, +) -> String { + let mut buf = String::with_capacity(512); + + if let Some(ref custom_path) = robots_config.custom { + match std::fs::read_to_string(custom_path) { + Ok(content) => buf.push_str(&content), + Err(e) => { + warn!( + "Failed to read custom robots file '{}': {}", + custom_path, e + ); + // Fallback to a permissive robots.txt + buf.push_str(ROBOTS_ALLOW_ALL); + } + } + } else if let Some(ref preset) = robots_config.preset { + match preset { + RobotsPreset::AllowAll => buf.push_str(ROBOTS_ALLOW_ALL), + RobotsPreset::BlockAll => buf.push_str(ROBOTS_BLOCK_ALL), + RobotsPreset::NoLlms => buf.push_str(ROBOTS_NO_LLMS), + } + } else { + // No preset and no custom file: default to permissive + buf.push_str(ROBOTS_ALLOW_ALL); + } + + // Append Sitemap directive if sitemap is enabled + if sitemap_enabled { + let _ = writeln!(buf); + let _ = writeln!(buf, "Sitemap: {}/sitemap.xml", site_config.root_url); + } + + buf +} + +/// Generates a `sitemap.xml` content from the given list of URLs. +pub fn generate_sitemap_xml(urls: &[SitemapUrl], root_url: &str) -> String { + // Estimate: ~200 bytes per URL entry + let mut buf = String::with_capacity(urls.len() * 200 + 128); + + buf.push_str( + r#" + +"#, + ); + + for url in urls { + let _ = writeln!(buf, " "); + let _ = writeln!( + buf, + " {}/{}", + root_url.trim_end_matches('/'), + url.loc.trim_start_matches('/') + ); + if let Some(ref lastmod) = url.lastmod { + let _ = writeln!(buf, " {}", lastmod); + } + let _ = writeln!(buf, " weekly"); + let _ = writeln!(buf, " 0.5"); + let _ = writeln!(buf, " "); + } + + buf.push_str("\n"); + buf +} + +#[cfg(test)] +mod tests { + use super::*; + + // --- generate_sitemap_xml --- + + #[test] + fn sitemap_empty() { + let xml = generate_sitemap_xml(&[], "https://example.com"); + assert!(xml.contains("")); + assert!(!xml.contains("")); + } + + #[test] + fn sitemap_single_url() { + let urls = vec![SitemapUrl { + loc: "/about/".into(), + lastmod: None, + }]; + let xml = generate_sitemap_xml(&urls, "https://example.com"); + assert!(xml.contains("https://example.com/about/")); + assert!(xml.contains("weekly")); + assert!(xml.contains("0.5")); + assert!(!xml.contains("")); + } + + #[test] + fn sitemap_with_lastmod() { + let urls = vec![SitemapUrl { + loc: "/posts/hello/".into(), + lastmod: Some("2026-01-15T12:00:00Z".into()), + }]; + let xml = generate_sitemap_xml(&urls, "https://example.com"); + assert!(xml.contains("2026-01-15T12:00:00Z")); + } + + #[test] + fn sitemap_multiple_urls() { + let urls = vec![ + SitemapUrl { loc: "/".into(), lastmod: None }, + SitemapUrl { loc: "/about/".into(), lastmod: None }, + SitemapUrl { loc: "/posts/".into(), lastmod: None }, + ]; + let xml = generate_sitemap_xml(&urls, "https://example.com"); + assert!(xml.contains("https://example.com/")); + assert!(xml.contains("https://example.com/about/")); + assert!(xml.contains("https://example.com/posts/")); + } + + // --- generate_robots_txt --- + + fn test_config(root_url: &str) -> SiteConfig { + SiteConfig { + root_url: root_url.to_string(), + ..Default::default() + } + } + + #[test] + fn robots_allow_all() { + let config = test_config("https://example.com"); + let robots = SiteConfigRobots { + enable: true, + preset: Some(RobotsPreset::AllowAll), + custom: None, + }; + let txt = generate_robots_txt(&config, &robots, false); + assert!(txt.contains("User-agent: *")); + assert!(txt.contains("Allow: /")); + } + + #[test] + fn robots_block_all() { + let config = test_config("https://example.com"); + let robots = SiteConfigRobots { + enable: true, + preset: Some(RobotsPreset::BlockAll), + custom: None, + }; + let txt = generate_robots_txt(&config, &robots, false); + assert!(txt.contains("User-agent: *")); + assert!(txt.contains("Disallow: /")); + } + + #[test] + fn robots_no_llms() { + let config = test_config("https://example.com"); + let robots = SiteConfigRobots { + enable: true, + preset: Some(RobotsPreset::NoLlms), + custom: None, + }; + let txt = generate_robots_txt(&config, &robots, false); + assert!(txt.contains("User-agent: GPTBot")); + assert!(txt.contains("User-agent: ClaudeBot")); + assert!(txt.contains("Disallow: /")); + } + + #[test] + fn robots_sitemap_line_when_enabled() { + let config = test_config("https://example.com"); + let robots = SiteConfigRobots { + enable: true, + preset: Some(RobotsPreset::AllowAll), + custom: None, + }; + let txt = generate_robots_txt(&config, &robots, true); + assert!(txt.contains("Sitemap: https://example.com/sitemap.xml")); + } + + #[test] + fn robots_no_sitemap_line_when_disabled() { + let config = test_config("https://example.com"); + let robots = SiteConfigRobots { + enable: true, + preset: Some(RobotsPreset::AllowAll), + custom: None, + }; + let txt = generate_robots_txt(&config, &robots, false); + assert!(!txt.contains("Sitemap:")); + } + + #[test] + fn robots_custom_file_fallback() { + let config = test_config("https://example.com"); + let robots = SiteConfigRobots { + enable: true, + preset: None, + custom: Some("/nonexistent/robots.txt".into()), + }; + // Should warn and fall back to allow_all + let txt = generate_robots_txt(&config, &robots, false); + assert!(txt.contains("User-agent: *")); + assert!(txt.contains("Allow: /")); + } + + #[test] + fn robots_no_preset_no_custom_defaults_to_allow() { + let config = test_config("https://example.com"); + let robots = SiteConfigRobots { + enable: true, + preset: None, + custom: None, + }; + let txt = generate_robots_txt(&config, &robots, false); + assert!(txt.contains("User-agent: *")); + assert!(txt.contains("Allow: /")); + } +} diff --git a/src/config/mod.rs b/src/config/mod.rs index 047a58a..4ab6549 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -25,6 +25,39 @@ pub struct CollectionConfig { pub dir: String, } +#[derive(Debug, Clone, Default, Deserialize, Serialize)] +pub struct SiteConfigSeo { + #[serde(default = "default_true")] + pub sitemap: bool, + #[serde(default = "default_true")] + pub open_graph: bool, + #[serde(default, rename = "default_image")] + pub default_image: Option, +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct SiteConfigRobots { + #[serde(default = "default_true")] + pub enable: bool, + pub preset: Option, + #[serde(default, rename = "custom_file")] + pub custom: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] +pub enum RobotsPreset { + #[serde(rename = "allow_all")] + AllowAll, + #[serde(rename = "no_llms")] + NoLlms, + #[serde(rename = "block_all")] + BlockAll, +} + +fn default_true() -> bool { + true +} + fn default_collections() -> Vec { vec![CollectionConfig { name: "posts".into(), @@ -52,6 +85,9 @@ pub struct SiteConfig { pub collections: Vec, #[serde(default = "default_categories_dir", rename = "categoriesDir")] pub categories_dir: String, + #[serde(default)] + pub seo: Option, + pub robots: Option, } impl Default for SiteConfig { @@ -67,6 +103,8 @@ impl Default for SiteConfig { extra: None, collections: default_collections(), categories_dir: default_categories_dir(), + seo: None, + robots: None, } } } diff --git a/src/resources/templates/base.html b/src/resources/templates/base.html index 5eb1d52..216aff8 100644 --- a/src/resources/templates/base.html +++ b/src/resources/templates/base.html @@ -48,6 +48,37 @@ + + {# Canonical URL #} + {% if metadata.permalink %} + + {% endif %} + + {# OpenGraph + Twitter Cards #} + {% if config.seo is defined and config.seo.open_graph %} + + {% if metadata.description and not metadata.description == "nil" %} + + {% endif %} + {% if metadata.permalink %} + + {% endif %} + + + {% if metadata.image %} + + + {% elif config.seo is defined and config.seo.default_image %} + + + {% endif %} + + + {% if metadata.description and not metadata.description == "nil" %} + + {% endif %} + {% endif %} + {% block title %}{% endblock title %} - {{ config.title | title }} {% endblock head %}