From 2b2d8a64259d4ff4ae31bd53864bf305ffff62ec Mon Sep 17 00:00:00 2001
From: Shivansh <srawat@enine.dev>
Date: Tue, 10 Feb 2026 16:38:50 +0530
Subject: [PATCH] Fix: make SEO extraction order-independent by replacing regex
 with BeautifulSoup

---
 seo/scanner.py | 70 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 50 insertions(+), 20 deletions(-)
diff --git a/seo/scanner.py b/seo/scanner.py
index cd151ad..8765824 100644
--- a/seo/scanner.py
+++ b/seo/scanner.py
@@ -12,7 +12,7 @@
 import re
 from urllib.parse import urlparse
 from xml.etree import ElementTree
-
+from bs4 import BeautifulSoup
 import requests
 from django.utils import timezone
 
@@ -24,19 +24,32 @@
 # SEO extraction patterns
 # ---------------------------------------------------------------------------
 
-SEO_PATTERNS = {
-    "title": r"<title>(.*?)</title>",
-    "meta_description": r'name="description"\s+content="([^"]*)"',
-    "canonical": r'rel="canonical"\s+href="([^"]*)"',
-    "og_url": r'property="og:url"\s+content="([^"]*)"',
-    "og_site_name": r'property="og:site_name"\s+content="([^"]*)"',
-    "og_title": r'property="og:title"\s+content="([^"]*)"',
-    "og_description": r'property="og:description"\s+content="([^"]*)"',
-    "og_image": r'property="og:image"\s+content="([^"]*)"',
-    "twitter_card": r'name="twitter:card"\s+content="([^"]*)"',
-    "twitter_title": r'name="twitter:title"\s+content="([^"]*)"',
-    "twitter_description": r'name="twitter:description"\s+content="([^"]*)"',
-    "twitter_image": r'name="twitter:image"\s+content="([^"]*)"',
+# SEO_PATTERNS = {
+#     "title": r"<title>(.*?)</title>",
+#     "meta_description": r'name="description"\s+content="([^"]*)"',
+#     "canonical": r'rel="canonical"\s+href="([^"]*)"',
+#     "og_url": r'property="og:url"\s+content="([^"]*)"',
+#     "og_site_name": r'property="og:site_name"\s+content="([^"]*)"',
+#     "og_title": r'property="og:title"\s+content="([^"]*)"',
+#     "og_description": r'property="og:description"\s+content="([^"]*)"',
+#     "og_image": r'property="og:image"\s+content="([^"]*)"',
+#     "twitter_card": r'name="twitter:card"\s+content="([^"]*)"',
+#     "twitter_title": r'name="twitter:title"\s+content="([^"]*)"',
+#     "twitter_description": r'name="twitter:description"\s+content="([^"]*)"',
+#     "twitter_image": r'name="twitter:image"\s+content="([^"]*)"',
+# }
+
+META_TAGS_MAP = {
+    "description": "meta_description",
+    "twitter:card": "twitter_card",
+    "twitter:title": "twitter_title",
+    "twitter:description": "twitter_description",
+    "twitter:image": "twitter_image",
+    "og:url": "og_url",
+    "og:site_name": "og_site_name",
+    "og:title": "og_title",
+    "og:description": "og_description",
+    "og:image": "og_image",
 }
 
 # Tags that must have non-empty values on every public page.
@@ -58,13 +71,30 @@
 # ---------------------------------------------------------------------------
 
 
+# def extract_seo(html):
+#     """Extract SEO tag values from rendered HTML into a dict."""
+#     values = {}
+#     for key, pattern in SEO_PATTERNS.items():
+#         match = re.search(pattern, html, re.DOTALL)
+#         values[key] = match.group(1).strip() if match else None
+#     return values
+
+
 def extract_seo(html):
-    """Extract SEO tag values from rendered HTML into a dict."""
-    values = {}
-    for key, pattern in SEO_PATTERNS.items():
-        match = re.search(pattern, html, re.DOTALL)
-        values[key] = match.group(1).strip() if match else None
-    return values
+    soup = BeautifulSoup(html, "html.parser")
+    data = {
+        "title": soup.title.string.strip() if soup.title and soup.title.string else None,
+        "canonical": (link := soup.find("link", rel="canonical")) and link.get("href"),
+    }
+    for tag in soup.find_all("meta"):
+        content = tag.get("content")
+        if not content:
+            continue
+        if (name := tag.get("name")) in META_TAGS_MAP:
+            data[META_TAGS_MAP[name]] = content
+        elif (name := tag.get("property")) in META_TAGS_MAP:
+            data[META_TAGS_MAP[name]] = content
+    return data
 
 
 def check_seo_quality(seo):