From 2b2d8a64259d4ff4ae31bd53864bf305ffff62ec Mon Sep 17 00:00:00 2001 From: Shivansh Date: Tue, 10 Feb 2026 16:38:50 +0530 Subject: [PATCH] Fix: make SEO extraction order-independent by replacing regex with BeautifulSoup --- seo/scanner.py | 70 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 50 insertions(+), 20 deletions(-) diff --git a/seo/scanner.py b/seo/scanner.py index cd151ad..8765824 100644 --- a/seo/scanner.py +++ b/seo/scanner.py @@ -12,7 +12,7 @@ import re from urllib.parse import urlparse from xml.etree import ElementTree - +from bs4 import BeautifulSoup import requests from django.utils import timezone @@ -24,19 +24,32 @@ # SEO extraction patterns # --------------------------------------------------------------------------- -SEO_PATTERNS = { - "title": r"(.*?)", - "meta_description": r'name="description"\s+content="([^"]*)"', - "canonical": r'rel="canonical"\s+href="([^"]*)"', - "og_url": r'property="og:url"\s+content="([^"]*)"', - "og_site_name": r'property="og:site_name"\s+content="([^"]*)"', - "og_title": r'property="og:title"\s+content="([^"]*)"', - "og_description": r'property="og:description"\s+content="([^"]*)"', - "og_image": r'property="og:image"\s+content="([^"]*)"', - "twitter_card": r'name="twitter:card"\s+content="([^"]*)"', - "twitter_title": r'name="twitter:title"\s+content="([^"]*)"', - "twitter_description": r'name="twitter:description"\s+content="([^"]*)"', - "twitter_image": r'name="twitter:image"\s+content="([^"]*)"', +# SEO_PATTERNS = { +# "title": r"(.*?)", +# "meta_description": r'name="description"\s+content="([^"]*)"', +# "canonical": r'rel="canonical"\s+href="([^"]*)"', +# "og_url": r'property="og:url"\s+content="([^"]*)"', +# "og_site_name": r'property="og:site_name"\s+content="([^"]*)"', +# "og_title": r'property="og:title"\s+content="([^"]*)"', +# "og_description": r'property="og:description"\s+content="([^"]*)"', +# "og_image": r'property="og:image"\s+content="([^"]*)"', +# "twitter_card": r'name="twitter:card"\s+content="([^"]*)"', +# "twitter_title": r'name="twitter:title"\s+content="([^"]*)"', +# "twitter_description": r'name="twitter:description"\s+content="([^"]*)"', +# "twitter_image": r'name="twitter:image"\s+content="([^"]*)"', +# } + +META_TAGS_MAP = { + "description": "meta_description", + "twitter:card": "twitter_card", + "twitter:title": "twitter_title", + "twitter:description": "twitter_description", + "twitter:image": "twitter_image", + "og:url": "og_url", + "og:site_name": "og_site_name", + "og:title": "og_title", + "og:description": "og_description", + "og:image": "og_image", } # Tags that must have non-empty values on every public page. @@ -58,13 +71,30 @@ # --------------------------------------------------------------------------- +# def extract_seo(html): +# """Extract SEO tag values from rendered HTML into a dict.""" +# values = {} +# for key, pattern in SEO_PATTERNS.items(): +# match = re.search(pattern, html, re.DOTALL) +# values[key] = match.group(1).strip() if match else None +# return values + + def extract_seo(html): - """Extract SEO tag values from rendered HTML into a dict.""" - values = {} - for key, pattern in SEO_PATTERNS.items(): - match = re.search(pattern, html, re.DOTALL) - values[key] = match.group(1).strip() if match else None - return values + soup = BeautifulSoup(html, "html.parser") + data = { + "title": soup.title.string.strip() if soup.title and soup.title.string else None, + "canonical": (link := soup.find("link", rel="canonical")) and link.get("href"), + } + for tag in soup.find_all("meta"): + content = tag.get("content") + if not content: + continue + if (name := tag.get("name")) in META_TAGS_MAP: + data[META_TAGS_MAP[name]] = content + elif (name := tag.get("property")) in META_TAGS_MAP: + data[META_TAGS_MAP[name]] = content + return data def check_seo_quality(seo):