Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 50 additions & 20 deletions seo/scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import re
from urllib.parse import urlparse
from xml.etree import ElementTree

from bs4 import BeautifulSoup
import requests
from django.utils import timezone

Expand All @@ -24,19 +24,32 @@
# SEO extraction patterns
# ---------------------------------------------------------------------------

SEO_PATTERNS = {
"title": r"<title>(.*?)</title>",
"meta_description": r'name="description"\s+content="([^"]*)"',
"canonical": r'rel="canonical"\s+href="([^"]*)"',
"og_url": r'property="og:url"\s+content="([^"]*)"',
"og_site_name": r'property="og:site_name"\s+content="([^"]*)"',
"og_title": r'property="og:title"\s+content="([^"]*)"',
"og_description": r'property="og:description"\s+content="([^"]*)"',
"og_image": r'property="og:image"\s+content="([^"]*)"',
"twitter_card": r'name="twitter:card"\s+content="([^"]*)"',
"twitter_title": r'name="twitter:title"\s+content="([^"]*)"',
"twitter_description": r'name="twitter:description"\s+content="([^"]*)"',
"twitter_image": r'name="twitter:image"\s+content="([^"]*)"',
# SEO_PATTERNS = {
# "title": r"<title>(.*?)</title>",
# "meta_description": r'name="description"\s+content="([^"]*)"',
# "canonical": r'rel="canonical"\s+href="([^"]*)"',
# "og_url": r'property="og:url"\s+content="([^"]*)"',
# "og_site_name": r'property="og:site_name"\s+content="([^"]*)"',
# "og_title": r'property="og:title"\s+content="([^"]*)"',
# "og_description": r'property="og:description"\s+content="([^"]*)"',
# "og_image": r'property="og:image"\s+content="([^"]*)"',
# "twitter_card": r'name="twitter:card"\s+content="([^"]*)"',
# "twitter_title": r'name="twitter:title"\s+content="([^"]*)"',
# "twitter_description": r'name="twitter:description"\s+content="([^"]*)"',
# "twitter_image": r'name="twitter:image"\s+content="([^"]*)"',
# }

META_TAGS_MAP = {
"description": "meta_description",
"twitter:card": "twitter_card",
"twitter:title": "twitter_title",
"twitter:description": "twitter_description",
"twitter:image": "twitter_image",
"og:url": "og_url",
"og:site_name": "og_site_name",
"og:title": "og_title",
"og:description": "og_description",
"og:image": "og_image",
}

# Tags that must have non-empty values on every public page.
Expand All @@ -58,13 +71,30 @@
# ---------------------------------------------------------------------------


# def extract_seo(html):
# """Extract SEO tag values from rendered HTML into a dict."""
# values = {}
# for key, pattern in SEO_PATTERNS.items():
# match = re.search(pattern, html, re.DOTALL)
# values[key] = match.group(1).strip() if match else None
# return values


def extract_seo(html):
"""Extract SEO tag values from rendered HTML into a dict."""
values = {}
for key, pattern in SEO_PATTERNS.items():
match = re.search(pattern, html, re.DOTALL)
values[key] = match.group(1).strip() if match else None
return values
soup = BeautifulSoup(html, "html.parser")
data = {
"title": soup.title.string.strip() if soup.title and soup.title.string else None,
"canonical": (link := soup.find("link", rel="canonical")) and link.get("href"),
}
for tag in soup.find_all("meta"):
content = tag.get("content")
if not content:
continue
if (name := tag.get("name")) in META_TAGS_MAP:
data[META_TAGS_MAP[name]] = content
elif (name := tag.get("property")) in META_TAGS_MAP:
data[META_TAGS_MAP[name]] = content
return data


def check_seo_quality(seo):
Expand Down