Skip to content

Commit 24dccb0

Browse files
committed
Testing the complaince engine fix
1 parent ec0867c commit 24dccb0

2 files changed

Lines changed: 80 additions & 37 deletions

File tree

ai-service/app.py

Lines changed: 47 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from fastapi import FastAPI, UploadFile, File, Form, HTTPException
22
import requests
33
import os
4+
import json
45
from dotenv import load_dotenv
56
from compliance_engine import ComplianceAuditor
6-
import json
77

88
# Load your Hugging Face token from the .env file in ai-service/
99
load_dotenv()
@@ -21,28 +21,63 @@ async def run_audit(
2121
boxes: str = Form(...)
2222
):
2323
try:
24-
# 1. Package the incoming data to forward to Hugging Face
2524
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
25+
26+
# Read the image into memory ONCE so we can reuse it for multiple chunks
2627
image_bytes = await screenshot.read()
27-
files = {"screenshot": (screenshot.filename, image_bytes, screenshot.content_type)}
28-
data = {"words": words, "boxes": boxes}
28+
29+
# Parse the incoming JSON strings into Python lists
30+
words_list = json.loads(words)
31+
boxes_list = json.loads(boxes)
32+
33+
# --- THE SLIDING WINDOW CHUNKING LOGIC ---
34+
CHUNK_SIZE = 400
35+
all_flagged_predictions = []
36+
37+
total_elements = len(words_list)
38+
print(f"[*] Starting Sliding Window analysis for {total_elements} elements on {target_url}...")
39+
40+
# Loop through the page in batches of 400 to prevent HF from hitting the 512 token limit
41+
for i in range(0, total_elements, CHUNK_SIZE):
42+
chunk_words = words_list[i : i + CHUNK_SIZE]
43+
chunk_boxes = boxes_list[i : i + CHUNK_SIZE]
44+
45+
if not chunk_words:
46+
continue
47+
48+
print(f"[*] Sending Chunk to Hugging Face: {i} to {i + len(chunk_words)}...")
49+
50+
# Package this specific chunk to send to Hugging Face
51+
files = {"screenshot": (screenshot.filename, image_bytes, screenshot.content_type)}
52+
data = {
53+
"words": json.dumps(chunk_words),
54+
"boxes": json.dumps(chunk_boxes)
55+
}
2956

30-
print(f"[*] Analyzing {target_url} via Hugging Face Cloud...")
31-
hf_response = requests.post(HF_API_URL, headers=headers, files=files, data=data)
57+
hf_response = requests.post(HF_API_URL, headers=headers, files=files, data=data)
3258

33-
if hf_response.status_code != 200:
34-
raise Exception(f"Hugging Face API Error: {hf_response.text}")
59+
if hf_response.status_code != 200:
60+
print(f"[!] Warning: HF API Error on chunk {i}: {hf_response.text}")
61+
continue # Skip this chunk if HF throws a timeout or error, but keep processing the rest of the page!
3562

36-
ai_predictions = hf_response.json()
63+
chunk_predictions = hf_response.json()
64+
65+
# Combine the flagged items from this chunk into our master list
66+
if isinstance(chunk_predictions, list):
67+
all_flagged_predictions.extend(chunk_predictions)
3768

69+
print(f"[+] Chunking complete! Found {len(all_flagged_predictions)} total suspicious elements to audit.")
3870

39-
# 2. Pass the AI predictions into your Compliance Engine
40-
print("[*] Generating Legal Compliance Report...")
71+
# --- STAGE 2: THE GEMINI LOGICAL AUDIT ---
72+
print("[*] Generating Legal Compliance Report via Gemini...")
4173
auditor = ComplianceAuditor(target_url=target_url)
42-
final_report = auditor.analyze_detections(ai_predictions)
74+
75+
# We pass the MASSIVE stitched list of all found patterns to your Gemini engine
76+
final_report = auditor.analyze_detections(all_flagged_predictions)
4377

4478
# 3. Return the formatted JSON to your Node.js backend
4579
return final_report
4680

4781
except Exception as e:
82+
print(f"[!] Fatal Audit Error: {str(e)}")
4883
raise HTTPException(status_code=500, detail=str(e))

ai-service/compliance_engine.py

Lines changed: 33 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,12 @@
77

88
load_dotenv()
99

10-
# Initialize the stable google-generativeai SDK
10+
# Initialize the SDK and force strict JSON output to prevent parsing crashes
1111
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
12-
llm_model = genai.GenerativeModel('gemini-2.5-flash')
12+
llm_model = genai.GenerativeModel(
13+
model_name='gemini-2.5-flash',
14+
generation_config={"response_mime_type": "application/json"}
15+
)
1316

1417
REGULATORY_MAP = {
1518
"preselected_invasive_default": {
@@ -81,46 +84,43 @@ def _verify_with_llm(self, element_text, layout_label):
8184
"""The Stage 2 Classifier: Asks Gemini to legally categorize the text using dynamic map data."""
8285

8386
legal_framework = "\n".join([f"{i+1}. {k.upper()}: {v['description']}" for i, (k, v) in enumerate(REGULATORY_MAP.items())])
84-
8587
allowed_categories = "[" + ", ".join(REGULATORY_MAP.keys()) + ", safe]"
8688

8789
prompt = f"""
88-
ACT AS: A Senior Digital Rights Attorney and GDPR Auditor specialized in the Digital Services Act (DSA) Article 25 & 27.
90+
ACT AS: A Senior Digital Rights Attorney and GDPR Auditor.
8991
90-
TASK: Conduct a high-stakes audit on a specific UI element to determine if it constitutes a "Dark Pattern" (deceptive design).
92+
TASK: Evaluate this specific UI element text flagged by a Vision AI. Is it a genuine "Dark Pattern" (deceptive design) or a normal UI element?
9193
9294
CONTEXT:
9395
- Element Type: {layout_label}
9496
- Detected Text: "{element_text}"
9597
96-
LEGAL REFERENCE FRAMEWORK:
98+
LEGAL FRAMEWORK (CATEGORIES):
9799
{legal_framework}
98100
101+
CRITICAL: NEGATIVE EXAMPLES (IGNORE THESE - LABEL AS 'safe')
102+
- Standard navigation ("Home", "About Us", "Contact").
103+
- Standard actions ("Login", "Submit", "Search", "Read More", "Accept").
104+
- Cookie banners with fair choices ("Accept All" alongside "Decline All").
105+
- "No thanks" or "Close" buttons.
106+
99107
AUDIT RULES:
100-
- ZERO TOLERANCE FOR FALSE POSITIVES: If the text is standard, polite, or merely descriptive (e.g., "We use cookies", "Learn More", "Accept"), it MUST be labeled 'safe'.
108+
- ZERO TOLERANCE FOR FALSE POSITIVES: If the text is standard, polite, or merely descriptive, it MUST be labeled 'safe'.
101109
- CONTEXT MATTERS: "No thanks" is safe. "No, I prefer to pay more" is emotional_steering.
102-
- DEFAULT TO SAFE: If you are less than 95% certain a pattern exists, return 'safe'.
103110
104-
STEP-BY-STEP REASONING:
105-
1. Analyze the literal meaning of the text.
106-
2. Evaluate the psychological intent (Is it steering, shaming, or confusing?).
107-
3. Compare against the legal frameworks above.
108-
109111
OUTPUT FORMAT:
110-
You must return a raw JSON object with this exact structure:
112+
Return ONLY a JSON object. You MUST provide the "reasoning" key BEFORE the "category" key to ensure logical chain-of-thought analysis.
111113
{{
112-
"reasoning": "A 1-sentence legal justification for your decision.",
114+
"reasoning": "CHAIN OF THOUGHT: Step-by-step, logically explain why this text violates user intent OR why it is perfectly safe.",
113115
"category": "one_of_the_categories_below_or_safe"
114116
}}
115117
116-
CATEGORIES:
117-
{allowed_categories}
118+
ALLOWED CATEGORIES: {allowed_categories}
118119
"""
119120

120121
try:
121122
response = llm_model.generate_content(prompt)
122-
result_text = response.text.replace('```json', '').replace('```', '').strip()
123-
data = json.loads(result_text)
123+
data = json.loads(response.text) # Clean parsing since response_mime_type is JSON
124124

125125
category = data.get("category", "safe")
126126
reasoning = data.get("reasoning", "No explanation provided.")
@@ -133,21 +133,29 @@ def _verify_with_llm(self, element_text, layout_label):
133133

134134
def analyze_detections(self, hf_api_response):
135135
"""Orchestrates the two-stage pipeline."""
136-
ai_predictions = hf_api_response.get("flagged_elements", [])
137-
print("\n[*] Running Stage 2 LLM Classification on flagged elements...")
136+
137+
# Accommodates both old list structure and new dictionary structure
138+
if isinstance(hf_api_response, list):
139+
ai_predictions = hf_api_response
140+
else:
141+
ai_predictions = hf_api_response.get("flagged_elements", [])
142+
143+
print(f"\n[*] Running Stage 2 LLM Classification on {len(ai_predictions)} flagged elements...")
138144

139145
for detection in ai_predictions:
140-
layout_label = detection.get("predicted_label")
141-
bbox = detection.get("box_2d") or [0, 0, 0, 0]
146+
# Safely extract data handling potential missing keys from Hugging Face output
147+
layout_label = detection.get("layoutlm_label", "deceptive_element")
148+
bbox = detection.get("box_2d", [0, 0, 0, 0])
142149
element_text = detection.get("text", "")
143150

144-
if layout_label in ["action_button", "overlay_content", "deceptive_element"]:
151+
if element_text:
145152
print(f"[*] Auditing text: '{element_text}'")
146153

147154
category, reasoning = self._verify_with_llm(element_text, layout_label)
148-
time.sleep(1.5)
155+
time.sleep(1.5) # Prevents Gemini API rate limiting
149156

150157
if category in REGULATORY_MAP:
158+
print(f" [!] Violation Found: {category}")
151159
rule = REGULATORY_MAP[category]
152160
self.trust_score -= rule["penalty"]
153161

0 commit comments

Comments
 (0)