Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 13 additions & 11 deletions simdrive/src/simdrive/observe.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,17 +206,19 @@ def observe(

marks: list[Mark] = []
annotated_path: Path | None = None
if annotate:
marks = som.detect_marks(raw_path)
if marks:
annotated_path = out_dir / f"observe-{ts}-som.png"
# Annotate the *unfiltered* image so the on-disk PNG keeps the full
# context for human review — filtering is for the JSON payload only.
som.annotate(raw_path, marks, annotated_path)
# Apply token-efficiency filters AFTER annotation so the PNG retains
# every detected mark, but the in-memory + JSON `marks` list reflects
# what the agent actually receives.
marks = _apply_filters(marks, confidence_floor, mark_limit)
# F#7 (b5): always detect marks so text targeting works regardless of annotate flag.
# When annotate=True, also draw the SoM overlay and set annotated_path.
# When annotate=False, skip drawing — marks are still returned, annotated_path stays None.
marks = som.detect_marks(raw_path)
if annotate and marks:
annotated_path = out_dir / f"observe-{ts}-som.png"
# Annotate the *unfiltered* image so the on-disk PNG keeps the full
# context for human review — filtering is for the JSON payload only.
som.annotate(raw_path, marks, annotated_path)
# Apply token-efficiency filters AFTER annotation so the PNG retains
# every detected mark, but the in-memory + JSON `marks` list reflects
# what the agent actually receives.
marks = _apply_filters(marks, confidence_floor, mark_limit)

logs_text: str | None = None
if capture_logs:
Expand Down
9 changes: 8 additions & 1 deletion simdrive/src/simdrive/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -942,7 +942,14 @@ def _resolve_target_xy(s, args: dict) -> tuple[int, int, str, "som.Mark | dict |
candidates, tier = som.find_text_candidates(marks, query)
if not candidates:
available = [_mark_attr(mk, "text") for mk in marks]
raise errors.target_not_found("text", query, available)
err = errors.target_not_found("text", query, available)
# F#5 — include fuzzy suggestion so agents know the closest real mark.
import difflib
available_texts = [t for t in available if t]
matches = difflib.get_close_matches(query, available_texts, n=1, cutoff=0.5)
if matches:
err.details["suggestion"] = matches[0]
raise err
# F#6 — >1 marks tied at the winning tier ⇒ refuse to silent-pick.
# The agent must re-target by stable_id / mark / xy. Single-match (even
# when other tiers also have matches) still resolves unambiguously.
Expand Down
16 changes: 16 additions & 0 deletions simdrive/src/simdrive/som.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,15 @@
# misc UI
"welcome", "hello", "goodbye", "logout", "trial", "free", "premium",
"upgrade", "subscribe", "subscription",
# iOS settings / system UI vocabulary (F#18 — Apple Preferences labels)
"general", "privacy", "bluetooth", "wi-fi", "wifi", "notifications",
"sounds", "haptics", "focus", "screen", "time", "accessibility",
"siri", "safari", "maps", "health", "wallet", "facetime", "photos",
"camera", "messages", "mail", "calendar", "contacts", "reminders",
"notes", "icloud", "itunes", "store", "appstore", "airdrop", "airplay",
"display", "brightness", "battery", "storage", "privacy", "security",
"passcode", "touchid", "faceid", "cellular", "vpn", "hotspot",
"language", "region", "keyboard", "reset", "update", "software",
# common content nouns / verbs that show up in titles & cells
"dance", "partner", "story", "tale", "world", "people", "person",
"place", "thing", "year", "day", "way", "man", "woman", "child",
Expand Down Expand Up @@ -190,6 +199,11 @@ class Mark:
raw_confidence: Optional[float] = None
# `confidence_band` is the dictionary-gated quality bucket. None = compute lazily.
_band: Optional[str] = field(default=None, repr=False)
# F#4 — b5: alternate OCR readings for this element seen across consecutive
# observations. Populated by the OCR smoothing layer when consecutive observes
# produce different text for the same spatial region. Defaults to empty list;
# callers may set this after construction.
alternates: list = field(default_factory=list)

def __post_init__(self) -> None:
# If callers constructed a Mark with only `confidence`, that value is
Expand Down Expand Up @@ -276,6 +290,8 @@ def to_dict(self) -> dict:
"raw_confidence": round(float(self.raw_confidence or 0.0), 3),
# `confidence_band` is the human-readable quality bucket.
"confidence_band": self.confidence_band,
# F#4 — alternate OCR readings seen across consecutive observations.
"alternates": list(self.alternates),
}

def to_compact_dict(self) -> dict:
Expand Down
1 change: 1 addition & 0 deletions simdrive/tests/test_a12_marks_parity.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
"confidence",
"raw_confidence",
"confidence_band",
"alternates", # F#4 (b5): OCR alternate readings field
})

# ── Minimal 1×1 PNG (PIL-readable) ───────────────────────────────────────────
Expand Down
Loading
Loading