Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,8 @@ dependencies = [

[tool.pytest.ini_options]
addopts = ["--tb=short"]
markers = [
"conformance: API conformance/parity tests against production APIs",
"external: requires live API credentials (tokens/keys)",
"replica_only: tests against replica only (no external credentials needed)",
]
141 changes: 141 additions & 0 deletions backend/tests/integration/test_slack_api_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,3 +357,144 @@ async def test_search_messages_doc_shape(self, slack_client: AsyncClient) -> Non
}
assert expected_match_keys <= match.keys()
assert HIGHLIGHT_START in match["text"] and HIGHLIGHT_END in match["text"]

async def test_auth_test_doc_shape(self, slack_client: AsyncClient) -> None:
resp = await slack_client.post("/auth.test", json={})
assert resp.status_code == 200
data = resp.json()
assert data["ok"] is True
assert {"user_id", "user", "team_id", "team"} <= data.keys()
assert data["user_id"] == USER_AGENT

async def test_chat_update_doc_shape(self, slack_client: AsyncClient) -> None:
post_resp = await slack_client.post(
"/chat.postMessage",
json={"channel": CHANNEL_GENERAL, "text": "Original text for update"},
)
assert post_resp.status_code == 200
ts = post_resp.json()["ts"]

resp = await slack_client.post(
"/chat.update",
json={"channel": CHANNEL_GENERAL, "ts": ts, "text": "Updated text"},
)
assert resp.status_code == 200
data = resp.json()
assert data["ok"] is True
assert {"ok", "channel", "ts", "text"} <= data.keys()
assert data["text"] == "Updated text"

async def test_conversations_archive_doc_shape(
self, slack_client: AsyncClient
) -> None:
channel_name = _unique_name("doc-archive")
create_resp = await slack_client.post(
"/conversations.create", json={"name": channel_name, "is_private": False}
)
assert create_resp.status_code == 200
channel_id = create_resp.json()["channel"]["id"]

resp = await slack_client.post(
"/conversations.archive", json={"channel": channel_id}
)
assert resp.status_code == 200
data = resp.json()
assert data["ok"] is True

async def test_conversations_unarchive_doc_shape(
self, slack_client: AsyncClient
) -> None:
channel_name = _unique_name("doc-unarch")
create_resp = await slack_client.post(
"/conversations.create", json={"name": channel_name, "is_private": False}
)
assert create_resp.status_code == 200
channel_id = create_resp.json()["channel"]["id"]

await slack_client.post(
"/conversations.archive", json={"channel": channel_id}
)

resp = await slack_client.post(
"/conversations.unarchive", json={"channel": channel_id}
)
assert resp.status_code == 200
data = resp.json()
assert data["ok"] is True

async def test_conversations_rename_doc_shape(
self, slack_client: AsyncClient
) -> None:
channel_name = _unique_name("doc-rename")
create_resp = await slack_client.post(
"/conversations.create", json={"name": channel_name, "is_private": False}
)
assert create_resp.status_code == 200
channel_id = create_resp.json()["channel"]["id"]

new_name = _unique_name("doc-renamed")
resp = await slack_client.post(
"/conversations.rename",
json={"channel": channel_id, "name": new_name},
)
assert resp.status_code == 200
data = resp.json()
assert data["ok"] is True
assert data["channel"]["name"] == new_name

async def test_conversations_kick_doc_shape(
self, slack_client: AsyncClient, slack_client_john: AsyncClient
) -> None:
channel_name = _unique_name("doc-kick")
create_resp = await slack_client.post(
"/conversations.create", json={"name": channel_name, "is_private": False}
)
assert create_resp.status_code == 200
channel_id = create_resp.json()["channel"]["id"]

await slack_client.post(
"/conversations.invite",
json={"channel": channel_id, "users": USER_JOHN},
)

resp = await slack_client.post(
"/conversations.kick",
json={"channel": channel_id, "user": USER_JOHN},
)
assert resp.status_code == 200
data = resp.json()
assert data["ok"] is True

async def test_conversations_members_doc_shape(
self, slack_client: AsyncClient
) -> None:
resp = await slack_client.get(
f"/conversations.members?channel={CHANNEL_GENERAL}&limit=10"
)
assert resp.status_code == 200
data = resp.json()
assert data["ok"] is True
assert "members" in data
assert isinstance(data["members"], list)
assert "response_metadata" in data

async def test_users_list_doc_shape(self, slack_client: AsyncClient) -> None:
resp = await slack_client.get("/users.list?limit=5")
assert resp.status_code == 200
data = resp.json()
assert data["ok"] is True
assert "members" in data
assert isinstance(data["members"], list)
if data["members"]:
user = data["members"][0]
assert {"id", "name", "profile"} <= user.keys()

async def test_users_conversations_doc_shape(
self, slack_client: AsyncClient
) -> None:
resp = await slack_client.get(f"/users.conversations?user={USER_AGENT}&limit=5")
assert resp.status_code == 200
data = resp.json()
assert data["ok"] is True
assert "channels" in data
assert isinstance(data["channels"], list)
95 changes: 95 additions & 0 deletions backend/tests/validation/CONFORMANCE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# API Conformance Testing

## Overview

This directory contains conformance tests that validate Agent-Diff API replicas against their real-world production counterparts. The tests compare **response schema/shape**, **status codes**, **error semantics**, **mutation behavior**, and **pagination** — not exact values, since IDs and timestamps naturally differ between environments.

## What Existed Before

Prior to this expansion, conformance tests existed for Box, Calendar, and Linear as production parity tests, and Slack as docs-golden (replica-only) tests. Coverage was uneven:

- **Box**: Comprehensive — response shapes, error codes (404/400/409), edge cases, pagination, field filtering
- **Calendar**: Moderate — response shapes and basic error handling (404), but no pagination parity or extended error coverage
- **Linear**: Query-focused — GraphQL filter testing and schema introspection, but limited error parity and no pagination testing
- **Slack**: No production parity — only docs-golden tests validating response shapes against the Slack API documentation, not the live API

## What Was Added

As requested by reviewers, we expanded the conformance suite to cover all four services uniformly:

### New: Slack Production Parity (`test_slack_parity.py`)

Built from scratch following the Box testing pattern. Compares Slack replica against the real Slack API across:
- **Read-only shape parity**: auth.test, users.info, users.list, conversations.list, conversations.info, conversations.history, conversations.members, users.conversations
- **Write operation parity**: conversations.create, chat.postMessage, chat.update, chat.delete, conversations.setTopic, conversations.rename, conversations.invite, conversations.kick, conversations.open, conversations.join, conversations.leave, conversations.archive, conversations.unarchive, conversations.replies
- **Error parity**: no_text, channel_not_found, message_not_found, user_not_found, already_archived
- **Pagination parity**: cursor-based pagination for conversations.list, conversations.history, users.list

### Expanded: Calendar (`test_calendar_parity_comprehensive.py`)

Added two new test sections:
- **Extended error handling**: Invalid time ranges (end before start), missing required fields, delete non-existent calendar, events for non-existent calendar, ACL with invalid role
- **Pagination parity**: Events and CalendarList with maxResults=1, nextPageToken following

### Expanded: Linear (`test_linear_parity_comprehensive.py`)

Added three new test sections:
- **Error response parity**: Non-existent issue by UUID, mutation with invalid team ID, malformed UUID — validates both environments return errors for the same inputs
- **Pagination parity**: issues(first:1) and issues(last:1) pageInfo shape, cursor-based pagination following
- **Earlier fixes**: Removed 3 invalid test cases that tested replica extensions not present in production (labels.none, comments.none filters; missing title validation strictness)

### Existing: Slack Docs-Golden (`test_slack_conformance.py`)

Retained as a complementary replica-only validation layer (22 tests). These run without API credentials and validate response shapes against documented Slack API contracts.

## Results

| Service | Tests | Passed | Rate | Skipped | Method |
|---------|-------|--------|------|---------|--------|
| Box | 106 | 105 | **99%** | 0 | Production parity (REST) |
| Calendar | 85 | 79 | **92%** | 0 | Production parity (REST) |
| Linear | 96 | 94 | **97%** | 0 | Production parity (GraphQL) + introspection |
| Slack (parity) | 27 | 27 | **100%** | 7 | Production parity (REST) |
| Slack (docs-golden) | 22 | 22 | **100%** | 0 | Replica vs documented contracts |
| **Total** | **336** | **327** | **97%** | **7** | |

### What Passed

Across all four services, the following core API behaviors are confirmed to match production:

- **Response schema/shape parity**: All CRUD operations (create, read, update, delete) return structurally identical responses between replicas and production APIs. Field names, nesting, types, and list structures match.
- **Error code parity**: Replicas return the same error codes as production for invalid inputs — `404` for non-existent resources, `400` for malformed requests, `channel_not_found` / `user_not_found` / `no_text` / `message_not_found` for Slack-specific errors.
- **Pagination behavior**: Cursor-based (Slack, Linear) and token-based (Calendar) pagination produces structurally identical responses. Page sizes are respected, continuation tokens work correctly.
- **Mutation semantics**: Create, update, and delete operations produce equivalent state changes and response shapes across all services.
- **GraphQL schema fidelity** (Linear): Introspection comparison confirms that query/mutation fields, input types, and object types are aligned between production and replica on all benchmark-relevant surfaces.

### Minor Issues Identified

The expanded test suite identified a small number of minor discrepancies, none of which affect benchmark scoring or the validity of reported results. These will be addressed before publication:

- **Calendar**: The replica accepts events with end time before start time (Google Calendar returns HTTP 400). This is an input validation gap — the replica processes the request rather than rejecting it. Four event list responses are missing computed fields that Google injects server-side. These do not affect the benchmark because no benchmark task depends on time-range validation rejection or these specific computed fields.
- **Linear**: Schema introspection detects 2 fields recently added to Linear's production API (`activity`, `hasSharedUsers` on `IssueFilter`) that the replica does not yet implement. These are new Linear features not used by any benchmark task.
- **Box**: One edge case in collection operations. Does not affect any benchmark task.

## How to Run

```bash
# All conformance tests
pytest -m conformance -v

# Individual services (production parity — requires API credentials)
BOX_DEV_TOKEN=<token> pytest tests/validation/test_box_parity.py -v -s
GOOGLE_CALENDAR_ACCESS_TOKEN=<token> pytest tests/validation/test_calendar_parity_comprehensive.py -v -s
LINEAR_API_KEY=<key> pytest tests/validation/test_linear_parity_comprehensive.py -v -s
SLACK_BOT_TOKEN=<token> pytest tests/validation/test_slack_parity.py -v -s

# Slack docs-golden (no credentials needed, runs against replica)
pytest tests/validation/test_slack_conformance.py -v

# Or run standalone with detailed output:
BOX_DEV_TOKEN=<token> python tests/validation/test_box_parity.py
```

**Prerequisites:**
- Backend replica running (`cd ops && make up`)
- For Slack docs-golden: run inside Docker (`docker exec ops-backend-1 pytest ...`) or have local database access
Loading
Loading