From ab7ba4dd5d5be1f8c910632c3813c9f7a1fd3e63 Mon Sep 17 00:00:00 2001 From: Aaron Goldsmith Date: Sat, 21 Mar 2026 09:27:13 -0700 Subject: [PATCH 1/2] Add competition task sets and agent cleanup script - 25 general competition tasks (scripts/competition_tasks.json) - 15 agentic tasks across 3 tiers requiring tool use (scripts/competition_tasks_agentic.json) - cleanup_agents.py for pruning dead weight agents and fixing champion flags - Gitignore .tree-workspace/ experiment artifacts All tasks use relative paths and cross-platform commands. Co-Authored-By: Claude Opus 4.6 --- .gitignore | 64 ++++----- scripts/cleanup_agents.py | 138 +++++++++++++++++++ scripts/competition_tasks.json | 125 +++++++++++++++++ scripts/competition_tasks_agentic.json | 180 +++++++++++++++++++++++++ 4 files changed, 472 insertions(+), 35 deletions(-) create mode 100644 scripts/cleanup_agents.py create mode 100644 scripts/competition_tasks.json create mode 100644 scripts/competition_tasks_agentic.json diff --git a/.gitignore b/.gitignore index 688f4a4..01ff35a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,37 +1,31 @@ -# Python -__pycache__/ -*.py[cod] -*.egg-info/ -*.egg -dist/ -build/ +commit fe66e75cc7f79b4ed77b2c8490f4e19862924bad +Author: Aaron Goldsmith +Date: Sat Mar 21 09:13:05 2026 -0700 -# Virtual environments -.venv/ -venv/ + Add agentic competition tasks, agent definitions, and skills + + - Agent definitions: competition-tasks, depth-test, tree-solver + - Skills: mobius-evolve (free Opus evolution), tree-solve (recursive decomposition) + - Competition tasks: standard + agentic (tool-heavy, multi-tier) + - Cleanup script for dead-weight agents + - Fix hardcoded paths in agentic tasks to use relative paths + - Make system monitoring task cross-platform (Unix tools) + - Remove unused import in cleanup_agents.py + - Add .tree-workspace/ to .gitignore + + Co-Authored-By: Claude Opus 4.6 -# Runtime data -data/mobius.db -data/mobius.db-shm -data/mobius.db-wal -data/mobius.log -data/*.html -data/*.md - -# Secrets -.env - -# Testing -.pytest_cache/ -.coverage -htmlcov/ - -# IDE -.idea/ -.vscode/ -*.swp -*.swo - -# OS -.DS_Store -Thumbs.db +diff --git a/.gitignore b/.gitignore +index 688f4a4..0dc8b78 100644 +--- a/.gitignore ++++ b/.gitignore +@@ -32,6 +32,9 @@ htmlcov/ + *.swp + *.swo + ++# Agent workspaces ++.tree-workspace/ ++ + # OS + .DS_Store + Thumbs.db diff --git a/scripts/cleanup_agents.py b/scripts/cleanup_agents.py new file mode 100644 index 0000000..a08626c --- /dev/null +++ b/scripts/cleanup_agents.py @@ -0,0 +1,138 @@ +commit fe66e75cc7f79b4ed77b2c8490f4e19862924bad +Author: Aaron Goldsmith +Date: Sat Mar 21 09:13:05 2026 -0700 + + Add agentic competition tasks, agent definitions, and skills + + - Agent definitions: competition-tasks, depth-test, tree-solver + - Skills: mobius-evolve (free Opus evolution), tree-solve (recursive decomposition) + - Competition tasks: standard + agentic (tool-heavy, multi-tier) + - Cleanup script for dead-weight agents + - Fix hardcoded paths in agentic tasks to use relative paths + - Make system monitoring task cross-platform (Unix tools) + - Remove unused import in cleanup_agents.py + - Add .tree-workspace/ to .gitignore + + Co-Authored-By: Claude Opus 4.6 + +diff --git a/scripts/cleanup_agents.py b/scripts/cleanup_agents.py +new file mode 100644 +index 0000000..4edaa4f +--- /dev/null ++++ b/scripts/cleanup_agents.py +@@ -0,0 +1,115 @@ ++"""Clean dead weight agents and fix champion flags in the Mobius registry.""" ++ ++import argparse ++import sqlite3 ++import sys ++from pathlib import Path ++ ++DB_PATH = Path("data/mobius.db") ++ ++ ++def get_connection(db_path: Path) -> sqlite3.Connection: ++ conn = sqlite3.connect(str(db_path)) ++ conn.row_factory = sqlite3.Row ++ return conn ++ ++ ++def list_zero_match_agents(conn: sqlite3.Connection) -> list[dict]: ++ """Find agents with 0 total matches.""" ++ rows = conn.execute( ++ "SELECT id, name, slug, elo_rating, is_champion, created_at " ++ "FROM agents WHERE total_matches = 0 ORDER BY created_at" ++ ).fetchall() ++ return [dict(r) for r in rows] ++ ++ ++def list_champions(conn: sqlite3.Connection) -> list[dict]: ++ """Find agents marked as champions with their stats.""" ++ rows = conn.execute( ++ "SELECT id, name, slug, is_champion, elo_rating, win_rate, total_matches " ++ "FROM agents WHERE is_champion = 1 ORDER BY elo_rating DESC" ++ ).fetchall() ++ return [dict(r) for r in rows] ++ ++ ++def list_all_agents_summary(conn: sqlite3.Connection) -> list[dict]: ++ """Quick summary of all agents.""" ++ rows = conn.execute( ++ "SELECT id, name, slug, elo_rating, win_rate, total_matches, is_champion " ++ "FROM agents ORDER BY elo_rating DESC" ++ ).fetchall() ++ return [dict(r) for r in rows] ++ ++ ++def retire_zero_match_agents(conn: sqlite3.Connection) -> int: ++ """Set elo_rating=0 and is_champion=0 for agents with 0 matches.""" ++ cursor = conn.execute( ++ "UPDATE agents SET elo_rating = 0.0, is_champion = 0 " ++ "WHERE total_matches = 0" ++ ) ++ conn.commit() ++ return cursor.rowcount ++ ++ ++def clear_all_champion_flags(conn: sqlite3.Connection) -> int: ++ """Clear is_champion on all agents.""" ++ cursor = conn.execute("UPDATE agents SET is_champion = 0 WHERE is_champion = 1") ++ conn.commit() ++ return cursor.rowcount ++ ++ ++def main(): ++ parser = argparse.ArgumentParser(description="Clean dead weight agents and fix champion flags") ++ parser.add_argument("--execute", action="store_true", help="Actually apply changes (default is dry-run)") ++ parser.add_argument("--db", type=Path, default=DB_PATH, help="Path to mobius.db") ++ args = parser.parse_args() ++ ++ if not args.db.exists(): ++ print(f"ERROR: Database not found at {args.db}") ++ sys.exit(1) ++ ++ conn = get_connection(args.db) ++ mode = "EXECUTE" if args.execute else "DRY-RUN" ++ print(f"=== Mobius Agent Cleanup [{mode}] ===\n") ++ ++ # Summary ++ all_agents = list_all_agents_summary(conn) ++ print(f"Total agents in registry: {len(all_agents)}\n") ++ ++ # Zero-match agents ++ zero_match = list_zero_match_agents(conn) ++ print(f"--- Agents with 0 matches ({len(zero_match)}) ---") ++ if zero_match: ++ for a in zero_match: ++ champ_flag = " [CHAMPION]" if a["is_champion"] else "" ++ print(f" {a['slug']:30s} elo={a['elo_rating']:7.1f}{champ_flag} created={a['created_at']}") ++ else: ++ print(" (none)") ++ print() ++ ++ # Champions ++ champions = list_champions(conn) ++ print(f"--- Current champions ({len(champions)}) ---") ++ if champions: ++ for a in champions: ++ print(f" {a['slug']:30s} elo={a['elo_rating']:7.1f} win_rate={a['win_rate']:.2%} matches={a['total_matches']}") ++ else: ++ print(" (none)") ++ print() ++ ++ if not args.execute: ++ print("[DRY-RUN] No changes made. Re-run with --execute to apply.") ++ print(f" Would retire {len(zero_match)} zero-match agents (set elo=0, is_champion=0)") ++ print(f" Would clear is_champion flag on {len(champions)} agents") ++ else: ++ retired = retire_zero_match_agents(conn) ++ cleared = clear_all_champion_flags(conn) ++ print(f"[EXECUTE] Retired {retired} zero-match agents (elo set to 0)") ++ print(f"[EXECUTE] Cleared champion flag on {cleared} agents") ++ print("Done.") ++ ++ conn.close() ++ ++ ++if __name__ == "__main__": ++ main() diff --git a/scripts/competition_tasks.json b/scripts/competition_tasks.json new file mode 100644 index 0000000..575839c --- /dev/null +++ b/scripts/competition_tasks.json @@ -0,0 +1,125 @@ +commit fe66e75cc7f79b4ed77b2c8490f4e19862924bad +Author: Aaron Goldsmith +Date: Sat Mar 21 09:13:05 2026 -0700 + + Add agentic competition tasks, agent definitions, and skills + + - Agent definitions: competition-tasks, depth-test, tree-solver + - Skills: mobius-evolve (free Opus evolution), tree-solve (recursive decomposition) + - Competition tasks: standard + agentic (tool-heavy, multi-tier) + - Cleanup script for dead-weight agents + - Fix hardcoded paths in agentic tasks to use relative paths + - Make system monitoring task cross-platform (Unix tools) + - Remove unused import in cleanup_agents.py + - Add .tree-workspace/ to .gitignore + + Co-Authored-By: Claude Opus 4.6 + +diff --git a/scripts/competition_tasks.json b/scripts/competition_tasks.json +new file mode 100644 +index 0000000..5c8eca9 +--- /dev/null ++++ b/scripts/competition_tasks.json +@@ -0,0 +1,102 @@ ++[ ++ { ++ "task": "Implement a rate limiter using the token bucket algorithm in Python with thread safety. Include a clean API: `RateLimiter(rate, capacity)` with an `acquire()` method that blocks until a token is available and a `try_acquire()` that returns immediately.", ++ "category": "code-generation-python" ++ }, ++ { ++ "task": "Write a Python function that solves the 0/1 knapsack problem using dynamic programming. It should accept a list of items (each with weight and value) and a capacity, and return both the maximum value and the selected items. Include proper type hints.", ++ "category": "algorithms" ++ }, ++ { ++ "task": "Implement a JavaScript EventEmitter class from scratch (no dependencies). Support `on`, `off`, `once`, and `emit` methods. Handle edge cases: removing a listener during emit, adding a listener during emit, and error events with no handler.", ++ "category": "code-generation-js" ++ }, ++ { ++ "task": "Review the following Python code for bugs and security issues:\n\n```python\nimport sqlite3, os\n\ndef get_user(db_path, username):\n conn = sqlite3.connect(db_path)\n cursor = conn.execute(f\"SELECT * FROM users WHERE name = '{username}'\")\n user = cursor.fetchone()\n return user\n\ndef save_upload(data, filename):\n path = os.path.join('/uploads', filename)\n with open(path, 'wb') as f:\n f.write(data)\n return path\n\ndef hash_password(password):\n import hashlib\n return hashlib.md5(password.encode()).hexdigest()\n```\n\nIdentify every bug and vulnerability, explain the impact, and provide corrected code.", ++ "category": "code-review" ++ }, ++ { ++ "task": "Design a URL shortener service that handles 100M URLs and 1B redirects per month. Cover: API design, storage schema, hash/ID generation strategy, redirect flow, caching layer, and how you'd handle analytics. Provide concrete technology choices with justification.", ++ "category": "system-design" ++ }, ++ { ++ "task": "Design the architecture for a real-time collaborative document editor (like Google Docs). Address: conflict resolution strategy (OT vs CRDT), WebSocket connection management, persistence layer, presence/cursor tracking, and offline support. Include a component diagram.", ++ "category": "system-design" ++ }, ++ { ++ "task": "Perform a security audit of this Express.js middleware stack:\n\n```javascript\nconst express = require('express');\nconst app = express();\napp.use(express.json());\napp.use((req, res, next) => {\n res.header('Access-Control-Allow-Origin', '*');\n res.header('Access-Control-Allow-Headers', '*');\n next();\n});\napp.post('/api/exec', (req, res) => {\n const { cmd } = req.body;\n const result = require('child_process').execSync(cmd).toString();\n res.json({ output: result });\n});\napp.get('/api/file', (req, res) => {\n const filePath = req.query.path;\n res.sendFile(filePath);\n});\napp.listen(3000);\n```\n\nList all vulnerabilities with OWASP classification, severity rating, and remediated code.", ++ "category": "security" ++ }, ++ { ++ "task": "You are given a Python web app that stores user sessions in a Redis-backed cookie store. Users report being randomly logged out. The session TTL is 30 minutes, but some users lose sessions after 5 minutes. Write a systematic debugging plan: what logs to check, what metrics to gather, specific Redis commands to run, and potential root causes ranked by likelihood.", ++ "category": "debugging" ++ }, ++ { ++ "task": "Write a comprehensive test suite for a Python `BankAccount` class that supports `deposit(amount)`, `withdraw(amount)`, `transfer(other_account, amount)`, and `get_balance()`. Cover: happy paths, overdraft protection, concurrent transfers, negative amounts, floating point precision, and transaction atomicity. Use pytest.", ++ "category": "testing" ++ }, ++ { ++ "task": "Write a testing strategy document for a microservices-based e-commerce platform with services for: user auth, product catalog, shopping cart, payment processing, and order fulfillment. Define the test pyramid, specify what to unit test vs integration test vs e2e test, and recommend specific tools and patterns for contract testing between services.", ++ "category": "testing-strategy" ++ }, ++ { ++ "task": "Write a Dockerfile and docker-compose.yml for a Python FastAPI app with PostgreSQL, Redis, and Celery workers. Requirements: multi-stage build, non-root user, health checks, proper signal handling, volume mounts for development, and production-ready defaults. Include a .dockerignore.", ++ "category": "devops" ++ }, ++ { ++ "task": "Design a CI/CD pipeline for a monorepo containing three microservices (Python, Node.js, Go) and a shared protobuf definitions package. The pipeline should: only build changed services, run tests in parallel, build and push Docker images, deploy to staging automatically, and require manual approval for production. Use GitHub Actions.", ++ "category": "devops" ++ }, ++ { ++ "task": "Write SQL queries for an e-commerce analytics dashboard against this schema:\n- `orders(id, user_id, total, status, created_at)`\n- `order_items(id, order_id, product_id, quantity, price)`\n- `products(id, name, category, price)`\n- `users(id, email, created_at, country)`\n\nQueries needed:\n1. Monthly revenue with month-over-month growth percentage\n2. Top 10 products by revenue with their return rate (status='returned')\n3. Customer cohort retention: for each signup month, what % ordered in months 1-6\n4. Rolling 7-day average order value\n5. Products frequently bought together (association rules)", ++ "category": "data-sql" ++ }, ++ { ++ "task": "Given a CSV dataset of 500K customer support tickets with columns (ticket_id, created_at, resolved_at, category, priority, agent_id, satisfaction_score, first_response_minutes), write a Python analysis script using pandas that: calculates resolution time distributions per category, identifies bottleneck categories, finds the best and worst performing agents by resolution time and satisfaction, and generates a summary report with actionable recommendations.", ++ "category": "data-analysis" ++ }, ++ { ++ "task": "Write a clear, concise README for a CLI tool called `logslice` that filters and aggregates log files. Features: regex pattern matching, time range filtering, JSON log parsing, output formats (table, CSV, JSON), and pipe-friendly. The README should include: badges, install instructions, quick start, usage examples, configuration, and contributing guide. Make it scannable and developer-friendly.", ++ "category": "documentation" ++ }, ++ { ++ "task": "Refactor this Python code to improve readability, reduce complexity, and follow SOLID principles:\n\n```python\ndef process_order(order):\n if order['type'] == 'physical':\n if order['weight'] > 50:\n shipping = order['weight'] * 0.5 + 10\n else:\n shipping = order['weight'] * 0.3 + 5\n if order['country'] != 'US':\n shipping *= 2.5\n tax = order['total'] * 0.08\n if order['state'] in ['OR', 'MT', 'NH', 'DE']:\n tax = 0\n total = order['total'] + shipping + tax\n if order.get('coupon'):\n if order['coupon']['type'] == 'percent':\n total -= total * (order['coupon']['value'] / 100)\n else:\n total -= order['coupon']['value']\n send_email(order['email'], f'Order total: ${total:.2f}')\n update_inventory(order['items'])\n return {'total': total, 'shipping': shipping, 'tax': tax}\n elif order['type'] == 'digital':\n tax = order['total'] * 0.08\n if order['state'] in ['OR', 'MT', 'NH', 'DE']:\n tax = 0\n total = order['total'] + tax\n if order.get('coupon'):\n if order['coupon']['type'] == 'percent':\n total -= total * (order['coupon']['value'] / 100)\n else:\n total -= order['coupon']['value']\n send_email(order['email'], f'Download link: {generate_link(order)}')\n return {'total': total, 'tax': tax}\n```\n\nProvide the refactored code with explanation of each improvement.", ++ "category": "refactoring" ++ }, ++ { ++ "task": "Profile and optimize this Python function that processes a large dataset:\n\n```python\ndef find_duplicates(records):\n duplicates = []\n for i in range(len(records)):\n for j in range(i + 1, len(records)):\n if records[i]['email'].lower().strip() == records[j]['email'].lower().strip():\n if records[i] not in duplicates:\n duplicates.append(records[i])\n if records[j] not in duplicates:\n duplicates.append(records[j])\n return duplicates\n```\n\nThis runs on 100K records and takes over 10 minutes. Identify all performance issues, provide an optimized version with complexity analysis, and explain each optimization.", ++ "category": "performance" ++ }, ++ { ++ "task": "Implement a Python async web scraper that crawls a paginated API endpoint, respects rate limits (max 10 requests/second), handles retries with exponential backoff, and writes results to a SQLite database. Use aiohttp and include proper error handling, graceful shutdown on SIGINT, and progress reporting.", ++ "category": "code-generation-python" ++ }, ++ { ++ "task": "Implement a trie (prefix tree) data structure in Python that supports: insert, search, delete, autocomplete (return all words with a given prefix), and wildcard search (where '.' matches any single character). Include full type hints and a clean, well-documented API.", ++ "category": "algorithms" ++ }, ++ { ++ "task": "You have a PostgreSQL database with a `transactions` table (50M rows) and queries are taking 30+ seconds. The table has columns: id, account_id, amount, type, status, created_at, metadata (JSONB). Common query patterns: filter by account_id + date range, aggregate by type + status, and search within metadata. Propose an indexing strategy, query optimizations, and partitioning scheme. Show the exact SQL for index creation and explain the tradeoffs.", ++ "category": "performance-db" ++ }, ++ { ++ "task": "Write a Python decorator `@retry` that supports: max retries, exponential backoff with jitter, retrying only on specified exception types, a configurable timeout, and an `on_retry` callback. It should work with both sync and async functions. Include comprehensive docstring and usage examples.", ++ "category": "code-generation-python" ++ }, ++ { ++ "task": "Analyze this Go concurrent code for race conditions, deadlocks, and goroutine leaks:\n\n```go\nfunc ProcessBatch(items []Item) []Result {\n results := make([]Result, len(items))\n var wg sync.WaitGroup\n for i, item := range items {\n wg.Add(1)\n go func() {\n defer wg.Done()\n result, err := process(item)\n if err != nil {\n log.Fatal(err)\n }\n results[i] = result\n }()\n }\n wg.Wait()\n return results\n}\n```\n\nIdentify every concurrency issue, explain why each is dangerous, and provide a corrected version.", ++ "category": "code-review" ++ }, ++ { ++ "task": "Design a notification system that supports email, SMS, push notifications, and in-app messages. Requirements: user preferences for channel and frequency, batching/digest mode, template system, delivery tracking, retry on failure, and rate limiting per user. Provide the data model, service architecture, and key API endpoints.", ++ "category": "system-design" ++ }, ++ { ++ "task": "Write a GitHub Actions workflow that runs on every PR and: lints Python code (ruff), runs type checking (mypy), executes pytest with coverage, fails if coverage drops below 80%, posts a coverage summary comment on the PR, and caches dependencies between runs. The workflow should be efficient and only run relevant checks based on changed files.", ++ "category": "devops" ++ }, ++ { ++ "task": "Implement a simple in-memory key-value store in Python that supports: GET, SET with optional TTL, DELETE, KEYS with glob pattern matching, and atomic INCR/DECR operations. It should be thread-safe, clean up expired keys lazily and via a background thread, and include a simple TCP server interface (like Redis RESP protocol).", ++ "category": "code-generation-python" ++ } ++] diff --git a/scripts/competition_tasks_agentic.json b/scripts/competition_tasks_agentic.json new file mode 100644 index 0000000..cac8106 --- /dev/null +++ b/scripts/competition_tasks_agentic.json @@ -0,0 +1,180 @@ +commit fe66e75cc7f79b4ed77b2c8490f4e19862924bad +Author: Aaron Goldsmith +Date: Sat Mar 21 09:13:05 2026 -0700 + + Add agentic competition tasks, agent definitions, and skills + + - Agent definitions: competition-tasks, depth-test, tree-solver + - Skills: mobius-evolve (free Opus evolution), tree-solve (recursive decomposition) + - Competition tasks: standard + agentic (tool-heavy, multi-tier) + - Cleanup script for dead-weight agents + - Fix hardcoded paths in agentic tasks to use relative paths + - Make system monitoring task cross-platform (Unix tools) + - Remove unused import in cleanup_agents.py + - Add .tree-workspace/ to .gitignore + + Co-Authored-By: Claude Opus 4.6 + +diff --git a/scripts/competition_tasks_agentic.json b/scripts/competition_tasks_agentic.json +new file mode 100644 +index 0000000..7b23fa5 +--- /dev/null ++++ b/scripts/competition_tasks_agentic.json +@@ -0,0 +1,157 @@ ++[ ++ { ++ "task": "Create a Python CLI tool at /tmp/mobius-task-csvtool/csvtool.py that reads a CSV file, supports --filter COLUMN=VALUE, --sort COLUMN, --group-by COLUMN (with count aggregation), and --output FORMAT (table or json). Generate a test CSV with at least 100 rows of synthetic employee data (name, department, salary, hire_date). Run the tool with each flag combination and verify the output is correct. Fix any bugs you find.", ++ "category": "build-and-test", ++ "tier": 1, ++ "tools_required": [ ++ "Bash" ++ ], ++ "verification": "Judge checks: (1) csvtool.py exists and is executable, (2) test CSV has 100+ rows, (3) all four flag combinations produce correct output when re-run, (4) edge cases like missing columns produce helpful errors", ++ "setup": "mkdir -p /tmp/mobius-task-csvtool" ++ }, ++ { ++ "task": "Write a Bash script at /tmp/mobius-task-monitor/sysmon.sh that collects system metrics using cross-platform Unix tools (CPU load via uptime, memory via free -m, disk usage via df -h, process count via ps aux | wc -l) every 2 seconds for 10 seconds, writes each snapshot as a JSON line to metrics.jsonl, then generates a summary report (min/max/avg for each metric) to summary.txt. The script must handle missing commands gracefully (check command availability with which/command -v and skip metrics for unavailable tools). Run it and verify the output files are correct and well-formed JSON.", ++ "category": "infrastructure", ++ "tier": 1, ++ "tools_required": [ ++ "Bash" ++ ], ++ "verification": "Judge checks: (1) sysmon.sh runs without errors, (2) metrics.jsonl has 5+ valid JSON lines, (3) summary.txt has correct min/max/avg calculations, (4) script handles at least one missing-tool fallback", ++ "setup": "mkdir -p /tmp/mobius-task-monitor" ++ }, ++ { ++ "task": "Create a SQLite database at /tmp/mobius-task-sql/analytics.db with tables: users(id, name, email, signup_date, country), orders(id, user_id, amount, status, created_at), products(id, name, category, price), order_items(order_id, product_id, quantity). Populate with at least 200 orders across 50 users and 20 products using realistic data. Then write and execute SQL queries that answer: (1) top 5 customers by lifetime value, (2) monthly revenue trend for the last 6 months, (3) most popular product category by country, (4) customers who ordered in their first week but never again. Save all queries and results to queries.sql and results.txt.", ++ "category": "data", ++ "tier": 1, ++ "tools_required": [ ++ "Bash" ++ ], ++ "verification": "Judge checks: (1) database exists with correct schema and 200+ orders, (2) all 4 queries are syntactically valid and return non-empty results, (3) results are plausible given the data, (4) queries use appropriate JOINs/subqueries", ++ "setup": "mkdir -p /tmp/mobius-task-sql" ++ }, ++ { ++ "task": "Build a static site generator at /tmp/mobius-task-ssg/. Write a Python script ssg.py that reads Markdown files from content/, converts them to HTML using only the standard library (regex-based parsing for headers, bold, italic, links, code blocks, lists), wraps them in a template with navigation, and outputs to build/. Create at least 3 sample Markdown pages with varied formatting. Run the generator and verify the output HTML is valid (all tags closed, links work, navigation present on every page).", ++ "category": "build-and-test", ++ "tier": 1, ++ "tools_required": [ ++ "Bash" ++ ], ++ "verification": "Judge checks: (1) ssg.py runs and produces build/ directory, (2) each Markdown file has a corresponding HTML file, (3) HTML contains correctly converted headers/bold/italic/links/code/lists, (4) navigation links between pages work, (5) no unclosed HTML tags", ++ "setup": "mkdir -p /tmp/mobius-task-ssg/content /tmp/mobius-task-ssg/build" ++ }, ++ { ++ "task": "Write a Python script at /tmp/mobius-task-gitanalyze/analyze.py that analyzes the Mobius git repository (at .). It should produce a JSON report containing: (1) total commits per author, (2) lines of code per file extension, (3) the 5 most-changed files by commit count, (4) average commits per day over the last 30 days, (5) list of files that exist in the repo but have never been committed (untracked). Run the script and save the report to report.json. Verify the data by spot-checking at least 2 metrics against direct git commands.", ++ "category": "explore-and-analyze", ++ "tier": 1, ++ "tools_required": [ ++ "Bash" ++ ], ++ "verification": "Judge checks: (1) analyze.py runs without errors, (2) report.json contains all 5 sections with plausible data, (3) spot-check verification shows metrics match actual git output, (4) script uses subprocess to call git (not a library)", ++ "setup": "mkdir -p /tmp/mobius-task-gitanalyze" ++ }, ++ { ++ "task": "A Python web scraping pipeline has three bugs preventing it from working. Find all three bugs, fix them, and verify the pipeline produces correct output. The pipeline reads URLs from urls.txt, fetches each page (simulated via local HTML files), extracts titles and links, deduplicates by URL, and writes results to output.json.", ++ "category": "debug-and-fix", ++ "tier": 2, ++ "tools_required": [ ++ "Bash" ++ ], ++ "verification": "Judge checks: (1) all 3 bugs identified and explained, (2) fixes are minimal and correct, (3) pipeline runs end-to-end producing valid output.json, (4) deduplication works correctly", ++ "setup": "mkdir -p /tmp/mobius-task-scraper && cat > /tmp/mobius-task-scraper/pipeline.py << 'PYEOF'\nimport json\nimport re\nimport os\n\ndef read_urls(filepath):\n with open(filepath) as f:\n return f.readlines() # Bug 1: leaves newlines in URLs\n\ndef fetch_page(url):\n # Simulate fetching by reading local HTML files\n filename = url.replace('http://', '').replace('/', '_') + '.html'\n filepath = os.path.join('pages', filename)\n with open(filepath) as f:\n return f.read()\n\ndef extract_data(html):\n title_match = re.search(r'(.*)', html)\n title = title_match.group(0) if title_match else 'No title' # Bug 2: group(0) returns full match including tags, should be group(1)\n links = re.findall(r'href=\"([^\"]+)\"', html)\n return {'title': title, 'links': links}\n\ndef deduplicate(results):\n seen = set()\n unique = []\n for r in results:\n if r['url'] not in seen:\n seen.add(r['url'])\n unique.append(r)\n seen.add(r['url']) # Bug 3: this is outside the if block but also redundant; the real bug is that seen is checked but 'url' key doesn't exist yet\n return unique\n\ndef main():\n urls = read_urls('urls.txt')\n results = []\n for url in urls:\n html = fetch_page(url)\n data = extract_data(html)\n data['url'] = url\n results.append(data)\n unique = deduplicate(results)\n with open('output.json', 'w') as f:\n json.dump(unique, f, indent=2)\n print(f'Processed {len(unique)} unique pages')\n\nif __name__ == '__main__':\n main()\nPYEOF\nmkdir -p /tmp/mobius-task-scraper/pages && cat > /tmp/mobius-task-scraper/urls.txt << 'EOF'\nhttp://example.com/page1\nhttp://example.com/page2\nhttp://example.com/page1\nhttp://example.com/page3\nEOF\ncat > /tmp/mobius-task-scraper/pages/example.com_page1.html << 'EOF'\nFirst PageAboutContact\nEOF\ncat > /tmp/mobius-task-scraper/pages/example.com_page2.html << 'EOF'\nSecond PageAboutProducts\nEOF\ncat > /tmp/mobius-task-scraper/pages/example.com_page3.html << 'EOF'\nThird PageHome\nEOF" ++ }, ++ { ++ "task": "Investigate the Mobius codebase at . to answer these questions with evidence (file paths, line numbers, code snippets): (1) What happens when two agents tie in Elo rating during selection? Show the exact code path. (2) Is there a race condition possible when multiple competitions run concurrently? Identify specific shared state. (3) What is the maximum number of API calls a single 'mobius run' invocation can make? Trace through the call chain. (4) Find at least one code path where an exception could be silently swallowed. Write your findings to /tmp/mobius-task-audit/audit_report.txt with evidence for each answer.", ++ "category": "explore-and-analyze", ++ "tier": 2, ++ "tools_required": [ ++ "Bash" ++ ], ++ "verification": "Judge checks: (1) all 4 questions answered with specific file:line references, (2) code snippets provided as evidence, (3) analysis is accurate (judge can verify by checking the referenced code), (4) race condition analysis identifies real shared state", ++ "setup": "mkdir -p /tmp/mobius-task-audit" ++ }, ++ { ++ "task": "A Python HTTP server has a performance bottleneck and two security vulnerabilities. The server handles JSON API requests. Profile the server to find the bottleneck, fix it, fix both security issues, and provide before/after timing measurements. Document each finding in findings.txt.", ++ "category": "security", ++ "tier": 2, ++ "tools_required": [ ++ "Bash" ++ ], ++ "verification": "Judge checks: (1) performance bottleneck identified with timing evidence, (2) both security vulnerabilities found and classified (one is command injection, one is path traversal), (3) fixes are correct and don't break functionality, (4) before/after timing shows improvement", ++ "setup": "mkdir -p /tmp/mobius-task-security && cat > /tmp/mobius-task-security/server.py << 'PYEOF'\nimport json\nimport os\nimport subprocess\nimport hashlib\nimport time\nfrom http.server import HTTPServer, BaseHTTPRequestHandler\n\nDB = {}\n\ndef slow_hash(data):\n # Performance bottleneck: unnecessary iterations\n result = data.encode()\n for i in range(100000):\n result = hashlib.sha256(result).digest()\n return result.hex()\n\nclass APIHandler(BaseHTTPRequestHandler):\n def do_POST(self):\n length = int(self.headers.get('Content-Length', 0))\n body = json.loads(self.rfile.read(length))\n \n if self.path == '/api/store':\n key = body['key']\n value = body['value']\n hashed_key = slow_hash(key)\n DB[hashed_key] = value\n self.send_response(200)\n self.end_headers()\n self.wfile.write(json.dumps({'hash': hashed_key}).encode())\n \n elif self.path == '/api/exec':\n # Security vuln 1: command injection\n cmd = body.get('command', 'echo hello')\n result = subprocess.check_output(cmd, shell=True).decode()\n self.send_response(200)\n self.end_headers()\n self.wfile.write(json.dumps({'output': result}).encode())\n \n elif self.path == '/api/read':\n # Security vuln 2: path traversal\n filename = body.get('file', 'readme.txt')\n filepath = os.path.join('/tmp/mobius-task-security/data', filename)\n with open(filepath) as f:\n content = f.read()\n self.send_response(200)\n self.end_headers()\n self.wfile.write(json.dumps({'content': content}).encode())\n\nif __name__ == '__main__':\n print('Starting server on :8199')\n HTTPServer(('', 8199), APIHandler).serve_forever()\nPYEOF\nmkdir -p /tmp/mobius-task-security/data && echo 'safe content' > /tmp/mobius-task-security/data/readme.txt" ++ }, ++ { ++ "task": "A pytest test suite for a calculator module has 8 tests but only 3 pass. Without modifying the test file, fix the calculator implementation so all 8 tests pass. You must understand what each test expects, identify the implementation bugs, fix them, and run the full suite green. Document each bug you found in /tmp/mobius-task-tdd/buglog.txt.", ++ "category": "debug-and-fix", ++ "tier": 2, ++ "tools_required": [ ++ "Bash" ++ ], ++ "verification": "Judge checks: (1) all 8 tests pass (run pytest), (2) test file was NOT modified, (3) buglog.txt explains each bug clearly, (4) fixes are minimal - no unnecessary changes to calculator.py", ++ "setup": "mkdir -p /tmp/mobius-task-tdd && cat > /tmp/mobius-task-tdd/calculator.py << 'PYEOF'\nclass Calculator:\n def __init__(self):\n self.history = []\n \n def add(self, a, b):\n result = a + b\n self.history.append(('add', a, b, result))\n return result\n \n def subtract(self, a, b):\n result = a + b # Bug: should be a - b\n self.history.append(('subtract', a, b, result))\n return result\n \n def multiply(self, a, b):\n result = a * b\n self.history.append(('multiply', a, b, result))\n return result\n \n def divide(self, a, b):\n if b == 0:\n raise ValueError(\"Cannot divide by zero\")\n result = a // b # Bug: should be true division a / b\n self.history.append(('divide', a, b, result))\n return result\n \n def power(self, base, exp):\n result = base * exp # Bug: should be base ** exp\n self.history.append(('power', base, exp, result))\n return result\n \n def sqrt(self, n):\n if n < 0:\n raise TypeError(\"Cannot take square root of negative number\") # Bug: should be ValueError\n result = n ** 0.5\n self.history.append(('sqrt', n, None, result))\n return result\n \n def get_history(self):\n return list(self.history) # This is correct\n \n def clear_history(self):\n self.history = None # Bug: should be self.history = []\nPYEOF\ncat > /tmp/mobius-task-tdd/test_calculator.py << 'PYEOF'\nimport pytest\nfrom calculator import Calculator\n\n@pytest.fixture\ndef calc():\n return Calculator()\n\ndef test_add(calc):\n assert calc.add(2, 3) == 5\n assert calc.add(-1, 1) == 0\n assert calc.add(0.1, 0.2) == pytest.approx(0.3)\n\ndef test_subtract(calc):\n assert calc.subtract(5, 3) == 2\n assert calc.subtract(0, 5) == -5\n\ndef test_multiply(calc):\n assert calc.multiply(3, 4) == 12\n assert calc.multiply(-2, 3) == -6\n\ndef test_divide(calc):\n assert calc.divide(10, 3) == pytest.approx(3.333333, rel=1e-4)\n assert calc.divide(1, 4) == 0.25\n with pytest.raises(ValueError):\n calc.divide(1, 0)\n\ndef test_power(calc):\n assert calc.power(2, 3) == 8\n assert calc.power(5, 0) == 1\n assert calc.power(3, 2) == 9\n\ndef test_sqrt(calc):\n assert calc.sqrt(9) == 3.0\n assert calc.sqrt(2) == pytest.approx(1.41421, rel=1e-4)\n with pytest.raises(ValueError):\n calc.sqrt(-1)\n\ndef test_history(calc):\n calc.add(1, 2)\n calc.multiply(3, 4)\n history = calc.get_history()\n assert len(history) == 2\n assert history[0] == ('add', 1, 2, 3)\n assert history[1] == ('multiply', 3, 4, 12)\n\ndef test_clear_history(calc):\n calc.add(1, 2)\n calc.clear_history()\n assert calc.get_history() == []\n calc.add(3, 4)\n assert len(calc.get_history()) == 1\nPYEOF" ++ }, ++ { ++ "task": "Reverse-engineer the Mobius agent selection algorithm by reading the source code at .. Then write a simulation at /tmp/mobius-task-selection/simulate.py that models 100 competitions with 10 agents starting at Elo 1000. Track how Elo ratings drift, identify whether the selection algorithm has any bias (does it favor certain agents unfairly?), and determine after how many rounds the rankings stabilize. Output a report with the Elo trajectory data and your analysis to report.txt. The simulation should use the same Elo update formula as the real code.", ++ "category": "explore-and-analyze", ++ "tier": 2, ++ "tools_required": [ ++ "Bash" ++ ], ++ "verification": "Judge checks: (1) simulation uses the actual Elo formula from Mobius source (not a generic one), (2) simulate.py runs and produces trajectory data, (3) report.txt contains analysis of bias and stabilization with supporting data, (4) at least one non-obvious insight about the selection algorithm", ++ "setup": "mkdir -p /tmp/mobius-task-selection" ++ }, ++ { ++ "task": "Build a robust configuration parser and then attack it with your own adversarial tests. Step 1: Write /tmp/mobius-task-adversarial/parser.py that parses a custom configuration format (key=value pairs, sections in [brackets], # comments, multi-line values with backslash continuation, include directives to pull in other files). Step 2: Write /tmp/mobius-task-adversarial/test_parser.py with at least 15 adversarial test cases designed to break the parser - focus on edge cases (empty files, deeply nested includes, circular includes, unicode keys, values with = signs, comments inside values, etc.). Step 3: Run the tests, fix any failures in the parser, then write even harder tests. Iterate until the parser survives all adversarial inputs and all tests pass.", ++ "category": "adversarial", ++ "tier": 3, ++ "tools_required": [ ++ "Bash" ++ ], ++ "note": "Designed for future multi-agent mode (Agent A implements parser, Agent B writes adversarial tests). Currently runs as single-agent doing both roles.", ++ "verification": "Judge checks: (1) parser.py handles standard configs correctly, (2) test suite has 15+ tests covering genuine edge cases, (3) all tests pass, (4) test cases are actually adversarial (not trivial), (5) evidence of iteration (fix cycles visible in code or comments)", ++ "setup": "mkdir -p /tmp/mobius-task-adversarial/configs && echo -e '[main]\\nkey1=value1\\n# comment\\nkey2 = multi \\\\\\n line value\\n\\n[section2]\\nkey3=value3' > /tmp/mobius-task-adversarial/configs/sample.conf" ++ }, ++ { ++ "task": "Design, implement, and harden a plugin system at /tmp/mobius-task-plugins/. Step 1: Build the core framework (plugin_loader.py, event_bus.py) that loads Python plugins from a plugins/ directory, where each plugin registers handlers for specific event types. Write 2 example plugins that work correctly. Step 2: Write integration tests in tests/. Step 3: Write at least 3 malicious plugins that attempt to break the system (modify global state, crash the event bus, infinite loops, raise exceptions in handlers). Step 4: Harden the framework so all malicious plugin attempts are caught and handled gracefully without crashing. All integration tests must pass.", ++ "category": "adversarial", ++ "tier": 3, ++ "tools_required": [ ++ "Bash" ++ ], ++ "note": "Designed for future multi-agent mode (Agent A builds framework, Agent B writes malicious plugins). Currently runs as single-agent doing both roles.", ++ "verification": "Judge checks: (1) plugin system loads and runs plugins correctly, (2) example plugins work, (3) at least 3 malicious plugin scenarios tested, (4) framework handles malicious plugins gracefully (no crashes), (5) integration tests pass", ++ "setup": "mkdir -p /tmp/mobius-task-plugins/plugins /tmp/mobius-task-plugins/tests" ++ }, ++ { ++ "task": "A SQLite-backed task queue system is provided at /tmp/mobius-task-queue/ with queue.py (enqueue/dequeue/complete/fail with priority, retry up to 3 times, dead-letter queue). The implementation has reliability issues under concurrent access. Your job: (1) Read and understand the queue implementation. (2) Write chaos.py that stress-tests it - concurrent producers/consumers using threading, simulate worker crashes mid-task (kill threads), rapid enqueue/dequeue cycles, and verify invariants (no task loss or duplication, failed tasks retry correctly, dead-letter catches permanent failures). (3) If chaos tests expose bugs in queue.py, fix them. (4) Write a final report to results.txt summarizing what broke, what you fixed, and proof that invariants now hold.", ++ "category": "build-and-test", ++ "tier": 3, ++ "tools_required": [ ++ "Bash" ++ ], ++ "note": "Designed for future multi-agent mode (chaos engineer + implementer). Currently runs as single-agent doing both roles.", ++ "verification": "Judge checks: (1) chaos.py runs with concurrent threads, (2) at least 3 distinct chaos scenarios tested, (3) invariant checks are real (count tasks, check for duplicates), (4) any bugs found are fixed in queue.py, (5) results.txt documents findings with evidence", ++ "setup": "mkdir -p /tmp/mobius-task-queue && cat > /tmp/mobius-task-queue/queue.py << 'PYEOF'\nimport sqlite3\nimport time\nimport threading\n\nclass TaskQueue:\n def __init__(self, db_path='queue.db'):\n self.db_path = db_path\n self.lock = threading.Lock()\n conn = sqlite3.connect(db_path)\n conn.execute('''CREATE TABLE IF NOT EXISTS tasks (\n id INTEGER PRIMARY KEY AUTOINCREMENT,\n payload TEXT NOT NULL,\n priority INTEGER DEFAULT 0,\n status TEXT DEFAULT 'pending',\n retries INTEGER DEFAULT 0,\n max_retries INTEGER DEFAULT 3,\n created_at REAL DEFAULT (julianday('now')),\n updated_at REAL DEFAULT (julianday('now'))\n )''')\n conn.execute('''CREATE TABLE IF NOT EXISTS dead_letter (\n id INTEGER PRIMARY KEY AUTOINCREMENT,\n original_id INTEGER,\n payload TEXT,\n reason TEXT,\n created_at REAL DEFAULT (julianday('now'))\n )''')\n conn.commit()\n conn.close()\n\n def _conn(self):\n return sqlite3.connect(self.db_path)\n\n def enqueue(self, payload, priority=0):\n conn = self._conn()\n conn.execute('INSERT INTO tasks (payload, priority) VALUES (?, ?)', (payload, priority))\n conn.commit()\n conn.close()\n\n def dequeue(self):\n # BUG: no lock around read-then-update, race condition possible\n conn = self._conn()\n row = conn.execute(\n \"SELECT id, payload FROM tasks WHERE status='pending' ORDER BY priority DESC, created_at ASC LIMIT 1\"\n ).fetchone()\n if row:\n conn.execute(\"UPDATE tasks SET status='processing', updated_at=julianday('now') WHERE id=?\", (row[0],))\n conn.commit()\n conn.close()\n return row\n\n def complete(self, task_id):\n conn = self._conn()\n conn.execute(\"UPDATE tasks SET status='completed', updated_at=julianday('now') WHERE id=?\", (task_id,))\n conn.commit()\n conn.close()\n\n def fail(self, task_id, reason='unknown'):\n conn = self._conn()\n row = conn.execute('SELECT retries, max_retries, payload FROM tasks WHERE id=?', (task_id,)).fetchone()\n if row:\n retries, max_retries, payload = row\n if retries + 1 >= max_retries:\n conn.execute('INSERT INTO dead_letter (original_id, payload, reason) VALUES (?, ?, ?)',\n (task_id, payload, reason))\n conn.execute(\"UPDATE tasks SET status='dead', updated_at=julianday('now') WHERE id=?\", (task_id,))\n else:\n conn.execute(\"UPDATE tasks SET status='pending', retries=retries+1, updated_at=julianday('now') WHERE id=?\", (task_id,))\n conn.commit()\n conn.close()\n\n def stats(self):\n conn = self._conn()\n rows = conn.execute('SELECT status, COUNT(*) FROM tasks GROUP BY status').fetchall()\n conn.close()\n return dict(rows)\nPYEOF" ++ }, ++ { ++ "task": "Audit and validate error handling in the Mobius codebase at .. Step 1: Find and document every place where error handling is missing, inconsistent, or could silently fail (bare except clauses, unchecked return values, missing null checks on database results). Write findings to /tmp/mobius-task-errors/error_audit.json as structured data with file:line references. Step 2: For each finding, write a minimal reproduction script that triggers the error condition. Step 3: Run each reproduction to verify it actually fails - classify each as confirmed bug or false positive. Write results to /tmp/mobius-task-errors/validation.json. Step 4: For confirmed bugs, propose a fix (describe the change, don't modify the Mobius source).", ++ "category": "explore-and-analyze", ++ "tier": 3, ++ "tools_required": [ ++ "Bash" ++ ], ++ "note": "Designed for future multi-agent mode (Agent A audits, Agent B validates and reproduces). Currently runs as single-agent doing both roles.", ++ "verification": "Judge checks: (1) error_audit.json has 5+ findings with file:line references, (2) validation.json marks each as confirmed/false-positive with evidence, (3) reproduction scripts actually demonstrate the issues, (4) proposed fixes are reasonable and don't break existing functionality", ++ "setup": "mkdir -p /tmp/mobius-task-errors" ++ }, ++ { ++ "task": "Build a data pipeline with built-in quality validation at /tmp/mobius-task-pipeline/. Step 1: Write pipeline.py that reads raw JSON records from input/, validates them against a schema, transforms them (normalize dates to ISO format, clean/lowercase department strings, reject invalid scores, deduplicate by id), and writes clean records to output/clean.json. Step 2: Write qa_check.py that independently verifies the output - check for missing required fields, type mismatches, remaining duplicates, values outside expected ranges, and that all transformations were applied correctly. Step 3: Run the pipeline, then run QA checks. If QA finds issues, fix the pipeline and re-run until qa_check.py reports zero issues.", ++ "category": "data", ++ "tier": 3, ++ "tools_required": [ ++ "Bash" ++ ], ++ "note": "Designed for future multi-agent mode (Agent A builds ETL, Agent B does data QA). Currently runs as single-agent doing both roles.", ++ "verification": "Judge checks: (1) pipeline processes all input records, (2) output records are valid and transformed correctly, (3) QA checker identifies real quality issues, (4) final output passes all quality checks, (5) at least 3 transformation rules are applied correctly", ++ "setup": "mkdir -p /tmp/mobius-task-pipeline/input /tmp/mobius-task-pipeline/output && python3 -c \"\nimport json, random, string\nrecords = []\nfor i in range(50):\n r = {\n 'id': i,\n 'name': ''.join(random.choices(string.ascii_letters, k=8)),\n 'email': f'user{i}@example.com' if random.random() > 0.1 else '',\n 'signup_date': f'2024-{random.randint(1,12):02d}-{random.randint(1,28):02d}' if random.random() > 0.05 else 'invalid',\n 'score': random.randint(0, 100) if random.random() > 0.1 else -999,\n 'department': random.choice(['eng', 'sales', 'marketing', 'ENGINEERING', 'Sales', ''])\n }\n records.append(r)\n# Add some duplicates\nrecords.append(records[0].copy())\nrecords.append(records[5].copy())\nwith open('/tmp/mobius-task-pipeline/input/records.json', 'w') as f:\n json.dump(records, f, indent=2)\n\"" ++ } ++] From 0dec2707630afb980450d1c7f66e2c4cbca282e1 Mon Sep 17 00:00:00 2001 From: Aaron Goldsmith Date: Sat, 21 Mar 2026 10:10:34 -0700 Subject: [PATCH 2/2] Fix broken files (strip git metadata), validate JSON/Python, fix champion re-election Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/cleanup_agents.py | 285 +++++++++++---------- scripts/competition_tasks.json | 227 ++++++++--------- scripts/competition_tasks_agentic.json | 337 ++++++++++++------------- 3 files changed, 406 insertions(+), 443 deletions(-) diff --git a/scripts/cleanup_agents.py b/scripts/cleanup_agents.py index a08626c..ba76e36 100644 --- a/scripts/cleanup_agents.py +++ b/scripts/cleanup_agents.py @@ -1,138 +1,147 @@ -commit fe66e75cc7f79b4ed77b2c8490f4e19862924bad -Author: Aaron Goldsmith -Date: Sat Mar 21 09:13:05 2026 -0700 - - Add agentic competition tasks, agent definitions, and skills - - - Agent definitions: competition-tasks, depth-test, tree-solver - - Skills: mobius-evolve (free Opus evolution), tree-solve (recursive decomposition) - - Competition tasks: standard + agentic (tool-heavy, multi-tier) - - Cleanup script for dead-weight agents - - Fix hardcoded paths in agentic tasks to use relative paths - - Make system monitoring task cross-platform (Unix tools) - - Remove unused import in cleanup_agents.py - - Add .tree-workspace/ to .gitignore - - Co-Authored-By: Claude Opus 4.6 - -diff --git a/scripts/cleanup_agents.py b/scripts/cleanup_agents.py -new file mode 100644 -index 0000000..4edaa4f ---- /dev/null -+++ b/scripts/cleanup_agents.py -@@ -0,0 +1,115 @@ -+"""Clean dead weight agents and fix champion flags in the Mobius registry.""" -+ -+import argparse -+import sqlite3 -+import sys -+from pathlib import Path -+ -+DB_PATH = Path("data/mobius.db") -+ -+ -+def get_connection(db_path: Path) -> sqlite3.Connection: -+ conn = sqlite3.connect(str(db_path)) -+ conn.row_factory = sqlite3.Row -+ return conn -+ -+ -+def list_zero_match_agents(conn: sqlite3.Connection) -> list[dict]: -+ """Find agents with 0 total matches.""" -+ rows = conn.execute( -+ "SELECT id, name, slug, elo_rating, is_champion, created_at " -+ "FROM agents WHERE total_matches = 0 ORDER BY created_at" -+ ).fetchall() -+ return [dict(r) for r in rows] -+ -+ -+def list_champions(conn: sqlite3.Connection) -> list[dict]: -+ """Find agents marked as champions with their stats.""" -+ rows = conn.execute( -+ "SELECT id, name, slug, is_champion, elo_rating, win_rate, total_matches " -+ "FROM agents WHERE is_champion = 1 ORDER BY elo_rating DESC" -+ ).fetchall() -+ return [dict(r) for r in rows] -+ -+ -+def list_all_agents_summary(conn: sqlite3.Connection) -> list[dict]: -+ """Quick summary of all agents.""" -+ rows = conn.execute( -+ "SELECT id, name, slug, elo_rating, win_rate, total_matches, is_champion " -+ "FROM agents ORDER BY elo_rating DESC" -+ ).fetchall() -+ return [dict(r) for r in rows] -+ -+ -+def retire_zero_match_agents(conn: sqlite3.Connection) -> int: -+ """Set elo_rating=0 and is_champion=0 for agents with 0 matches.""" -+ cursor = conn.execute( -+ "UPDATE agents SET elo_rating = 0.0, is_champion = 0 " -+ "WHERE total_matches = 0" -+ ) -+ conn.commit() -+ return cursor.rowcount -+ -+ -+def clear_all_champion_flags(conn: sqlite3.Connection) -> int: -+ """Clear is_champion on all agents.""" -+ cursor = conn.execute("UPDATE agents SET is_champion = 0 WHERE is_champion = 1") -+ conn.commit() -+ return cursor.rowcount -+ -+ -+def main(): -+ parser = argparse.ArgumentParser(description="Clean dead weight agents and fix champion flags") -+ parser.add_argument("--execute", action="store_true", help="Actually apply changes (default is dry-run)") -+ parser.add_argument("--db", type=Path, default=DB_PATH, help="Path to mobius.db") -+ args = parser.parse_args() -+ -+ if not args.db.exists(): -+ print(f"ERROR: Database not found at {args.db}") -+ sys.exit(1) -+ -+ conn = get_connection(args.db) -+ mode = "EXECUTE" if args.execute else "DRY-RUN" -+ print(f"=== Mobius Agent Cleanup [{mode}] ===\n") -+ -+ # Summary -+ all_agents = list_all_agents_summary(conn) -+ print(f"Total agents in registry: {len(all_agents)}\n") -+ -+ # Zero-match agents -+ zero_match = list_zero_match_agents(conn) -+ print(f"--- Agents with 0 matches ({len(zero_match)}) ---") -+ if zero_match: -+ for a in zero_match: -+ champ_flag = " [CHAMPION]" if a["is_champion"] else "" -+ print(f" {a['slug']:30s} elo={a['elo_rating']:7.1f}{champ_flag} created={a['created_at']}") -+ else: -+ print(" (none)") -+ print() -+ -+ # Champions -+ champions = list_champions(conn) -+ print(f"--- Current champions ({len(champions)}) ---") -+ if champions: -+ for a in champions: -+ print(f" {a['slug']:30s} elo={a['elo_rating']:7.1f} win_rate={a['win_rate']:.2%} matches={a['total_matches']}") -+ else: -+ print(" (none)") -+ print() -+ -+ if not args.execute: -+ print("[DRY-RUN] No changes made. Re-run with --execute to apply.") -+ print(f" Would retire {len(zero_match)} zero-match agents (set elo=0, is_champion=0)") -+ print(f" Would clear is_champion flag on {len(champions)} agents") -+ else: -+ retired = retire_zero_match_agents(conn) -+ cleared = clear_all_champion_flags(conn) -+ print(f"[EXECUTE] Retired {retired} zero-match agents (elo set to 0)") -+ print(f"[EXECUTE] Cleared champion flag on {cleared} agents") -+ print("Done.") -+ -+ conn.close() -+ -+ -+if __name__ == "__main__": -+ main() +"""Clean dead weight agents and fix champion flags in the Mobius registry.""" + +import argparse +import sqlite3 +import sys +from pathlib import Path + +DB_PATH = Path("data/mobius.db") + + +def get_connection(db_path: Path) -> sqlite3.Connection: + conn = sqlite3.connect(str(db_path)) + conn.row_factory = sqlite3.Row + return conn + + +def list_zero_match_agents(conn: sqlite3.Connection) -> list[dict]: + """Find agents with 0 total matches.""" + rows = conn.execute( + "SELECT id, name, slug, elo_rating, is_champion, created_at " + "FROM agents WHERE total_matches = 0 ORDER BY created_at" + ).fetchall() + return [dict(r) for r in rows] + + +def list_champions(conn: sqlite3.Connection) -> list[dict]: + """Find agents marked as champions with their stats.""" + rows = conn.execute( + "SELECT id, name, slug, is_champion, elo_rating, win_rate, total_matches " + "FROM agents WHERE is_champion = 1 ORDER BY elo_rating DESC" + ).fetchall() + return [dict(r) for r in rows] + + +def list_all_agents_summary(conn: sqlite3.Connection) -> list[dict]: + """Quick summary of all agents.""" + rows = conn.execute( + "SELECT id, name, slug, elo_rating, win_rate, total_matches, is_champion " + "FROM agents ORDER BY elo_rating DESC" + ).fetchall() + return [dict(r) for r in rows] + + +def retire_zero_match_agents(conn: sqlite3.Connection) -> int: + """Set elo_rating=0 and is_champion=0 for agents with 0 matches.""" + cursor = conn.execute( + "UPDATE agents SET elo_rating = 0.0, is_champion = 0 " + "WHERE total_matches = 0" + ) + conn.commit() + return cursor.rowcount + + +def clear_all_champion_flags(conn: sqlite3.Connection) -> int: + """Clear is_champion on all agents.""" + cursor = conn.execute("UPDATE agents SET is_champion = 0 WHERE is_champion = 1") + conn.commit() + return cursor.rowcount + + +def elect_champions(conn: sqlite3.Connection) -> int: + """Re-elect champions: highest Elo per specialization among agents with matches.""" + # Find the highest-Elo agent in each specialization (only among those with matches) + rows = conn.execute( + "SELECT id, specialization, elo_rating FROM agents " + "WHERE total_matches > 0 AND elo_rating > 0 " + "ORDER BY specialization, elo_rating DESC" + ).fetchall() + + # Group by specialization, pick top agent per group + best_per_spec: dict[str, int] = {} + for row in rows: + spec = row["specialization"] + if spec not in best_per_spec: + best_per_spec[spec] = row["id"] + + if not best_per_spec: + return 0 + + # Set is_champion=1 for the winners + ids = list(best_per_spec.values()) + placeholders = ",".join("?" for _ in ids) + cursor = conn.execute( + f"UPDATE agents SET is_champion = 1 WHERE id IN ({placeholders})", ids + ) + conn.commit() + return cursor.rowcount + + +def main(): + parser = argparse.ArgumentParser(description="Clean dead weight agents and fix champion flags") + parser.add_argument("--execute", action="store_true", help="Actually apply changes (default is dry-run)") + parser.add_argument("--db", type=Path, default=DB_PATH, help="Path to mobius.db") + args = parser.parse_args() + + if not args.db.exists(): + print(f"ERROR: Database not found at {args.db}") + sys.exit(1) + + conn = get_connection(args.db) + mode = "EXECUTE" if args.execute else "DRY-RUN" + print(f"=== Mobius Agent Cleanup [{mode}] ===\n") + + # Summary + all_agents = list_all_agents_summary(conn) + print(f"Total agents in registry: {len(all_agents)}\n") + + # Zero-match agents + zero_match = list_zero_match_agents(conn) + print(f"--- Agents with 0 matches ({len(zero_match)}) ---") + if zero_match: + for a in zero_match: + champ_flag = " [CHAMPION]" if a["is_champion"] else "" + print(f" {a['slug']:30s} elo={a['elo_rating']:7.1f}{champ_flag} created={a['created_at']}") + else: + print(" (none)") + print() + + # Champions + champions = list_champions(conn) + print(f"--- Current champions ({len(champions)}) ---") + if champions: + for a in champions: + print(f" {a['slug']:30s} elo={a['elo_rating']:7.1f} win_rate={a['win_rate']:.2%} matches={a['total_matches']}") + else: + print(" (none)") + print() + + if not args.execute: + print("[DRY-RUN] No changes made. Re-run with --execute to apply.") + print(f" Would retire {len(zero_match)} zero-match agents (set elo=0, is_champion=0)") + print(f" Would clear is_champion flag on {len(champions)} agents") + print(" Would re-elect champions (highest Elo per specialization)") + else: + retired = retire_zero_match_agents(conn) + cleared = clear_all_champion_flags(conn) + elected = elect_champions(conn) + print(f"[EXECUTE] Retired {retired} zero-match agents (elo set to 0)") + print(f"[EXECUTE] Cleared champion flag on {cleared} agents") + print(f"[EXECUTE] Re-elected {elected} champions (highest Elo per specialization)") + print("Done.") + + conn.close() + + +if __name__ == "__main__": + main() diff --git a/scripts/competition_tasks.json b/scripts/competition_tasks.json index 575839c..5c8eca9 100644 --- a/scripts/competition_tasks.json +++ b/scripts/competition_tasks.json @@ -1,125 +1,102 @@ -commit fe66e75cc7f79b4ed77b2c8490f4e19862924bad -Author: Aaron Goldsmith -Date: Sat Mar 21 09:13:05 2026 -0700 - - Add agentic competition tasks, agent definitions, and skills - - - Agent definitions: competition-tasks, depth-test, tree-solver - - Skills: mobius-evolve (free Opus evolution), tree-solve (recursive decomposition) - - Competition tasks: standard + agentic (tool-heavy, multi-tier) - - Cleanup script for dead-weight agents - - Fix hardcoded paths in agentic tasks to use relative paths - - Make system monitoring task cross-platform (Unix tools) - - Remove unused import in cleanup_agents.py - - Add .tree-workspace/ to .gitignore - - Co-Authored-By: Claude Opus 4.6 - -diff --git a/scripts/competition_tasks.json b/scripts/competition_tasks.json -new file mode 100644 -index 0000000..5c8eca9 ---- /dev/null -+++ b/scripts/competition_tasks.json -@@ -0,0 +1,102 @@ -+[ -+ { -+ "task": "Implement a rate limiter using the token bucket algorithm in Python with thread safety. Include a clean API: `RateLimiter(rate, capacity)` with an `acquire()` method that blocks until a token is available and a `try_acquire()` that returns immediately.", -+ "category": "code-generation-python" -+ }, -+ { -+ "task": "Write a Python function that solves the 0/1 knapsack problem using dynamic programming. It should accept a list of items (each with weight and value) and a capacity, and return both the maximum value and the selected items. Include proper type hints.", -+ "category": "algorithms" -+ }, -+ { -+ "task": "Implement a JavaScript EventEmitter class from scratch (no dependencies). Support `on`, `off`, `once`, and `emit` methods. Handle edge cases: removing a listener during emit, adding a listener during emit, and error events with no handler.", -+ "category": "code-generation-js" -+ }, -+ { -+ "task": "Review the following Python code for bugs and security issues:\n\n```python\nimport sqlite3, os\n\ndef get_user(db_path, username):\n conn = sqlite3.connect(db_path)\n cursor = conn.execute(f\"SELECT * FROM users WHERE name = '{username}'\")\n user = cursor.fetchone()\n return user\n\ndef save_upload(data, filename):\n path = os.path.join('/uploads', filename)\n with open(path, 'wb') as f:\n f.write(data)\n return path\n\ndef hash_password(password):\n import hashlib\n return hashlib.md5(password.encode()).hexdigest()\n```\n\nIdentify every bug and vulnerability, explain the impact, and provide corrected code.", -+ "category": "code-review" -+ }, -+ { -+ "task": "Design a URL shortener service that handles 100M URLs and 1B redirects per month. Cover: API design, storage schema, hash/ID generation strategy, redirect flow, caching layer, and how you'd handle analytics. Provide concrete technology choices with justification.", -+ "category": "system-design" -+ }, -+ { -+ "task": "Design the architecture for a real-time collaborative document editor (like Google Docs). Address: conflict resolution strategy (OT vs CRDT), WebSocket connection management, persistence layer, presence/cursor tracking, and offline support. Include a component diagram.", -+ "category": "system-design" -+ }, -+ { -+ "task": "Perform a security audit of this Express.js middleware stack:\n\n```javascript\nconst express = require('express');\nconst app = express();\napp.use(express.json());\napp.use((req, res, next) => {\n res.header('Access-Control-Allow-Origin', '*');\n res.header('Access-Control-Allow-Headers', '*');\n next();\n});\napp.post('/api/exec', (req, res) => {\n const { cmd } = req.body;\n const result = require('child_process').execSync(cmd).toString();\n res.json({ output: result });\n});\napp.get('/api/file', (req, res) => {\n const filePath = req.query.path;\n res.sendFile(filePath);\n});\napp.listen(3000);\n```\n\nList all vulnerabilities with OWASP classification, severity rating, and remediated code.", -+ "category": "security" -+ }, -+ { -+ "task": "You are given a Python web app that stores user sessions in a Redis-backed cookie store. Users report being randomly logged out. The session TTL is 30 minutes, but some users lose sessions after 5 minutes. Write a systematic debugging plan: what logs to check, what metrics to gather, specific Redis commands to run, and potential root causes ranked by likelihood.", -+ "category": "debugging" -+ }, -+ { -+ "task": "Write a comprehensive test suite for a Python `BankAccount` class that supports `deposit(amount)`, `withdraw(amount)`, `transfer(other_account, amount)`, and `get_balance()`. Cover: happy paths, overdraft protection, concurrent transfers, negative amounts, floating point precision, and transaction atomicity. Use pytest.", -+ "category": "testing" -+ }, -+ { -+ "task": "Write a testing strategy document for a microservices-based e-commerce platform with services for: user auth, product catalog, shopping cart, payment processing, and order fulfillment. Define the test pyramid, specify what to unit test vs integration test vs e2e test, and recommend specific tools and patterns for contract testing between services.", -+ "category": "testing-strategy" -+ }, -+ { -+ "task": "Write a Dockerfile and docker-compose.yml for a Python FastAPI app with PostgreSQL, Redis, and Celery workers. Requirements: multi-stage build, non-root user, health checks, proper signal handling, volume mounts for development, and production-ready defaults. Include a .dockerignore.", -+ "category": "devops" -+ }, -+ { -+ "task": "Design a CI/CD pipeline for a monorepo containing three microservices (Python, Node.js, Go) and a shared protobuf definitions package. The pipeline should: only build changed services, run tests in parallel, build and push Docker images, deploy to staging automatically, and require manual approval for production. Use GitHub Actions.", -+ "category": "devops" -+ }, -+ { -+ "task": "Write SQL queries for an e-commerce analytics dashboard against this schema:\n- `orders(id, user_id, total, status, created_at)`\n- `order_items(id, order_id, product_id, quantity, price)`\n- `products(id, name, category, price)`\n- `users(id, email, created_at, country)`\n\nQueries needed:\n1. Monthly revenue with month-over-month growth percentage\n2. Top 10 products by revenue with their return rate (status='returned')\n3. Customer cohort retention: for each signup month, what % ordered in months 1-6\n4. Rolling 7-day average order value\n5. Products frequently bought together (association rules)", -+ "category": "data-sql" -+ }, -+ { -+ "task": "Given a CSV dataset of 500K customer support tickets with columns (ticket_id, created_at, resolved_at, category, priority, agent_id, satisfaction_score, first_response_minutes), write a Python analysis script using pandas that: calculates resolution time distributions per category, identifies bottleneck categories, finds the best and worst performing agents by resolution time and satisfaction, and generates a summary report with actionable recommendations.", -+ "category": "data-analysis" -+ }, -+ { -+ "task": "Write a clear, concise README for a CLI tool called `logslice` that filters and aggregates log files. Features: regex pattern matching, time range filtering, JSON log parsing, output formats (table, CSV, JSON), and pipe-friendly. The README should include: badges, install instructions, quick start, usage examples, configuration, and contributing guide. Make it scannable and developer-friendly.", -+ "category": "documentation" -+ }, -+ { -+ "task": "Refactor this Python code to improve readability, reduce complexity, and follow SOLID principles:\n\n```python\ndef process_order(order):\n if order['type'] == 'physical':\n if order['weight'] > 50:\n shipping = order['weight'] * 0.5 + 10\n else:\n shipping = order['weight'] * 0.3 + 5\n if order['country'] != 'US':\n shipping *= 2.5\n tax = order['total'] * 0.08\n if order['state'] in ['OR', 'MT', 'NH', 'DE']:\n tax = 0\n total = order['total'] + shipping + tax\n if order.get('coupon'):\n if order['coupon']['type'] == 'percent':\n total -= total * (order['coupon']['value'] / 100)\n else:\n total -= order['coupon']['value']\n send_email(order['email'], f'Order total: ${total:.2f}')\n update_inventory(order['items'])\n return {'total': total, 'shipping': shipping, 'tax': tax}\n elif order['type'] == 'digital':\n tax = order['total'] * 0.08\n if order['state'] in ['OR', 'MT', 'NH', 'DE']:\n tax = 0\n total = order['total'] + tax\n if order.get('coupon'):\n if order['coupon']['type'] == 'percent':\n total -= total * (order['coupon']['value'] / 100)\n else:\n total -= order['coupon']['value']\n send_email(order['email'], f'Download link: {generate_link(order)}')\n return {'total': total, 'tax': tax}\n```\n\nProvide the refactored code with explanation of each improvement.", -+ "category": "refactoring" -+ }, -+ { -+ "task": "Profile and optimize this Python function that processes a large dataset:\n\n```python\ndef find_duplicates(records):\n duplicates = []\n for i in range(len(records)):\n for j in range(i + 1, len(records)):\n if records[i]['email'].lower().strip() == records[j]['email'].lower().strip():\n if records[i] not in duplicates:\n duplicates.append(records[i])\n if records[j] not in duplicates:\n duplicates.append(records[j])\n return duplicates\n```\n\nThis runs on 100K records and takes over 10 minutes. Identify all performance issues, provide an optimized version with complexity analysis, and explain each optimization.", -+ "category": "performance" -+ }, -+ { -+ "task": "Implement a Python async web scraper that crawls a paginated API endpoint, respects rate limits (max 10 requests/second), handles retries with exponential backoff, and writes results to a SQLite database. Use aiohttp and include proper error handling, graceful shutdown on SIGINT, and progress reporting.", -+ "category": "code-generation-python" -+ }, -+ { -+ "task": "Implement a trie (prefix tree) data structure in Python that supports: insert, search, delete, autocomplete (return all words with a given prefix), and wildcard search (where '.' matches any single character). Include full type hints and a clean, well-documented API.", -+ "category": "algorithms" -+ }, -+ { -+ "task": "You have a PostgreSQL database with a `transactions` table (50M rows) and queries are taking 30+ seconds. The table has columns: id, account_id, amount, type, status, created_at, metadata (JSONB). Common query patterns: filter by account_id + date range, aggregate by type + status, and search within metadata. Propose an indexing strategy, query optimizations, and partitioning scheme. Show the exact SQL for index creation and explain the tradeoffs.", -+ "category": "performance-db" -+ }, -+ { -+ "task": "Write a Python decorator `@retry` that supports: max retries, exponential backoff with jitter, retrying only on specified exception types, a configurable timeout, and an `on_retry` callback. It should work with both sync and async functions. Include comprehensive docstring and usage examples.", -+ "category": "code-generation-python" -+ }, -+ { -+ "task": "Analyze this Go concurrent code for race conditions, deadlocks, and goroutine leaks:\n\n```go\nfunc ProcessBatch(items []Item) []Result {\n results := make([]Result, len(items))\n var wg sync.WaitGroup\n for i, item := range items {\n wg.Add(1)\n go func() {\n defer wg.Done()\n result, err := process(item)\n if err != nil {\n log.Fatal(err)\n }\n results[i] = result\n }()\n }\n wg.Wait()\n return results\n}\n```\n\nIdentify every concurrency issue, explain why each is dangerous, and provide a corrected version.", -+ "category": "code-review" -+ }, -+ { -+ "task": "Design a notification system that supports email, SMS, push notifications, and in-app messages. Requirements: user preferences for channel and frequency, batching/digest mode, template system, delivery tracking, retry on failure, and rate limiting per user. Provide the data model, service architecture, and key API endpoints.", -+ "category": "system-design" -+ }, -+ { -+ "task": "Write a GitHub Actions workflow that runs on every PR and: lints Python code (ruff), runs type checking (mypy), executes pytest with coverage, fails if coverage drops below 80%, posts a coverage summary comment on the PR, and caches dependencies between runs. The workflow should be efficient and only run relevant checks based on changed files.", -+ "category": "devops" -+ }, -+ { -+ "task": "Implement a simple in-memory key-value store in Python that supports: GET, SET with optional TTL, DELETE, KEYS with glob pattern matching, and atomic INCR/DECR operations. It should be thread-safe, clean up expired keys lazily and via a background thread, and include a simple TCP server interface (like Redis RESP protocol).", -+ "category": "code-generation-python" -+ } -+] +[ + { + "task": "Implement a rate limiter using the token bucket algorithm in Python with thread safety. Include a clean API: `RateLimiter(rate, capacity)` with an `acquire()` method that blocks until a token is available and a `try_acquire()` that returns immediately.", + "category": "code-generation-python" + }, + { + "task": "Write a Python function that solves the 0/1 knapsack problem using dynamic programming. It should accept a list of items (each with weight and value) and a capacity, and return both the maximum value and the selected items. Include proper type hints.", + "category": "algorithms" + }, + { + "task": "Implement a JavaScript EventEmitter class from scratch (no dependencies). Support `on`, `off`, `once`, and `emit` methods. Handle edge cases: removing a listener during emit, adding a listener during emit, and error events with no handler.", + "category": "code-generation-js" + }, + { + "task": "Review the following Python code for bugs and security issues:\n\n```python\nimport sqlite3, os\n\ndef get_user(db_path, username):\n conn = sqlite3.connect(db_path)\n cursor = conn.execute(f\"SELECT * FROM users WHERE name = '{username}'\")\n user = cursor.fetchone()\n return user\n\ndef save_upload(data, filename):\n path = os.path.join('/uploads', filename)\n with open(path, 'wb') as f:\n f.write(data)\n return path\n\ndef hash_password(password):\n import hashlib\n return hashlib.md5(password.encode()).hexdigest()\n```\n\nIdentify every bug and vulnerability, explain the impact, and provide corrected code.", + "category": "code-review" + }, + { + "task": "Design a URL shortener service that handles 100M URLs and 1B redirects per month. Cover: API design, storage schema, hash/ID generation strategy, redirect flow, caching layer, and how you'd handle analytics. Provide concrete technology choices with justification.", + "category": "system-design" + }, + { + "task": "Design the architecture for a real-time collaborative document editor (like Google Docs). Address: conflict resolution strategy (OT vs CRDT), WebSocket connection management, persistence layer, presence/cursor tracking, and offline support. Include a component diagram.", + "category": "system-design" + }, + { + "task": "Perform a security audit of this Express.js middleware stack:\n\n```javascript\nconst express = require('express');\nconst app = express();\napp.use(express.json());\napp.use((req, res, next) => {\n res.header('Access-Control-Allow-Origin', '*');\n res.header('Access-Control-Allow-Headers', '*');\n next();\n});\napp.post('/api/exec', (req, res) => {\n const { cmd } = req.body;\n const result = require('child_process').execSync(cmd).toString();\n res.json({ output: result });\n});\napp.get('/api/file', (req, res) => {\n const filePath = req.query.path;\n res.sendFile(filePath);\n});\napp.listen(3000);\n```\n\nList all vulnerabilities with OWASP classification, severity rating, and remediated code.", + "category": "security" + }, + { + "task": "You are given a Python web app that stores user sessions in a Redis-backed cookie store. Users report being randomly logged out. The session TTL is 30 minutes, but some users lose sessions after 5 minutes. Write a systematic debugging plan: what logs to check, what metrics to gather, specific Redis commands to run, and potential root causes ranked by likelihood.", + "category": "debugging" + }, + { + "task": "Write a comprehensive test suite for a Python `BankAccount` class that supports `deposit(amount)`, `withdraw(amount)`, `transfer(other_account, amount)`, and `get_balance()`. Cover: happy paths, overdraft protection, concurrent transfers, negative amounts, floating point precision, and transaction atomicity. Use pytest.", + "category": "testing" + }, + { + "task": "Write a testing strategy document for a microservices-based e-commerce platform with services for: user auth, product catalog, shopping cart, payment processing, and order fulfillment. Define the test pyramid, specify what to unit test vs integration test vs e2e test, and recommend specific tools and patterns for contract testing between services.", + "category": "testing-strategy" + }, + { + "task": "Write a Dockerfile and docker-compose.yml for a Python FastAPI app with PostgreSQL, Redis, and Celery workers. Requirements: multi-stage build, non-root user, health checks, proper signal handling, volume mounts for development, and production-ready defaults. Include a .dockerignore.", + "category": "devops" + }, + { + "task": "Design a CI/CD pipeline for a monorepo containing three microservices (Python, Node.js, Go) and a shared protobuf definitions package. The pipeline should: only build changed services, run tests in parallel, build and push Docker images, deploy to staging automatically, and require manual approval for production. Use GitHub Actions.", + "category": "devops" + }, + { + "task": "Write SQL queries for an e-commerce analytics dashboard against this schema:\n- `orders(id, user_id, total, status, created_at)`\n- `order_items(id, order_id, product_id, quantity, price)`\n- `products(id, name, category, price)`\n- `users(id, email, created_at, country)`\n\nQueries needed:\n1. Monthly revenue with month-over-month growth percentage\n2. Top 10 products by revenue with their return rate (status='returned')\n3. Customer cohort retention: for each signup month, what % ordered in months 1-6\n4. Rolling 7-day average order value\n5. Products frequently bought together (association rules)", + "category": "data-sql" + }, + { + "task": "Given a CSV dataset of 500K customer support tickets with columns (ticket_id, created_at, resolved_at, category, priority, agent_id, satisfaction_score, first_response_minutes), write a Python analysis script using pandas that: calculates resolution time distributions per category, identifies bottleneck categories, finds the best and worst performing agents by resolution time and satisfaction, and generates a summary report with actionable recommendations.", + "category": "data-analysis" + }, + { + "task": "Write a clear, concise README for a CLI tool called `logslice` that filters and aggregates log files. Features: regex pattern matching, time range filtering, JSON log parsing, output formats (table, CSV, JSON), and pipe-friendly. The README should include: badges, install instructions, quick start, usage examples, configuration, and contributing guide. Make it scannable and developer-friendly.", + "category": "documentation" + }, + { + "task": "Refactor this Python code to improve readability, reduce complexity, and follow SOLID principles:\n\n```python\ndef process_order(order):\n if order['type'] == 'physical':\n if order['weight'] > 50:\n shipping = order['weight'] * 0.5 + 10\n else:\n shipping = order['weight'] * 0.3 + 5\n if order['country'] != 'US':\n shipping *= 2.5\n tax = order['total'] * 0.08\n if order['state'] in ['OR', 'MT', 'NH', 'DE']:\n tax = 0\n total = order['total'] + shipping + tax\n if order.get('coupon'):\n if order['coupon']['type'] == 'percent':\n total -= total * (order['coupon']['value'] / 100)\n else:\n total -= order['coupon']['value']\n send_email(order['email'], f'Order total: ${total:.2f}')\n update_inventory(order['items'])\n return {'total': total, 'shipping': shipping, 'tax': tax}\n elif order['type'] == 'digital':\n tax = order['total'] * 0.08\n if order['state'] in ['OR', 'MT', 'NH', 'DE']:\n tax = 0\n total = order['total'] + tax\n if order.get('coupon'):\n if order['coupon']['type'] == 'percent':\n total -= total * (order['coupon']['value'] / 100)\n else:\n total -= order['coupon']['value']\n send_email(order['email'], f'Download link: {generate_link(order)}')\n return {'total': total, 'tax': tax}\n```\n\nProvide the refactored code with explanation of each improvement.", + "category": "refactoring" + }, + { + "task": "Profile and optimize this Python function that processes a large dataset:\n\n```python\ndef find_duplicates(records):\n duplicates = []\n for i in range(len(records)):\n for j in range(i + 1, len(records)):\n if records[i]['email'].lower().strip() == records[j]['email'].lower().strip():\n if records[i] not in duplicates:\n duplicates.append(records[i])\n if records[j] not in duplicates:\n duplicates.append(records[j])\n return duplicates\n```\n\nThis runs on 100K records and takes over 10 minutes. Identify all performance issues, provide an optimized version with complexity analysis, and explain each optimization.", + "category": "performance" + }, + { + "task": "Implement a Python async web scraper that crawls a paginated API endpoint, respects rate limits (max 10 requests/second), handles retries with exponential backoff, and writes results to a SQLite database. Use aiohttp and include proper error handling, graceful shutdown on SIGINT, and progress reporting.", + "category": "code-generation-python" + }, + { + "task": "Implement a trie (prefix tree) data structure in Python that supports: insert, search, delete, autocomplete (return all words with a given prefix), and wildcard search (where '.' matches any single character). Include full type hints and a clean, well-documented API.", + "category": "algorithms" + }, + { + "task": "You have a PostgreSQL database with a `transactions` table (50M rows) and queries are taking 30+ seconds. The table has columns: id, account_id, amount, type, status, created_at, metadata (JSONB). Common query patterns: filter by account_id + date range, aggregate by type + status, and search within metadata. Propose an indexing strategy, query optimizations, and partitioning scheme. Show the exact SQL for index creation and explain the tradeoffs.", + "category": "performance-db" + }, + { + "task": "Write a Python decorator `@retry` that supports: max retries, exponential backoff with jitter, retrying only on specified exception types, a configurable timeout, and an `on_retry` callback. It should work with both sync and async functions. Include comprehensive docstring and usage examples.", + "category": "code-generation-python" + }, + { + "task": "Analyze this Go concurrent code for race conditions, deadlocks, and goroutine leaks:\n\n```go\nfunc ProcessBatch(items []Item) []Result {\n results := make([]Result, len(items))\n var wg sync.WaitGroup\n for i, item := range items {\n wg.Add(1)\n go func() {\n defer wg.Done()\n result, err := process(item)\n if err != nil {\n log.Fatal(err)\n }\n results[i] = result\n }()\n }\n wg.Wait()\n return results\n}\n```\n\nIdentify every concurrency issue, explain why each is dangerous, and provide a corrected version.", + "category": "code-review" + }, + { + "task": "Design a notification system that supports email, SMS, push notifications, and in-app messages. Requirements: user preferences for channel and frequency, batching/digest mode, template system, delivery tracking, retry on failure, and rate limiting per user. Provide the data model, service architecture, and key API endpoints.", + "category": "system-design" + }, + { + "task": "Write a GitHub Actions workflow that runs on every PR and: lints Python code (ruff), runs type checking (mypy), executes pytest with coverage, fails if coverage drops below 80%, posts a coverage summary comment on the PR, and caches dependencies between runs. The workflow should be efficient and only run relevant checks based on changed files.", + "category": "devops" + }, + { + "task": "Implement a simple in-memory key-value store in Python that supports: GET, SET with optional TTL, DELETE, KEYS with glob pattern matching, and atomic INCR/DECR operations. It should be thread-safe, clean up expired keys lazily and via a background thread, and include a simple TCP server interface (like Redis RESP protocol).", + "category": "code-generation-python" + } +] diff --git a/scripts/competition_tasks_agentic.json b/scripts/competition_tasks_agentic.json index cac8106..7b23fa5 100644 --- a/scripts/competition_tasks_agentic.json +++ b/scripts/competition_tasks_agentic.json @@ -1,180 +1,157 @@ -commit fe66e75cc7f79b4ed77b2c8490f4e19862924bad -Author: Aaron Goldsmith -Date: Sat Mar 21 09:13:05 2026 -0700 - - Add agentic competition tasks, agent definitions, and skills - - - Agent definitions: competition-tasks, depth-test, tree-solver - - Skills: mobius-evolve (free Opus evolution), tree-solve (recursive decomposition) - - Competition tasks: standard + agentic (tool-heavy, multi-tier) - - Cleanup script for dead-weight agents - - Fix hardcoded paths in agentic tasks to use relative paths - - Make system monitoring task cross-platform (Unix tools) - - Remove unused import in cleanup_agents.py - - Add .tree-workspace/ to .gitignore - - Co-Authored-By: Claude Opus 4.6 - -diff --git a/scripts/competition_tasks_agentic.json b/scripts/competition_tasks_agentic.json -new file mode 100644 -index 0000000..7b23fa5 ---- /dev/null -+++ b/scripts/competition_tasks_agentic.json -@@ -0,0 +1,157 @@ -+[ -+ { -+ "task": "Create a Python CLI tool at /tmp/mobius-task-csvtool/csvtool.py that reads a CSV file, supports --filter COLUMN=VALUE, --sort COLUMN, --group-by COLUMN (with count aggregation), and --output FORMAT (table or json). Generate a test CSV with at least 100 rows of synthetic employee data (name, department, salary, hire_date). Run the tool with each flag combination and verify the output is correct. Fix any bugs you find.", -+ "category": "build-and-test", -+ "tier": 1, -+ "tools_required": [ -+ "Bash" -+ ], -+ "verification": "Judge checks: (1) csvtool.py exists and is executable, (2) test CSV has 100+ rows, (3) all four flag combinations produce correct output when re-run, (4) edge cases like missing columns produce helpful errors", -+ "setup": "mkdir -p /tmp/mobius-task-csvtool" -+ }, -+ { -+ "task": "Write a Bash script at /tmp/mobius-task-monitor/sysmon.sh that collects system metrics using cross-platform Unix tools (CPU load via uptime, memory via free -m, disk usage via df -h, process count via ps aux | wc -l) every 2 seconds for 10 seconds, writes each snapshot as a JSON line to metrics.jsonl, then generates a summary report (min/max/avg for each metric) to summary.txt. The script must handle missing commands gracefully (check command availability with which/command -v and skip metrics for unavailable tools). Run it and verify the output files are correct and well-formed JSON.", -+ "category": "infrastructure", -+ "tier": 1, -+ "tools_required": [ -+ "Bash" -+ ], -+ "verification": "Judge checks: (1) sysmon.sh runs without errors, (2) metrics.jsonl has 5+ valid JSON lines, (3) summary.txt has correct min/max/avg calculations, (4) script handles at least one missing-tool fallback", -+ "setup": "mkdir -p /tmp/mobius-task-monitor" -+ }, -+ { -+ "task": "Create a SQLite database at /tmp/mobius-task-sql/analytics.db with tables: users(id, name, email, signup_date, country), orders(id, user_id, amount, status, created_at), products(id, name, category, price), order_items(order_id, product_id, quantity). Populate with at least 200 orders across 50 users and 20 products using realistic data. Then write and execute SQL queries that answer: (1) top 5 customers by lifetime value, (2) monthly revenue trend for the last 6 months, (3) most popular product category by country, (4) customers who ordered in their first week but never again. Save all queries and results to queries.sql and results.txt.", -+ "category": "data", -+ "tier": 1, -+ "tools_required": [ -+ "Bash" -+ ], -+ "verification": "Judge checks: (1) database exists with correct schema and 200+ orders, (2) all 4 queries are syntactically valid and return non-empty results, (3) results are plausible given the data, (4) queries use appropriate JOINs/subqueries", -+ "setup": "mkdir -p /tmp/mobius-task-sql" -+ }, -+ { -+ "task": "Build a static site generator at /tmp/mobius-task-ssg/. Write a Python script ssg.py that reads Markdown files from content/, converts them to HTML using only the standard library (regex-based parsing for headers, bold, italic, links, code blocks, lists), wraps them in a template with navigation, and outputs to build/. Create at least 3 sample Markdown pages with varied formatting. Run the generator and verify the output HTML is valid (all tags closed, links work, navigation present on every page).", -+ "category": "build-and-test", -+ "tier": 1, -+ "tools_required": [ -+ "Bash" -+ ], -+ "verification": "Judge checks: (1) ssg.py runs and produces build/ directory, (2) each Markdown file has a corresponding HTML file, (3) HTML contains correctly converted headers/bold/italic/links/code/lists, (4) navigation links between pages work, (5) no unclosed HTML tags", -+ "setup": "mkdir -p /tmp/mobius-task-ssg/content /tmp/mobius-task-ssg/build" -+ }, -+ { -+ "task": "Write a Python script at /tmp/mobius-task-gitanalyze/analyze.py that analyzes the Mobius git repository (at .). It should produce a JSON report containing: (1) total commits per author, (2) lines of code per file extension, (3) the 5 most-changed files by commit count, (4) average commits per day over the last 30 days, (5) list of files that exist in the repo but have never been committed (untracked). Run the script and save the report to report.json. Verify the data by spot-checking at least 2 metrics against direct git commands.", -+ "category": "explore-and-analyze", -+ "tier": 1, -+ "tools_required": [ -+ "Bash" -+ ], -+ "verification": "Judge checks: (1) analyze.py runs without errors, (2) report.json contains all 5 sections with plausible data, (3) spot-check verification shows metrics match actual git output, (4) script uses subprocess to call git (not a library)", -+ "setup": "mkdir -p /tmp/mobius-task-gitanalyze" -+ }, -+ { -+ "task": "A Python web scraping pipeline has three bugs preventing it from working. Find all three bugs, fix them, and verify the pipeline produces correct output. The pipeline reads URLs from urls.txt, fetches each page (simulated via local HTML files), extracts titles and links, deduplicates by URL, and writes results to output.json.", -+ "category": "debug-and-fix", -+ "tier": 2, -+ "tools_required": [ -+ "Bash" -+ ], -+ "verification": "Judge checks: (1) all 3 bugs identified and explained, (2) fixes are minimal and correct, (3) pipeline runs end-to-end producing valid output.json, (4) deduplication works correctly", -+ "setup": "mkdir -p /tmp/mobius-task-scraper && cat > /tmp/mobius-task-scraper/pipeline.py << 'PYEOF'\nimport json\nimport re\nimport os\n\ndef read_urls(filepath):\n with open(filepath) as f:\n return f.readlines() # Bug 1: leaves newlines in URLs\n\ndef fetch_page(url):\n # Simulate fetching by reading local HTML files\n filename = url.replace('http://', '').replace('/', '_') + '.html'\n filepath = os.path.join('pages', filename)\n with open(filepath) as f:\n return f.read()\n\ndef extract_data(html):\n title_match = re.search(r'(.*)', html)\n title = title_match.group(0) if title_match else 'No title' # Bug 2: group(0) returns full match including tags, should be group(1)\n links = re.findall(r'href=\"([^\"]+)\"', html)\n return {'title': title, 'links': links}\n\ndef deduplicate(results):\n seen = set()\n unique = []\n for r in results:\n if r['url'] not in seen:\n seen.add(r['url'])\n unique.append(r)\n seen.add(r['url']) # Bug 3: this is outside the if block but also redundant; the real bug is that seen is checked but 'url' key doesn't exist yet\n return unique\n\ndef main():\n urls = read_urls('urls.txt')\n results = []\n for url in urls:\n html = fetch_page(url)\n data = extract_data(html)\n data['url'] = url\n results.append(data)\n unique = deduplicate(results)\n with open('output.json', 'w') as f:\n json.dump(unique, f, indent=2)\n print(f'Processed {len(unique)} unique pages')\n\nif __name__ == '__main__':\n main()\nPYEOF\nmkdir -p /tmp/mobius-task-scraper/pages && cat > /tmp/mobius-task-scraper/urls.txt << 'EOF'\nhttp://example.com/page1\nhttp://example.com/page2\nhttp://example.com/page1\nhttp://example.com/page3\nEOF\ncat > /tmp/mobius-task-scraper/pages/example.com_page1.html << 'EOF'\nFirst PageAboutContact\nEOF\ncat > /tmp/mobius-task-scraper/pages/example.com_page2.html << 'EOF'\nSecond PageAboutProducts\nEOF\ncat > /tmp/mobius-task-scraper/pages/example.com_page3.html << 'EOF'\nThird PageHome\nEOF" -+ }, -+ { -+ "task": "Investigate the Mobius codebase at . to answer these questions with evidence (file paths, line numbers, code snippets): (1) What happens when two agents tie in Elo rating during selection? Show the exact code path. (2) Is there a race condition possible when multiple competitions run concurrently? Identify specific shared state. (3) What is the maximum number of API calls a single 'mobius run' invocation can make? Trace through the call chain. (4) Find at least one code path where an exception could be silently swallowed. Write your findings to /tmp/mobius-task-audit/audit_report.txt with evidence for each answer.", -+ "category": "explore-and-analyze", -+ "tier": 2, -+ "tools_required": [ -+ "Bash" -+ ], -+ "verification": "Judge checks: (1) all 4 questions answered with specific file:line references, (2) code snippets provided as evidence, (3) analysis is accurate (judge can verify by checking the referenced code), (4) race condition analysis identifies real shared state", -+ "setup": "mkdir -p /tmp/mobius-task-audit" -+ }, -+ { -+ "task": "A Python HTTP server has a performance bottleneck and two security vulnerabilities. The server handles JSON API requests. Profile the server to find the bottleneck, fix it, fix both security issues, and provide before/after timing measurements. Document each finding in findings.txt.", -+ "category": "security", -+ "tier": 2, -+ "tools_required": [ -+ "Bash" -+ ], -+ "verification": "Judge checks: (1) performance bottleneck identified with timing evidence, (2) both security vulnerabilities found and classified (one is command injection, one is path traversal), (3) fixes are correct and don't break functionality, (4) before/after timing shows improvement", -+ "setup": "mkdir -p /tmp/mobius-task-security && cat > /tmp/mobius-task-security/server.py << 'PYEOF'\nimport json\nimport os\nimport subprocess\nimport hashlib\nimport time\nfrom http.server import HTTPServer, BaseHTTPRequestHandler\n\nDB = {}\n\ndef slow_hash(data):\n # Performance bottleneck: unnecessary iterations\n result = data.encode()\n for i in range(100000):\n result = hashlib.sha256(result).digest()\n return result.hex()\n\nclass APIHandler(BaseHTTPRequestHandler):\n def do_POST(self):\n length = int(self.headers.get('Content-Length', 0))\n body = json.loads(self.rfile.read(length))\n \n if self.path == '/api/store':\n key = body['key']\n value = body['value']\n hashed_key = slow_hash(key)\n DB[hashed_key] = value\n self.send_response(200)\n self.end_headers()\n self.wfile.write(json.dumps({'hash': hashed_key}).encode())\n \n elif self.path == '/api/exec':\n # Security vuln 1: command injection\n cmd = body.get('command', 'echo hello')\n result = subprocess.check_output(cmd, shell=True).decode()\n self.send_response(200)\n self.end_headers()\n self.wfile.write(json.dumps({'output': result}).encode())\n \n elif self.path == '/api/read':\n # Security vuln 2: path traversal\n filename = body.get('file', 'readme.txt')\n filepath = os.path.join('/tmp/mobius-task-security/data', filename)\n with open(filepath) as f:\n content = f.read()\n self.send_response(200)\n self.end_headers()\n self.wfile.write(json.dumps({'content': content}).encode())\n\nif __name__ == '__main__':\n print('Starting server on :8199')\n HTTPServer(('', 8199), APIHandler).serve_forever()\nPYEOF\nmkdir -p /tmp/mobius-task-security/data && echo 'safe content' > /tmp/mobius-task-security/data/readme.txt" -+ }, -+ { -+ "task": "A pytest test suite for a calculator module has 8 tests but only 3 pass. Without modifying the test file, fix the calculator implementation so all 8 tests pass. You must understand what each test expects, identify the implementation bugs, fix them, and run the full suite green. Document each bug you found in /tmp/mobius-task-tdd/buglog.txt.", -+ "category": "debug-and-fix", -+ "tier": 2, -+ "tools_required": [ -+ "Bash" -+ ], -+ "verification": "Judge checks: (1) all 8 tests pass (run pytest), (2) test file was NOT modified, (3) buglog.txt explains each bug clearly, (4) fixes are minimal - no unnecessary changes to calculator.py", -+ "setup": "mkdir -p /tmp/mobius-task-tdd && cat > /tmp/mobius-task-tdd/calculator.py << 'PYEOF'\nclass Calculator:\n def __init__(self):\n self.history = []\n \n def add(self, a, b):\n result = a + b\n self.history.append(('add', a, b, result))\n return result\n \n def subtract(self, a, b):\n result = a + b # Bug: should be a - b\n self.history.append(('subtract', a, b, result))\n return result\n \n def multiply(self, a, b):\n result = a * b\n self.history.append(('multiply', a, b, result))\n return result\n \n def divide(self, a, b):\n if b == 0:\n raise ValueError(\"Cannot divide by zero\")\n result = a // b # Bug: should be true division a / b\n self.history.append(('divide', a, b, result))\n return result\n \n def power(self, base, exp):\n result = base * exp # Bug: should be base ** exp\n self.history.append(('power', base, exp, result))\n return result\n \n def sqrt(self, n):\n if n < 0:\n raise TypeError(\"Cannot take square root of negative number\") # Bug: should be ValueError\n result = n ** 0.5\n self.history.append(('sqrt', n, None, result))\n return result\n \n def get_history(self):\n return list(self.history) # This is correct\n \n def clear_history(self):\n self.history = None # Bug: should be self.history = []\nPYEOF\ncat > /tmp/mobius-task-tdd/test_calculator.py << 'PYEOF'\nimport pytest\nfrom calculator import Calculator\n\n@pytest.fixture\ndef calc():\n return Calculator()\n\ndef test_add(calc):\n assert calc.add(2, 3) == 5\n assert calc.add(-1, 1) == 0\n assert calc.add(0.1, 0.2) == pytest.approx(0.3)\n\ndef test_subtract(calc):\n assert calc.subtract(5, 3) == 2\n assert calc.subtract(0, 5) == -5\n\ndef test_multiply(calc):\n assert calc.multiply(3, 4) == 12\n assert calc.multiply(-2, 3) == -6\n\ndef test_divide(calc):\n assert calc.divide(10, 3) == pytest.approx(3.333333, rel=1e-4)\n assert calc.divide(1, 4) == 0.25\n with pytest.raises(ValueError):\n calc.divide(1, 0)\n\ndef test_power(calc):\n assert calc.power(2, 3) == 8\n assert calc.power(5, 0) == 1\n assert calc.power(3, 2) == 9\n\ndef test_sqrt(calc):\n assert calc.sqrt(9) == 3.0\n assert calc.sqrt(2) == pytest.approx(1.41421, rel=1e-4)\n with pytest.raises(ValueError):\n calc.sqrt(-1)\n\ndef test_history(calc):\n calc.add(1, 2)\n calc.multiply(3, 4)\n history = calc.get_history()\n assert len(history) == 2\n assert history[0] == ('add', 1, 2, 3)\n assert history[1] == ('multiply', 3, 4, 12)\n\ndef test_clear_history(calc):\n calc.add(1, 2)\n calc.clear_history()\n assert calc.get_history() == []\n calc.add(3, 4)\n assert len(calc.get_history()) == 1\nPYEOF" -+ }, -+ { -+ "task": "Reverse-engineer the Mobius agent selection algorithm by reading the source code at .. Then write a simulation at /tmp/mobius-task-selection/simulate.py that models 100 competitions with 10 agents starting at Elo 1000. Track how Elo ratings drift, identify whether the selection algorithm has any bias (does it favor certain agents unfairly?), and determine after how many rounds the rankings stabilize. Output a report with the Elo trajectory data and your analysis to report.txt. The simulation should use the same Elo update formula as the real code.", -+ "category": "explore-and-analyze", -+ "tier": 2, -+ "tools_required": [ -+ "Bash" -+ ], -+ "verification": "Judge checks: (1) simulation uses the actual Elo formula from Mobius source (not a generic one), (2) simulate.py runs and produces trajectory data, (3) report.txt contains analysis of bias and stabilization with supporting data, (4) at least one non-obvious insight about the selection algorithm", -+ "setup": "mkdir -p /tmp/mobius-task-selection" -+ }, -+ { -+ "task": "Build a robust configuration parser and then attack it with your own adversarial tests. Step 1: Write /tmp/mobius-task-adversarial/parser.py that parses a custom configuration format (key=value pairs, sections in [brackets], # comments, multi-line values with backslash continuation, include directives to pull in other files). Step 2: Write /tmp/mobius-task-adversarial/test_parser.py with at least 15 adversarial test cases designed to break the parser - focus on edge cases (empty files, deeply nested includes, circular includes, unicode keys, values with = signs, comments inside values, etc.). Step 3: Run the tests, fix any failures in the parser, then write even harder tests. Iterate until the parser survives all adversarial inputs and all tests pass.", -+ "category": "adversarial", -+ "tier": 3, -+ "tools_required": [ -+ "Bash" -+ ], -+ "note": "Designed for future multi-agent mode (Agent A implements parser, Agent B writes adversarial tests). Currently runs as single-agent doing both roles.", -+ "verification": "Judge checks: (1) parser.py handles standard configs correctly, (2) test suite has 15+ tests covering genuine edge cases, (3) all tests pass, (4) test cases are actually adversarial (not trivial), (5) evidence of iteration (fix cycles visible in code or comments)", -+ "setup": "mkdir -p /tmp/mobius-task-adversarial/configs && echo -e '[main]\\nkey1=value1\\n# comment\\nkey2 = multi \\\\\\n line value\\n\\n[section2]\\nkey3=value3' > /tmp/mobius-task-adversarial/configs/sample.conf" -+ }, -+ { -+ "task": "Design, implement, and harden a plugin system at /tmp/mobius-task-plugins/. Step 1: Build the core framework (plugin_loader.py, event_bus.py) that loads Python plugins from a plugins/ directory, where each plugin registers handlers for specific event types. Write 2 example plugins that work correctly. Step 2: Write integration tests in tests/. Step 3: Write at least 3 malicious plugins that attempt to break the system (modify global state, crash the event bus, infinite loops, raise exceptions in handlers). Step 4: Harden the framework so all malicious plugin attempts are caught and handled gracefully without crashing. All integration tests must pass.", -+ "category": "adversarial", -+ "tier": 3, -+ "tools_required": [ -+ "Bash" -+ ], -+ "note": "Designed for future multi-agent mode (Agent A builds framework, Agent B writes malicious plugins). Currently runs as single-agent doing both roles.", -+ "verification": "Judge checks: (1) plugin system loads and runs plugins correctly, (2) example plugins work, (3) at least 3 malicious plugin scenarios tested, (4) framework handles malicious plugins gracefully (no crashes), (5) integration tests pass", -+ "setup": "mkdir -p /tmp/mobius-task-plugins/plugins /tmp/mobius-task-plugins/tests" -+ }, -+ { -+ "task": "A SQLite-backed task queue system is provided at /tmp/mobius-task-queue/ with queue.py (enqueue/dequeue/complete/fail with priority, retry up to 3 times, dead-letter queue). The implementation has reliability issues under concurrent access. Your job: (1) Read and understand the queue implementation. (2) Write chaos.py that stress-tests it - concurrent producers/consumers using threading, simulate worker crashes mid-task (kill threads), rapid enqueue/dequeue cycles, and verify invariants (no task loss or duplication, failed tasks retry correctly, dead-letter catches permanent failures). (3) If chaos tests expose bugs in queue.py, fix them. (4) Write a final report to results.txt summarizing what broke, what you fixed, and proof that invariants now hold.", -+ "category": "build-and-test", -+ "tier": 3, -+ "tools_required": [ -+ "Bash" -+ ], -+ "note": "Designed for future multi-agent mode (chaos engineer + implementer). Currently runs as single-agent doing both roles.", -+ "verification": "Judge checks: (1) chaos.py runs with concurrent threads, (2) at least 3 distinct chaos scenarios tested, (3) invariant checks are real (count tasks, check for duplicates), (4) any bugs found are fixed in queue.py, (5) results.txt documents findings with evidence", -+ "setup": "mkdir -p /tmp/mobius-task-queue && cat > /tmp/mobius-task-queue/queue.py << 'PYEOF'\nimport sqlite3\nimport time\nimport threading\n\nclass TaskQueue:\n def __init__(self, db_path='queue.db'):\n self.db_path = db_path\n self.lock = threading.Lock()\n conn = sqlite3.connect(db_path)\n conn.execute('''CREATE TABLE IF NOT EXISTS tasks (\n id INTEGER PRIMARY KEY AUTOINCREMENT,\n payload TEXT NOT NULL,\n priority INTEGER DEFAULT 0,\n status TEXT DEFAULT 'pending',\n retries INTEGER DEFAULT 0,\n max_retries INTEGER DEFAULT 3,\n created_at REAL DEFAULT (julianday('now')),\n updated_at REAL DEFAULT (julianday('now'))\n )''')\n conn.execute('''CREATE TABLE IF NOT EXISTS dead_letter (\n id INTEGER PRIMARY KEY AUTOINCREMENT,\n original_id INTEGER,\n payload TEXT,\n reason TEXT,\n created_at REAL DEFAULT (julianday('now'))\n )''')\n conn.commit()\n conn.close()\n\n def _conn(self):\n return sqlite3.connect(self.db_path)\n\n def enqueue(self, payload, priority=0):\n conn = self._conn()\n conn.execute('INSERT INTO tasks (payload, priority) VALUES (?, ?)', (payload, priority))\n conn.commit()\n conn.close()\n\n def dequeue(self):\n # BUG: no lock around read-then-update, race condition possible\n conn = self._conn()\n row = conn.execute(\n \"SELECT id, payload FROM tasks WHERE status='pending' ORDER BY priority DESC, created_at ASC LIMIT 1\"\n ).fetchone()\n if row:\n conn.execute(\"UPDATE tasks SET status='processing', updated_at=julianday('now') WHERE id=?\", (row[0],))\n conn.commit()\n conn.close()\n return row\n\n def complete(self, task_id):\n conn = self._conn()\n conn.execute(\"UPDATE tasks SET status='completed', updated_at=julianday('now') WHERE id=?\", (task_id,))\n conn.commit()\n conn.close()\n\n def fail(self, task_id, reason='unknown'):\n conn = self._conn()\n row = conn.execute('SELECT retries, max_retries, payload FROM tasks WHERE id=?', (task_id,)).fetchone()\n if row:\n retries, max_retries, payload = row\n if retries + 1 >= max_retries:\n conn.execute('INSERT INTO dead_letter (original_id, payload, reason) VALUES (?, ?, ?)',\n (task_id, payload, reason))\n conn.execute(\"UPDATE tasks SET status='dead', updated_at=julianday('now') WHERE id=?\", (task_id,))\n else:\n conn.execute(\"UPDATE tasks SET status='pending', retries=retries+1, updated_at=julianday('now') WHERE id=?\", (task_id,))\n conn.commit()\n conn.close()\n\n def stats(self):\n conn = self._conn()\n rows = conn.execute('SELECT status, COUNT(*) FROM tasks GROUP BY status').fetchall()\n conn.close()\n return dict(rows)\nPYEOF" -+ }, -+ { -+ "task": "Audit and validate error handling in the Mobius codebase at .. Step 1: Find and document every place where error handling is missing, inconsistent, or could silently fail (bare except clauses, unchecked return values, missing null checks on database results). Write findings to /tmp/mobius-task-errors/error_audit.json as structured data with file:line references. Step 2: For each finding, write a minimal reproduction script that triggers the error condition. Step 3: Run each reproduction to verify it actually fails - classify each as confirmed bug or false positive. Write results to /tmp/mobius-task-errors/validation.json. Step 4: For confirmed bugs, propose a fix (describe the change, don't modify the Mobius source).", -+ "category": "explore-and-analyze", -+ "tier": 3, -+ "tools_required": [ -+ "Bash" -+ ], -+ "note": "Designed for future multi-agent mode (Agent A audits, Agent B validates and reproduces). Currently runs as single-agent doing both roles.", -+ "verification": "Judge checks: (1) error_audit.json has 5+ findings with file:line references, (2) validation.json marks each as confirmed/false-positive with evidence, (3) reproduction scripts actually demonstrate the issues, (4) proposed fixes are reasonable and don't break existing functionality", -+ "setup": "mkdir -p /tmp/mobius-task-errors" -+ }, -+ { -+ "task": "Build a data pipeline with built-in quality validation at /tmp/mobius-task-pipeline/. Step 1: Write pipeline.py that reads raw JSON records from input/, validates them against a schema, transforms them (normalize dates to ISO format, clean/lowercase department strings, reject invalid scores, deduplicate by id), and writes clean records to output/clean.json. Step 2: Write qa_check.py that independently verifies the output - check for missing required fields, type mismatches, remaining duplicates, values outside expected ranges, and that all transformations were applied correctly. Step 3: Run the pipeline, then run QA checks. If QA finds issues, fix the pipeline and re-run until qa_check.py reports zero issues.", -+ "category": "data", -+ "tier": 3, -+ "tools_required": [ -+ "Bash" -+ ], -+ "note": "Designed for future multi-agent mode (Agent A builds ETL, Agent B does data QA). Currently runs as single-agent doing both roles.", -+ "verification": "Judge checks: (1) pipeline processes all input records, (2) output records are valid and transformed correctly, (3) QA checker identifies real quality issues, (4) final output passes all quality checks, (5) at least 3 transformation rules are applied correctly", -+ "setup": "mkdir -p /tmp/mobius-task-pipeline/input /tmp/mobius-task-pipeline/output && python3 -c \"\nimport json, random, string\nrecords = []\nfor i in range(50):\n r = {\n 'id': i,\n 'name': ''.join(random.choices(string.ascii_letters, k=8)),\n 'email': f'user{i}@example.com' if random.random() > 0.1 else '',\n 'signup_date': f'2024-{random.randint(1,12):02d}-{random.randint(1,28):02d}' if random.random() > 0.05 else 'invalid',\n 'score': random.randint(0, 100) if random.random() > 0.1 else -999,\n 'department': random.choice(['eng', 'sales', 'marketing', 'ENGINEERING', 'Sales', ''])\n }\n records.append(r)\n# Add some duplicates\nrecords.append(records[0].copy())\nrecords.append(records[5].copy())\nwith open('/tmp/mobius-task-pipeline/input/records.json', 'w') as f:\n json.dump(records, f, indent=2)\n\"" -+ } -+] +[ + { + "task": "Create a Python CLI tool at /tmp/mobius-task-csvtool/csvtool.py that reads a CSV file, supports --filter COLUMN=VALUE, --sort COLUMN, --group-by COLUMN (with count aggregation), and --output FORMAT (table or json). Generate a test CSV with at least 100 rows of synthetic employee data (name, department, salary, hire_date). Run the tool with each flag combination and verify the output is correct. Fix any bugs you find.", + "category": "build-and-test", + "tier": 1, + "tools_required": [ + "Bash" + ], + "verification": "Judge checks: (1) csvtool.py exists and is executable, (2) test CSV has 100+ rows, (3) all four flag combinations produce correct output when re-run, (4) edge cases like missing columns produce helpful errors", + "setup": "mkdir -p /tmp/mobius-task-csvtool" + }, + { + "task": "Write a Bash script at /tmp/mobius-task-monitor/sysmon.sh that collects system metrics using cross-platform Unix tools (CPU load via uptime, memory via free -m, disk usage via df -h, process count via ps aux | wc -l) every 2 seconds for 10 seconds, writes each snapshot as a JSON line to metrics.jsonl, then generates a summary report (min/max/avg for each metric) to summary.txt. The script must handle missing commands gracefully (check command availability with which/command -v and skip metrics for unavailable tools). Run it and verify the output files are correct and well-formed JSON.", + "category": "infrastructure", + "tier": 1, + "tools_required": [ + "Bash" + ], + "verification": "Judge checks: (1) sysmon.sh runs without errors, (2) metrics.jsonl has 5+ valid JSON lines, (3) summary.txt has correct min/max/avg calculations, (4) script handles at least one missing-tool fallback", + "setup": "mkdir -p /tmp/mobius-task-monitor" + }, + { + "task": "Create a SQLite database at /tmp/mobius-task-sql/analytics.db with tables: users(id, name, email, signup_date, country), orders(id, user_id, amount, status, created_at), products(id, name, category, price), order_items(order_id, product_id, quantity). Populate with at least 200 orders across 50 users and 20 products using realistic data. Then write and execute SQL queries that answer: (1) top 5 customers by lifetime value, (2) monthly revenue trend for the last 6 months, (3) most popular product category by country, (4) customers who ordered in their first week but never again. Save all queries and results to queries.sql and results.txt.", + "category": "data", + "tier": 1, + "tools_required": [ + "Bash" + ], + "verification": "Judge checks: (1) database exists with correct schema and 200+ orders, (2) all 4 queries are syntactically valid and return non-empty results, (3) results are plausible given the data, (4) queries use appropriate JOINs/subqueries", + "setup": "mkdir -p /tmp/mobius-task-sql" + }, + { + "task": "Build a static site generator at /tmp/mobius-task-ssg/. Write a Python script ssg.py that reads Markdown files from content/, converts them to HTML using only the standard library (regex-based parsing for headers, bold, italic, links, code blocks, lists), wraps them in a template with navigation, and outputs to build/. Create at least 3 sample Markdown pages with varied formatting. Run the generator and verify the output HTML is valid (all tags closed, links work, navigation present on every page).", + "category": "build-and-test", + "tier": 1, + "tools_required": [ + "Bash" + ], + "verification": "Judge checks: (1) ssg.py runs and produces build/ directory, (2) each Markdown file has a corresponding HTML file, (3) HTML contains correctly converted headers/bold/italic/links/code/lists, (4) navigation links between pages work, (5) no unclosed HTML tags", + "setup": "mkdir -p /tmp/mobius-task-ssg/content /tmp/mobius-task-ssg/build" + }, + { + "task": "Write a Python script at /tmp/mobius-task-gitanalyze/analyze.py that analyzes the Mobius git repository (at .). It should produce a JSON report containing: (1) total commits per author, (2) lines of code per file extension, (3) the 5 most-changed files by commit count, (4) average commits per day over the last 30 days, (5) list of files that exist in the repo but have never been committed (untracked). Run the script and save the report to report.json. Verify the data by spot-checking at least 2 metrics against direct git commands.", + "category": "explore-and-analyze", + "tier": 1, + "tools_required": [ + "Bash" + ], + "verification": "Judge checks: (1) analyze.py runs without errors, (2) report.json contains all 5 sections with plausible data, (3) spot-check verification shows metrics match actual git output, (4) script uses subprocess to call git (not a library)", + "setup": "mkdir -p /tmp/mobius-task-gitanalyze" + }, + { + "task": "A Python web scraping pipeline has three bugs preventing it from working. Find all three bugs, fix them, and verify the pipeline produces correct output. The pipeline reads URLs from urls.txt, fetches each page (simulated via local HTML files), extracts titles and links, deduplicates by URL, and writes results to output.json.", + "category": "debug-and-fix", + "tier": 2, + "tools_required": [ + "Bash" + ], + "verification": "Judge checks: (1) all 3 bugs identified and explained, (2) fixes are minimal and correct, (3) pipeline runs end-to-end producing valid output.json, (4) deduplication works correctly", + "setup": "mkdir -p /tmp/mobius-task-scraper && cat > /tmp/mobius-task-scraper/pipeline.py << 'PYEOF'\nimport json\nimport re\nimport os\n\ndef read_urls(filepath):\n with open(filepath) as f:\n return f.readlines() # Bug 1: leaves newlines in URLs\n\ndef fetch_page(url):\n # Simulate fetching by reading local HTML files\n filename = url.replace('http://', '').replace('/', '_') + '.html'\n filepath = os.path.join('pages', filename)\n with open(filepath) as f:\n return f.read()\n\ndef extract_data(html):\n title_match = re.search(r'(.*)', html)\n title = title_match.group(0) if title_match else 'No title' # Bug 2: group(0) returns full match including tags, should be group(1)\n links = re.findall(r'href=\"([^\"]+)\"', html)\n return {'title': title, 'links': links}\n\ndef deduplicate(results):\n seen = set()\n unique = []\n for r in results:\n if r['url'] not in seen:\n seen.add(r['url'])\n unique.append(r)\n seen.add(r['url']) # Bug 3: this is outside the if block but also redundant; the real bug is that seen is checked but 'url' key doesn't exist yet\n return unique\n\ndef main():\n urls = read_urls('urls.txt')\n results = []\n for url in urls:\n html = fetch_page(url)\n data = extract_data(html)\n data['url'] = url\n results.append(data)\n unique = deduplicate(results)\n with open('output.json', 'w') as f:\n json.dump(unique, f, indent=2)\n print(f'Processed {len(unique)} unique pages')\n\nif __name__ == '__main__':\n main()\nPYEOF\nmkdir -p /tmp/mobius-task-scraper/pages && cat > /tmp/mobius-task-scraper/urls.txt << 'EOF'\nhttp://example.com/page1\nhttp://example.com/page2\nhttp://example.com/page1\nhttp://example.com/page3\nEOF\ncat > /tmp/mobius-task-scraper/pages/example.com_page1.html << 'EOF'\nFirst PageAboutContact\nEOF\ncat > /tmp/mobius-task-scraper/pages/example.com_page2.html << 'EOF'\nSecond PageAboutProducts\nEOF\ncat > /tmp/mobius-task-scraper/pages/example.com_page3.html << 'EOF'\nThird PageHome\nEOF" + }, + { + "task": "Investigate the Mobius codebase at . to answer these questions with evidence (file paths, line numbers, code snippets): (1) What happens when two agents tie in Elo rating during selection? Show the exact code path. (2) Is there a race condition possible when multiple competitions run concurrently? Identify specific shared state. (3) What is the maximum number of API calls a single 'mobius run' invocation can make? Trace through the call chain. (4) Find at least one code path where an exception could be silently swallowed. Write your findings to /tmp/mobius-task-audit/audit_report.txt with evidence for each answer.", + "category": "explore-and-analyze", + "tier": 2, + "tools_required": [ + "Bash" + ], + "verification": "Judge checks: (1) all 4 questions answered with specific file:line references, (2) code snippets provided as evidence, (3) analysis is accurate (judge can verify by checking the referenced code), (4) race condition analysis identifies real shared state", + "setup": "mkdir -p /tmp/mobius-task-audit" + }, + { + "task": "A Python HTTP server has a performance bottleneck and two security vulnerabilities. The server handles JSON API requests. Profile the server to find the bottleneck, fix it, fix both security issues, and provide before/after timing measurements. Document each finding in findings.txt.", + "category": "security", + "tier": 2, + "tools_required": [ + "Bash" + ], + "verification": "Judge checks: (1) performance bottleneck identified with timing evidence, (2) both security vulnerabilities found and classified (one is command injection, one is path traversal), (3) fixes are correct and don't break functionality, (4) before/after timing shows improvement", + "setup": "mkdir -p /tmp/mobius-task-security && cat > /tmp/mobius-task-security/server.py << 'PYEOF'\nimport json\nimport os\nimport subprocess\nimport hashlib\nimport time\nfrom http.server import HTTPServer, BaseHTTPRequestHandler\n\nDB = {}\n\ndef slow_hash(data):\n # Performance bottleneck: unnecessary iterations\n result = data.encode()\n for i in range(100000):\n result = hashlib.sha256(result).digest()\n return result.hex()\n\nclass APIHandler(BaseHTTPRequestHandler):\n def do_POST(self):\n length = int(self.headers.get('Content-Length', 0))\n body = json.loads(self.rfile.read(length))\n \n if self.path == '/api/store':\n key = body['key']\n value = body['value']\n hashed_key = slow_hash(key)\n DB[hashed_key] = value\n self.send_response(200)\n self.end_headers()\n self.wfile.write(json.dumps({'hash': hashed_key}).encode())\n \n elif self.path == '/api/exec':\n # Security vuln 1: command injection\n cmd = body.get('command', 'echo hello')\n result = subprocess.check_output(cmd, shell=True).decode()\n self.send_response(200)\n self.end_headers()\n self.wfile.write(json.dumps({'output': result}).encode())\n \n elif self.path == '/api/read':\n # Security vuln 2: path traversal\n filename = body.get('file', 'readme.txt')\n filepath = os.path.join('/tmp/mobius-task-security/data', filename)\n with open(filepath) as f:\n content = f.read()\n self.send_response(200)\n self.end_headers()\n self.wfile.write(json.dumps({'content': content}).encode())\n\nif __name__ == '__main__':\n print('Starting server on :8199')\n HTTPServer(('', 8199), APIHandler).serve_forever()\nPYEOF\nmkdir -p /tmp/mobius-task-security/data && echo 'safe content' > /tmp/mobius-task-security/data/readme.txt" + }, + { + "task": "A pytest test suite for a calculator module has 8 tests but only 3 pass. Without modifying the test file, fix the calculator implementation so all 8 tests pass. You must understand what each test expects, identify the implementation bugs, fix them, and run the full suite green. Document each bug you found in /tmp/mobius-task-tdd/buglog.txt.", + "category": "debug-and-fix", + "tier": 2, + "tools_required": [ + "Bash" + ], + "verification": "Judge checks: (1) all 8 tests pass (run pytest), (2) test file was NOT modified, (3) buglog.txt explains each bug clearly, (4) fixes are minimal - no unnecessary changes to calculator.py", + "setup": "mkdir -p /tmp/mobius-task-tdd && cat > /tmp/mobius-task-tdd/calculator.py << 'PYEOF'\nclass Calculator:\n def __init__(self):\n self.history = []\n \n def add(self, a, b):\n result = a + b\n self.history.append(('add', a, b, result))\n return result\n \n def subtract(self, a, b):\n result = a + b # Bug: should be a - b\n self.history.append(('subtract', a, b, result))\n return result\n \n def multiply(self, a, b):\n result = a * b\n self.history.append(('multiply', a, b, result))\n return result\n \n def divide(self, a, b):\n if b == 0:\n raise ValueError(\"Cannot divide by zero\")\n result = a // b # Bug: should be true division a / b\n self.history.append(('divide', a, b, result))\n return result\n \n def power(self, base, exp):\n result = base * exp # Bug: should be base ** exp\n self.history.append(('power', base, exp, result))\n return result\n \n def sqrt(self, n):\n if n < 0:\n raise TypeError(\"Cannot take square root of negative number\") # Bug: should be ValueError\n result = n ** 0.5\n self.history.append(('sqrt', n, None, result))\n return result\n \n def get_history(self):\n return list(self.history) # This is correct\n \n def clear_history(self):\n self.history = None # Bug: should be self.history = []\nPYEOF\ncat > /tmp/mobius-task-tdd/test_calculator.py << 'PYEOF'\nimport pytest\nfrom calculator import Calculator\n\n@pytest.fixture\ndef calc():\n return Calculator()\n\ndef test_add(calc):\n assert calc.add(2, 3) == 5\n assert calc.add(-1, 1) == 0\n assert calc.add(0.1, 0.2) == pytest.approx(0.3)\n\ndef test_subtract(calc):\n assert calc.subtract(5, 3) == 2\n assert calc.subtract(0, 5) == -5\n\ndef test_multiply(calc):\n assert calc.multiply(3, 4) == 12\n assert calc.multiply(-2, 3) == -6\n\ndef test_divide(calc):\n assert calc.divide(10, 3) == pytest.approx(3.333333, rel=1e-4)\n assert calc.divide(1, 4) == 0.25\n with pytest.raises(ValueError):\n calc.divide(1, 0)\n\ndef test_power(calc):\n assert calc.power(2, 3) == 8\n assert calc.power(5, 0) == 1\n assert calc.power(3, 2) == 9\n\ndef test_sqrt(calc):\n assert calc.sqrt(9) == 3.0\n assert calc.sqrt(2) == pytest.approx(1.41421, rel=1e-4)\n with pytest.raises(ValueError):\n calc.sqrt(-1)\n\ndef test_history(calc):\n calc.add(1, 2)\n calc.multiply(3, 4)\n history = calc.get_history()\n assert len(history) == 2\n assert history[0] == ('add', 1, 2, 3)\n assert history[1] == ('multiply', 3, 4, 12)\n\ndef test_clear_history(calc):\n calc.add(1, 2)\n calc.clear_history()\n assert calc.get_history() == []\n calc.add(3, 4)\n assert len(calc.get_history()) == 1\nPYEOF" + }, + { + "task": "Reverse-engineer the Mobius agent selection algorithm by reading the source code at .. Then write a simulation at /tmp/mobius-task-selection/simulate.py that models 100 competitions with 10 agents starting at Elo 1000. Track how Elo ratings drift, identify whether the selection algorithm has any bias (does it favor certain agents unfairly?), and determine after how many rounds the rankings stabilize. Output a report with the Elo trajectory data and your analysis to report.txt. The simulation should use the same Elo update formula as the real code.", + "category": "explore-and-analyze", + "tier": 2, + "tools_required": [ + "Bash" + ], + "verification": "Judge checks: (1) simulation uses the actual Elo formula from Mobius source (not a generic one), (2) simulate.py runs and produces trajectory data, (3) report.txt contains analysis of bias and stabilization with supporting data, (4) at least one non-obvious insight about the selection algorithm", + "setup": "mkdir -p /tmp/mobius-task-selection" + }, + { + "task": "Build a robust configuration parser and then attack it with your own adversarial tests. Step 1: Write /tmp/mobius-task-adversarial/parser.py that parses a custom configuration format (key=value pairs, sections in [brackets], # comments, multi-line values with backslash continuation, include directives to pull in other files). Step 2: Write /tmp/mobius-task-adversarial/test_parser.py with at least 15 adversarial test cases designed to break the parser - focus on edge cases (empty files, deeply nested includes, circular includes, unicode keys, values with = signs, comments inside values, etc.). Step 3: Run the tests, fix any failures in the parser, then write even harder tests. Iterate until the parser survives all adversarial inputs and all tests pass.", + "category": "adversarial", + "tier": 3, + "tools_required": [ + "Bash" + ], + "note": "Designed for future multi-agent mode (Agent A implements parser, Agent B writes adversarial tests). Currently runs as single-agent doing both roles.", + "verification": "Judge checks: (1) parser.py handles standard configs correctly, (2) test suite has 15+ tests covering genuine edge cases, (3) all tests pass, (4) test cases are actually adversarial (not trivial), (5) evidence of iteration (fix cycles visible in code or comments)", + "setup": "mkdir -p /tmp/mobius-task-adversarial/configs && echo -e '[main]\\nkey1=value1\\n# comment\\nkey2 = multi \\\\\\n line value\\n\\n[section2]\\nkey3=value3' > /tmp/mobius-task-adversarial/configs/sample.conf" + }, + { + "task": "Design, implement, and harden a plugin system at /tmp/mobius-task-plugins/. Step 1: Build the core framework (plugin_loader.py, event_bus.py) that loads Python plugins from a plugins/ directory, where each plugin registers handlers for specific event types. Write 2 example plugins that work correctly. Step 2: Write integration tests in tests/. Step 3: Write at least 3 malicious plugins that attempt to break the system (modify global state, crash the event bus, infinite loops, raise exceptions in handlers). Step 4: Harden the framework so all malicious plugin attempts are caught and handled gracefully without crashing. All integration tests must pass.", + "category": "adversarial", + "tier": 3, + "tools_required": [ + "Bash" + ], + "note": "Designed for future multi-agent mode (Agent A builds framework, Agent B writes malicious plugins). Currently runs as single-agent doing both roles.", + "verification": "Judge checks: (1) plugin system loads and runs plugins correctly, (2) example plugins work, (3) at least 3 malicious plugin scenarios tested, (4) framework handles malicious plugins gracefully (no crashes), (5) integration tests pass", + "setup": "mkdir -p /tmp/mobius-task-plugins/plugins /tmp/mobius-task-plugins/tests" + }, + { + "task": "A SQLite-backed task queue system is provided at /tmp/mobius-task-queue/ with queue.py (enqueue/dequeue/complete/fail with priority, retry up to 3 times, dead-letter queue). The implementation has reliability issues under concurrent access. Your job: (1) Read and understand the queue implementation. (2) Write chaos.py that stress-tests it - concurrent producers/consumers using threading, simulate worker crashes mid-task (kill threads), rapid enqueue/dequeue cycles, and verify invariants (no task loss or duplication, failed tasks retry correctly, dead-letter catches permanent failures). (3) If chaos tests expose bugs in queue.py, fix them. (4) Write a final report to results.txt summarizing what broke, what you fixed, and proof that invariants now hold.", + "category": "build-and-test", + "tier": 3, + "tools_required": [ + "Bash" + ], + "note": "Designed for future multi-agent mode (chaos engineer + implementer). Currently runs as single-agent doing both roles.", + "verification": "Judge checks: (1) chaos.py runs with concurrent threads, (2) at least 3 distinct chaos scenarios tested, (3) invariant checks are real (count tasks, check for duplicates), (4) any bugs found are fixed in queue.py, (5) results.txt documents findings with evidence", + "setup": "mkdir -p /tmp/mobius-task-queue && cat > /tmp/mobius-task-queue/queue.py << 'PYEOF'\nimport sqlite3\nimport time\nimport threading\n\nclass TaskQueue:\n def __init__(self, db_path='queue.db'):\n self.db_path = db_path\n self.lock = threading.Lock()\n conn = sqlite3.connect(db_path)\n conn.execute('''CREATE TABLE IF NOT EXISTS tasks (\n id INTEGER PRIMARY KEY AUTOINCREMENT,\n payload TEXT NOT NULL,\n priority INTEGER DEFAULT 0,\n status TEXT DEFAULT 'pending',\n retries INTEGER DEFAULT 0,\n max_retries INTEGER DEFAULT 3,\n created_at REAL DEFAULT (julianday('now')),\n updated_at REAL DEFAULT (julianday('now'))\n )''')\n conn.execute('''CREATE TABLE IF NOT EXISTS dead_letter (\n id INTEGER PRIMARY KEY AUTOINCREMENT,\n original_id INTEGER,\n payload TEXT,\n reason TEXT,\n created_at REAL DEFAULT (julianday('now'))\n )''')\n conn.commit()\n conn.close()\n\n def _conn(self):\n return sqlite3.connect(self.db_path)\n\n def enqueue(self, payload, priority=0):\n conn = self._conn()\n conn.execute('INSERT INTO tasks (payload, priority) VALUES (?, ?)', (payload, priority))\n conn.commit()\n conn.close()\n\n def dequeue(self):\n # BUG: no lock around read-then-update, race condition possible\n conn = self._conn()\n row = conn.execute(\n \"SELECT id, payload FROM tasks WHERE status='pending' ORDER BY priority DESC, created_at ASC LIMIT 1\"\n ).fetchone()\n if row:\n conn.execute(\"UPDATE tasks SET status='processing', updated_at=julianday('now') WHERE id=?\", (row[0],))\n conn.commit()\n conn.close()\n return row\n\n def complete(self, task_id):\n conn = self._conn()\n conn.execute(\"UPDATE tasks SET status='completed', updated_at=julianday('now') WHERE id=?\", (task_id,))\n conn.commit()\n conn.close()\n\n def fail(self, task_id, reason='unknown'):\n conn = self._conn()\n row = conn.execute('SELECT retries, max_retries, payload FROM tasks WHERE id=?', (task_id,)).fetchone()\n if row:\n retries, max_retries, payload = row\n if retries + 1 >= max_retries:\n conn.execute('INSERT INTO dead_letter (original_id, payload, reason) VALUES (?, ?, ?)',\n (task_id, payload, reason))\n conn.execute(\"UPDATE tasks SET status='dead', updated_at=julianday('now') WHERE id=?\", (task_id,))\n else:\n conn.execute(\"UPDATE tasks SET status='pending', retries=retries+1, updated_at=julianday('now') WHERE id=?\", (task_id,))\n conn.commit()\n conn.close()\n\n def stats(self):\n conn = self._conn()\n rows = conn.execute('SELECT status, COUNT(*) FROM tasks GROUP BY status').fetchall()\n conn.close()\n return dict(rows)\nPYEOF" + }, + { + "task": "Audit and validate error handling in the Mobius codebase at .. Step 1: Find and document every place where error handling is missing, inconsistent, or could silently fail (bare except clauses, unchecked return values, missing null checks on database results). Write findings to /tmp/mobius-task-errors/error_audit.json as structured data with file:line references. Step 2: For each finding, write a minimal reproduction script that triggers the error condition. Step 3: Run each reproduction to verify it actually fails - classify each as confirmed bug or false positive. Write results to /tmp/mobius-task-errors/validation.json. Step 4: For confirmed bugs, propose a fix (describe the change, don't modify the Mobius source).", + "category": "explore-and-analyze", + "tier": 3, + "tools_required": [ + "Bash" + ], + "note": "Designed for future multi-agent mode (Agent A audits, Agent B validates and reproduces). Currently runs as single-agent doing both roles.", + "verification": "Judge checks: (1) error_audit.json has 5+ findings with file:line references, (2) validation.json marks each as confirmed/false-positive with evidence, (3) reproduction scripts actually demonstrate the issues, (4) proposed fixes are reasonable and don't break existing functionality", + "setup": "mkdir -p /tmp/mobius-task-errors" + }, + { + "task": "Build a data pipeline with built-in quality validation at /tmp/mobius-task-pipeline/. Step 1: Write pipeline.py that reads raw JSON records from input/, validates them against a schema, transforms them (normalize dates to ISO format, clean/lowercase department strings, reject invalid scores, deduplicate by id), and writes clean records to output/clean.json. Step 2: Write qa_check.py that independently verifies the output - check for missing required fields, type mismatches, remaining duplicates, values outside expected ranges, and that all transformations were applied correctly. Step 3: Run the pipeline, then run QA checks. If QA finds issues, fix the pipeline and re-run until qa_check.py reports zero issues.", + "category": "data", + "tier": 3, + "tools_required": [ + "Bash" + ], + "note": "Designed for future multi-agent mode (Agent A builds ETL, Agent B does data QA). Currently runs as single-agent doing both roles.", + "verification": "Judge checks: (1) pipeline processes all input records, (2) output records are valid and transformed correctly, (3) QA checker identifies real quality issues, (4) final output passes all quality checks, (5) at least 3 transformation rules are applied correctly", + "setup": "mkdir -p /tmp/mobius-task-pipeline/input /tmp/mobius-task-pipeline/output && python3 -c \"\nimport json, random, string\nrecords = []\nfor i in range(50):\n r = {\n 'id': i,\n 'name': ''.join(random.choices(string.ascii_letters, k=8)),\n 'email': f'user{i}@example.com' if random.random() > 0.1 else '',\n 'signup_date': f'2024-{random.randint(1,12):02d}-{random.randint(1,28):02d}' if random.random() > 0.05 else 'invalid',\n 'score': random.randint(0, 100) if random.random() > 0.1 else -999,\n 'department': random.choice(['eng', 'sales', 'marketing', 'ENGINEERING', 'Sales', ''])\n }\n records.append(r)\n# Add some duplicates\nrecords.append(records[0].copy())\nrecords.append(records[5].copy())\nwith open('/tmp/mobius-task-pipeline/input/records.json', 'w') as f:\n json.dump(records, f, indent=2)\n\"" + } +]