Sentinel/debug_response_agent.py at master · Skumarr53/Sentinel · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
#!/usr/bin/env python3
"""Debug script to trace Response Agent LLM setup line-by-line.

Run with: python debug_response_agent.py
Set breakpoints in your IDE to step through each line.

This script walks through:
1. Loading configuration
2. Creating test state with analysis result
3. Building response context
4. Creating LLM messages
5. Token counting
6. LLM client creation
7. Making LLM API call (optional)
8. Parsing remediation plan
"""

import asyncio
import json
import logging
import sys
from pathlib import Path

# Setup logging to see what's happening
logging.basicConfig(
    level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)

# Add project root
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))

from config import get_config
from src.agents.response_builder import (
    build_response_context,
    build_response_messages,
    generate_placeholder_plan,
    parse_response_output,
    ResponseContext,
)
from src.agents.llm_client import LLMClientFactory, count_tokens
from src.agents.schemas import (
    AgentState,
    AnalysisResult,
    DetectionResult,
    IncidentMetrics,
    create_initial_state,
)

print("=" * 80)
print("RESPONSE AGENT DEBUG SCRIPT")
print("=" * 80)
print("\nThis script walks through the Response Agent flow step-by-step.")
print("Set breakpoints in your IDE to debug line-by-line.\n")

print("=" * 80)
print("STEP 1: Loading Configuration")
print("=" * 80)

# Breakpoint here: Step into get_config()
config = get_config()
print(f"✅ Config loaded:")
print(f"   Environment: {config.env}")
print(f"   LLM Provider: {config.llm.provider}")
print(f"   Response Model: {config.llm.response_model}")
print(f"   Max Tokens: {config.llm.max_tokens_per_analysis}")
print(f"   Response Timeout: {config.agent.response_timeout_seconds}s")
print(f"   Groq API Key Set: {config.api_keys.groq_api_key is not None}")
print(f"   OpenAI API Key Set: {config.api_keys.openai_api_key is not None}")

print("\n" + "=" * 80)
print("STEP 2: Creating Test Incident State with Analysis Result")
print("=" * 80)

# Breakpoint here: Step into create_initial_state()
metrics = [
    IncidentMetrics(
        metric_name="memory_usage_percent",
        current_value=95.5,  # High memory usage
        baseline_value=70.0,  # Normal baseline
        deviation_score=3.5,  # 3.5 sigma deviation
        labels={"service": "cache-service", "host": "cache-01"},
    )
]

state = create_initial_state("debug-response-001", metrics)
print(f"✅ Initial state created:")
print(f"   Incident ID: {state['incident_id']}")
print(f"   Metrics: {len(state['metrics'])} metric(s)")

# Add detection result (simulating Detection Agent output)
state["detection_result"] = DetectionResult(
    is_anomaly=True,
    confidence=0.90,
    detection_method="z_score",
    threshold_used=3.0,
)
print(f"   Detection Result: Anomaly={state['detection_result'].is_anomaly}")

# Add analysis result (simulating Analysis Agent output)
state["analysis_result"] = AnalysisResult(
    root_cause="Memory leak in cache service due to unbounded cache growth",
    confidence=0.85,
    affected_services=["cache-service", "web-server-01"],
    evidence=[
        "Memory usage spiked from 70% to 95% over 5 minutes",
        "No recent deployments or config changes",
        "Cache hit rate dropped from 85% to 65%",
        "GC frequency increased 3x",
    ],
    recommendations=["Restart cache service", "Investigate cache eviction policy"],
    token_usage=450,
)
print(f"   Analysis Result:")
print(f"      Root Cause: {state['analysis_result'].root_cause[:60]}...")
print(f"      Confidence: {state['analysis_result'].confidence:.1%}")
print(f"      Affected Services: {state['analysis_result'].affected_services}")

print("\n" + "=" * 80)
print("STEP 3: Building Response Context")
print("=" * 80)

# Breakpoint here: Step into build_response_context()
try:
    context = build_response_context(state)
    print(f"✅ Response context built:")
    print(f"   Incident ID: {context.incident_id}")
    print(f"   Root Cause: {context.root_cause[:60]}...")
    print(f"   Confidence: {context.confidence:.1%}")
    print(f"   Evidence Items: {len(context.evidence)}")
    print(f"   Affected Services: {context.affected_services}")
    print(f"   Metric Name: {context.metric_name}")
    print(f"   Metric Deviation: {context.metric_deviation:.2f} sigma")
    print(f"   Detection Method: {context.detection_method}")

    # Show the prompt text
    print(f"\n   Context prompt preview:")
    prompt_text = context.to_prompt_text()
    print(f"   {prompt_text[:300]}...")
except Exception as e:
    print(f"❌ Context building failed: {e}")
    import traceback

    traceback.print_exc()
    sys.exit(1)

print("\n" + "=" * 80)
print("STEP 4: Building LLM Messages")
print("=" * 80)

# Breakpoint here: Step into build_response_messages()
messages = build_response_messages(context)
print(f"✅ Messages built:")
print(f"   Number of messages: {len(messages)}")
print(f"   System prompt length: {len(messages[0]['content'])} chars")
print(f"   User prompt length: {len(messages[1]['content'])} chars")

# Show system prompt (first 400 chars)
print(f"\n   System prompt preview:")
print(f"   {messages[0]['content'][:400]}...")

# Show user prompt (first 400 chars)
print(f"\n   User prompt preview:")
print(f"   {messages[1]['content'][:400]}...")

print("\n" + "=" * 80)
print("STEP 5: Token Counting")
print("=" * 80)

# Breakpoint here: Step into count_tokens()
system_tokens = count_tokens(messages[0]["content"])
user_tokens = count_tokens(messages[1]["content"])
total_tokens = system_tokens + user_tokens

print(f"✅ Token count:")
print(f"   System prompt tokens: {system_tokens}")
print(f"   User prompt tokens: {user_tokens}")
print(f"   Total input tokens: {total_tokens}")
print(f"   Max tokens allowed: {config.llm.max_tokens_per_analysis}")
print(f"   Under budget: {total_tokens <= config.llm.max_tokens_per_analysis}")

# Estimate output tokens (remediation plan is typically 200-500 tokens)
estimated_output = 300
estimated_total = total_tokens + estimated_output
print(f"   Estimated output tokens: ~{estimated_output}")
print(f"   Estimated total tokens: ~{estimated_total}")

print("\n" + "=" * 80)
print("STEP 6: Creating LLM Client Factory")
print("=" * 80)

# Breakpoint here: Step into LLMClientFactory()
try:
    factory = LLMClientFactory(config.api_keys, config.llm)
    print(f"✅ Factory created")

    # Breakpoint here: Step into get_provider()
    provider = factory.get_provider()
    print(f"✅ Provider obtained:")
    print(f"   Provider name: {provider.provider_name}")
    print(f"   Provider type: {type(provider).__name__}")
    print(f"   Model: {config.llm.response_model}")
except ValueError as e:
    print(f"❌ Provider creation failed: {e}")
    print(f"\n💡 TIP: Set your API key:")
    print(f"   export SENTINEL_GROQ_API_KEY=your_key_here")
    print(f"\n   Or get a free key at: https://console.groq.com")
    print(f"\n   Continuing with placeholder plan generation...")
    provider = None
    factory = None

print("\n" + "=" * 80)
print("STEP 7: Making LLM API Call (Optional - requires API key)")
print("=" * 80)

# Only proceed if API key is set
if factory and config.api_keys.groq_api_key:
    print("✅ API key found, making test call...")

    async def test_llm_call():
        # Breakpoint here: Step into complete_with_retry()
        try:
            if provider is None:
                raise ValueError(
                    "Provider is None - API key not set or provider creation failed"
                )

            response = await factory.complete_with_retry(
                messages=messages,
                model=config.llm.response_model,
                max_tokens=500,  # Reasonable for remediation plan
                temperature=config.llm.temperature,
            )

            print(f"✅ LLM Response received:")
            print(f"   Content length: {len(response.content)} chars")
            print(f"   Input tokens: {response.input_tokens}")
            print(f"   Output tokens: {response.output_tokens}")
            print(f"   Total tokens: {response.total_tokens}")
            print(f"   Latency: {response.latency_seconds:.2f}s")
            print(f"   Cost: ${response.cost_usd:.6f}")
            print(f"\n   Raw response preview:")
            print(f"   {response.content[:500]}...")

            return response
        except Exception as e:
            print(f"❌ LLM call failed: {e}")
            print(f"\n💡 Debugging tips:")
            print(f"   1. Check API key: echo $SENTINEL_GROQ_API_KEY")
            print(f"   2. Check network connectivity")
            print(f"   3. Check rate limits (Groq: ~6000 req/min)")
            print(f"   4. Check model availability")
            import traceback

            traceback.print_exc()
            raise

    # Run async test
    try:
        llm_response = asyncio.run(test_llm_call())

        print("\n" + "=" * 80)
        print("STEP 8: Parsing LLM Response")
        print("=" * 80)

        # Breakpoint here: Step into parse_response_output()
        try:
            remediation_plan = parse_response_output(llm_response.content)
            print(f"✅ Remediation plan parsed:")
            print(f"   Number of actions: {len(remediation_plan.actions)}")
            print(
                f"   Total estimated downtime: {remediation_plan.total_estimated_downtime_seconds}s"
            )

            for i, action in enumerate(remediation_plan.actions, 1):
                print(f"\n   Action {i}:")
                print(f"      ID: {action.action_id}")
                print(f"      Type: {action.action_type}")
                print(f"      Target: {action.target}")
                print(f"      Risk Level: {action.risk_level}")
                print(f"      Estimated Downtime: {action.estimated_downtime_seconds}s")
                print(f"      Approved: {action.approved}")
                print(f"      Parameters: {action.parameters}")
        except Exception as e:
            print(f"❌ Parsing failed: {e}")
            print(f"\n   Raw response that failed to parse:")
            print(f"   {llm_response.content}")
            import traceback

            traceback.print_exc()
    except Exception as e:
        print(f"\n⚠️  LLM call skipped due to error (this is OK for debugging)")
        print(f"   Error: {e}")
else:
    print("⚠️  No API key set - skipping actual LLM call")
    print(f"\n💡 To test LLM call:")
    print(f"   1. Get free API key: https://console.groq.com")
    print(f"   2. Set: export SENTINEL_GROQ_API_KEY=your_key")
    print(f"   3. Re-run this script")

print("\n" + "=" * 80)
print("STEP 9: Placeholder Plan Generation (Fallback)")
print("=" * 80)

# Breakpoint here: Step into generate_placeholder_plan()
placeholder_plan = generate_placeholder_plan(state, "LLM unavailable for testing")
print(f"✅ Placeholder plan generated:")
print(f"   Number of actions: {len(placeholder_plan.actions)}")
print(
    f"   Total estimated downtime: {placeholder_plan.total_estimated_downtime_seconds}s"
)

for i, action in enumerate(placeholder_plan.actions, 1):
    print(f"\n   Action {i}:")
    print(f"      ID: {action.action_id}")
    print(f"      Type: {action.action_type}")
    print(f"      Target: {action.target}")
    print(f"      Risk Level: {action.risk_level}")
    print(f"      Estimated Downtime: {action.estimated_downtime_seconds}s")
    print(f"      Approved: {action.approved}")

print("\n" + "=" * 80)
print("✅ Debug script completed!")
print("=" * 80)
print("\nNext steps:")
print("1. Set breakpoints in your IDE at each STEP marker")
print("2. Run this script in debug mode")
print("3. Step through each function call")
print("4. Inspect variables at each step")
print("\nKey files to debug:")
print("  - src/agents/response_builder.py (context & parsing)")
print("  - src/agents/nodes/respond.py (main node logic)")
print("  - src/agents/llm_client.py (LLM API calls)")
print("  - config/settings.py (configuration loading)")