Here's a lightweight, self-contained Self-Improving Prompt Optimizer built in Elixir. It uses DeepSeek's reasoning to iteratively refine a system prompt based on test case performance. The entire system is under 300 lines of code and requires only a DeepSeek API key to run.
prompt_optimizer/
├── mix.exs
├── config/
│ └── config.exs
├── lib/
│ ├── optimizer.ex
│ ├── evaluator.ex
│ ├── improver.ex
│ └── storage.ex
└── priv/
└── test_cases.json
- Initial Prompt: You provide a starting system prompt and a set of test cases (input/expected output pairs).
- Evaluate: The system runs the prompt against all test cases, scoring accuracy.
- Analyze Failures: DeepSeek R1 analyzes failed cases and proposes prompt improvements.
- Mutate: The prompt is updated with the suggested changes.
- Repeat: The cycle continues until convergence or a maximum iteration limit.
- Persist: The best prompt and its score are saved to disk.
defmodule PromptOptimizer.MixProject do
use Mix.Project
def project do
[
app: :prompt_optimizer,
version: "0.1.0",
elixir: "~> 1.15",
start_permanent: Mix.env() == :prod,
deps: deps()
]
end
def application do
[extra_applications: [:logger]]
end
defp deps do
[
{:req, "~> 0.5.0"},
{:jason, "~> 1.4"}
]
end
endimport Config
config :prompt_optimizer,
deepseek_api_key: System.get_env("DEEPSEEK_API_KEY"),
deepseek_model: "deepseek-reasoner",
max_iterations: 10,
improvement_threshold: 0.95,
test_cases_path: "priv/test_cases.json",
state_path: "priv/state.json"defmodule PromptOptimizer do
@moduledoc """
Main orchestrator for the self-improving prompt loop.
"""
alias PromptOptimizer.{Evaluator, Improver, Storage}
def run(initial_prompt, opts \\ []) do
max_iterations = Keyword.get(opts, :max_iterations, 10)
threshold = Keyword.get(opts, :threshold, 0.95)
state = Storage.load() || %{
prompt: initial_prompt,
best_score: 0.0,
iteration: 0,
history: []
}
IO.puts("🧠 Starting Self-Improving Prompt Optimizer")
IO.puts(" Initial prompt length: #{String.length(state.prompt)} chars")
IO.puts(" Max iterations: #{max_iterations}")
IO.puts(" Target score: #{threshold}\n")
result = optimize(state, max_iterations, threshold)
Storage.save(result)
IO.puts("\n✅ Optimization complete! Best score: #{Float.round(result.best_score, 4)}")
result
end
defp optimize(state, max_iterations, threshold) do
if state.iteration >= max_iterations or state.best_score >= threshold do
state
else
iteration = state.iteration + 1
IO.puts("\n📊 Iteration #{iteration}")
IO.puts(" Current best score: #{Float.round(state.best_score, 4)}")
# Evaluate current prompt
score = Evaluator.evaluate(state.prompt)
IO.puts(" Score: #{Float.round(score, 4)}")
if score > state.best_score do
IO.puts(" ✨ New best score!")
new_state = %{state | prompt: state.prompt, best_score: score}
else
new_state = state
end
# Analyze failures and propose improvement
if score < threshold do
failures = Evaluator.get_failures(state.prompt)
if failures != [] do
IO.puts(" 🔍 Analyzing #{length(failures)} failures...")
improved_prompt = Improver.improve(state.prompt, failures)
if improved_prompt != state.prompt do
IO.puts(" 📝 Prompt updated (#{String.length(improved_prompt)} chars)")
new_state = %{new_state | prompt: improved_prompt}
end
end
end
history_entry = %{
iteration: iteration,
score: score,
prompt_length: String.length(new_state.prompt)
}
next_state = %{
new_state |
iteration: iteration,
history: [history_entry | new_state.history]
}
optimize(next_state, max_iterations, threshold)
end
end
enddefmodule PromptOptimizer.Evaluator do
@moduledoc """
Evaluates a prompt against test cases using DeepSeek API.
"""
@test_cases_path Application.compile_env(:prompt_optimizer, :test_cases_path)
def evaluate(prompt) do
test_cases = load_test_cases()
results = Enum.map(test_cases, &run_test(prompt, &1))
score = Enum.count(results, & &1) / length(test_cases)
score
end
def get_failures(prompt) do
test_cases = load_test_cases()
Enum.reject(test_cases, fn tc -> run_test(prompt, tc) end)
end
defp load_test_cases do
@test_cases_path
|> File.read!()
|> Jason.decode!()
end
defp run_test(prompt, %{"input" => input, "expected" => expected}) do
response = call_deepseek(prompt, input)
# Simple string contains check (can be enhanced)
String.contains?(String.downcase(response), String.downcase(expected))
end
defp call_deepseek(system_prompt, user_input) do
api_key = Application.fetch_env!(:prompt_optimizer, :deepseek_api_key)
model = Application.get_env(:prompt_optimizer, :deepseek_model, "deepseek-chat")
body = %{
model: model,
messages: [
%{role: "system", content: system_prompt},
%{role: "user", content: user_input}
],
temperature: 0.1,
max_tokens: 200
}
headers = [
{"Authorization", "Bearer #{api_key}"},
{"Content-Type", "application/json"}
]
case Req.post("https://api.deepseek.com/v1/chat/completions",
json: body, headers: headers) do
{:ok, %{status: 200, body: %{"choices" => [%{"message" => %{"content" => content}}]}}} ->
content
_ ->
""
end
end
enddefmodule PromptOptimizer.Improver do
@moduledoc """
Uses DeepSeek R1 to analyze failures and propose prompt improvements.
"""
def improve(current_prompt, failures) do
analysis = analyze_failures(current_prompt, failures)
extract_improved_prompt(analysis) || current_prompt
end
defp analyze_failures(prompt, failures) do
api_key = Application.fetch_env!(:prompt_optimizer, :deepseek_api_key)
failures_text = failures
|> Enum.take(5)
|> Enum.map(fn %{"input" => inp, "expected" => exp} ->
"Q: #{inp}\nExpected: #{exp}"
end)
|> Enum.join("\n\n")
analysis_prompt = """
You are an expert prompt engineer. Analyze why the following system prompt failed on these test cases.
CURRENT SYSTEM PROMPT:
---
#{prompt}
---
FAILED TEST CASES:
#{failures_text}
Identify the root cause of failures and propose a CONCISE, IMPROVED system prompt.
Return ONLY the new system prompt text, no explanations.
IMPROVED SYSTEM PROMPT:
"""
body = %{
model: "deepseek-reasoner",
messages: [%{role: "user", content: analysis_prompt}],
temperature: 0.3,
max_tokens: 500
}
headers = [
{"Authorization", "Bearer #{api_key}"},
{"Content-Type", "application/json"}
]
case Req.post("https://api.deepseek.com/v1/chat/completions",
json: body, headers: headers) do
{:ok, %{status: 200, body: %{"choices" => [%{"message" => %{"content" => content}}]}}} ->
content
_ ->
nil
end
end
defp extract_improved_prompt(response) when is_binary(response) do
# Clean up response (remove any markdown or extra text)
response
|> String.replace(~r/```[a-zA-Z]*\n?/, "")
|> String.replace("```", "")
|> String.trim()
end
defp extract_improved_prompt(_), do: nil
enddefmodule PromptOptimizer.Storage do
@moduledoc """
Saves and loads optimization state.
"""
@state_path Application.compile_env(:prompt_optimizer, :state_path)
def save(state) do
File.write!(@state_path, Jason.encode!(state, pretty: true))
end
def load do
if File.exists?(@state_path) do
@state_path
|> File.read!()
|> Jason.decode!(keys: :atoms)
end
end
end[
{"input": "What is 2+2?", "expected": "4"},
{"input": "Capital of France?", "expected": "Paris"},
{"input": "Who wrote Romeo and Juliet?", "expected": "Shakespeare"},
{"input": "Largest planet?", "expected": "Jupiter"},
{"input": "Color of the sky?", "expected": "blue"}
]export DEEPSEEK_API_KEY="your-key"
mix deps.get
iex -S mixinitial_prompt = "You are a helpful assistant. Answer questions concisely."
PromptOptimizer.run(initial_prompt)🧠 Starting Self-Improving Prompt Optimizer
Initial prompt length: 48 chars
Max iterations: 10
Target score: 0.95
📊 Iteration 1
Current best score: 0.0000
Score: 0.6000
✨ New best score!
🔍 Analyzing 2 failures...
📝 Prompt updated (72 chars)
📊 Iteration 2
Current best score: 0.6000
Score: 0.8000
✨ New best score!
🔍 Analyzing 1 failures...
📝 Prompt updated (85 chars)
📊 Iteration 3
Current best score: 0.8000
Score: 1.0000
✨ New best score!
✅ Optimization complete! Best score: 1.0000
This lightweight foundation can be extended in many directions:
| Extension | Description |
|---|---|
| Multiple Prompt Variants | Evolve a population of prompts, not just one |
| Semantic Similarity Scoring | Use embeddings for fuzzy matching instead of exact string |
| Parameter Tuning | Also optimize temperature, max_tokens, etc. |
| Tool-Use Optimization | Improve how the agent selects and uses tools |
| Distributed Evaluation | Run test cases in parallel across multiple nodes |
| Web Dashboard | LiveView UI to watch optimization progress |
This system demonstrates the core principle of self-improvement: an AI that analyzes its own failures and iteratively refines its behavior. It's lightweight, extensible, and a perfect showcase for DeepSeek's reasoning capabilities.