EvoX
Co-evolves the search strategy with the solutions: the parent/context selection policy is itself LLM-written code, scored by windowed improvement and hot-swapped on stagnation.
"""EvoX PromptBuilder component — operator-labeled solution prompts + every EvoX prompt text.
One file per component (see scaffold.py). Two responsibilities:
* :class:`EvoXPromptBuilder` — the SOLUTION prompt. Default-template layout (SkyDiscover
``diff_user_message.txt`` + ``DefaultContextBuilder``): metrics + focus areas
(``_identify_improvement_areas`` wording, 500-char simplify hint), ``# Program Generation
History`` with the fence-free ``## Previous Attempts`` (``previous_attempt.txt`` Changes/
Metrics/Outcome, top-3 by combined_score) and FULL-source inspirations with per-program score
breakdowns under ``## Other Context Solutions``, evaluator feedback, the previous iteration's
failure feedback (cross-iteration retry adaptation — see scaffold.py), then ``# Current
Solution`` with the variation-operator label (the verbatim DIVERGE/REFINE template instantiation
chosen by the strategy, or nothing for the free-form ``""`` label) and the verbatim obedience
line. One galapagos hard invariant forces a layout change: the current program must be the LAST
fenced ``python`` block (see ``components/prompt.py``), so the obedience line renders BEFORE the
program block (upstream places it after) and everything after the program block is fence-free —
the SEARCH/REPLACE demonstration is unfenced exactly like the adaevolve port, and all injected
free text (task context, inspiration sources — display-only, never re-extracted — labels,
evaluator feedback, failure feedback, guide-LLM output) is fence-sanitized via
:func:`_defence`.
* The rendering half of the META machinery, the mirror of ``context_builder/evox`` — the verbatim
operator templates (``search/evox/utils/template.py``), the strategy-evolution system prompt
(``evox_search_sys_prompt.txt`` ADAPTED to the galapagos contract: ``EvolvedStrategy(
StrategyBase)``, Genome field names, ``self.genomes``/``self.rng`` — a sanctioned and required
adaptation since the LLM codes against OUR base class), the guide-LLM templates
(stats insight / problem summary / batch summaries, verbatim), the operator-generation prompts
(``variation_operator_generator.py``, verbatim), and the φ formatters (``formatters.py`` ports:
:func:`format_population_state`, :func:`format_execution_trace`, :func:`format_stats_diff`,
:func:`filter_stats_by_horizon`, :func:`format_search_window_context`). The scaffold assembles
the meta user message from these (see ``scaffold.py``); upstream's key mismatch between the
scorer's ``search_horizon`` and the formatters' ``search_window_horizon`` is resolved by
storing both keys on every strategy entry.
"""
from __future__ import annotations
import re
from ...components.prompt import PromptBuilder
from ...models.base import Prompt
from ...records import Genome, RunState, Selection
# config prompt.system_message default fallback — verbatim (shared with the adaevolve port)
_EVOX_SYSTEM = (
"You are an expert tasked with iteratively improving a solution.\n"
"Your goal is to maximize the COMBINED SCORE while exploring diverse approaches.\n"
"The system maintains a collection of diverse solutions - both high combined score AND "
"diversity are valuable."
)
# the label-obedience line from full_rewrite_user_message.txt / diff_user_message.txt — verbatim
# (rendered BEFORE the program block; see module docstring)
_OBEY_LINE = (
'IMPORTANT: If an instruction header of "## IMPORTANT: ..." is given below the '
'"# Current Solution", you MUST follow it. Otherwise, focus on targeted improvements '
"of the program."
)
# the # Task tail (diff mode, default diff_user_message.txt wording incl. the diversity lines);
# the SEARCH/REPLACE demonstration is unfenced upstream and stays unfenced here so the current
# program remains the last fenced block. Upstream diff mode offers NO full-rewrite alternative.
_TASK_SECTION = """\
# Task
Suggest improvements to the program that will improve its COMBINED_SCORE.
The system maintains diversity across these dimensions: score, complexity.
Different solutions with similar combined_score but different features are valuable.
You MUST use the exact SEARCH/REPLACE diff format shown below to indicate changes:
<<<<<<< SEARCH
# Original code to find and replace (must match exactly)
=======
# New replacement code
>>>>>>> REPLACE
**CRITICAL**: You can suggest multiple changes. Each SEARCH section must EXACTLY match code in \
"# Current Solution" - copy it character-for-character, preserving all whitespace and indentation. \
Do NOT paraphrase or reformat.
Be thoughtful about your changes and explain your reasoning thoroughly.
Include a concise docstring at the start of functions describing the exact approach taken."""
# full-rewrite mode # Task (general.mutation_approach="full_rewrite") — the model returns the complete program
_TASK_SECTION_REWRITE = """\
# Task
Rewrite the program to improve its COMBINED_SCORE.
The system maintains diversity across these dimensions: score, complexity.
Different solutions with similar combined_score but different features are valuable.
Provide the complete new program solution in a single ```python code block.
IMPORTANT: Make sure your rewritten program maintains the same inputs and outputs \
as the original program, but with improved internal implementation.
Be thoughtful about your changes and explain your reasoning thoroughly.
Include a concise docstring at the start of functions describing the exact approach taken."""
_FENCE_RUN = re.compile(r"`{3,}") # any run of >=3 backticks (a fence opener/closer)
def _defence(text: str) -> str:
"""Neutralize fence markers in injected free text (labels, feedback, guide-LLM output):
collapse any run of three-plus backticks to two, so the current-program-is-the-last-fenced-
block invariant holds in both the solution and the meta prompt."""
return _FENCE_RUN.sub("``", text)
# =============================================================================================
# Variation-operator templates — search/evox/utils/template.py, verbatim
# =============================================================================================
DIVERGE_TEMPLATE = """\
## IMPORTANT: YOU MUST FOLLOW THE FOLLOWING IN YOUR GENERATION, OTHERWISE THE SOLUTION WILL BE REJECTED.
### Goal
You MUST USE a FUNDAMENTALLY DIFFERENT APPROACH to the following code, NOT JUST parameter tweaks or local changes.
This may mean using different solver, function, or library, a different algorithm, or a different method entirely; or adding new components that have never been used before.
You can replace partial or the entire code if needed (i.e., rewrite a simpler solution from scratch), or build upon / complement it.
Think through what is needed to achieve a BREAKTHROUGH SCORE.
**Tools Available:**
You have access to a list of libraries. Use these tools to your advantage.
- Choose library functions appropriate to the problem's structure and constraints.
- Feel free to break from the existing code entirely — focus on correct library usage for the problem, not on preserving the current code's structure.
- **Prioritize libraries and functions not already used by the parent program.** Adding new components or changing how existing ones are combined is fine, but prefer approaches that introduce something the parent has never tried.
**CRITICAL: IF A LIBRARY EXISTS FOR AN ALGORITHM, USE IT. DO NOT REIMPLEMENT MANUALLY. SEE THE EXAMPLES BELOW FOR RELEVANT LIBRARIES.**
### Examples
{GENERATED_EXAMPLES}
### Output
Your solution MUST still be VALID / CORRECT for the same problem. A broken "creative" solution is worthless.
Format:
1. APPROACH: Describe your different approach.
2. CODE: Implementation."""
REFINE_TEMPLATE = """\
## IMPORTANT: YOU MUST FOLLOW THE FOLLOWING IN YOUR GENERATION, OTHERWISE THE SOLUTION WILL BE REJECTED.
### Goal
You should keep the same fundamental approach and algorithm structure.
DO NOT switch to a different problem formulation.
Focus on REFINING / EXPLOITING the current approach: better inputs, additional polish, etc.
Think through what is needed to SQUEEZE the last few percent of performance.
Use sufficient computational budget for refinement.
**Tools Available:**
You have access to a list of libraries. Use these tools to your advantage.
- Choose library functions appropriate to the problem's structure and constraints.
- Feel free to break from the existing code entirely — focus on correct library usage for the problem, not on preserving the current code's structure.
**CRITICAL: IF A LIBRARY EXISTS FOR AN ALGORITHM, USE IT. DO NOT REIMPLEMENT MANUALLY. SEE THE EXAMPLES BELOW FOR RELEVANT LIBRARIES.**
### Examples
{GENERATED_EXAMPLES}
### Output
Your solution MUST still be VALID / CORRECT for the same problem. A broken solution is worthless.
Format:
1. REFINEMENT: Describe what you are refining.
2. CODE: Implementation with refinement."""
DEFAULT_DIVERGE_TEMPLATE = """\
## IMPORTANT: YOU MUST FOLLOW THE FOLLOWING IN YOUR GENERATION, OTHERWISE THE SOLUTION WILL BE REJECTED.
### Goal
Produce a **fundamentally different** solution than the current one.
This must be a real **strategy shift**, not minor edits or small tweaks.
Allowed: new structure, new strategy, different generation plan, different style/format, new components.
DO NOT do: superficial rewrites, polishing, or the same idea with only small changes.
### Constraints
- Keep the output **valid** for the same task and constraints.
- Prefer changes that introduce something **not present** in the following current solution.
- If reliable tools or shortcuts exist, use them instead of manual re-creation.
### Output
1. APPROACH: One short paragraph describing what is different and why.
2. OUTPUT: The full generated solution."""
DEFAULT_REFINE_TEMPLATE = """\
## IMPORTANT: YOU MUST FOLLOW THE FOLLOWING IN YOUR GENERATION, OTHERWISE THE SOLUTION WILL BE REJECTED.
### Goal
Improve the current solution **within the same core structure**.
Do **not** change the fundamental structure of the following solution; instead make it stronger, cleaner, and more reliable.
Allowed: rewriting for clarity, fixing weaknesses, improving completeness, better edge-case handling, and added polish.
DO NOT switch to a fundamentally different approach.
### Constraints
- Keep the output **valid** for the same task and constraints.
- Spend effort on higher-quality execution and fewer mistakes.
### Output
1. REFINEMENT: One short paragraph describing what you improved and how.
2. OUTPUT: The full refined solution."""
# =============================================================================================
# Operator-generation prompts — variation_operator_generator.py, verbatim
# =============================================================================================
EXPLORE_SYSTEM_PROMPT = """\
You are an expert at analyzing problems and suggesting diverse algorithmic approaches.
Given a problem description and the evaluator setup and code, generate a concise "different approaches" guidance block that will help an LLM generate diverse solutions.
## STEP 1: ANALYZE THE PROBLEM TYPE (do this internally, don't output)
Before suggesting anything, you MUST determine:
- What kinds of problems are you dealing with?
- **Optimization problem**: continuous vs discrete, constrained vs unconstrained, what variables and constraints, and what optimization family, ...
- **Algorithm/heuristic design**: what decisions, what tradeoffs
- **System design**: what resources, what objectives
- And anything else that is related to the problem type and context.
- What is the objective?
- Read the evaluator's scoring function (e.g., `combined_score`) to understand ALL components of the objective.
- If the objective combines multiple sub-scores, what are the weights and tradeoffs between them?
- What are the constraints or requirements?
- E.g., bounds, accuracy, time limits, correctness criteria, etc.
- Does this problem have multiple decision stages?
- What tradeoffs exist?
- E.g. size vs weight, cost vs quality, speed vs accuracy, ...
- For optimization problems specifically:
- Are the decisions discrete or continuous (real-valued parameters)?
- What variables are being optimized and what are the constraint types?
## STEP 2: PROVIDE SUGGESTIONS FOR DIFFERENT APPROACHES
### STEP 2a: IDENTIFY STANDARD TOOLKIT
Before suggesting anything, think like a DOMAIN EXPERT. Ask yourself:
- "What would a textbook or survey paper list as the main approaches?"
- "What is the standard toolkit, or major libraries and functions for this problem domain?" (not just what context programs use)
- For optimization problems specifically
- What type? (constrained/unconstrained, local/global, gradient/derivative-free)
- What methods handle the constraint types? (bounds, equality, inequality)
- How are constraints ENFORCED? Penalty methods, projection/repair, barrier methods, and exact constraint solvers are fundamentally different paradigms. Treat constraint handling as a distinct design axis, separate from the choice of optimizer.
- What are ALL the decision variables? Not just the final output — also intermediate/structural variables (positions, configurations, orderings) that the output depends on. The FULL joint optimization over all these variables is the real problem.
- Does the problem DECOMPOSE? Often the full problem has a HARD core (nonlinear, jointly over many variables with constraints) + EASIER sub-problems (one set of variables given another fixed). Identify the hardest sub-problem — the solver for that is the MAIN solver. Simpler methods (LP, closed-form) can serve as sub-routines for the easier parts.
- What metaheuristic/evolutionary algorithms are available?
### STEP 2b: GENERATE SUGGESTIONS (based on problem & evaluator)
Your suggestions should be driven primarily by analyzing the PROBLEM and EVALUATOR from Step 1. Enumerate the landscape of known, proven techniques for this problem type.
- In the Libraries bullet: include the canonical/standard library for this problem domain, with its key functions. Do not skip it.
- For multi-objective problems, include at least ONE category named after a KEY QUALITY DIMENSION from the evaluator's `combined_score`. Name the category after the objective, then list diverse techniques that target it.
- Consider MULTIPLE PARADIGMS: Your suggestions should span at least two distinct approach families. Don't anchor on one paradigm.
- Consider MULTI-PHASE / PIPELINE approaches: Chaining methods sequentially (e.g., one for an initial solution, another to refine it) can help.
- Consider MULTI-START / ENSEMBLE strategies: Running the same method from many different starting points and keeping the best result can be a simple but powerful technique.
- Do NOT dismiss simple approaches just because advanced ones exist. Simple methods with good tuning often beat complex methods.
## STEP 3: OUTPUT FORMAT (follow exactly)
```
EXAMPLES OF DIFFERENT approaches (NOT LIMITED TO, PROPOSE YOUR OWN):
- **[Category 1]**: [describe 3-5 alternative specific techniques], e.g. A ↔ B ↔ C, D ↔ E, etc.
- **[Category 2]**: [describe 3-5 alternative specific techniques], e.g. A ↔ B ↔ C, D ↔ E, etc.
- ... (3-6 bullet points total)
```
Your goal is to come up with the "DIFFERENT APPROACHES" sections, with the problem specific guidance, e.g.
different structural components (e.g. initialization, objective design, search strategy to balance multiple factors, anything else that
is related to the problem type and context).
EXAMPLE CATEGORIES (do not blindly copy, tailor to problem):
```
EXAMPLES OF DIFFERENT approaches (NOT LIMITED TO, PROPOSE YOUR OWN):
- **Libraries / solvers**:
- **Multi-phase pipeline**:
- **Algorithm family / strategy**:
- **Runtime optimization (if applicable to the problem objective)**
- **Construction / search method**:
- **Constraint handling**:
... Or anything else you can think of.
```
Do not just blindly copy from the example; tailor the categories and ideas specifically to the problem type and context.
## RULES
1. LIBRARIES / TOOLS - Include for problems where libraries provide algorithmic solutions.
- **CRITICAL**: ONLY suggest libraries that appear in the "Available Packages in Environment" section. Do NOT suggest libraries that are not installed.
- Always list it in the first category if applicable.
- **USE THIS EXACT FORMAT**: `library.submodule: func1, func2 ↔ another_lib: funcA, funcB`. Use → to wire dependent functions: `setup_func → apply_func`.
- **CRITICAL** For wrapper functions that accept a `method` parameter: you MUST specify the exact method names.
- The LLM needs to know WHICH specific solver to use. Format: `wrapper(method='X'), wrapper(method='Y')`.
- **BE FOCUSED, NOT EXHAUSTIVE**: Do not list every possible methods, but DO include those from DIFFERENT library ecosystems when they offer distinct algorithms for the problem.
- **CRITICAL**: ONLY list functions that directly SOLVE the problem . Do NOT list low-level building blocks (linear algebra routines, random generators, data structures, geometry computations, plotting, etc.).
- For optimization problems specifically:
- First identify the constraints on the actual decision variables (not evaluation metrics like combined_score), then include solvers for each constraint level present (bounds, general). Do not list solvers that cannot handle the problem's constraints.
- **KEY**: Match the complexity of the tool to the problem structure. If the problem has complex interactions or constraints, the PRIMARY tool must be capable of handling that full complexity — simpler/restricted methods should only be listed as complementary sub-routines for easier sub-problems. When a lightweight specialised variant exists for a simpler sub-task (e.g., a final refinement step with fewer constraints), include it too — it can be significantly faster.
- If the problem decomposes (e.g., fixing one set of variables makes the remaining problem simpler), list the methods for those sub-problems too.
- **INCLUDE COMPLEMENTARY VARIANTS**: Standard libraries often have PAIRS for the same operation (e.g., local vs global, forward-only vs bidirectional, online vs batch). ALWAYS list **both** sides of each pair — never omit the complement.
2. HIGH-LEVEL IDEAS ONLY - short, abstract concepts (stay within 15 words).
- Within each category just list 3-5 alternative ideas.
3. Tailor categories to the problem; try to focus on a few different categories, and within each category focus on actionable and simple approaches not parameter tweaks
4. Only assume constraints explicitly stated in the problem.
5. 3-6 bullet points, order them based on ease of implementation and relevance to the problem.
6. Use ↔ to separate alternatives
7. **BALANCE**: Include libraries with SPECIFIC algorithms when they provide solutions, but EQUALLY emphasize domain-specific algorithmic strategies. Both are important.
8. If the evaluator passes input data for processing, add a **separate bullet** after Libraries: state that the full input is available and library functions can be applied directly to it."""
EXPLOIT_SYSTEM_PROMPT = """\
You are an expert at analyzing problems and suggesting ways to INTENSIFY and REFINE within an existing approach.
Given a problem description and evaluator, generate a concise "refinement strategies" guidance block that will help an LLM squeeze more performance from the CURRENT approach without changing its fundamental structure.
## GOAL: EXPLOITATION, NOT EXPLORATION
The LLM receiving this guidance already has a working solution. Your job is to help it, for example:
- Tune hyperparameters more aggressively (iterations, batch size, population, etc.)
- Improve the quality of inputs (seeds, initializations, candidates)
- Improve the quality of the solution (post-processing, validation, polish)
- Refine tradeoff parameters: If the approach uses parameters to balance competing factors, tuning strategies in different ways (e.g., search ranges, convergence criteria, adaptive schedules)
- Explore composite metrics: If the approach combines multiple factors, varying different ways in how they're combined (e.g., weights, normalization, thresholds)
## STEP 1: ANALYZE THE PROBLEM TYPE (do this internally, don't output)
Before suggesting anything, you MUST determine:
- What type of problem is this?
- What are the "knobs" that can be tuned without changing the core algorithm? For example,
- Tradeoff parameters: If the approach balances multiple factors, what parameters control the tradeoff?
- Composite metrics: If the approach combines multiple factors, how are they combined?
- Search strategies: If the approach searches for parameter values, what can be refined?
- What does "better" mean for this problem? Read the evaluator's scoring function (e.g., `combined_score`) and understand ALL components.
- **PRINCIPLE**: Only include suggestions that DIRECTLY improve a component of `combined_score`.
If something is NOT measured in the score (e.g., runtime when only quality matters), do NOT suggest directions that do not improve the score.
- What post-processing, validation, or polish stages could help?
- If the problem objective is only to improve the quality of the solution, where can more computation budget be spent to improve the quality?
## STEP 2: GENERATE REFINEMENT STRATEGIES (based on problem & evaluator)
Your suggestions should be driven primarily by analyzing the PROBLEM and EVALUATOR from Step 1.
Think about what "knobs" exist in typical solutions for this problem type:
- Iteration counts, population sizes, step sizes, convergence thresholds
- Input quality (initializations, seeds, candidates)
- Post-processing and polish stages
- Tradeoff parameters and composite metric weights
## STEP 3: OUTPUT FORMAT (follow exactly)
```
EXAMPLES OF REFINEMENT strategies (NOT LIMITED TO, PROPOSE YOUR OWN):
- **[Category 1]**: [describe 3-5 specific tuning techniques]
- **[Category 2]**: [describe 3-5 specific tuning techniques]
- ... (3-5 bullet points total, only include categories that actually apply)
... Or anything else you can think of.
```
EXAMPLE CATEGORIES (do not blindly copy, tailor to problem):
```
REFINEMENT strategies examples:
- **Libraries / tools**: (`library.submodule: func1, func2 ↔ another_lib: funcA`; for wrapper functions that accept a `method` parameter: you MUST specify the exact method names. Format: `wrapper(method='X'), wrapper(method='Y')`).
- **Computational budget**:
- **Hyperparameter tuning**:
- **Dynamic scheduling**:
- **Multi-objective weights / tradeoff balancing**:
- **Input quality / initialization**:
- **Solver tolerances / convergence**:
- **Post-processing / polish**:
```
Do not just blindly copy from the example; tailor the categories and ideas specifically to the problem type and context.
## RULES:
1. **STRICT**: DO NOT suggest changing the core algorithm or problem formulation. You CAN suggest library tools that REFINE/POLISH within the same approach.
2. **TAILOR**: Only suggest categories that actually exist in this problem type - do NOT invent knobs that don't apply
- Do NOT invent hyperparameters that don't exist in the code
3. **EVALUATOR ALIGNMENT**: Only suggest strategies that DIRECTLY improve metrics in the evaluator's scoring function
4. Keep each bullet concise (max 15 words per alternative)
5. 3-5 bullet points only
6. Use ↔ to separate alternatives
7. Only give high-level directions, not specific numbers"""
COMBINED_SYSTEM_PROMPT = f"""\
You must generate TWO separate guidance blocks in a single response. Output them as two clearly separated sections.
## OUTPUT STRUCTURE
Your response must have exactly two sections:
1. First section: "### EXPLORATION (diverge_label)" followed by the exploration guidance
2. Second section: "### EXPLOITATION (refine_label)" followed by the exploitation guidance
---
## INSTRUCTIONS FOR SECTION 1 (EXPLORATION):
{EXPLORE_SYSTEM_PROMPT}
---
## INSTRUCTIONS FOR SECTION 2 (EXPLOITATION):
{EXPLOIT_SYSTEM_PROMPT}
---
## CROSS-REFERENCE RULE
The EXPLOITATION section MUST be consistent with the EXPLORATION section AND must be self-sufficient (usable even if the user never sees the exploration section).
- If the EXPLORATION section suggests optimization solvers or libraries, it is recommended for the EXPLOITATION section to include:
- (1) a "Libraries / tools for refinement" category that is the same list and order as the one in the EXPLORATION section
- ALSO INCLUDE ADDITIONAL optimizers, libraries, and/or polish methods for polishing and refining existing solutions.
- (2) solver-specific refinement knobs: for example, maxiter, convergence tolerances, different initializations and polish passes.
- The EXPLOITATION section should contain enough concrete, actionable ideas that the LLM can make meaningful improvements even without seeing the EXPLORATION section.
- These can be general knobs, not specific numbers.
Generate BOTH sections now, clearly separated with the headers above.
"""
# =============================================================================================
# Strategy-evolution system prompt — evox_search_sys_prompt.txt ADAPTED to the galapagos
# contract (sanctioned + required: the meta-LLM codes against StrategyBase/Genome, not
# ProgramDatabase/Program; the J formula, label rules, and constraints are verbatim)
# =============================================================================================
META_SYSTEM_PROMPT = """\
You are an expert coder evolving a search algorithm for program optimization.
Your task is to improve the EvolvedStrategy class, which implements a search algorithm that helps the LLM leverage past experience to generate better solutions.
========================
HIGH-LEVEL CONTEXT
========================
- You are evolving the *search algorithm itself*.
- The search algorithm is implemented as a strategy store that:
(1) manages candidate genomes (solutions), and
(2) decides which genome to mutate next and how, and which genomes to sample (if any) as context for the next solution generation
- The search algorithm will run on the downstream problem for a number of iterations.
- Search algorithms that fail to improve past a meaningful threshold within a time window may be replaced.
========================
SCORING OBJECTIVE
========================
Your search algorithm is evaluated based on how much the best solution improves over time. The quality of your search algorithm is measured by:
Search Algorithm Score = ((best_score - initial_score) * (1 + log(1 + initial_score))) / sqrt(iterations)
- Where best_score is the best score found on the downstream problem, and initial_score is the starting score before search begins.
========================
CLASS & FUNCTIONS
========================
You must implement a class named EvolvedStrategy that inherits from StrategyBase
(from galapagos.scaffolds.evox.strategy).
This class defines the search algorithm by deciding:
- add() function: How genomes are added to the population
- sample() function
- Which genome to mutate next and how (the parent)
- What past genomes to provide as context (if any) to the LLM when generating new candidate solutions
Candidate solutions are Genome objects (from galapagos.records).
Important Genome fields:
- id: str
- content: str
- The solution to the problem, can be code, prompt, etc.
- scores: Dict[str, float]
- Task evaluation metrics; the main metric is "combined_score"
- parent_id: Optional[str]
- ID of the parent genome it was mutated from
- lineage: str
- The chain of ancestor ids
- metadata: Dict[str, Any], notably:
- metadata["iteration"]: int - the iteration the genome was found
- metadata["parent_info"]: Tuple[str, str] - (label, id) tuple tracking the label assigned to the parent for generating this genome.
If label is non-empty, it indicates either `self.DIVERGE_LABEL` or `self.REFINE_LABEL` was used.
- metadata["context_ids"]: List[str] - IDs of other genomes used as context for this generation
- artifacts: Dict[str, Any]
- Evaluator side-output (error strings, text feedback)
NOTE:
- You can read these attributes directly from any Genome.
- The genome passed to add() already has these fields populated. You can use them as needed, but do
not overwrite them.
EvolvedStrategy extends StrategyBase.
Available attributes and helpers from StrategyBase:
- self.genomes: Dict[str, Genome] - access to all genomes
- self.rng: random.Random - the seeded random generator injected by the host. ALL randomness MUST go through self.rng (self.rng.random(), self.rng.choice(), self.rng.sample(), ...); NEVER use the global random module.
- self.DIVERGE_LABEL: str - a predefined label for signaling divergence from the parent genome (DO NOT modify)
- self.REFINE_LABEL: str - a predefined label for signaling refinement over the parent genome (DO NOT modify)
- Helper methods (use these rather than re-implementing):
get(genome_id: str) -> Optional[Genome] - get a Genome by its id
get_best() -> Optional[Genome] - the best genome by combined_score
_update_best(genome: Genome) -> None - keep best tracking up to date (call it at the end of add())
========================
REQUIRED METHODS
========================
You MUST implement the following methods exactly (do not change signatures):
1) add
def add(self, genome: Genome, iteration: Optional[int] = None, **kwargs: Any) -> str
The add() method controls how genomes enter and remain in the population. It must return genome.id.
2) sample
def sample(self, num_context_programs: Optional[int] = 4, **kwargs) -> Tuple[
Dict[str, Genome],
Dict[str, List[Genome]]
]
The sample() method selects which genome to mutate next and how, and which past genomes to use as context.
Returns a tuple of:
- parent_dict: Dict[str, Genome]
A dict mapping a label string to EXACTLY ONE genome to mutate.
- context_programs_dict: Dict[str, List[Genome]]
A dict mapping label strings to lists of past genomes to provide as additional context for the next genome mutation.
LABEL RULES FOR PARENT and CONTEXT:
- Parent labels MUST be empty string "" by default.
- Context labels MUST ALWAYS be empty strings "".
PARENT + CONTEXT SELECTION PRINCIPLES:
The parent and context together determine what the LLM sees when generating the next solution.
1) Parent selection: what to mutate
- Main genome to mutate for the next generation.
2) Context selection: what else the LLM sees
- Complementary perspectives to inform the next generation. E.g. different approaches, contrasting examples, different score ranges.
3) Avoid deterministic selections that would always pick the same genome across iterations.
================================================
PREDEFINED LABELS (DIVERGE & REFINE)
================================================
- Parent labels MUST be empty string "" by default.
- `self.DIVERGE_LABEL`: Signals the LLM to try something fundamentally different.
- `self.REFINE_LABEL`: Signals the LLM to refine the current approach.
- **USE LABELS ONLY WHEN APPROPRIATE**:
- Default should be empty string "" for parent label - let smart parent/context selection do the work.
- Use DIVERGE_LABEL or REFINE_LABEL when progress is stagnating -- consider why:
- Fundamentally limited approaches may need a new direction.
- A promising candidate solution may need refinement to reach its potential.
- A new idea often needs a few iterations of refinement to work.
- Be careful not to overuse any particular genome to apply the labels.
- Choose label and the targeted genome based on the search state, not a fixed rule.
- ONLY WHEN using self.DIVERGE_LABEL or self.REFINE_LABEL, you can optionally return empty {} context for targeted divergence or refinement on the parent genome.
========================
DESIGN IDEAS
========================
KEEP IT SIMPLE: Match your algorithm to the population size and iteration window (Read "Your Algorithm's Search Window").
- NOTE: You can define helper state in __init__ for use in add/sample (and __init__ must call super().__init__()).
PARENT and CONTEXT SELECTION DIVERSITY (PRIMARY MECHANISM):
- Use `parent_id`, `metadata["context_ids"]`, or `metadata["parent_info"]` to check which parent and context were used, or whether a label has been tried.
- Think about different combinations of diverse parents and varied context, and consider what works best.
- Sometimes failure genomes can also provide useful insights.
- Be careful not to overuse any particular genome or set of genomes from the store.
- Adjust the search strategy based on the search state - try different approaches (parent, label, and context selection) when progress is deeply stuck.
READING THE POPULATION:
- Exploit or explore?
- Consolidate gains or explore more?
- What parents/contexts led to improvements? Which led nowhere or showed little improvement?
- How far are we from SOTA? When close to SOTA, what parent should I select and refine?
- How are the scores distributed?
- Is the population diverse or converging? Consider whether diversity itself (not just score) should be a selection signal.
- When progress stalls, ask: is the issue with parent and context selection, or a fundamental plateau (reasonable
variations exhausted)? Adjust the selection strategy, and consider using DIVERGE and REFINE labels as part of your toolkit.
========================
CONSTRAINTS
========================
- DO NOT change the class name (EvolvedStrategy) or method signatures. Preserve all imports and the # EVOLVE-BLOCK-START / # EVOLVE-BLOCK-END markers; import only from galapagos and the Python standard library.
- All variables used in add() or sample() MUST be initialized in __init__ with safe default values to avoid runtime errors.
- DO NOT modify, overwrite, or recompute any existing values in genome.scores.
- Existing scores (e.g., "combined_score") MUST remain unchanged.
- You may add new internal state or metadata keys if useful.
- When accessing scores:
- Always validate the type before use: check isinstance(value, (int, float)) before converting to float or performing arithmetic.
- Filter out non-numeric or missing values when computing averages or scores.
- LABEL CONSTRAINTS (CRITICAL):
- Parent label: MUST be "" (empty string) by default. Use self.DIVERGE_LABEL or self.REFINE_LABEL when population signals warrant it.
- Context labels: MUST ALWAYS be "" (empty string). No exceptions.
- self.DIVERGE_LABEL and self.REFINE_LABEL are predefined constants. DO NOT redefine or modify them.
- Track progress state (e.g., best score history, parent usage) in add(), not sample(). This persists correctly across checkpoint resumes.
- Be aware that all genomes share a single root ancestor (the initial seed); the lineage forms a tree, so tracing parent_id always leads back to the same root.
- Genome objects are NOT hashable: do not put them in sets or use them as dict keys. Use genome.id (a string) for deduplication and lookups instead.
========================
CRITICAL INSTRUCTION
========================
- Design a search algorithm that performs parent and context selection wisely and adaptively.
- Only count improvements as meaningful if they exceed 1% relative or 0.01 absolute."""
# =============================================================================================
# Guide-LLM templates — context_builder/evox/templates/*.txt, verbatim
# =============================================================================================
STATS_INSIGHT_SYSTEM = """\
Summarize the population state with NUMBER-BACKED observations.
OUTPUT FORMAT:
📊 **State:** [One-sentence description based on numbers]
**Key Numbers:** [3-4 bullet points]
• Report key metrics from the stats (score range, spread, trajectory)
**Patterns Observed:** [2-3 bullet points]
• Describe factual patterns in the data (gaps, trends, anything)
• State what the numbers show, not what they mean
• Show a few numbers in the text; do not repeat the full list of numbers.
Example:
• Parent selection: what parents were typically chosen recently?
• Context selection: what context programs were used recently?
• Outcomes: compare scores across parent, context, and resulting child programs.
• If any particular program is overused in parent or context selection, flag it.
• Label usage: when self.DIVERGE_LABEL / self.REFINE_LABEL was used (if any).
RULES:
- Every statement MUST cite a specific number FROM THE STATS PROVIDED
- Report observations only—no recommendations or interpretations
- NO made-up numbers
- NO summary paragraph at the end"""
PROBLEM_SUMMARY_SYSTEM = """\
Summarize the downstream problem context for a search algorithm designer.
Provide a concise summary (under 100 words) covering:
- [Read ## Problem Description] What problem is being solved. Be accurate and detailed; describe the problem in a way that is easy to understand at a high level.
- [Read ## Problem Evaluator] How solutions are evaluated/scored
INSTRUCTIONS FOR SCORING FORMULA:
1. Find the FINAL return statement in the evaluate() function - this is the ONLY source of truth for metric names.
2. Trace how `combined_score` is calculated using ONLY the metric names that appear as KEYS in that final return dict.
3. DO NOT expose internal/intermediate variable names (e.g., internal variable names within the function) that don't appear in the return statement.
4. In your answer, highlight the metric names in the return statement by ``
OUTPUT FORMAT:
**Task:** <one sentence: what the solution must optimize>
**Scoring:** combined_score = <formula with weights and metric names>
**Goal:** Maximize `combined_score`"""
PROBLEM_TEMPLATE = """\
## Problem Description
{problem_description}
## Problem Evaluator
{evaluator_context}"""
BATCH_SUMMARY_SYSTEM = """\
You are summarizing previous search algorithm attempts.
For EACH program, provide:
1) TECHNIQUE: (1-2 short sentences)
Describe the parent and context selection approach.
• Parent selection: what parents were typically chosen recently?
• Context selection: what context programs were used recently?
• How is the label used?
2) KEY TAKEAWAYS: (3-5 bullet points)
• Success or not (improved the downstream problem score or not). Compare scores across parent, context, and resulting child programs.
• Highlight any notable patterns in the numbers (e.g., score gaps to SOTA, recent score trends
[slowing down, plateau, ...], regressions, etc.).
• Describe observable patterns in the technique used, parent/context selection (e.g., score ranges, diversity, selection frequency, best program inclusion/exclusion, context lineage diversity, empty context usage, ...)
and any correlations with child program outcomes.
• Label usage analysis:
- Report the COUNT of DIVERGE_LABEL and REFINE_LABEL uses, and the child scores for each.
- Label-guided generations are inherently high-variance. Individual attempts
often regress — this is expected. Breakthroughs typically require 3-5+ tries with varied parents and contexts.
Do NOT treat individual regressions as evidence that "labels don't work".
- If any labels were UNDER-UTILIZED during LONG STAGNATION, flag this as a MISSED OPPORTUNITY.
- Flag if the same parent program or same contexts are reused repeatedly with labels and no improvement is observed.
Keep under 70 words per program. Report facts only.
Ground all statements in the actual statistics provided."""
BATCH_PER_PROGRAM = """\
TASK DESCRIPTION:
{task_description}
--------------------------------------------------------
SEARCH ALGORITHM CODE:
```python
{solution}
```
--------------------------------------------------------
STATS (includes execution trace with parent/context scores):
{db_stats_text}"""
BATCH_INSTRUCTIONS = """\
Below are {num_programs} PREVIOUS SEARCH ALGORITHM ATTEMPTS.
{combined_content}
--------------------------------------------------------
For EACH program, provide analysis in this EXACT format:
[PROGRAM 1]
SIGNALS OBSERVED: <...>
WHAT SEEMED TO HELP (ONLY LIST WHAT HELPED): <...>
POTENTIAL KEY INSIGHT: <...>
[PROGRAM 2]
...and so on. Use the exact [PROGRAM N] markers.
Keep under 60 words per program. Ground observations in actual statistics."""
# search_evolution_user_message.txt task tail — verbatim wording except the trailing fenced
# response example, which is replaced with an unfenced instruction so the CURRENT STRATEGY
# SOURCE stays the LAST fenced python block of the meta user message (sanctioned adaptation)
META_TASK_TAIL = """\
---------------------------
# Task
Rewrite the program to improve search algorithm score based on the population state.
Provide the complete new program solution, and explain high-level 1-2 principals on what you have done
and why.
Keep things SIMPLE.
IMPORTANT: Make sure your rewritten program maintains the same inputs and outputs
as the original program, but with improved internal implementation.
Respond with the complete rewritten EvolvedStrategy module in ONE fenced python code block placed
at the very end of your reply."""
# =============================================================================================
# φ rendering — context_builder/evox/formatters.py ports
# =============================================================================================
def _fmt_score(score) -> str:
return f"{score:.4f}" if isinstance(score, (int, float)) else "N/A"
def _fmt_scores(scores) -> list[str]:
return [_fmt_score(s) for s in scores]
def filter_stats_by_horizon(db_stats: dict, horizon: int) -> dict:
"""``filter_db_stats_by_horizon``: truncate the trajectory fields to the last ``horizon``."""
if not db_stats or horizon <= 0:
return db_stats
filtered = dict(db_stats)
recent = db_stats.get("recent_solution_stats")
if recent:
filtered_recent = dict(recent)
for key in ("execution_trace", "score_trajectory", "parent_scores"):
value = recent.get(key)
if value and len(value) > horizon:
filtered_recent[key] = value[-horizon:]
filtered_recent["num_recent_iterations"] = min(
horizon, recent.get("num_recent_iterations", 0))
filtered["recent_solution_stats"] = filtered_recent
return filtered
def format_execution_trace(execution_trace: list, window_start_score: float | None = None) -> str:
"""``format_execution_trace``: per child — parent (label, id, score), context tuples, and the
outcome flag vs the running best (⭐ NEW BEST / above parent / regression / no change)."""
if not execution_trace:
return ""
def fmt_id(genome_id):
return genome_id[:8] if genome_id and len(genome_id) > 8 else (genome_id or "None")
def unpack(t):
if not t:
return None, None, None
return (t[0], t[1], t[2]) if len(t) >= 3 else (None, t[0], t[1])
def fmt_ref(t, prefix=""):
label, genome_id, score = unpack(t)
if genome_id is None:
return f"{prefix}=None (seed program)" if prefix else "None"
label_str = f'label="{label}", ' if label else ""
body = f"({label_str}id={fmt_id(genome_id)}, score={_fmt_score(score)})"
return f"{prefix} {body}" if prefix else body
lines: list[str] = []
best = window_start_score
for entry in execution_trace:
prog_tuple = entry.get("program")
if prog_tuple is None:
continue
_, _, prog_score = unpack((None,) + tuple(prog_tuple))
_, _, parent_score = unpack(entry.get("parent"))
parent_str = fmt_ref(entry.get("parent"), "Parent")
ctx = entry.get("context") or []
context_str = f"Context=[{', '.join(fmt_ref(c) for c in ctx)}]"
if prog_score is not None:
prog_score = round(prog_score, 4)
parent_score = round(parent_score, 4) if parent_score is not None else None
if best is None:
best, outcome = prog_score, "first program"
elif prog_score > best:
outcome, best = f"⭐ NEW BEST (was {best:.4f})", prog_score
elif parent_score is not None and prog_score > parent_score:
outcome = f"above parent, best still {best:.4f}"
elif parent_score is not None and prog_score < parent_score:
outcome = f"regression, best still {best:.4f}"
else:
outcome = f"no change, best still {best:.4f}"
else:
outcome = "N/A"
lines.extend([
f"Iter {entry.get('iteration', '?')}: {parent_str}, {context_str}",
f" -> Generated child score={_fmt_score(prog_score)} ({outcome})",
"",
])
return "\n".join(lines[:-1]) if lines else ""
def format_population_state(db_stats: dict) -> str:
"""``format_population_state``: the φ rendering — distribution, tiers, top-k repetition
flags, parent sharing, no-improvement streak, reuse rates, recent trajectories."""
if not db_stats:
return ""
lines: list[str] = []
pop_size = db_stats.get("population_size")
lines.append(f"- population_size: {pop_size}")
summary = db_stats.get("solution_score_summary") or {}
best = summary.get("best")
q75, q50, q25 = summary.get("q75"), summary.get("q50"), summary.get("q25")
worst = summary.get("worst")
if best is not None:
def pct(v):
return (v / best * 100) if best > 0 and v is not None else 0
dist = [f"current_best={best:.4f}"]
for name, val in (("75th_pct", q75), ("50th_pct", q50), ("25th_pct", q25)):
if val is not None:
dist.append(f"{name}={val:.4f} ({pct(val):.0f}%)")
if worst is not None:
dist.append(f"worst={worst:.4f}")
lines.append(f"- score_distribution: {', '.join(dist)}")
tiers = summary.get("score_tiers")
if tiers:
tier_parts = [f"{n} ({d.get('threshold', '')}): {d.get('pct_programs', 0):.0f}%"
for n, d in tiers.items()]
lines.append(f"- programs_by_score_tier: {', '.join(tier_parts)}")
unique = summary.get("unique_scores")
if unique is not None:
lines.append(f"- unique_score_values: {unique}")
avg = db_stats.get("avg_solutions_per_parent")
if avg is not None and pop_size:
lines.append(f"- {avg / pop_size * 100:.1f}% of solutions share the same parent on average")
top_scores = db_stats.get("top_solution_scores")
if top_scores:
best_score = top_scores[0]
best_count = (sum(1 for s in top_scores if isinstance(s, (int, float))
and round(s, 4) == round(best_score, 4))
if isinstance(best_score, (int, float)) else 0)
lines.append(f"- top_{len(top_scores)}_scores: {_fmt_scores(top_scores)}")
if best_count > 1:
lines.append(f" - Top score ({best_score:.4f}) repeated {best_count}x")
if best_count == len(top_scores):
lines.append(f" (⚠️ ALL {best_count} identical)")
recent = db_stats.get("recent_solution_stats")
if recent:
iters = recent.get("iterations_without_improvement")
if iters and iters > 0:
thresh = recent.get("improvement_threshold", 0.0)
thresh_str = f" by more than {thresh:.4f}" if thresh > 0 else ""
lines.append(f"- No improvement{thresh_str} for {iters} iterations")
def score_bucket(score):
if score is None or best is None:
return None
if score >= best:
return "at best"
if q75 and score >= q75:
return "75-100th"
if q50 and score >= q50:
return "50-75th"
if q25 and score >= q25:
return "25-50th"
return "0-25th"
for key, label in (("most_reused_parent", "parent"), ("most_reused_context", "context")):
ratio = recent.get(f"{key}_ratio")
if ratio and ratio > 0:
bucket = score_bucket(recent.get(f"{key}_score"))
score_str = f", score {bucket}" if bucket else ""
lines.append(f"- {label}: {ratio * 100:.0f}% reuse rate{score_str}")
traj = recent.get("score_trajectory")
if traj:
lines.append(f"- recent_scores (last {len(traj)}): {_fmt_scores(traj)}")
parent_scores = recent.get("parent_scores")
if parent_scores:
lines.append(f"- recent_parent_scores: {_fmt_scores(parent_scores)}")
return "\n".join(lines)
def format_stats_diff(start_stats: dict, end_stats: dict, horizon: int | None = None) -> str:
"""``format_db_stats_diff``: the per-strategy φ_j start → end rendering over its window."""
if not start_stats or not end_stats:
return ""
lines = ["Population Statistics Change (Start -> End of Search Window):"]
start_pop = start_stats.get("population_size", "?")
end_pop = end_stats.get("population_size", "?")
lines.append(f"- population_size: {start_pop} -> {end_pop}")
start_summary = start_stats.get("solution_score_summary", {})
end_summary = end_stats.get("solution_score_summary", {})
if start_summary and end_summary:
parts = []
for key, name in (("best", "current_best"), ("q75", "75th_pct"),
("q50", "50th_pct (median)"), ("q25", "25th_pct"), ("worst", "worst")):
s, e = start_summary.get(key), end_summary.get(key)
if s is not None and e is not None:
diff = e - s
sign = "+" if diff >= 0 else ""
parts.append(f"{name}: {s:.4f} -> {e:.4f} ({sign}{diff:.4f})")
if parts:
lines.append(f"- {', '.join(parts)}")
start_top = start_stats.get("top_solution_scores", [])
end_top = end_stats.get("top_solution_scores", [])
if start_top and end_top:
k = min(len(start_top), len(end_top))
lines.append(f"- top_{k}_solution_scores: {_fmt_scores(start_top[:k])} -> "
f"{_fmt_scores(end_top[:k])}")
start_avg = start_stats.get("avg_solutions_per_parent")
end_avg = end_stats.get("avg_solutions_per_parent")
if start_avg is not None and end_avg is not None and start_pop and end_pop:
start_pct = (start_avg / start_pop * 100) if start_pop != "?" else 0
end_pct = (end_avg / end_pop * 100) if end_pop != "?" else 0
lines.append(f"- % of solutions share the same parent on average: "
f"{start_pct:.1f}% -> {end_pct:.1f}%")
start_tiers = start_summary.get("score_tiers") if start_summary else None
end_tiers = end_summary.get("score_tiers") if end_summary else None
if start_tiers and end_tiers:
tier_parts = []
for tier_name in end_tiers:
s_data, e_data = start_tiers.get(tier_name, {}), end_tiers.get(tier_name, {})
s_pct, e_pct = s_data.get("pct_programs", 0), e_data.get("pct_programs", 0)
diff = e_pct - s_pct
sign = "+" if diff >= 0 else ""
tier_parts.append(f"\n {tier_name}: [{s_data.get('threshold', '')}] {s_pct:.0f}% -> "
f"[{e_data.get('threshold', '')}] {e_pct:.0f}% ({sign}{diff:.0f}%)")
lines.append(f"- programs_by_score_tier:{','.join(tier_parts)}")
end_recent = end_stats.get("recent_solution_stats", {})
if end_recent:
iters = end_recent.get("iterations_without_improvement")
threshold = end_recent.get("improvement_threshold", 0.0)
if iters is not None:
if threshold > 0:
lines.append(f"- iterations_without_improvement (improvement <= "
f"{threshold:.4f}): {iters}")
else:
lines.append(f"- iterations_without_improvement: {iters}")
execution_trace = end_recent.get("execution_trace")
if execution_trace:
if horizon:
execution_trace = execution_trace[-horizon:]
first_iter = execution_trace[0].get("iteration", "?")
last_iter = execution_trace[-1].get("iteration", "?")
lines.append(f"\n### Execution Trace (iterations {first_iter}-{last_iter})")
window_start_score = start_summary.get("best") if start_summary else None
lines.append(format_execution_trace(execution_trace,
window_start_score=window_start_score))
else:
traj = end_recent.get("score_trajectory")
if traj:
lines.append(f"- recent_score_trajectory (last {len(traj)}): {_fmt_scores(traj)}")
parent_scores = end_recent.get("parent_scores")
if parent_scores:
lines.append(f"- recent_parent_scores: {_fmt_scores(parent_scores)}")
return "\n".join(lines)
def format_search_window_context(window_start: int, total_iterations: int, horizon: int,
improvement_threshold: float) -> str:
"""``format_search_window_context`` — verbatim contract lines (threshold > 0 path)."""
return "\n".join([
f"- Your newly designed search algorithm will start at iteration {window_start} out of "
f"{total_iterations}. It will run for at least {horizon} iterations (potentially more if "
f"improving), but will be cut to just {horizon} iterations if it fails to improve the "
f"solution score by more than {improvement_threshold:.4f}.",
f"- If your algorithm fails to improve the solution score by more than "
f"{improvement_threshold:.4f} during this window, it will be replaced.",
"- Goal: Design a better search strategy (e.g. how to select and manage solution "
"programs) to improve the downstream solution score.",
"- NOTE: Exactly one program is generated per iteration. Keep the population size in "
"mind when designing your search algorithm.",
])
def _entry_window(metrics: dict) -> tuple[int, int, float, float, float]:
"""(window_start, horizon, start_score, end_score, J) from a strategy entry's metrics —
reads both the scorer's ``search_horizon`` key and the formatters' ``search_window_horizon``
alias (upstream uses both names; entries store both)."""
window_start = int(metrics.get("window_start_iteration") or 0)
horizon = int(metrics.get("search_window_horizon") or metrics.get("search_horizon") or 0)
start = float(metrics.get("search_window_start_score") or 0.0)
end = float(metrics.get("search_window_end_score") or 0.0)
combined = float(metrics.get("combined_score") or 0.0)
return window_start, horizon, start, end, combined
def identify_strategy_focus_areas(entry: dict, previous_entry: dict | None,
simplification_threshold: int = 500) -> str:
"""``identify_search_improvement_areas``: J vs the most recent strategy + simplify hint."""
areas: list[str] = []
current = _entry_window(entry.get("metrics", {}))[4]
if previous_entry is not None and previous_entry is not entry:
previous = _entry_window(previous_entry.get("metrics", {}))[4]
if current > previous:
areas.append(f"Search algorithm score improved: {previous:.4f} → {current:.4f}")
elif current < previous:
areas.append(f"Search algorithm score declined: {previous:.4f} → {current:.4f}. "
f"Consider revising.")
else:
areas.append(f"Search algorithm score unchanged at {current:.4f}")
if not areas:
areas.append("Focus on improving the search algorithm score (combined_score)")
if simplification_threshold and len(entry.get("source", "")) > simplification_threshold:
areas.append(f"Consider simplifying - solution length exceeds "
f"{simplification_threshold} characters")
return "\n".join(f"- {area}" for area in areas)
def format_current_strategy(entry: dict, focus_areas: str = "") -> str:
"""``format_current_program``: heading, J + window metrics, then the strategy source as a
fenced python block — the LAST fenced block of the meta user message."""
metrics = entry.get("metrics", {})
window_start, horizon, start, end, combined = _entry_window(metrics)
lines = ["## Current Search Program\n", "### Metrics"]
if focus_areas:
lines.append(f"Focus areas:\n{focus_areas}\n")
lines.append(f"Search Algorithm Score = {combined:.4f}")
lines.append(f"This search algorithm ran from iteration {window_start} to "
f"{window_start + horizon} ({horizon} iterations)")
lines.append(f"This search algorithm changed the downstream solution combined_score by: "
f"{start:.4f} -> {end:.4f} (+{end - start:.4f})")
lines.append("\n### Solution\n```python")
lines.append(entry.get("source", ""))
lines.append("```")
return "\n".join(lines)
def format_strategy_inspirations(entries: list[dict], summaries_by_num: dict[int, str]) -> str:
"""``format_search_algorithms``: per inspiration strategy — J, window, score change, then
the guide-LLM Summary (fence-sanitized) when parsed, else the full source in a fence."""
if not entries:
return ""
lines = ["\n## Other Reference Programs\n",
"Diverse search programs that may inspire new ideas:\n"]
for idx, entry in enumerate(entries, start=1):
metrics = entry.get("metrics", {})
window_start, horizon, start, end, combined = _entry_window(metrics)
lines.extend([
f"### Program {idx}\n",
"#### Metrics",
f"Search Algorithm Score = {combined:.4f}",
f"Ran iterations {window_start} to {window_start + horizon} ({horizon} iterations)",
f"Score changed: {start:.4f} -> {end:.4f} (+{end - start:.4f})",
])
if idx in summaries_by_num:
lines.append(f"\n#### Summary\n{_defence(summaries_by_num[idx])}\n")
else:
lines.extend(["\n#### Solution", "```python", entry.get("source", ""), "```\n"])
return "\n".join(lines)
def parse_batch_summaries(response: str, num_programs: int) -> dict[int, str]:
"""``parse_batch_summaries``: split on ``[PROGRAM N]`` markers; an unparseable response is
assigned wholesale to the first program."""
summaries: dict[int, str] = {}
if not response or num_programs <= 0:
return summaries
for num in range(1, num_programs + 1):
marker = f"[PROGRAM {num}]"
if marker in response:
start_idx = response.find(marker) + len(marker)
next_idx = len(response)
for other in range(1, num_programs + 1):
if other == num:
continue
other_marker = f"[PROGRAM {other}]"
if other_marker in response:
idx = response.find(other_marker)
if start_idx < idx < next_idx:
next_idx = idx
summaries[num] = response[start_idx:next_idx].strip()
if not summaries and response:
summaries[1] = response
return summaries
# =============================================================================================
# The solution PromptBuilder
# =============================================================================================
class EvoXPromptBuilder(PromptBuilder):
"""Renders the EvoX solution message: task → metrics + focus areas → generation history
(fence-free ``## Previous Attempts`` + full-source inspirations under ``## Other Context
Solutions``) → evaluator feedback → failure feedback from the previous iteration →
``# Current Solution`` (operator label + obedience line + the LAST fenced python block) →
fence-free diff instructions. All injected free text — task context, inspiration sources
(display-only, never re-extracted), feedback, labels — is fence-sanitized via
:func:`_defence`; only the parent program block keeps exact bytes (it must be diffable)."""
def __init__(self, max_feedback_chars: int = 2000, num_previous_attempts: int = 3,
simplification_threshold: int = 500):
self.system_message = _EVOX_SYSTEM
self.max_feedback_chars = max_feedback_chars
self.num_previous_attempts = num_previous_attempts
# suggest_simplification_after_chars (upstream default 500)
self.simplification_threshold = simplification_threshold
def feedback_section(self, failures: list[dict], *, solution_chars: int = 1500,
traceback_chars: int = 800, language: str = "python") -> str:
"""Fence-FREE port of upstream ``_format_failed_attempts``, appended by the base
``_attempt`` retry loop on each in-iteration retry (``inner_retry_times=3``). EvoX renders
the failed attempts WITHOUT code fences so the current program stays the LAST fenced python
block (the SEARCH/REPLACE / full-rewrite extraction target); all free text is fence-sanitized
via :func:`_defence`. Same truncations as the base section (solution/response 1500 chars,
traceback last 800)."""
if not failures:
return ""
# wording mirrors the base/upstream _format_failed_attempts ("(this retry):", bold markers);
# only the code fences are stripped (fence-free) so the current program stays the last fenced block
out = ["## ❌ Previous Failed Attempts (this retry):",
"The following attempts failed. Avoid these errors:"]
for f in failures:
out.append(f"### Attempt {f.get('n', '?')}:")
out.append(f"**Error:** {_defence(str(f.get('error') or 'Unknown error'))}")
response = str(f.get("response") or "")
solution = str(f.get("solution") or "")
if f.get("outcome") == "no_diff" and response:
if len(response) > solution_chars:
response = response[:solution_chars] + "\n... (truncated)"
out.append("**Your response that failed** (fences stripped):\n" + _defence(response))
elif solution:
if len(solution) > solution_chars:
solution = solution[:solution_chars] + "\n... (truncated)"
out.append("**Generated solution that failed** (fences stripped):\n" + _defence(solution))
traceback = str(f.get("traceback") or "")
if traceback:
if len(traceback) > traceback_chars:
traceback = "... (truncated)\n" + traceback[-traceback_chars:]
out.append("**Traceback** (fences stripped):\n" + _defence(traceback))
return "\n".join(out)
def build(self, selection: Selection, memory=None, state: RunState | None = None) -> Prompt:
parent = selection.parent
if parent is None: # delegated selection (not used by EvoX, kept for safety)
return Prompt(system=self.system_message, user=(state.task_context if state else ""))
sig = (state.signals.get("evox", {}) if state is not None else {}) or {}
pool = list(selection.pool or [])
sections: list[str] = []
if state and state.task_context:
sections.append(f"# Task Description\n{_defence(state.task_context)}")
# --- # Current Solution Information --- ({metrics} + {improvement_areas})
info = ["# Current Solution Information", "- Main Metrics:"]
for key, value in parent.scores.items():
numeric = isinstance(value, (int, float)) and not isinstance(value, bool)
info.append(f" - {key}: {value:.4f}" if numeric
else f" - {key}: {_defence(str(value))}")
info.append(f"- Focus areas: {self._improvement_areas(parent, pool)}")
sections.append("\n".join(info))
# --- # Program Generation History --- (## Previous Attempts, upstream
# _format_previous_attempts in TEXT form: no code fences, so the last-fenced-block
# invariant holds; then the inspiration programs)
sections.append("# Program Generation History\n## Previous Attempts\n\n"
+ self._previous_attempts(pool))
# --- ## Other Context Solutions --- (the strategy's labeled inspiration set; context
# labels are always "" per the contract, so the default heading applies)
if selection.inspirations:
blocks = ["## Other Context Solutions",
"These programs represent diverse approaches and creative solutions that "
"may be relevant to the current task:"]
for i, genome in enumerate(selection.inspirations, 1):
blocks.append(self._context_program(genome, i))
sections.append("\n\n".join(blocks))
# --- evaluator feedback (rendered BEFORE the program so nothing fenced follows it) ---
feedback = parent.artifacts.get("text_feedback")
if feedback:
text = _defence(str(feedback)[: self.max_feedback_chars])
sections.append(f"## Evaluator Feedback on Current Solution\n{text}")
# NOTE: the previous-attempt failure feedback is now rendered IN-ITERATION via
# feedback_section() (appended by the base _attempt retry loop), matching upstream's
# _run_iteration which retries the SAME parent within the iteration. The override below keeps
# that section fence-free so the current program stays the LAST fenced python block.
# --- # Current Solution --- (operator label + obedience line + LAST fenced block)
current = ["# Current Solution"]
label = str(sig.get("label") or "")
if label:
current.append(_defence(label))
combined = parent.scores.get("combined_score")
if isinstance(combined, (int, float)):
current.append(f"## Program Information\ncombined_score: {combined:.4f}")
current.append(_OBEY_LINE)
current.append(f"```python\n{parent.content}\n```")
sections.append("\n\n".join(current))
sections.append(self.by_approach(_TASK_SECTION, _TASK_SECTION_REWRITE))
return Prompt(system=self.system_message, user="\n\n".join(sections))
# ---- section builders ----------------------------------------------------------------------
@staticmethod
def _score_of(genome: Genome) -> float:
value = genome.scores.get("combined_score", 0.0)
if isinstance(value, bool) or not isinstance(value, (int, float)):
try:
return float(value)
except (ValueError, TypeError):
return 0.0
return float(value)
def _improvement_areas(self, parent: Genome, pool: list[Genome]) -> str:
"""``_identify_improvement_areas`` (default builder): combined_score trend vs the most
recent program + the ``suggest_simplification_after_chars`` (500) simplify hint."""
areas: list[str] = []
current_score = self._score_of(parent)
if pool:
# upstream previous_programs[-1]: the most recently found program
previous = max(pool, key=lambda g: g.metadata.get("iteration", 0)
if isinstance(g.metadata.get("iteration"), int) else 0)
prev_score = self._score_of(previous)
if current_score > prev_score:
areas.append(f"Combined score improved: {prev_score:.4f} → {current_score:.4f}")
elif current_score < prev_score:
areas.append(f"Combined score declined: {prev_score:.4f} → {current_score:.4f}. "
f"Consider revising recent changes.")
elif abs(current_score - prev_score) < 1e-6:
areas.append(f"Combined score unchanged at {current_score:.4f}")
threshold = self.simplification_threshold
if threshold and len(parent.content) > threshold:
areas.append(f"Consider simplifying - solution length exceeds "
f"{threshold} characters")
if not areas:
areas.append("Focus on improving the combined_score")
return "\n".join(f"- {area}" for area in areas)
def _previous_attempts(self, pool: list[Genome]) -> str:
"""``_format_previous_attempts``: the top-N programs by combined_score rendered with the
upstream ``previous_attempt.txt`` template (Changes/Metrics/Outcome) in plain text —
no code fences, all free text fence-sanitized."""
if not pool:
return "No previous attempts yet."
by_id = {g.id: g for g in pool}
selected = sorted(pool, key=self._score_of, reverse=True)[: self.num_previous_attempts]
lines: list[str] = []
for i, genome in enumerate(reversed(selected)):
attempt_number = len(selected) - i
changes = genome.metadata.get("changes", "Unknown changes")
performance_parts = []
for name, value in genome.scores.items():
if isinstance(value, (int, float)) and not isinstance(value, bool):
performance_parts.append(f"{name}: {value:.4f}")
else:
performance_parts.append(f"{name}: {_defence(str(value))}")
performance = ", ".join(performance_parts) if performance_parts else "No metrics"
parent = by_id.get(genome.parent_id) if genome.parent_id else None
prog_score = genome.scores.get("combined_score")
parent_score = (parent.scores.get("combined_score", 0) if parent is not None else 0)
outcome = "No change in combined_score"
if (isinstance(prog_score, (int, float))
and isinstance(parent_score, (int, float))):
if prog_score > parent_score:
outcome = "Improvement in combined_score"
elif prog_score < parent_score:
outcome = "Regression in combined_score"
lines.append(f"### Attempt {attempt_number}\n- Changes: {_defence(str(changes))}\n"
f"- Metrics: {performance}\n- Outcome: {outcome}")
return "\n\n".join(lines)
@staticmethod
def _context_program(genome: Genome, index: int) -> str:
"""``_format_single_context_program``: header with combined_score, error line, score
breakdown, then the FULL program source. The source is display-only (never re-extracted),
so it is fence-sanitized — the parent block stays the LAST fenced python block."""
combined = genome.scores.get("combined_score")
if isinstance(combined, (int, float)) and not isinstance(combined, bool):
lines = [f"### Program {index} (combined_score: {combined:.4f})"]
else:
lines = [f"### Program {index}"]
error = genome.scores.get("error") or genome.artifacts.get("error")
if error:
lines.append(f"- error: {_defence(str(error))}")
breakdown = {k: v for k, v in genome.scores.items()
if k not in ("combined_score", "error")}
if breakdown:
lines.append("Score breakdown:")
for key, value in breakdown.items():
if isinstance(value, float):
lines.append(f" - {key}: {value:.4f}")
else:
lines.append(f" - {key}: {_defence(str(value))}")
lines.append(f"```python\n{_defence(genome.content)}\n```")
return "\n".join(lines)