EvoX
Co-evolves the search strategy with the solutions: the parent/context selection policy is itself LLM-written code, scored by windowed improvement and hot-swapped on stagnation.
"""EvoX strategy infrastructure — the executable-strategy contract the meta-LLM codes against.
NOT one of the six components: this module is the galapagos mirror of SkyDiscover's
``search/base_database.py`` (the surface generated strategies program against) plus
``search/evox/database/search_strategy_evaluator.py`` (the ``Valid(·)`` test). Three exports:
* :class:`StrategyBase` — the ``ProgramDatabase`` equivalent. A search strategy is a Python class
``EvolvedStrategy(StrategyBase)`` whose ``add()``/``sample()`` methods ARE the selection policy.
It exposes ``self.genomes`` (the candidate store, the mirror of ``self.programs``), ``self.rng``
(host-injected seeded RNG — generated code must route ALL randomness through it),
``DIVERGE_LABEL``/``REFINE_LABEL`` (the variation-operator texts assigned by the scaffold) and
the ``get``/``get_best``/``_update_best`` helpers documented in the meta system prompt.
* :func:`load_strategy_from_source` — ``load_database_from_file`` mirror: exec the source in a
fresh module namespace and return the class named exactly ``EvolvedStrategy``.
* :func:`validate_strategy` — faithful port of ``search_strategy_evaluator.evaluate`` (full mode):
structural checks, add() metric preservation (tolerance 1e-10, stored AND original object),
bulk add + sample contract (exactly-one-parent, total context <= min(requested, store)), the
error-string/mixed-type robustness tests (mixed-type junk in ``artifacts``/``metadata`` AND one
round with an actual string value directly in ``Genome.scores`` — the dict is not type-enforced,
so the upstream type-checking pressure is preserved), base-``Genome`` migration compatibility,
and the ``num_context_programs`` re-check with n=3. Returns ``(ok, error, cls)`` with the
already-loaded class on success, so the deployed object is the validated object (no second
exec). Deterministic: all strategy randomness flows through the caller-supplied ``rng``.
"""
from __future__ import annotations
import inspect
import itertools
import random
import traceback
import types
import typing
from ...records import Genome
_module_ids = itertools.count(1)
class StrategyBase:
"""The base class evolved strategies inherit from (the ``ProgramDatabase`` mirror).
Generated code may add helper state in ``__init__`` (which must call ``super().__init__()``)
and must implement ``add``/``sample``. The host injects ``self.rng`` after construction.
"""
# variation-operator label texts; the scaffold assigns the real (possibly generated) texts
DIVERGE_LABEL: str = ""
REFINE_LABEL: str = ""
def __init__(self) -> None:
self.genomes: dict[str, Genome] = {} # the candidate store (``self.programs``)
self.rng: random.Random = random.Random(0) # host-injected; ALL randomness through it
self.best_id: str | None = None # ``best_program_id``
# ---- helpers documented in the meta system prompt --------------------------------------
def get(self, genome_id: str) -> Genome | None:
"""Get a Genome by its id (``ProgramDatabase.get``)."""
return self.genomes.get(genome_id)
def get_best(self) -> Genome | None:
"""Best genome by ``combined_score`` (``get_best_program``): tracked id when it is still
stored, else recomputed over the store."""
if self.best_id is not None and self.best_id in self.genomes:
return self.genomes[self.best_id]
if not self.genomes:
return None
best = max(self.genomes.values(), key=lambda g: g.fitness)
self.best_id = best.id
return best
def _update_best(self, genome: Genome) -> None:
"""``_update_best_program``: promote ``genome`` to best when strictly fitter."""
best = self.get_best()
if best is None or genome.fitness > best.fitness:
self.best_id = genome.id
# ---- the strategy surface (implemented by EvolvedStrategy) ------------------------------
def add(self, genome: Genome, iteration: int | None = None, **kwargs) -> str:
"""How genomes enter and remain in the population. Must return ``genome.id``."""
raise NotImplementedError
def sample(self, num_context_programs: int | None = 4, **kwargs):
"""Pick ``({label: parent}, {label: [context genomes]})`` for the next generation."""
raise NotImplementedError
def load_strategy_from_source(source: str) -> type[StrategyBase]:
"""Exec ``source`` in a fresh module namespace and return its ``EvolvedStrategy`` class
(the ``load_database_from_file`` mirror). The source must import only from galapagos and
the Python standard library; it carries its own imports."""
module = types.ModuleType(f"_evox_strategy_{next(_module_ids)}")
module.__file__ = "<evox-strategy>"
# noqa: S102 — in-process exec of LLM-written code; Valid(·) is a behavioral test suite, NOT
# a sandbox: module-level code runs with full interpreter access. Each exec also runs the
# module level again, so validated classes must be deployed directly (see validate_strategy).
exec(compile(source, "<evox-strategy>", "exec"), module.__dict__) # noqa: S102
cls = module.__dict__.get("EvolvedStrategy")
if cls is None or not inspect.isclass(cls):
raise ValueError("source defines no class named exactly 'EvolvedStrategy'")
if not issubclass(cls, StrategyBase):
raise ValueError("EvolvedStrategy must inherit from StrategyBase")
return cls
# -------------------------------------------------------------------------------------------
# Valid(·) — port of search_strategy_evaluator.evaluate (full mode counts: 10/10/5/5/3)
# -------------------------------------------------------------------------------------------
_NUM_PROGRAMS_TO_ADD = 10
_NUM_SAMPLE_ITERATIONS = 10
_NUM_ERROR_SAMPLE_ITERATIONS = 5
_NUM_MIGRATION_PROGRAMS = 5
_NUM_MIGRATION_SAMPLES = 3
def _verify_scores_preserved(original: dict, stored: dict, operation: str, genome_id: str) -> str:
"""Every original score key/value must survive unchanged (numeric tolerance 1e-10).
Returns an error message, or ``""`` when OK (``_verify_metrics_preserved`` port)."""
for key, original_value in original.items():
if key not in stored:
return (f"Score '{key}' was deleted from genome '{genome_id}' during {operation}. "
f"Original scores: {sorted(original)}, stored scores: {sorted(stored)}")
stored_value = stored[key]
if isinstance(original_value, (int, float)) and isinstance(stored_value, (int, float)):
if abs(float(original_value) - float(stored_value)) > 1e-10:
return (f"Score '{key}' was modified in genome '{genome_id}' during {operation}: "
f"original={original_value}, stored={stored_value}. "
f"Score values must remain unchanged.")
elif original_value != stored_value:
return (f"Score '{key}' was modified in genome '{genome_id}' during {operation}: "
f"original={original_value!r}, stored={stored_value!r}. "
f"Score values must remain unchanged.")
return ""
def _check_sample(db: StrategyBase, requested: int, originals: dict[str, dict],
phase: str, sample_iter: int) -> str:
"""One ``sample()`` round against the full contract; ``""`` when OK."""
result = db.sample(num_context_programs=requested)
if not isinstance(result, tuple):
return f"sample() must return a tuple, got {type(result)} {phase} (iteration {sample_iter})"
if len(result) != 2:
return (f"sample() must return a tuple of 2 elements (parent_dict, context_programs_dict), "
f"got {len(result)} elements {phase} (iteration {sample_iter})")
parent_dict, context_dict = result
if not isinstance(parent_dict, dict):
return (f"sample() first element must be a Dict[str, Genome], got {type(parent_dict)} "
f"{phase} (iteration {sample_iter})")
if len(parent_dict) != 1:
return (f"sample() must return exactly one parent genome in parent_dict, got "
f"{len(parent_dict)} parents with keys {list(parent_dict)} {phase} "
f"(iteration {sample_iter})")
parent = next(iter(parent_dict.values()))
if parent is None or not isinstance(parent, Genome):
return f"sample() parent_dict value must be a Genome, got {type(parent)} {phase}"
if parent.id not in db.genomes:
return f"Sampled parent (id={parent.id}) not found in the store {phase}"
if not isinstance(context_dict, dict):
return (f"sample() second element must be a Dict[str, List[Genome]], got "
f"{type(context_dict)} {phase} (iteration {sample_iter})")
all_context: list[Genome] = []
for label, genome_list in context_dict.items():
if not isinstance(genome_list, list):
return (f"sample() context_programs_dict[{label!r}] must be a list, got "
f"{type(genome_list)} {phase}")
for g in genome_list:
if not isinstance(g, Genome):
return (f"sample() context_programs_dict[{label!r}] contains a non-Genome object: "
f"{type(g)} {phase}")
all_context.extend(genome_list)
max_possible = min(requested, len(db.genomes))
if len(all_context) > max_possible:
return (f"sample() returned {len(all_context)} total context genomes, which exceeds the "
f"maximum possible ({max_possible} given {len(db.genomes)} genomes in the store) "
f"for num_context_programs={requested} {phase}")
# metric preservation on the returned AND stored objects (parent + every context genome)
for g, role in [(parent, "parent")] + [(c, "context program") for c in all_context]:
if g.id not in originals:
continue # genomes the test didn't create (none expected)
err = _verify_scores_preserved(originals[g.id], g.scores,
f"sample() ({role} returned) {phase}", g.id)
if err:
return err
stored = db.get(g.id)
if stored is None:
return f"Sampled {role} (id={g.id}) not found in the store via get() {phase}"
err = _verify_scores_preserved(originals[g.id], stored.scores,
f"sample() ({role} stored) {phase}", g.id)
if err:
return err
return ""
def validate_strategy(
source: str, rng: random.Random) -> tuple[bool, str, type[StrategyBase] | None]:
"""``Valid(S')``: load + structural checks + behavioral test-suite. Returns
``(ok, error, cls)`` — on success ``cls`` is the ALREADY-loaded ``EvolvedStrategy`` class and
callers must deploy exactly that object: re-exec'ing the source via
``load_strategy_from_source`` would run the candidate's module level a SECOND time, unguarded
(validation is the only sanctioned exec of meta-generated code). Never raises. Deterministic —
the candidate strategy's ``rng`` is the one passed in."""
# 1. load (the ONE exec of the candidate source)
try:
cls = load_strategy_from_source(source)
except Exception as e: # noqa: BLE001 — any load failure is a validation verdict
return False, f"Failed to load strategy class: {e}", None
ok, error = _behavioral_checks(cls, rng)
return ok, error, (cls if ok else None)
def _behavioral_checks(cls: type[StrategyBase], rng: random.Random) -> tuple[bool, str]:
"""Steps 2-9 of the ``search_strategy_evaluator.evaluate`` port, run against the
already-loaded class. Returns ``(ok, error)``; never raises."""
try:
# 2. structural checks
try:
sig = inspect.signature(cls.sample)
params = list(sig.parameters.values())
if len(params) < 2 or params[1].name != "num_context_programs":
return False, ("sample() must have signature sample(self, num_context_programs: "
"Optional[int] = 4, **kwargs). Expected second parameter named "
"'num_context_programs'.")
annotation = sig.return_annotation
if annotation is not inspect.Signature.empty and not isinstance(annotation, str):
origin = typing.get_origin(annotation)
args = typing.get_args(annotation)
if origin is tuple and len(args) != 2:
return False, (f"sample() return type must be a 2-tuple "
f"(parent_dict, context_programs_dict), got {annotation}")
except Exception as e: # noqa: BLE001
return False, f"Failed structural checks for EvolvedStrategy: {e}"
# 3. labels exist (inherited empty defaults are fine), instantiation, rng injection
if not hasattr(cls, "DIVERGE_LABEL"):
cls.DIVERGE_LABEL = ""
if not hasattr(cls, "REFINE_LABEL"):
cls.REFINE_LABEL = ""
try:
db = cls()
db.rng = rng
except Exception as e: # noqa: BLE001
return False, f"Failed to initialize strategy: {e}"
originals: dict[str, dict] = {}
# 4. add() metric preservation (stored AND original object)
try:
g1 = Genome(content="def test(): return 1", id="test_program_1",
scores={"score": 0.5, "combined_score": 0.5}, metadata={"iteration": 0})
originals[g1.id] = dict(g1.scores)
returned = db.add(g1, iteration=0)
if returned != "test_program_1":
return False, f"add() returned unexpected id: {returned!r}"
stored = db.get("test_program_1")
if stored is None:
return False, "Genome not found in the store after add()"
for target, op in ((stored.scores, "add()"), (g1.scores, "add() (original object)")):
err = _verify_scores_preserved(originals[g1.id], target, op, g1.id)
if err:
return False, err
except Exception as e: # noqa: BLE001
return False, f"Failed to add or verify genome: {e}"
# 5. bulk add + sample contract (10 genomes, 10 rounds, n=4)
try:
for i in range(_NUM_PROGRAMS_TO_ADD):
g = Genome(content=f"def func_{i}(): return {i}", id=f"program_{i}",
scores={"score": i / 10.0, "combined_score": i / 10.0},
metadata={"iteration": i})
originals[g.id] = dict(g.scores)
db.add(g, iteration=i)
stored = db.get(g.id)
if stored is None:
return False, f"Genome '{g.id}' not found in the store after add()"
err = _verify_scores_preserved(originals[g.id], stored.scores, "add()", g.id)
if err:
return False, err
if len(db.genomes) < _NUM_PROGRAMS_TO_ADD:
return False, (f"Expected at least {_NUM_PROGRAMS_TO_ADD} genomes, "
f"found {len(db.genomes)}")
for sample_iter in range(_NUM_SAMPLE_ITERATIONS):
err = _check_sample(db, 4, originals, "", sample_iter)
if err:
return False, err
except Exception as e: # noqa: BLE001
return False, f"Failed to add {_NUM_PROGRAMS_TO_ADD} genomes and sample: {e}"
# 6. error/zero-score robustness — galapagos adaptation: scores stay float-typed, the
# mixed-type junk lives in artifacts/metadata; the strategy must survive zero scores and
# non-numeric side data without arithmetic crashes
try:
error_genome = Genome(
content="def error_func(): return 0", id="error_program",
scores={"combined_score": 0.0, "runs_successfully": 0.0},
metadata={"iteration": 10, "note": None, "trace": ["mixed", 1, {"k": "v"}]},
artifacts={"error": ("Stage 1 error: cannot access local variable 'r' where it "
"is not associated with a value"),
"text_feedback": "Stage 1 error"})
originals[error_genome.id] = dict(error_genome.scores)
db.add(error_genome, iteration=10)
stored = db.get("error_program")
if stored is None:
return False, "Genome with an error artifact not found after add()"
err = _verify_scores_preserved(originals["error_program"], stored.scores,
"add()", "error_program")
if err:
return False, err
for sample_iter in range(_NUM_ERROR_SAMPLE_ITERATIONS):
err = _check_sample(db, 3, originals, "when testing error genomes", sample_iter)
if err:
return False, err
except Exception as e: # noqa: BLE001
return False, f"Failed to handle genomes with error artifacts: {e}"
# 6b. string-valued score robustness — the upstream type-checking pressure: reference
# tests add programs whose metrics dicts hold actual STRINGS ('error': 'Stage 1 error:
# ...'). Genome.scores is a plain dict, so a custom Evaluator CAN place strings in it;
# strategies must isinstance-check score values before arithmetic (per the meta prompt)
try:
sv = Genome(content="def error_func_2(): return 0", id="error_string_program",
scores={"combined_score": 0.0,
"error": ("Stage 1 error: cannot access local variable 'r' "
"where it is not associated with a value")},
metadata={"iteration": 11})
originals[sv.id] = dict(sv.scores)
db.add(sv, iteration=11)
stored = db.get(sv.id)
if stored is None:
return False, "Genome with a string score value not found after add()"
err = _verify_scores_preserved(originals[sv.id], stored.scores, "add()", sv.id)
if err:
return False, err
err = _check_sample(db, 3, originals, "when testing string-valued scores", 0)
if err:
return False, err
except Exception as e: # noqa: BLE001
return False, f"Failed to handle genomes with string score values: {e}"
# 7. score immutability on a many-key genome
try:
mt = Genome(content="def test_metrics(): return 42", id="metrics_test_program",
scores={"combined_score": 0.85, "correlation": 0.92, "success_rate": 1.0,
"execution_time": 0.123},
metadata={"iteration": 11}, artifacts={"error": "No error"})
originals[mt.id] = dict(mt.scores)
db.add(mt, iteration=11)
stored = db.get(mt.id)
if stored is None:
return False, "Score-immutability test genome not found after add()"
for target, op in ((stored.scores, "add()"), (mt.scores, "add() (original object)")):
err = _verify_scores_preserved(originals[mt.id], target, op, mt.id)
if err:
return False, err
except Exception as e: # noqa: BLE001
return False, f"Failed to verify score immutability: {e}"
# 8. migration compatibility: a fresh store must accept plain Genome instances and sample
# (guards against strategies that filter by an isinstance check on a custom subclass)
try:
db2 = cls()
db2.rng = rng
for i in range(_NUM_MIGRATION_PROGRAMS):
g = Genome(content=f"def migrated_func_{i}(): return {i * 10}",
id=f"migrated_program_{i}",
scores={"combined_score": i / 5.0, "score": i / 5.0},
metadata={"iteration": i})
originals[g.id] = dict(g.scores)
db2.add(g, iteration=i)
if len(db2.genomes) < _NUM_MIGRATION_PROGRAMS:
return False, (f"Migration test: expected at least {_NUM_MIGRATION_PROGRAMS} "
f"genomes after adding plain Genome instances, "
f"found {len(db2.genomes)}")
for sample_iter in range(_NUM_MIGRATION_SAMPLES):
try:
err = _check_sample(db2, 3, originals, "in the migration test", sample_iter)
except ValueError as e:
if "No candidates" in str(e):
return False, (
"Migration test FAILED: sample() raised 'No candidates' when the "
f"store contains {len(db2.genomes)} genomes. This typically happens "
"when the implementation filters genomes by a custom isinstance "
"check, which fails for migrated plain Genome instances. Fix: use "
"self.genomes.values() directly without isinstance filtering.")
raise
if err:
return False, err
except Exception as e: # noqa: BLE001
return False, f"Migration test failed: {e}"
# 9. num_context_programs contract re-check with n=3 on the main store
err = _check_sample(db, 3, originals, "in the n=3 re-check", 0)
if err:
return False, err
return True, ""
except Exception as e: # noqa: BLE001 — the validator must never raise
return False, f"Unexpected error: {e}\n{traceback.format_exc()}"