SkyDiscover/evox

EvoX

Co-evolves the search strategy with the solutions: the parent/context selection policy is itself LLM-written code, scored by windowed improvement and hot-swapped on stagnation.

Test-time searchApache-2.0

"""EvoX strategy infrastructure — the executable-strategy contract the meta-LLM codes against. NOT one of the six components: this module is the galapagos mirror of SkyDiscover's ``search/base_database.py`` (the surface generated strategies program against) plus ``search/evox/database/search_strategy_evaluator.py`` (the ``Valid(·)`` test). Three exports: * :class:`StrategyBase` — the ``ProgramDatabase`` equivalent. A search strategy is a Python class ``EvolvedStrategy(StrategyBase)`` whose ``add()``/``sample()`` methods ARE the selection policy. It exposes ``self.genomes`` (the candidate store, the mirror of ``self.programs``), ``self.rng`` (host-injected seeded RNG — generated code must route ALL randomness through it), ``DIVERGE_LABEL``/``REFINE_LABEL`` (the variation-operator texts assigned by the scaffold) and the ``get``/``get_best``/``_update_best`` helpers documented in the meta system prompt. * :func:`load_strategy_from_source` — ``load_database_from_file`` mirror: exec the source in a fresh module namespace and return the class named exactly ``EvolvedStrategy``. * :func:`validate_strategy` — faithful port of ``search_strategy_evaluator.evaluate`` (full mode): structural checks, add() metric preservation (tolerance 1e-10, stored AND original object), bulk add + sample contract (exactly-one-parent, total context <= min(requested, store)), the error-string/mixed-type robustness tests (mixed-type junk in ``artifacts``/``metadata`` AND one round with an actual string value directly in ``Genome.scores`` — the dict is not type-enforced, so the upstream type-checking pressure is preserved), base-``Genome`` migration compatibility, and the ``num_context_programs`` re-check with n=3. Returns ``(ok, error, cls)`` with the already-loaded class on success, so the deployed object is the validated object (no second exec). Deterministic: all strategy randomness flows through the caller-supplied ``rng``. """ from __future__ import annotations import inspect import itertools import random import traceback import types import typing from ...records import Genome _module_ids = itertools.count(1) class StrategyBase: """The base class evolved strategies inherit from (the ``ProgramDatabase`` mirror). Generated code may add helper state in ``__init__`` (which must call ``super().__init__()``) and must implement ``add``/``sample``. The host injects ``self.rng`` after construction. """ # variation-operator label texts; the scaffold assigns the real (possibly generated) texts DIVERGE_LABEL: str = "" REFINE_LABEL: str = "" def __init__(self) -> None: self.genomes: dict[str, Genome] = {} # the candidate store (``self.programs``) self.rng: random.Random = random.Random(0) # host-injected; ALL randomness through it self.best_id: str | None = None # ``best_program_id`` # ---- helpers documented in the meta system prompt -------------------------------------- def get(self, genome_id: str) -> Genome | None: """Get a Genome by its id (``ProgramDatabase.get``).""" return self.genomes.get(genome_id) def get_best(self) -> Genome | None: """Best genome by ``combined_score`` (``get_best_program``): tracked id when it is still stored, else recomputed over the store.""" if self.best_id is not None and self.best_id in self.genomes: return self.genomes[self.best_id] if not self.genomes: return None best = max(self.genomes.values(), key=lambda g: g.fitness) self.best_id = best.id return best def _update_best(self, genome: Genome) -> None: """``_update_best_program``: promote ``genome`` to best when strictly fitter.""" best = self.get_best() if best is None or genome.fitness > best.fitness: self.best_id = genome.id # ---- the strategy surface (implemented by EvolvedStrategy) ------------------------------ def add(self, genome: Genome, iteration: int | None = None, **kwargs) -> str: """How genomes enter and remain in the population. Must return ``genome.id``.""" raise NotImplementedError def sample(self, num_context_programs: int | None = 4, **kwargs): """Pick ``({label: parent}, {label: [context genomes]})`` for the next generation.""" raise NotImplementedError def load_strategy_from_source(source: str) -> type[StrategyBase]: """Exec ``source`` in a fresh module namespace and return its ``EvolvedStrategy`` class (the ``load_database_from_file`` mirror). The source must import only from galapagos and the Python standard library; it carries its own imports.""" module = types.ModuleType(f"_evox_strategy_{next(_module_ids)}") module.__file__ = "<evox-strategy>" # noqa: S102 — in-process exec of LLM-written code; Valid(·) is a behavioral test suite, NOT # a sandbox: module-level code runs with full interpreter access. Each exec also runs the # module level again, so validated classes must be deployed directly (see validate_strategy). exec(compile(source, "<evox-strategy>", "exec"), module.__dict__) # noqa: S102 cls = module.__dict__.get("EvolvedStrategy") if cls is None or not inspect.isclass(cls): raise ValueError("source defines no class named exactly 'EvolvedStrategy'") if not issubclass(cls, StrategyBase): raise ValueError("EvolvedStrategy must inherit from StrategyBase") return cls # ------------------------------------------------------------------------------------------- # Valid(·) — port of search_strategy_evaluator.evaluate (full mode counts: 10/10/5/5/3) # ------------------------------------------------------------------------------------------- _NUM_PROGRAMS_TO_ADD = 10 _NUM_SAMPLE_ITERATIONS = 10 _NUM_ERROR_SAMPLE_ITERATIONS = 5 _NUM_MIGRATION_PROGRAMS = 5 _NUM_MIGRATION_SAMPLES = 3 def _verify_scores_preserved(original: dict, stored: dict, operation: str, genome_id: str) -> str: """Every original score key/value must survive unchanged (numeric tolerance 1e-10). Returns an error message, or ``""`` when OK (``_verify_metrics_preserved`` port).""" for key, original_value in original.items(): if key not in stored: return (f"Score '{key}' was deleted from genome '{genome_id}' during {operation}. " f"Original scores: {sorted(original)}, stored scores: {sorted(stored)}") stored_value = stored[key] if isinstance(original_value, (int, float)) and isinstance(stored_value, (int, float)): if abs(float(original_value) - float(stored_value)) > 1e-10: return (f"Score '{key}' was modified in genome '{genome_id}' during {operation}: " f"original={original_value}, stored={stored_value}. " f"Score values must remain unchanged.") elif original_value != stored_value: return (f"Score '{key}' was modified in genome '{genome_id}' during {operation}: " f"original={original_value!r}, stored={stored_value!r}. " f"Score values must remain unchanged.") return "" def _check_sample(db: StrategyBase, requested: int, originals: dict[str, dict], phase: str, sample_iter: int) -> str: """One ``sample()`` round against the full contract; ``""`` when OK.""" result = db.sample(num_context_programs=requested) if not isinstance(result, tuple): return f"sample() must return a tuple, got {type(result)} {phase} (iteration {sample_iter})" if len(result) != 2: return (f"sample() must return a tuple of 2 elements (parent_dict, context_programs_dict), " f"got {len(result)} elements {phase} (iteration {sample_iter})") parent_dict, context_dict = result if not isinstance(parent_dict, dict): return (f"sample() first element must be a Dict[str, Genome], got {type(parent_dict)} " f"{phase} (iteration {sample_iter})") if len(parent_dict) != 1: return (f"sample() must return exactly one parent genome in parent_dict, got " f"{len(parent_dict)} parents with keys {list(parent_dict)} {phase} " f"(iteration {sample_iter})") parent = next(iter(parent_dict.values())) if parent is None or not isinstance(parent, Genome): return f"sample() parent_dict value must be a Genome, got {type(parent)} {phase}" if parent.id not in db.genomes: return f"Sampled parent (id={parent.id}) not found in the store {phase}" if not isinstance(context_dict, dict): return (f"sample() second element must be a Dict[str, List[Genome]], got " f"{type(context_dict)} {phase} (iteration {sample_iter})") all_context: list[Genome] = [] for label, genome_list in context_dict.items(): if not isinstance(genome_list, list): return (f"sample() context_programs_dict[{label!r}] must be a list, got " f"{type(genome_list)} {phase}") for g in genome_list: if not isinstance(g, Genome): return (f"sample() context_programs_dict[{label!r}] contains a non-Genome object: " f"{type(g)} {phase}") all_context.extend(genome_list) max_possible = min(requested, len(db.genomes)) if len(all_context) > max_possible: return (f"sample() returned {len(all_context)} total context genomes, which exceeds the " f"maximum possible ({max_possible} given {len(db.genomes)} genomes in the store) " f"for num_context_programs={requested} {phase}") # metric preservation on the returned AND stored objects (parent + every context genome) for g, role in [(parent, "parent")] + [(c, "context program") for c in all_context]: if g.id not in originals: continue # genomes the test didn't create (none expected) err = _verify_scores_preserved(originals[g.id], g.scores, f"sample() ({role} returned) {phase}", g.id) if err: return err stored = db.get(g.id) if stored is None: return f"Sampled {role} (id={g.id}) not found in the store via get() {phase}" err = _verify_scores_preserved(originals[g.id], stored.scores, f"sample() ({role} stored) {phase}", g.id) if err: return err return "" def validate_strategy( source: str, rng: random.Random) -> tuple[bool, str, type[StrategyBase] | None]: """``Valid(S')``: load + structural checks + behavioral test-suite. Returns ``(ok, error, cls)`` — on success ``cls`` is the ALREADY-loaded ``EvolvedStrategy`` class and callers must deploy exactly that object: re-exec'ing the source via ``load_strategy_from_source`` would run the candidate's module level a SECOND time, unguarded (validation is the only sanctioned exec of meta-generated code). Never raises. Deterministic — the candidate strategy's ``rng`` is the one passed in.""" # 1. load (the ONE exec of the candidate source) try: cls = load_strategy_from_source(source) except Exception as e: # noqa: BLE001 — any load failure is a validation verdict return False, f"Failed to load strategy class: {e}", None ok, error = _behavioral_checks(cls, rng) return ok, error, (cls if ok else None) def _behavioral_checks(cls: type[StrategyBase], rng: random.Random) -> tuple[bool, str]: """Steps 2-9 of the ``search_strategy_evaluator.evaluate`` port, run against the already-loaded class. Returns ``(ok, error)``; never raises.""" try: # 2. structural checks try: sig = inspect.signature(cls.sample) params = list(sig.parameters.values()) if len(params) < 2 or params[1].name != "num_context_programs": return False, ("sample() must have signature sample(self, num_context_programs: " "Optional[int] = 4, **kwargs). Expected second parameter named " "'num_context_programs'.") annotation = sig.return_annotation if annotation is not inspect.Signature.empty and not isinstance(annotation, str): origin = typing.get_origin(annotation) args = typing.get_args(annotation) if origin is tuple and len(args) != 2: return False, (f"sample() return type must be a 2-tuple " f"(parent_dict, context_programs_dict), got {annotation}") except Exception as e: # noqa: BLE001 return False, f"Failed structural checks for EvolvedStrategy: {e}" # 3. labels exist (inherited empty defaults are fine), instantiation, rng injection if not hasattr(cls, "DIVERGE_LABEL"): cls.DIVERGE_LABEL = "" if not hasattr(cls, "REFINE_LABEL"): cls.REFINE_LABEL = "" try: db = cls() db.rng = rng except Exception as e: # noqa: BLE001 return False, f"Failed to initialize strategy: {e}" originals: dict[str, dict] = {} # 4. add() metric preservation (stored AND original object) try: g1 = Genome(content="def test(): return 1", id="test_program_1", scores={"score": 0.5, "combined_score": 0.5}, metadata={"iteration": 0}) originals[g1.id] = dict(g1.scores) returned = db.add(g1, iteration=0) if returned != "test_program_1": return False, f"add() returned unexpected id: {returned!r}" stored = db.get("test_program_1") if stored is None: return False, "Genome not found in the store after add()" for target, op in ((stored.scores, "add()"), (g1.scores, "add() (original object)")): err = _verify_scores_preserved(originals[g1.id], target, op, g1.id) if err: return False, err except Exception as e: # noqa: BLE001 return False, f"Failed to add or verify genome: {e}" # 5. bulk add + sample contract (10 genomes, 10 rounds, n=4) try: for i in range(_NUM_PROGRAMS_TO_ADD): g = Genome(content=f"def func_{i}(): return {i}", id=f"program_{i}", scores={"score": i / 10.0, "combined_score": i / 10.0}, metadata={"iteration": i}) originals[g.id] = dict(g.scores) db.add(g, iteration=i) stored = db.get(g.id) if stored is None: return False, f"Genome '{g.id}' not found in the store after add()" err = _verify_scores_preserved(originals[g.id], stored.scores, "add()", g.id) if err: return False, err if len(db.genomes) < _NUM_PROGRAMS_TO_ADD: return False, (f"Expected at least {_NUM_PROGRAMS_TO_ADD} genomes, " f"found {len(db.genomes)}") for sample_iter in range(_NUM_SAMPLE_ITERATIONS): err = _check_sample(db, 4, originals, "", sample_iter) if err: return False, err except Exception as e: # noqa: BLE001 return False, f"Failed to add {_NUM_PROGRAMS_TO_ADD} genomes and sample: {e}" # 6. error/zero-score robustness — galapagos adaptation: scores stay float-typed, the # mixed-type junk lives in artifacts/metadata; the strategy must survive zero scores and # non-numeric side data without arithmetic crashes try: error_genome = Genome( content="def error_func(): return 0", id="error_program", scores={"combined_score": 0.0, "runs_successfully": 0.0}, metadata={"iteration": 10, "note": None, "trace": ["mixed", 1, {"k": "v"}]}, artifacts={"error": ("Stage 1 error: cannot access local variable 'r' where it " "is not associated with a value"), "text_feedback": "Stage 1 error"}) originals[error_genome.id] = dict(error_genome.scores) db.add(error_genome, iteration=10) stored = db.get("error_program") if stored is None: return False, "Genome with an error artifact not found after add()" err = _verify_scores_preserved(originals["error_program"], stored.scores, "add()", "error_program") if err: return False, err for sample_iter in range(_NUM_ERROR_SAMPLE_ITERATIONS): err = _check_sample(db, 3, originals, "when testing error genomes", sample_iter) if err: return False, err except Exception as e: # noqa: BLE001 return False, f"Failed to handle genomes with error artifacts: {e}" # 6b. string-valued score robustness — the upstream type-checking pressure: reference # tests add programs whose metrics dicts hold actual STRINGS ('error': 'Stage 1 error: # ...'). Genome.scores is a plain dict, so a custom Evaluator CAN place strings in it; # strategies must isinstance-check score values before arithmetic (per the meta prompt) try: sv = Genome(content="def error_func_2(): return 0", id="error_string_program", scores={"combined_score": 0.0, "error": ("Stage 1 error: cannot access local variable 'r' " "where it is not associated with a value")}, metadata={"iteration": 11}) originals[sv.id] = dict(sv.scores) db.add(sv, iteration=11) stored = db.get(sv.id) if stored is None: return False, "Genome with a string score value not found after add()" err = _verify_scores_preserved(originals[sv.id], stored.scores, "add()", sv.id) if err: return False, err err = _check_sample(db, 3, originals, "when testing string-valued scores", 0) if err: return False, err except Exception as e: # noqa: BLE001 return False, f"Failed to handle genomes with string score values: {e}" # 7. score immutability on a many-key genome try: mt = Genome(content="def test_metrics(): return 42", id="metrics_test_program", scores={"combined_score": 0.85, "correlation": 0.92, "success_rate": 1.0, "execution_time": 0.123}, metadata={"iteration": 11}, artifacts={"error": "No error"}) originals[mt.id] = dict(mt.scores) db.add(mt, iteration=11) stored = db.get(mt.id) if stored is None: return False, "Score-immutability test genome not found after add()" for target, op in ((stored.scores, "add()"), (mt.scores, "add() (original object)")): err = _verify_scores_preserved(originals[mt.id], target, op, mt.id) if err: return False, err except Exception as e: # noqa: BLE001 return False, f"Failed to verify score immutability: {e}" # 8. migration compatibility: a fresh store must accept plain Genome instances and sample # (guards against strategies that filter by an isinstance check on a custom subclass) try: db2 = cls() db2.rng = rng for i in range(_NUM_MIGRATION_PROGRAMS): g = Genome(content=f"def migrated_func_{i}(): return {i * 10}", id=f"migrated_program_{i}", scores={"combined_score": i / 5.0, "score": i / 5.0}, metadata={"iteration": i}) originals[g.id] = dict(g.scores) db2.add(g, iteration=i) if len(db2.genomes) < _NUM_MIGRATION_PROGRAMS: return False, (f"Migration test: expected at least {_NUM_MIGRATION_PROGRAMS} " f"genomes after adding plain Genome instances, " f"found {len(db2.genomes)}") for sample_iter in range(_NUM_MIGRATION_SAMPLES): try: err = _check_sample(db2, 3, originals, "in the migration test", sample_iter) except ValueError as e: if "No candidates" in str(e): return False, ( "Migration test FAILED: sample() raised 'No candidates' when the " f"store contains {len(db2.genomes)} genomes. This typically happens " "when the implementation filters genomes by a custom isinstance " "check, which fails for migrated plain Genome instances. Fix: use " "self.genomes.values() directly without isinstance filtering.") raise if err: return False, err except Exception as e: # noqa: BLE001 return False, f"Migration test failed: {e}" # 9. num_context_programs contract re-check with n=3 on the main store err = _check_sample(db, 3, originals, "in the n=3 re-check", 0) if err: return False, err return True, "" except Exception as e: # noqa: BLE001 — the validator must never raise return False, f"Unexpected error: {e}\n{traceback.format_exc()}"