SkyDiscover/evox

EvoX

Co-evolves the search strategy with the solutions: the parent/context selection policy is itself LLM-written code, scored by windowed improvement and hot-swapped on stagnation.

Test-time searchApache-2.0

"""EvoX Population component — the host of the ACTIVE evolved search strategy. One file per component (see scaffold.py). Faithful port of the database half of SkyDiscover's ``CoEvolutionController``: the candidate store IS the current ``EvolvedStrategy`` instance (the ``EvolvedProgramDatabase`` mirror), and this Population owns the lifecycle around it — the eval-failure admission gate, the always-run best-tracking wrapper (``_wrap_add_method``), the execution trace consumed by the φ descriptor, the hot-swap with full migration (``_switch_to_new_search_algorithm`` / ``_migrate_to_db``), the runtime fallback (``_restore_fallback_database``), and the ``get_statistics`` port (:meth:`statistics`). Admission mirrors the upstream gate exactly as in the adaevolve port: children whose scores mark them invalid (``validity`` exactly 0 or -1, or — when no ``validity`` key exists — the two ``SubprocessEvaluator`` hard-failure shapes: ``combined_score == 0.0`` with ``text_feedback`` starting ``"evaluator error:"`` or ``"timeout after"``) are rejected outright and never reach the strategy, exactly like upstream children that error out and are retried instead of added. The bootstrap add (the task seed into an empty store) is exempt. ``add`` always stamps ``metadata["admitted"]`` (and ``metadata["eval_failed"]`` for gated children). Determinism: the store draws no randomness of its own; the active strategy's ``rng`` is the policy's seeded ``rng``, injected via :meth:`inject_rng` / :meth:`swap_strategy`. """ from __future__ import annotations import random import statistics as _stats from collections import Counter from ...components.population import Population from ...records import Genome from .strategy import StrategyBase, load_strategy_from_source class EvoXPopulation(Population): """Hosts the active :class:`~galapagos.scaffolds.evox.strategy.StrategyBase` instance; the strategy's ``add``/``sample`` are the LLM-written search policy, this class is the harness.""" def __init__(self, seed_source: str, rng: random.Random | None = None, statistics_k: int = 20, improvement_threshold: float = 0.01): self.seed_source = seed_source self.statistics_k = int(statistics_k) self.improvement_threshold = float(improvement_threshold) self._rng = rng or random.Random(0) strategy_class = load_strategy_from_source(seed_source) self._strategy: StrategyBase = strategy_class() self._strategy.rng = self._rng self._strategy_source = seed_source self._fallback_strategy: StrategyBase | None = None self._fallback_source: str | None = None self._trace: list[dict] = [] # execution trace (φ input); carried across swaps self.runtime_fallbacks = 0 # failed evolutions caused by runtime errors self.last_runtime_error: str | None = None # ---- strategy access ------------------------------------------------------------------ @property def active_strategy(self) -> StrategyBase: return self._strategy @property def strategy_source(self) -> str: return self._strategy_source def inject_rng(self, rng: random.Random) -> None: """Route the active strategy's randomness through the policy's seeded rng.""" self._rng = rng self._strategy.rng = rng def assign_labels(self, diverge_label: str, refine_label: str) -> None: """``_assign_labels_to_db``: stamp the operator texts on the active strategy.""" self._strategy.DIVERGE_LABEL = diverge_label self._strategy.REFINE_LABEL = refine_label # ---- admission -------------------------------------------------------------------------- @staticmethod def _is_eval_failure(genome: Genome) -> bool: """The upstream eval-failure gate (``validity in (0, -1)`` OR a zero ``combined_score`` with an evaluator error): such children are retried upstream, never added. The upstream OR-clause (zero score AND an ``error`` key) applies regardless of validity, so a custom Evaluator emitting ``{combined_score: 0, validity: 1}`` with an error artifact is rejected exactly like upstream. Ordinary text feedback is NOT gated — only the two ``SubprocessEvaluator`` hard-failure shapes.""" # upstream: combined_score == 0 with an 'error' key is a failure even when validity is # truthy or absent if genome.fitness == 0.0 and "error" in genome.artifacts: return True validity = genome.scores.get("validity") if validity is not None: try: return float(validity) in (0.0, -1.0) except (TypeError, ValueError): return True feedback = genome.artifacts.get("text_feedback") return (genome.fitness == 0.0 and isinstance(feedback, str) and (feedback.startswith("evaluator error:") or feedback.startswith("timeout after"))) def add(self, genome: Genome) -> bool: # strip inherited verdict flags (Genome.child copies the parent's metadata wholesale; # parent_info/context_ids/iteration are RE-stamped by the proposer each generation) for stale in ("admitted", "eval_failed"): genome.metadata.pop(stale, None) bootstrap = len(self._strategy.genomes) == 0 if not bootstrap and self._is_eval_failure(genome): genome.metadata.update(admitted=False, eval_failed=True) return False iteration = genome.metadata.get("iteration") if not isinstance(iteration, int): iteration = 0 genome.metadata["iteration"] = 0 for attempt in range(3): # broken evolved add(): restore + retry (upstream); the second try: # restore reloads the seed strategy (see restore_fallback) self._strategy.add(genome, iteration=iteration) break except Exception as e: # noqa: BLE001 self.last_runtime_error = str(e) if attempt == 2: raise self.restore_fallback() # _wrap_add_method port: best-tracking ALWAYS runs, even if the LLM's add() forgot it self._strategy._update_best(genome) self._record_trace(genome, iteration) admitted = genome.id in self._strategy.genomes # evolved add() may prune/reject genome.metadata["admitted"] = admitted return admitted def _record_trace(self, genome: Genome, iteration: int) -> None: """One execution-trace entry per added child (φ's ``execution_trace`` source).""" parent_tuple = None if genome.parent_id: info = genome.metadata.get("parent_info") label = info[0] if isinstance(info, (tuple, list)) and info else None parent = self._strategy.genomes.get(genome.parent_id) parent_score = parent.scores.get("combined_score") if parent is not None else None parent_tuple = (label, genome.parent_id, parent_score) context = [] for ctx_id in genome.metadata.get("context_ids") or []: ctx = self._strategy.genomes.get(ctx_id) context.append(("", ctx_id, ctx.scores.get("combined_score") if ctx else None)) self._trace.append({ "iteration": iteration, "program": (genome.id, genome.scores.get("combined_score")), "parent": parent_tuple, "context": context or None, }) # ---- queries ------------------------------------------------------------------------------ def query(self, spec: dict | None = None) -> list[Genome]: members = sorted(self.all(), key=lambda g: g.fitness, reverse=True) top = (spec or {}).get("top") return members[:top] if top else members def all(self) -> list[Genome]: return list(self._strategy.genomes.values()) # ---- hot-swap + fallback (the co-evolution database lifecycle) ------------------------------ def swap_strategy(self, new_class: type[StrategyBase], source: str, rng: random.Random, diverge_label: str = "", refine_label: str = "") -> bool: """``_switch_to_new_search_algorithm``: instantiate, inject rng + labels, migrate ALL genomes in iteration order through the new ``add()``, keep the old instance + source as fallback. Any exception → ``False`` (a validation-stage failure; nothing changes).""" try: new = new_class() new.rng = rng new.DIVERGE_LABEL = diverge_label new.REFINE_LABEL = refine_label for genome in sorted(self._strategy.genomes.values(), key=lambda g: g.metadata.get("iteration", 0)): new.add(genome, iteration=genome.metadata.get("iteration", 0)) new._update_best(genome) # wrapped-add semantics during migration too new.get_best() # sets best_id (None -> actual best) except Exception: # noqa: BLE001 — swap failure leaves the active strategy untouched return False self._fallback_strategy = self._strategy self._fallback_source = self._strategy_source self._strategy = new self._strategy_source = source return True def restore_fallback(self) -> bool: """``_restore_fallback_database``: copy genomes that only exist in the broken strategy back into the restore target (per-genome exceptions swallowed), restore it, clear the fallback, count the failed evolution. When NO fallback remains (e.g. the restored strategy broke too), the always-valid seed strategy is reloaded from source instead — a sanctioned galapagos adaptation: upstream marks such iterations as failed and burns budget until the next evolution event, but the galapagos loop cannot fail an iteration mid-``select``/``add``, so reverting to S0 is the honest never-crash mirror.""" broken = self._strategy target, source = self._fallback_strategy, self._fallback_source if target is None: strategy_class = load_strategy_from_source(self.seed_source) target = strategy_class() target.rng = self._rng target.DIVERGE_LABEL = broken.DIVERGE_LABEL target.REFINE_LABEL = broken.REFINE_LABEL source = self.seed_source target_ids = set(target.genomes) for genome_id, genome in broken.genomes.items(): if genome_id in target_ids: continue try: target.add(genome, iteration=genome.metadata.get("iteration", 0)) target._update_best(genome) except Exception: # noqa: BLE001 — preserve as many new genomes as possible continue self._strategy = target self._strategy_source = source or self.seed_source self._fallback_strategy = None self._fallback_source = None self.runtime_fallbacks += 1 return True # ---- φ — the population-state descriptor (base_database.get_statistics port) ---------------- def statistics(self, improvement_threshold: float | None = None, k: int | None = None, num_recent: int = 100) -> dict: """``get_statistics(num_recent_iterations=100, k=20, improvement_threshold=0.01)``: score quartiles + tiers + unique-score count, top-k scores, avg children per parent, iterations-without-improvement, and the recent execution trace with parent/context reuse ratios. Rendering lives in :mod:`.prompt_builder` (``format_population_state``).""" threshold = self.improvement_threshold if improvement_threshold is None else improvement_threshold k = self.statistics_k if k is None else int(k) genomes = self.all() population_size = len(genomes) def cscore(g: Genome): v = g.scores.get("combined_score") return float(v) if isinstance(v, (int, float)) else None def iter_of(g: Genome) -> int: it = g.metadata.get("iteration", 0) return it if isinstance(it, int) else 0 scores = [s for s in (cscore(g) for g in genomes) if s is not None] last_iter = max((iter_of(g) for g in genomes), default=0) if scores: scores_sorted = sorted(scores, reverse=True) n = len(scores) if n >= 4: q25, q50, q75 = _stats.quantiles(scores, n=4) else: q25 = q50 = q75 = _stats.median(scores) pct_top = sum(1 for s in scores if s >= q75) / n * 100 pct_upper = sum(1 for s in scores if q50 <= s < q75) / n * 100 pct_lower = sum(1 for s in scores if q25 <= s < q50) / n * 100 pct_bottom = sum(1 for s in scores if s < q25) / n * 100 score_stats = { "best": scores_sorted[0], "q75": q75, "q50": q50, "q25": q25, "worst": scores_sorted[-1], "score_tiers": { "top": {"threshold": f"score >= {q75:.4f}", "pct_programs": pct_top}, "upper_mid": {"threshold": f"{q50:.4f} <= score < {q75:.4f}", "pct_programs": pct_upper}, "lower_mid": {"threshold": f"{q25:.4f} <= score < {q50:.4f}", "pct_programs": pct_lower}, "bottom": {"threshold": f"score < {q25:.4f}", "pct_programs": pct_bottom}, }, "unique_scores": len(set(round(s, 4) for s in scores)), } top_scores = scores_sorted[:k] else: score_stats = {"best": None, "q75": None, "q50": None, "q25": None, "worst": None} top_scores = [] with_parents = [g for g in genomes if g.parent_id is not None] unique_parents = len({g.parent_id for g in with_parents}) avg_per_parent = len(with_parents) / unique_parents if unique_parents > 0 else 0.0 iterations_without_improvement = 0 if scores: best_score = max(scores) near_best = [iter_of(g) for g in genomes if cscore(g) is not None and cscore(g) >= best_score - threshold] if near_best: iterations_without_improvement = last_iter - min(near_best) recent = self._trace[-num_recent:] most_parent_ratio, most_parent_score = 0.0, None most_context_ratio, most_context_score = 0.0, None if recent: with_parent = [e for e in recent if e.get("parent")] if with_parent: counts = Counter(e["parent"][1] for e in with_parent) top_parent_id, count = counts.most_common(1)[0] most_parent_ratio = count / len(with_parent) top_parent = self._strategy.genomes.get(top_parent_id) if top_parent is not None: most_parent_score = top_parent.scores.get("combined_score") context_counts: Counter = Counter() programs_with_context = 0 for e in recent: ctx = e.get("context") or [] if ctx: programs_with_context += 1 for _label, ctx_id, _score in ctx: context_counts[ctx_id] += 1 if context_counts and programs_with_context > 0: top_ctx_id, count = context_counts.most_common(1)[0] most_context_ratio = count / programs_with_context top_ctx = self._strategy.genomes.get(top_ctx_id) if top_ctx is not None: most_context_score = top_ctx.scores.get("combined_score") recent_stats = { "num_recent_iterations": len(recent), "execution_trace": list(recent), "score_trajectory": [e["program"][1] for e in recent], "parent_scores": [e["parent"][2] if e.get("parent") else None for e in recent], "iterations_without_improvement": iterations_without_improvement, "improvement_threshold": threshold, "most_reused_parent_ratio": most_parent_ratio, "most_reused_parent_score": most_parent_score, "most_reused_context_ratio": most_context_ratio, "most_reused_context_score": most_context_score, } if recent else {} return { "population_size": population_size, "solution_score_summary": score_stats, "avg_solutions_per_parent": avg_per_parent, "top_solution_scores": top_scores, "recent_solution_stats": recent_stats, }