EvoX
Co-evolves the search strategy with the solutions: the parent/context selection policy is itself LLM-written code, scored by windowed improvement and hot-swapped on stagnation.
"""EvoX Population component — the host of the ACTIVE evolved search strategy.
One file per component (see scaffold.py). Faithful port of the database half of SkyDiscover's
``CoEvolutionController``: the candidate store IS the current ``EvolvedStrategy`` instance (the
``EvolvedProgramDatabase`` mirror), and this Population owns the lifecycle around it — the
eval-failure admission gate, the always-run best-tracking wrapper (``_wrap_add_method``), the
execution trace consumed by the φ descriptor, the hot-swap with full migration
(``_switch_to_new_search_algorithm`` / ``_migrate_to_db``), the runtime fallback
(``_restore_fallback_database``), and the ``get_statistics`` port (:meth:`statistics`).
Admission mirrors the upstream gate exactly as in the adaevolve port: children whose scores mark
them invalid (``validity`` exactly 0 or -1, or — when no ``validity`` key exists — the two
``SubprocessEvaluator`` hard-failure shapes: ``combined_score == 0.0`` with ``text_feedback``
starting ``"evaluator error:"`` or ``"timeout after"``) are rejected outright and never reach the
strategy, exactly like upstream children that error out and are retried instead of added. The
bootstrap add (the task seed into an empty store) is exempt. ``add`` always stamps
``metadata["admitted"]`` (and ``metadata["eval_failed"]`` for gated children).
Determinism: the store draws no randomness of its own; the active strategy's ``rng`` is the
policy's seeded ``rng``, injected via :meth:`inject_rng` / :meth:`swap_strategy`.
"""
from __future__ import annotations
import random
import statistics as _stats
from collections import Counter
from ...components.population import Population
from ...records import Genome
from .strategy import StrategyBase, load_strategy_from_source
class EvoXPopulation(Population):
"""Hosts the active :class:`~galapagos.scaffolds.evox.strategy.StrategyBase` instance; the
strategy's ``add``/``sample`` are the LLM-written search policy, this class is the harness."""
def __init__(self, seed_source: str, rng: random.Random | None = None,
statistics_k: int = 20, improvement_threshold: float = 0.01):
self.seed_source = seed_source
self.statistics_k = int(statistics_k)
self.improvement_threshold = float(improvement_threshold)
self._rng = rng or random.Random(0)
strategy_class = load_strategy_from_source(seed_source)
self._strategy: StrategyBase = strategy_class()
self._strategy.rng = self._rng
self._strategy_source = seed_source
self._fallback_strategy: StrategyBase | None = None
self._fallback_source: str | None = None
self._trace: list[dict] = [] # execution trace (φ input); carried across swaps
self.runtime_fallbacks = 0 # failed evolutions caused by runtime errors
self.last_runtime_error: str | None = None
# ---- strategy access ------------------------------------------------------------------
@property
def active_strategy(self) -> StrategyBase:
return self._strategy
@property
def strategy_source(self) -> str:
return self._strategy_source
def inject_rng(self, rng: random.Random) -> None:
"""Route the active strategy's randomness through the policy's seeded rng."""
self._rng = rng
self._strategy.rng = rng
def assign_labels(self, diverge_label: str, refine_label: str) -> None:
"""``_assign_labels_to_db``: stamp the operator texts on the active strategy."""
self._strategy.DIVERGE_LABEL = diverge_label
self._strategy.REFINE_LABEL = refine_label
# ---- admission --------------------------------------------------------------------------
@staticmethod
def _is_eval_failure(genome: Genome) -> bool:
"""The upstream eval-failure gate (``validity in (0, -1)`` OR a zero ``combined_score``
with an evaluator error): such children are retried upstream, never added. The upstream
OR-clause (zero score AND an ``error`` key) applies regardless of validity, so a custom
Evaluator emitting ``{combined_score: 0, validity: 1}`` with an error artifact is rejected
exactly like upstream. Ordinary text feedback is NOT gated — only the two
``SubprocessEvaluator`` hard-failure shapes."""
# upstream: combined_score == 0 with an 'error' key is a failure even when validity is
# truthy or absent
if genome.fitness == 0.0 and "error" in genome.artifacts:
return True
validity = genome.scores.get("validity")
if validity is not None:
try:
return float(validity) in (0.0, -1.0)
except (TypeError, ValueError):
return True
feedback = genome.artifacts.get("text_feedback")
return (genome.fitness == 0.0 and isinstance(feedback, str)
and (feedback.startswith("evaluator error:")
or feedback.startswith("timeout after")))
def add(self, genome: Genome) -> bool:
# strip inherited verdict flags (Genome.child copies the parent's metadata wholesale;
# parent_info/context_ids/iteration are RE-stamped by the proposer each generation)
for stale in ("admitted", "eval_failed"):
genome.metadata.pop(stale, None)
bootstrap = len(self._strategy.genomes) == 0
if not bootstrap and self._is_eval_failure(genome):
genome.metadata.update(admitted=False, eval_failed=True)
return False
iteration = genome.metadata.get("iteration")
if not isinstance(iteration, int):
iteration = 0
genome.metadata["iteration"] = 0
for attempt in range(3): # broken evolved add(): restore + retry (upstream); the second
try: # restore reloads the seed strategy (see restore_fallback)
self._strategy.add(genome, iteration=iteration)
break
except Exception as e: # noqa: BLE001
self.last_runtime_error = str(e)
if attempt == 2:
raise
self.restore_fallback()
# _wrap_add_method port: best-tracking ALWAYS runs, even if the LLM's add() forgot it
self._strategy._update_best(genome)
self._record_trace(genome, iteration)
admitted = genome.id in self._strategy.genomes # evolved add() may prune/reject
genome.metadata["admitted"] = admitted
return admitted
def _record_trace(self, genome: Genome, iteration: int) -> None:
"""One execution-trace entry per added child (φ's ``execution_trace`` source)."""
parent_tuple = None
if genome.parent_id:
info = genome.metadata.get("parent_info")
label = info[0] if isinstance(info, (tuple, list)) and info else None
parent = self._strategy.genomes.get(genome.parent_id)
parent_score = parent.scores.get("combined_score") if parent is not None else None
parent_tuple = (label, genome.parent_id, parent_score)
context = []
for ctx_id in genome.metadata.get("context_ids") or []:
ctx = self._strategy.genomes.get(ctx_id)
context.append(("", ctx_id, ctx.scores.get("combined_score") if ctx else None))
self._trace.append({
"iteration": iteration,
"program": (genome.id, genome.scores.get("combined_score")),
"parent": parent_tuple,
"context": context or None,
})
# ---- queries ------------------------------------------------------------------------------
def query(self, spec: dict | None = None) -> list[Genome]:
members = sorted(self.all(), key=lambda g: g.fitness, reverse=True)
top = (spec or {}).get("top")
return members[:top] if top else members
def all(self) -> list[Genome]:
return list(self._strategy.genomes.values())
# ---- hot-swap + fallback (the co-evolution database lifecycle) ------------------------------
def swap_strategy(self, new_class: type[StrategyBase], source: str, rng: random.Random,
diverge_label: str = "", refine_label: str = "") -> bool:
"""``_switch_to_new_search_algorithm``: instantiate, inject rng + labels, migrate ALL
genomes in iteration order through the new ``add()``, keep the old instance + source as
fallback. Any exception → ``False`` (a validation-stage failure; nothing changes)."""
try:
new = new_class()
new.rng = rng
new.DIVERGE_LABEL = diverge_label
new.REFINE_LABEL = refine_label
for genome in sorted(self._strategy.genomes.values(),
key=lambda g: g.metadata.get("iteration", 0)):
new.add(genome, iteration=genome.metadata.get("iteration", 0))
new._update_best(genome) # wrapped-add semantics during migration too
new.get_best() # sets best_id (None -> actual best)
except Exception: # noqa: BLE001 — swap failure leaves the active strategy untouched
return False
self._fallback_strategy = self._strategy
self._fallback_source = self._strategy_source
self._strategy = new
self._strategy_source = source
return True
def restore_fallback(self) -> bool:
"""``_restore_fallback_database``: copy genomes that only exist in the broken strategy
back into the restore target (per-genome exceptions swallowed), restore it, clear the
fallback, count the failed evolution.
When NO fallback remains (e.g. the restored strategy broke too), the always-valid seed
strategy is reloaded from source instead — a sanctioned galapagos adaptation: upstream
marks such iterations as failed and burns budget until the next evolution event, but the
galapagos loop cannot fail an iteration mid-``select``/``add``, so reverting to S0 is the
honest never-crash mirror."""
broken = self._strategy
target, source = self._fallback_strategy, self._fallback_source
if target is None:
strategy_class = load_strategy_from_source(self.seed_source)
target = strategy_class()
target.rng = self._rng
target.DIVERGE_LABEL = broken.DIVERGE_LABEL
target.REFINE_LABEL = broken.REFINE_LABEL
source = self.seed_source
target_ids = set(target.genomes)
for genome_id, genome in broken.genomes.items():
if genome_id in target_ids:
continue
try:
target.add(genome, iteration=genome.metadata.get("iteration", 0))
target._update_best(genome)
except Exception: # noqa: BLE001 — preserve as many new genomes as possible
continue
self._strategy = target
self._strategy_source = source or self.seed_source
self._fallback_strategy = None
self._fallback_source = None
self.runtime_fallbacks += 1
return True
# ---- φ — the population-state descriptor (base_database.get_statistics port) ----------------
def statistics(self, improvement_threshold: float | None = None, k: int | None = None,
num_recent: int = 100) -> dict:
"""``get_statistics(num_recent_iterations=100, k=20, improvement_threshold=0.01)``:
score quartiles + tiers + unique-score count, top-k scores, avg children per parent,
iterations-without-improvement, and the recent execution trace with parent/context
reuse ratios. Rendering lives in :mod:`.prompt_builder` (``format_population_state``)."""
threshold = self.improvement_threshold if improvement_threshold is None else improvement_threshold
k = self.statistics_k if k is None else int(k)
genomes = self.all()
population_size = len(genomes)
def cscore(g: Genome):
v = g.scores.get("combined_score")
return float(v) if isinstance(v, (int, float)) else None
def iter_of(g: Genome) -> int:
it = g.metadata.get("iteration", 0)
return it if isinstance(it, int) else 0
scores = [s for s in (cscore(g) for g in genomes) if s is not None]
last_iter = max((iter_of(g) for g in genomes), default=0)
if scores:
scores_sorted = sorted(scores, reverse=True)
n = len(scores)
if n >= 4:
q25, q50, q75 = _stats.quantiles(scores, n=4)
else:
q25 = q50 = q75 = _stats.median(scores)
pct_top = sum(1 for s in scores if s >= q75) / n * 100
pct_upper = sum(1 for s in scores if q50 <= s < q75) / n * 100
pct_lower = sum(1 for s in scores if q25 <= s < q50) / n * 100
pct_bottom = sum(1 for s in scores if s < q25) / n * 100
score_stats = {
"best": scores_sorted[0], "q75": q75, "q50": q50, "q25": q25,
"worst": scores_sorted[-1],
"score_tiers": {
"top": {"threshold": f"score >= {q75:.4f}", "pct_programs": pct_top},
"upper_mid": {"threshold": f"{q50:.4f} <= score < {q75:.4f}",
"pct_programs": pct_upper},
"lower_mid": {"threshold": f"{q25:.4f} <= score < {q50:.4f}",
"pct_programs": pct_lower},
"bottom": {"threshold": f"score < {q25:.4f}", "pct_programs": pct_bottom},
},
"unique_scores": len(set(round(s, 4) for s in scores)),
}
top_scores = scores_sorted[:k]
else:
score_stats = {"best": None, "q75": None, "q50": None, "q25": None, "worst": None}
top_scores = []
with_parents = [g for g in genomes if g.parent_id is not None]
unique_parents = len({g.parent_id for g in with_parents})
avg_per_parent = len(with_parents) / unique_parents if unique_parents > 0 else 0.0
iterations_without_improvement = 0
if scores:
best_score = max(scores)
near_best = [iter_of(g) for g in genomes
if cscore(g) is not None and cscore(g) >= best_score - threshold]
if near_best:
iterations_without_improvement = last_iter - min(near_best)
recent = self._trace[-num_recent:]
most_parent_ratio, most_parent_score = 0.0, None
most_context_ratio, most_context_score = 0.0, None
if recent:
with_parent = [e for e in recent if e.get("parent")]
if with_parent:
counts = Counter(e["parent"][1] for e in with_parent)
top_parent_id, count = counts.most_common(1)[0]
most_parent_ratio = count / len(with_parent)
top_parent = self._strategy.genomes.get(top_parent_id)
if top_parent is not None:
most_parent_score = top_parent.scores.get("combined_score")
context_counts: Counter = Counter()
programs_with_context = 0
for e in recent:
ctx = e.get("context") or []
if ctx:
programs_with_context += 1
for _label, ctx_id, _score in ctx:
context_counts[ctx_id] += 1
if context_counts and programs_with_context > 0:
top_ctx_id, count = context_counts.most_common(1)[0]
most_context_ratio = count / programs_with_context
top_ctx = self._strategy.genomes.get(top_ctx_id)
if top_ctx is not None:
most_context_score = top_ctx.scores.get("combined_score")
recent_stats = {
"num_recent_iterations": len(recent),
"execution_trace": list(recent),
"score_trajectory": [e["program"][1] for e in recent],
"parent_scores": [e["parent"][2] if e.get("parent") else None for e in recent],
"iterations_without_improvement": iterations_without_improvement,
"improvement_threshold": threshold,
"most_reused_parent_ratio": most_parent_ratio,
"most_reused_parent_score": most_parent_score,
"most_reused_context_ratio": most_context_ratio,
"most_reused_context_score": most_context_score,
} if recent else {}
return {
"population_size": population_size,
"solution_score_summary": score_stats,
"avg_solutions_per_parent": avg_per_parent,
"top_solution_scores": top_scores,
"recent_solution_stats": recent_stats,
}