Meta-Harness
A minimal outer loop that delegates selection AND mutation to a skill-steered proposer over an append-only candidate history, returning a (score x cost) Pareto frontier.
"""Meta-Harness Population component — the append-only filesystem-``D`` analogue with a
strict-dominance Pareto frontier.
One file per component (see scaffold.py). Faithful port of the reference's candidate store: the
``agents/`` directory plus the per-candidate ``val.json`` results (``meta_harness.py`` +
``benchmark.py``). The reference deliberately has NO archive policy — ``D`` only grows, nothing
is pruned or evicted; the only structure is the Pareto frontier that ``benchmark.py --frontier``
recomputes from every result on disk after each iteration. This store mirrors both facts:
``add`` is append-only (no capacity, no eviction) and the frontier is recomputed after every add.
Frontier criterion — the exact port of ``compute_pareto_frontier`` (benchmark.py:228-244):
maximize ``combined_score``, minimize cost; sort by ``(-score, cost)`` then sweep keeping every
point whose cost is ``<=`` the running minimum. Dominance is strict on both axes — a point
survives unless another has strictly higher score AND strictly lower cost; exact ties on both
axes are all kept (and, exactly like the reference's sweep, an equal-score-but-strictly-costlier
point is dropped in favour of the cheaper one). ``best()`` is the frontier's top entry —
the open mirror of ``pareto[0]["system"]`` (highest score, ties broken by lower cost) — which is
what the run reports as ``state.best``.
Cost axis: the reference measures injected context length in characters
(``memory_context_chars``); the universal galapagos analogue is ``genome_chars`` =
``len(genome.content)``. ``pareto.cost_metric`` may instead name any task metric key (falling
back to ``genome_chars`` for genomes that lack the key).
Admission mirrors the eval-failure gate (the open analogue of a crashed benchmark job): children
whose scores mark them invalid (``validity`` exactly 0 or -1, or — when no ``validity`` key
exists — the two ``SubprocessEvaluator`` hard-failure shapes: ``combined_score == 0.0`` with
``text_feedback`` starting ``"evaluator error:"`` or ``"timeout after"``) are rejected outright
and update *nothing*; the scaffold records them as ``outcome: "failed"`` evolution-summary rows,
exactly like the reference's crashed candidates (score 0 → ``"failed"``). The very FIRST add —
the task seed, Phase 0's baseline — bypasses the gate and is admitted unconditionally: reference
baselines enter ``D`` regardless of score (a crashed baseline benchmark just scores 0 and stays
in ``agents/``), and an empty store would leave nothing to select. ``add`` always stamps
``metadata["admitted"]`` (and ``metadata["eval_failed"]`` for gated children).
Determinism: this store draws no randomness; the frontier is a pure function of the members
(insertion order breaks exact ``(score, cost)`` ties in the sort, deterministically).
"""
from __future__ import annotations
from ...components.population import Population
from ...records import Genome
def display_name(genome: Genome) -> str:
"""The candidate's proposer-given name (``metadata["candidate_name"]``); the task seed — the
reference's hand-written baseline — reports as ``"seed"``."""
name = genome.metadata.get("candidate_name")
return str(name) if name else "seed"
class MetaHarnessPopulation(Population):
"""Append-only candidate store + Pareto frontier over (maximize score, minimize cost)."""
def __init__(self, cost_metric: str = "genome_chars"):
self.cost_metric = str(cost_metric)
self._members: list[Genome] = []
self._frontier: list[Genome] = []
# ---- objectives ----------------------------------------------------------------------------
def cost_of(self, genome: Genome) -> float:
"""Objective 2 (minimize). ``genome_chars`` → ``len(content)`` (the chars analogue of the
reference's ``memory_context_chars``); any other name → that score key, falling back to
``genome_chars`` when the genome lacks it."""
if self.cost_metric != "genome_chars":
value = genome.scores.get(self.cost_metric)
if isinstance(value, (int, float)) and not isinstance(value, bool):
return float(value)
return float(len(genome.content))
@staticmethod
def _score_of(genome: Genome) -> float:
return genome.fitness if genome.fitness != float("-inf") else 0.0
# ---- admission ------------------------------------------------------------------------------
@staticmethod
def _is_eval_failure(genome: Genome) -> bool:
"""The open mirror of a crashed benchmark job (``validity in (0, -1)`` OR a zero
``combined_score`` with a hard evaluator error). Without a ``validity`` key the fallback
fires ONLY on the two ``SubprocessEvaluator`` hard-failure shapes (exec error / timeout) —
a legitimate 0.0-score child with ordinary diagnostic feedback IS admitted."""
validity = genome.scores.get("validity")
if validity is not None:
try:
return float(validity) in (0.0, -1.0)
except (TypeError, ValueError):
return True
feedback = genome.artifacts.get("text_feedback")
return (genome.fitness == 0.0 and isinstance(feedback, str)
and (feedback.startswith("evaluator error:")
or feedback.startswith("timeout after")))
def add(self, genome: Genome) -> bool:
# strip inherited bookkeeping flags (Genome.child copies the parent's metadata wholesale)
for stale in ("admitted", "eval_failed"):
genome.metadata.pop(stale, None)
# bootstrap: the very first add (the task seed) is admitted UNCONDITIONALLY, before the
# eval gate — in reference Phase 0 the baselines enter D regardless of score (a crashed
# baseline benchmark just scores 0 and stays in agents/), and an empty store would leave
# the policy with nothing to select on iteration 1
if not self._members:
genome.metadata["admitted"] = True
self._members.append(genome)
self._recompute_frontier()
return True
if self._is_eval_failure(genome):
genome.metadata.update(admitted=False, eval_failed=True)
return False
genome.metadata["admitted"] = True
self._members.append(genome) # append-only: D only grows, no pruning, no eviction
self._recompute_frontier() # the per-iteration `--frontier` recompute, per add
return True
# ---- the Pareto frontier (compute_pareto_frontier, ported exactly) ---------------------------
def _recompute_frontier(self) -> None:
points = sorted(
((self._score_of(g), self.cost_of(g), i, g) for i, g in enumerate(self._members)),
key=lambda t: (-t[0], t[1], t[2]), # accuracy-desc, then cost-asc (index = stable ties)
)
pareto: list[Genome] = []
min_cost = float("inf")
for _score, cost, _i, genome in points:
if cost <= min_cost: # `<=` keeps equal-cost ties, as in the reference
pareto.append(genome)
min_cost = cost
self._frontier = pareto
def frontier(self) -> list[Genome]:
"""The current Pareto frontier, sorted score-desc (``frontier_val.json["_pareto"]``)."""
return list(self._frontier)
def best(self) -> Genome | None:
"""``pareto[0]`` — the frontier's highest-score member (ties → lowest cost)."""
return self._frontier[0] if self._frontier else None
# ---- queries ---------------------------------------------------------------------------------
def query(self, spec: dict | None = None) -> list[Genome]:
"""``{"frontier": True}`` → the Pareto frontier; default → all members sorted by fitness;
``{"top": n, ...}`` truncates."""
spec = spec or {}
if spec.get("frontier"):
members = self.frontier()
else:
members = sorted(self._members, key=lambda g: g.fitness, reverse=True)
top = spec.get("top")
return members[:top] if top else members
def all(self) -> list[Genome]:
return list(self._members)