Meta-Harness
A minimal outer loop that delegates selection AND mutation to a skill-steered proposer over an append-only candidate history, returning a (score x cost) Pareto frontier.
"""Meta-Harness Proposer component — ONE model call → k candidates, dispensed one per iteration.
One file per component (see scaffold.py). The reference proposer session writes k candidate
``.py`` files and declares them in ``pending_eval.json``; the outer loop import-checks each and
benchmarks the valid ones. The chat port makes ONE model call per proposal round whose response
carries all k candidates as ``### CANDIDATE <i>: <snake_case_name>`` sections (each: the
<=30-line report, then ONE fenced python block holding the full program). k is enforced socially
by the steering — exactly like the reference, where the outer loop "imposes no k; it evaluates
however many entries pending_eval.json contains" — so parsing is lenient and accepts any count.
**Queue semantics (sanctioned adaptation, also documented in scaffold.py):** the reference
evaluates all k candidates of an iteration before the next proposer session; the galapagos base
loop is strictly one-child-per-iteration. The proposer therefore keeps an internal FIFO of parsed
candidates and dispenses ONE per loop iteration, making a new model call only when the queue is
empty — reference N iterations x k candidates maps to N*k galapagos iterations (the bundled
budget 60 = 20 x 3).
**Interface validation:** the reference import-checks each candidate in a 30 s subprocess
(``from text_classification.agents.<name> import *``). The safe in-process analogue for arbitrary
task programs is ``compile(source, "<candidate>", "exec")`` — syntax-valid candidates pass;
invalid ones are NOT evaluated and are recorded as ``outcome: "failed"`` rows in Memory at
proposal time, with the compile error as their trace (the reference's failed-row behaviour; this
port records the row even when sibling candidates were valid — the reference silently drops the
failed rows of partially-invalid iterations, an information-loss quirk we do not reproduce).
**Failure semantics:** a response with headers but no parsable program per section, or with no
fenced block at all, yields zero queued candidates → the child is returned with
``metadata["changed"] = False`` (a NO_DIFF wasted step), mirroring the reference's abandoned
iteration when the proposer exits non-zero or writes no ``pending_eval.json``.
**Free-form degradation:** a reply with no ``### CANDIDATE`` headers yields exactly one parsed
candidate — the LAST fenced python block, name ``candidate_1``, empty report — so even an
unstructured reply still exercises the full loop.
Cost is recorded via ``env.state.record_cost`` on every model call. ``Genome.child`` copies the
parent's metadata wholesale, so dispensing strips the bookkeeping/transient keys (``admitted``,
``eval_failed``, ``seed``, ``candidate_name``, ``report``, ``iteration``, ``outcome``) before
stamping the fresh candidate fields (``candidate_name``/``report`` here; ``iteration``/``outcome``
in the scaffold's ``after_step``, once the eval verdict is known).
"""
from __future__ import annotations
import re
from ...components.proposer import Env, Proposer
from ...records import Genome
_FENCE = re.compile(r"```(?:python|py)?\s*\n(.*?)```", re.DOTALL)
# "### CANDIDATE <i>: <snake_case_name>" — lenient: 2-4 hashes, optional index, optional colon
_HEADER = re.compile(r"^[ \t]*#{2,4}\s*CANDIDATE\b[^:\n]*:?[ \t]*(?P<name>[^\n]*)$",
re.MULTILINE | re.IGNORECASE)
_NAME_CLEAN = re.compile(r"[^A-Za-z0-9_]+")
_STALE_KEYS = ("admitted", "eval_failed", "seed", "candidate_name", "report", "iteration",
"outcome")
_MAX_REPORT_LINES = 30
class MetaHarnessProposer(Proposer):
"""One call → k parsed candidates → an internal FIFO dispensing one child per iteration."""
modality = "prompt_call"
def __init__(self, candidates_per_proposal: int = 3):
self.candidates_per_proposal = max(1, int(candidates_per_proposal)) # steering-only
self._queue: list[dict] = [] # parsed {name, report, source} awaiting dispatch (FIFO)
# ---- parsing ---------------------------------------------------------------------------------
@staticmethod
def _clean_name(raw: str, index: int) -> str:
name = _NAME_CLEAN.sub("_", raw).strip("_").lower()
return name or f"candidate_{index}"
def _parse(self, text: str) -> list[dict]:
"""Split the response into candidate sections. Each candidate = header name + the report
(text before its first fence, capped at 30 lines) + the LAST fenced python block of its
section (``source=None`` marks a section with no program — a parse failure)."""
headers = list(_HEADER.finditer(text))
if not headers: # free-form degradation: one candidate from the last fence
fences = _FENCE.findall(text)
if not fences:
return []
return [{"name": "candidate_1", "report": "", "source": fences[-1].strip("\n")}]
out: list[dict] = []
for i, header in enumerate(headers, 1):
end = headers[i].start() if i < len(headers) else len(text)
section = text[header.end(): end]
name = self._clean_name(header.group("name"), i)
fences = list(_FENCE.finditer(section))
if not fences:
out.append({"name": name, "report": "", "source": None})
continue
report = section[: fences[0].start()].strip()
report = "\n".join(report.splitlines()[:_MAX_REPORT_LINES])
out.append({"name": name, "report": report,
"source": fences[-1].group(1).strip("\n")})
return out
# ---- the propose step --------------------------------------------------------------------------
def _refill(self, prompt, env: Env) -> None:
"""One model call → parse → interface-validate → queue (the proposer session analogue)."""
gen = env.model.generate(prompt)
if env.state is not None:
env.state.record_cost(gen.cost_usd, gen.prompt_tokens, gen.completion_tokens)
iteration = env.state.iteration if env.state is not None else 0
for cand in self._parse(gen.text or ""):
source, error = cand["source"], ""
if source is None:
error = "no fenced python block in the candidate section"
else:
try: # the 30 s import-check analogue, safe for arbitrary task programs
compile(source, "<candidate>", "exec")
except Exception as exc: # noqa: BLE001 — SyntaxError/ValueError etc.
error = f"{type(exc).__name__}: {exc}"
if error: # failed validation: never evaluated, recorded as a failed row NOW
if env.memory is not None:
env.memory.write("", kind="failed", name=cand["name"], iteration=iteration,
cost=float(len(source or "")),
trace=f"interface validation failed: {error}")
continue
self._queue.append(cand)
def propose(self, prompt, env: Env) -> Genome:
parent = env.selection.parent
if not self._queue:
self._refill(prompt, env)
if not self._queue: # abandoned proposal round (reference: propose_claude → continue)
child = (parent.child(parent.content) if parent is not None else Genome(content=""))
for stale in _STALE_KEYS:
child.metadata.pop(stale, None)
child.metadata["changed"] = False
return child
cand = self._queue.pop(0) # FIFO: dispense exactly one candidate per loop iteration
if parent is None: # delegated selection (not used by Meta-Harness; safety)
child = Genome(content=cand["source"])
else:
child = parent.child(cand["source"],
generation=parent.metadata.get("generation", 0) + 1)
for stale in _STALE_KEYS: # Genome.child copies metadata — strip the bookkeeping
child.metadata.pop(stale, None)
child.metadata.update(
changed=(parent is None or cand["source"].strip() != parent.content.strip()),
candidate_name=cand["name"],
report=cand["report"],
)
return child