Experiment

Bundle ExperimentConfig + dashboard writes + per-arm orchestration.

Attributes

attributeconfig

= config

attributestore

= store

attributetrain_fn

= train_fn or _default_train_fn

attributeinference_factory

= inference_factory

Functions

func__init__(self, config, *, store=None, train_fn=None, benchmark=None, inference_factory=None) -> None

paramself

paramconfigExperimentConfig

paramstoreAny | None

= None

paramtrain_fnTrainFn | None

= None

parambenchmarkBenchmark | None

= None

paraminference_factoryInferenceFactory | None

= None

Returns

None

funcfrom_yaml(cls, path, **kwargs) -> Experiment

paramcls

parampathstr | Path

paramkwargsAny

= {}

Returns

evsys_sdk.experiment.Experiment

funcrun(self) -> ExperimentResult

paramself

Returns

evsys_sdk.experiment.ExperimentResult

func_iter_runs(self) -> list[RunConfig]

Expand sweep matrix / single-run / multi-run to a flat list of primaries.

Each entry is a "group" when n_repeats > 1. See :meth:_replicates_for for the per-primary seeded replicates.

paramself

Returns

list[evsys_sdk.config.RunConfig]

func_replicates_for(self, primary) -> list[tuple[RunConfig, str | None]]

Per-primary seed replicates.

For n_repeats == 1: returns [(primary, None)] - no group. For n_repeats > 1: returns N tuples of (\<RunConfig with name=primary.name__s\<seed> and seed=\<seed>>, primary.name). Seeds run [base_seed, base_seed+1, ...] when base_seed is set, else [primary.seed, primary.seed+1, ...].

paramself

paramprimaryRunConfig

Returns

list[tuple[evsys_sdk.config.RunConfig, str | None]]

func_resolve_benchmarks(self, raw) -> list[tuple[Benchmark, dict]]

Normalize metadata.benchmark to a list of (Benchmark, spec).

Accepts three shapes:

None / empty → [] (no eval).
single dict (legacy single-benchmark form) → wrapped into a one-element list with name defaulting to "benchmark" (or the spec's own name if present).
list[dict] (new multi-benchmark form) → each entry must carry a name and may carry tags and run_every.

self._benchmark_override (test seam) bypasses everything and returns a single-entry list.

paramself

paramrawdict | list | None

Returns

list[tuple[evsys_sdk.benchmark.Benchmark, dict]]

func_create_experiment(self, hypothesis, tags, meta) -> str | None

paramself

paramhypothesisstr | None

paramtagslist[str]

parammetadict

Returns

str | None

func_execute_arm(self, experiment_id, run_cfg, benchmarks, meta, *, group_id=None, group_name=None, score_api_models=True) -> ArmResult

paramself

paramexperiment_idstr | None

paramrun_cfgRunConfig

parambenchmarkslist[tuple[Benchmark, dict]]

parammetadict

paramgroup_idstr | None

= None

paramgroup_namestr | None

= None

paramscore_api_modelsbool

= True

Returns

evsys_sdk.experiment.ArmResult

func_train_arm(self, arm, run_cfg) -> ArmResult

paramself

paramarmArmResult

paramrun_cfgRunConfig

Returns

evsys_sdk.experiment.ArmResult

func_forward_step_metrics(self, arm) -> None

Push the arm's local metrics.jsonl rows to the store.

Runner-time logging writes locally; this batch-forwards to the dashboard so the script doesn't have to call backfill_step_metrics manually after training.

paramself

paramarmArmResult

Returns

None

func_resolve_run_dir(self, arm) -> Path | None

Reconstruct the run output dir the runner wrote into.

paramself

paramarmArmResult

Returns

pathlib.Path | None

func_resolve_inference_factory(self, run_cfg) -> InferenceFactory | None

User-supplied factory wins; otherwise pick a default by backend kind.

Falls back to the registry's get_default_inference_factory so we don't have to import backend-specific inference modules here - e.g. tinker registers its own default at module load.

paramself

paramrun_cfgRunConfig

Returns

evsys_sdk.experiment.InferenceFactory | None

func_eval_arm(self, arm, run_cfg, benchmarks, meta, *, score_api_models=True) -> ArmResult

Score each post-training benchmark and attach an EvalResult per entry.

score_api_models=False skips the closed/API-model (benchmark.models) evals - used for continual stages after the first, since a closed model's weights are fixed across stages so one scoring suffices (only the trained checkpoint, which changes per stage, is re-scored).

Entries flagged with run_every are in-loop and skipped here (their scoring happens during training in the algorithm wrapper - task commit 2). Entries without run_every get a single post-training EvalResult appended to arm.evals.

After all benchmarks score, the flat back-compat fields (arm.eval_metrics / eval_breakdowns / eval_seconds) mirror the primary eval - the first test-tagged post-training row, else the first post-training row, else nothing.

paramself

paramarmArmResult

paramrun_cfgRunConfig

parambenchmarkslist[tuple[Benchmark, dict]]

parammetadict

paramscore_api_modelsbool

= True

Returns

evsys_sdk.experiment.ArmResult

func_eval_arm_harbor(self, arm, run_cfg, bench, bench_meta, *, api_model=None) -> None

Score one benchmark through harbor's rollout engine and upload the eval rollouts (kind='eval'). Opt-in via benchmark.engine: harbor.

api_model (a litellm string) scores a closed / API model instead of the trained checkpoint - same rollout path, model_client='litellm', recorded as its own per-model eval. None → the trained checkpoint.

paramself

paramarmArmResult

paramrun_cfgRunConfig

parambenchBenchmark

parambench_metadict

paramapi_modelstr | None

= None

Returns

None

func_final_checkpoint(arm) -> str | None

The trained sampler checkpoint URI from the arm's artifacts.

paramarmArmResult

Returns

str | None

func_run_continual(self, experiment_id, benchmarks, meta) -> list[ArmResult]

Train the base run once per dataset in continual.datasets, in order, chaining each stage's final weights (fresh optimizer) into the next. Each completed stage is scored on every benchmark via :meth:_execute_arm; a chain stops at the first stage that does not complete.

With n_repeats > 1 the whole chain is replicated once per seed ([base_seed, base_seed+1, ...] or [run.seed, ...]). Stage i across all repeats shares one dashboard group, so variance is aggregated per stage. Within a chain, weights are chained only between that chain's own stages.

paramself

paramexperiment_idstr | None

parambenchmarkslist[tuple[Benchmark, dict]]

parammetadict

Returns

list[evsys_sdk.experiment.ArmResult]

func_final_state_checkpoint(arm) -> str | None

The full training-state path (weights + optimizer) of an arm's final checkpoint. Continual learning loads weights only from this into the next stage. Distinct from :meth:_final_checkpoint, which returns the inference-only sampler path.

paramarmArmResult

Returns

str | None

func_create_group(self, experiment_id, name) -> str | None

paramself

paramexperiment_idstr | None

paramnamestr

Returns

str | None

func_create_run(self, experiment_id, run_cfg, *, group_id=None) -> str | None

paramself

paramexperiment_idstr | None

paramrun_cfgRunConfig

paramgroup_idstr | None

= None

Returns

str | None

func_mark_run_completed(self, run_id, arm) -> None

paramself

paramrun_idstr | None

paramarmArmResult

Returns

None

func_mark_run_failed(self, run_id, error) -> None

paramself

paramrun_idstr | None

paramerrorstr

Returns

None

func_record_eval(self, arm, benchmark, bench_meta, score) -> None

paramself

paramarmArmResult

parambenchmarkBenchmark

parambench_metadict

paramscoreBenchmarkScore

Returns

None

func_finalize_experiment(self, experiment_id, status, best_score, conclusion) -> None

paramself

paramexperiment_idstr | None

paramstatusstr

parambest_scorefloat | None

paramconclusionstr

Returns

None

func_pick_best(self, arms, metric) -> ArmResult | None

paramself

paramarmslist[ArmResult]

parammetricstr

Returns

evsys_sdk.experiment.ArmResult | None

func_build_conclusion(self, arms, best_arm, success_metric) -> str

paramself

paramarmslist[ArmResult]

parambest_armArmResult | None

paramsuccess_metricstr | None

Returns

str

Experiment

Attributes

Functions

On this page