Migrate Track 1 dataset layer into renunney

2026-04-11 06:50:53 -04:00 · 2026-04-11 06:50:53 -04:00 · aefd4e4ccb
parent 7ea94aa7fd
commit aefd4e4ccb
7 changed files with 180 additions and 10 deletions
--- a/README.md
+++ b/README.md
@ -27,6 +27,7 @@ plane and the Track 1 runner/API boundary are now local to `renunney`.
 - a local Track 1 simulation kernel,
 - a local Track 1 report generator,
 - a local Track 1 extinction-model data layer,
 - a local Track 1 dataset generator,
 - a Makefile for common tasks,
 - migration notes for pulling code into this repo in stages.
@ -92,7 +93,8 @@ The current state is split:
 - Track 1 simulation kernel: local to `renunney`
 - Track 1 report generator: local to `renunney`
 - Track 1 extinction-model data layer: local to `renunney`
- Track 1 dataset and fit helpers: still imported
+- Track 1 dataset generator: local to `renunney`
 - Track 1 fit helper: still imported
  from the older `cost_of_substitution` directory through the local
  compatibility layer
--- a/docs/MIGRATION.md
+++ b/docs/MIGRATION.md
@ -37,11 +37,12 @@ Operational code still lives in:
   - `src/renunney/track1_report.py`
 8. Track 1 extinction-model data layer has been migrated locally:
   - `src/renunney/track1_extinction.py`
-9. Migrate dataset and fit modules next:
+9. Track 1 dataset generator has been migrated locally:
-   - `python/track1_dataset.py`
+   - `src/renunney/track1_dataset.py`
 10. Migrate the fit module next:
   - `python/track1_fit.py`
-10. Reduce or remove the remaining compatibility-layer imports after those modules are local.
+11. Reduce or remove the remaining compatibility-layer imports after those modules are local.
-11. Migrate docs and example configs last, after path references are updated.
+12. Migrate docs and example configs last, after path references are updated.
 ## Constraint
--- a/docs/WORKFLOW.md
+++ b/docs/WORKFLOW.md
@ -49,7 +49,8 @@ make status
 The Makefile now drives the local orchestration code in `renunney`, while the
 Track 1 runner/API boundary, analysis layer, threshold/search layer, and
 simulation kernel, report generator, and extinction-model data layer are also
-local to `renunney`. The remaining Track 1 dataset/fit helpers are still
+local to `renunney`, and the dataset generator is now local as well. The
-imported from the legacy `cost_of_substitution` directory through the
+remaining Track 1 fit helper is still imported from the legacy
-compatibility layer in `src/renunney/legacy.py`. The paper-scale Figure 1
+`cost_of_substitution` directory through the compatibility layer in
-configs used for submission are now local to `renunney/config`.
+`src/renunney/legacy.py`. The paper-scale Figure 1 configs used for submission
 are now local to `renunney/config`.
--- a/src/renunney/init.py
+++ b/src/renunney/init.py
@ -28,6 +28,7 @@ from .track1_analysis import (
    sweep_number_of_loci,
 )
 from .track1_api import Track1RunConfig, config_from_mapping, load_config, run_config, save_payload
 from .track1_dataset import GRID_KEYS, generate_extinction_dataset
 from .track1_extinction import (
    ExtinctionGenerationRow,
    ExtinctionRunRow,
@ -113,6 +114,7 @@ __all__ = [
    "expected_female_productivity",
    "expected_mutations_for_population",
    "evaluate_threshold_candidate",
    "generate_extinction_dataset",
    "ExtinctionGenerationRow",
    "ExtinctionRunRow",
    "female_fecundity",
@ -121,6 +123,7 @@ __all__ = [
    "generation_metrics",
    "genotype_fitness",
    "generate_report_bundle",
    "GRID_KEYS",
    "initialize_population",
    "is_extinct",
    "build_extinction_generation_rows",
--- a/src/renunney/track1_api.py
+++ b/src/renunney/track1_api.py
@ -16,13 +16,13 @@ from typing import Any, Optional
 from .legacy import ensure_legacy_python_path
 from .track1_analysis import summarize_tracking, sweep_number_of_loci
 from .track1_dataset import generate_extinction_dataset
 from .track1_reference import Track1Parameters, simulate_run
 from .track1_report import generate_report_bundle
 from .track1_threshold import evaluate_threshold_candidate, search_threshold_over_candidates
 ensure_legacy_python_path()
 from track1_dataset import generate_extinction_dataset
 from track1_fit import class_balance, fit_payload_from_jsonl, load_jsonl
--- a/src/renunney/track1_dataset.py
+++ b/src/renunney/track1_dataset.py
@ -0,0 +1,107 @@
 """
 track1_dataset.py
 Dataset generation for extinction-risk analysis on top of Track 1 simulations.
 """
 from __future__ import annotations
 from dataclasses import asdict
 import itertools
 import json
 from pathlib import Path
 from typing import Any
 from .track1_extinction import build_extinction_generation_rows, build_extinction_run_row, save_jsonl
 from .track1_reference import Track1Parameters, simulate_run
 GRID_KEYS = ("K", "N0", "n", "u", "R", "T", "epochs", "p", "a_max")
 def _grid_axes_from_config(params: Track1Parameters, grid: dict[str, list[Any]] | None) -> dict[str, list[Any]]:
    base = {
        "K": [params.K],
        "N0": [params.N0],
        "n": [params.n],
        "u": [params.u],
        "R": [params.R],
        "T": [params.T],
        "epochs": [params.epochs],
        "p": [params.p],
        "a_max": [params.a_max],
    }
    if not grid:
        return base
    for key, values in grid.items():
        if key not in GRID_KEYS:
            raise ValueError(f"Unsupported extinction dataset grid key: {key}")
        if not isinstance(values, list) or len(values) == 0:
            raise ValueError(f"Grid key {key} must map to a non-empty list.")
        base[key] = values
    return base
 def generate_extinction_dataset(
    params: Track1Parameters,
    runs: int,
    seed_start: int,
    dataset_dir: str | Path,
    grid: dict[str, list[Any]] | None = None,
 ) -> dict[str, Any]:
    outdir = Path(dataset_dir)
    outdir.mkdir(parents=True, exist_ok=True)
    axes = _grid_axes_from_config(params, grid)
    combinations = list(itertools.product(*(axes[key] for key in GRID_KEYS)))
    generation_rows = []
    run_rows = []
    treatment_rows = []
    for treatment_index, values in enumerate(combinations):
        combo = dict(zip(GRID_KEYS, values))
        run_params = Track1Parameters(
            K=int(combo["K"]),
            N0=int(combo["N0"]),
            n=int(combo["n"]),
            u=float(combo["u"]),
            R=float(combo["R"]),
            T=int(combo["T"]),
            epochs=int(combo["epochs"]),
            p=float(combo["p"]),
            a_max=None if combo["a_max"] is None else int(combo["a_max"]),
        )
        treatment_rows.append(
            {
                "treatment_index": treatment_index,
                **asdict(run_params),
                "M": run_params.M,
                "runs": runs,
            }
        )
        for run_offset in range(runs):
            seed = seed_start + (treatment_index * runs) + run_offset
            summaries = simulate_run(run_params, seed=seed)
            generation_rows.extend(build_extinction_generation_rows(run_params, summaries, seed=seed))
            run_rows.append(build_extinction_run_row(run_params, summaries, seed=seed))
    save_jsonl(generation_rows, outdir / "generation_rows.jsonl")
    save_jsonl(run_rows, outdir / "run_rows.jsonl")
    (outdir / "treatments.json").write_text(
        json.dumps(treatment_rows, indent=2, sort_keys=True) + "\n",
        encoding="utf-8",
    )
    metadata = {
        "dataset_dir": str(outdir),
        "generation_rows_path": str(outdir / "generation_rows.jsonl"),
        "run_rows_path": str(outdir / "run_rows.jsonl"),
        "treatments_path": str(outdir / "treatments.json"),
        "treatment_count": len(treatment_rows),
        "run_row_count": len(run_rows),
        "generation_row_count": len(generation_rows),
        "runs_per_treatment": runs,
        "seed_start": seed_start,
        "grid": axes,
    }
    (outdir / "metadata.json").write_text(json.dumps(metadata, indent=2, sort_keys=True) + "\n", encoding="utf-8")
    return metadata
--- a/tests/test_track1_dataset.py
+++ b/tests/test_track1_dataset.py
@ -0,0 +1,56 @@
 import json
 import sys
 from pathlib import Path
 ROOT = Path(__file__).resolve().parents[1]
 SRC_DIR = ROOT / "src"
 if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))
 import renunney.track1_api as api
 import renunney.track1_dataset as ds
 import renunney.track1_reference as ref
 def test_generate_extinction_dataset_writes_expected_files(tmp_path: Path):
    params = ref.Track1Parameters(K=500, N0=20, n=1, u=0.001, R=10.0, T=10, epochs=1)
    dataset_dir = tmp_path / "dataset"
    payload = ds.generate_extinction_dataset(
        params=params,
        runs=1,
        seed_start=1,
        dataset_dir=dataset_dir,
        grid={"N0": [20, 500], "u": [0.001, 0.005]},
    )
    assert payload["treatment_count"] == 4
    assert payload["run_row_count"] == 4
    assert Path(payload["generation_rows_path"]).exists()
    assert Path(payload["run_rows_path"]).exists()
    assert Path(payload["treatments_path"]).exists()
    metadata = json.loads((dataset_dir / "metadata.json").read_text(encoding="utf-8"))
    assert metadata["treatment_count"] == 4
 def test_run_config_extinction_dataset_mode(tmp_path: Path):
    dataset_dir = tmp_path / "dataset"
    config = api.Track1RunConfig(
        mode="extinction_dataset",
        K=500,
        N0=20,
        n=1,
        u=0.001,
        R=10.0,
        T=10,
        epochs=1,
        runs=1,
        seed=1,
        dataset_dir=str(dataset_dir),
        grid={"u": [0.001, 0.005]},
    )
    payload = api.run_config(config)
    assert payload["mode"] == "extinction_dataset"
    assert payload["parameters"]["u"] == 0.001
    assert payload["parameters"]["M"] == 1.0
    assert payload["treatment_count"] == 2
    assert (dataset_dir / "run_rows.jsonl").exists()
    assert (dataset_dir / "generation_rows.jsonl").exists()