Migrate Track 1 dataset layer into renunney

2026-04-11 06:50:53 -04:00 · 2026-04-11 06:50:53 -04:00 · aefd4e4ccb
parent 7ea94aa7fd
commit aefd4e4ccb
7 changed files with 180 additions and 10 deletions
--- a/README.md
+++ b/README.md
@ -27,6 +27,7 @@ plane and the Track 1 runner/API boundary are now local to `renunney`.
 - a local Track 1 simulation kernel,
 - a local Track 1 report generator,
 - a local Track 1 extinction-model data layer,
+- a local Track 1 dataset generator,
 - a Makefile for common tasks,
 - migration notes for pulling code into this repo in stages.

@ -92,7 +93,8 @@ The current state is split:
 - Track 1 simulation kernel: local to `renunney`
 - Track 1 report generator: local to `renunney`
 - Track 1 extinction-model data layer: local to `renunney`
- Track 1 dataset and fit helpers: still imported
+- Track 1 dataset generator: local to `renunney`
+- Track 1 fit helper: still imported
  from the older `cost_of_substitution` directory through the local
  compatibility layer

--- a/docs/MIGRATION.md
+++ b/docs/MIGRATION.md
@ -37,11 +37,12 @@ Operational code still lives in:
   - `src/renunney/track1_report.py`
 8. Track 1 extinction-model data layer has been migrated locally:
   - `src/renunney/track1_extinction.py`
-9. Migrate dataset and fit modules next:
-   - `python/track1_dataset.py`
+9. Track 1 dataset generator has been migrated locally:
+   - `src/renunney/track1_dataset.py`
+10. Migrate the fit module next:
   - `python/track1_fit.py`
-10. Reduce or remove the remaining compatibility-layer imports after those modules are local.
-11. Migrate docs and example configs last, after path references are updated.
+11. Reduce or remove the remaining compatibility-layer imports after those modules are local.
+12. Migrate docs and example configs last, after path references are updated.

 ## Constraint

--- a/docs/WORKFLOW.md
+++ b/docs/WORKFLOW.md
@ -49,7 +49,8 @@ make status
 The Makefile now drives the local orchestration code in `renunney`, while the
 Track 1 runner/API boundary, analysis layer, threshold/search layer, and
 simulation kernel, report generator, and extinction-model data layer are also
-local to `renunney`. The remaining Track 1 dataset/fit helpers are still
-imported from the legacy `cost_of_substitution` directory through the
-compatibility layer in `src/renunney/legacy.py`. The paper-scale Figure 1
-configs used for submission are now local to `renunney/config`.
+local to `renunney`, and the dataset generator is now local as well. The
+remaining Track 1 fit helper is still imported from the legacy
+`cost_of_substitution` directory through the compatibility layer in
+`src/renunney/legacy.py`. The paper-scale Figure 1 configs used for submission
+are now local to `renunney/config`.
--- a/src/renunney/init.py
+++ b/src/renunney/init.py
@ -28,6 +28,7 @@ from .track1_analysis import (
    sweep_number_of_loci,
 )
 from .track1_api import Track1RunConfig, config_from_mapping, load_config, run_config, save_payload
+from .track1_dataset import GRID_KEYS, generate_extinction_dataset
 from .track1_extinction import (
    ExtinctionGenerationRow,
    ExtinctionRunRow,
@ -113,6 +114,7 @@ __all__ = [
    "expected_female_productivity",
    "expected_mutations_for_population",
    "evaluate_threshold_candidate",
+    "generate_extinction_dataset",
    "ExtinctionGenerationRow",
    "ExtinctionRunRow",
    "female_fecundity",
@ -121,6 +123,7 @@ __all__ = [
    "generation_metrics",
    "genotype_fitness",
    "generate_report_bundle",
+    "GRID_KEYS",
    "initialize_population",
    "is_extinct",
    "build_extinction_generation_rows",
--- a/src/renunney/track1_api.py
+++ b/src/renunney/track1_api.py
@ -16,13 +16,13 @@ from typing import Any, Optional

 from .legacy import ensure_legacy_python_path
 from .track1_analysis import summarize_tracking, sweep_number_of_loci
+from .track1_dataset import generate_extinction_dataset
 from .track1_reference import Track1Parameters, simulate_run
 from .track1_report import generate_report_bundle
 from .track1_threshold import evaluate_threshold_candidate, search_threshold_over_candidates

 ensure_legacy_python_path()

-from track1_dataset import generate_extinction_dataset
 from track1_fit import class_balance, fit_payload_from_jsonl, load_jsonl


--- a/src/renunney/track1_dataset.py
+++ b/src/renunney/track1_dataset.py
@ -0,0 +1,107 @@
+"""
+track1_dataset.py
+
+Dataset generation for extinction-risk analysis on top of Track 1 simulations.
+"""
+
+from __future__ import annotations
+
+from dataclasses import asdict
+import itertools
+import json
+from pathlib import Path
+from typing import Any
+
+from .track1_extinction import build_extinction_generation_rows, build_extinction_run_row, save_jsonl
+from .track1_reference import Track1Parameters, simulate_run
+
+
+GRID_KEYS = ("K", "N0", "n", "u", "R", "T", "epochs", "p", "a_max")
+
+
+def _grid_axes_from_config(params: Track1Parameters, grid: dict[str, list[Any]] | None) -> dict[str, list[Any]]:
+    base = {
+        "K": [params.K],
+        "N0": [params.N0],
+        "n": [params.n],
+        "u": [params.u],
+        "R": [params.R],
+        "T": [params.T],
+        "epochs": [params.epochs],
+        "p": [params.p],
+        "a_max": [params.a_max],
+    }
+    if not grid:
+        return base
+    for key, values in grid.items():
+        if key not in GRID_KEYS:
+            raise ValueError(f"Unsupported extinction dataset grid key: {key}")
+        if not isinstance(values, list) or len(values) == 0:
+            raise ValueError(f"Grid key {key} must map to a non-empty list.")
+        base[key] = values
+    return base
+
+
+def generate_extinction_dataset(
+    params: Track1Parameters,
+    runs: int,
+    seed_start: int,
+    dataset_dir: str | Path,
+    grid: dict[str, list[Any]] | None = None,
+) -> dict[str, Any]:
+    outdir = Path(dataset_dir)
+    outdir.mkdir(parents=True, exist_ok=True)
+
+    axes = _grid_axes_from_config(params, grid)
+    combinations = list(itertools.product(*(axes[key] for key in GRID_KEYS)))
+    generation_rows = []
+    run_rows = []
+    treatment_rows = []
+
+    for treatment_index, values in enumerate(combinations):
+        combo = dict(zip(GRID_KEYS, values))
+        run_params = Track1Parameters(
+            K=int(combo["K"]),
+            N0=int(combo["N0"]),
+            n=int(combo["n"]),
+            u=float(combo["u"]),
+            R=float(combo["R"]),
+            T=int(combo["T"]),
+            epochs=int(combo["epochs"]),
+            p=float(combo["p"]),
+            a_max=None if combo["a_max"] is None else int(combo["a_max"]),
+        )
+        treatment_rows.append(
+            {
+                "treatment_index": treatment_index,
+                **asdict(run_params),
+                "M": run_params.M,
+                "runs": runs,
+            }
+        )
+        for run_offset in range(runs):
+            seed = seed_start + (treatment_index * runs) + run_offset
+            summaries = simulate_run(run_params, seed=seed)
+            generation_rows.extend(build_extinction_generation_rows(run_params, summaries, seed=seed))
+            run_rows.append(build_extinction_run_row(run_params, summaries, seed=seed))
+
+    save_jsonl(generation_rows, outdir / "generation_rows.jsonl")
+    save_jsonl(run_rows, outdir / "run_rows.jsonl")
+    (outdir / "treatments.json").write_text(
+        json.dumps(treatment_rows, indent=2, sort_keys=True) + "\n",
+        encoding="utf-8",
+    )
+    metadata = {
+        "dataset_dir": str(outdir),
+        "generation_rows_path": str(outdir / "generation_rows.jsonl"),
+        "run_rows_path": str(outdir / "run_rows.jsonl"),
+        "treatments_path": str(outdir / "treatments.json"),
+        "treatment_count": len(treatment_rows),
+        "run_row_count": len(run_rows),
+        "generation_row_count": len(generation_rows),
+        "runs_per_treatment": runs,
+        "seed_start": seed_start,
+        "grid": axes,
+    }
+    (outdir / "metadata.json").write_text(json.dumps(metadata, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+    return metadata
--- a/tests/test_track1_dataset.py
+++ b/tests/test_track1_dataset.py
@ -0,0 +1,56 @@
+import json
+import sys
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+SRC_DIR = ROOT / "src"
+if str(SRC_DIR) not in sys.path:
+    sys.path.insert(0, str(SRC_DIR))
+
+import renunney.track1_api as api
+import renunney.track1_dataset as ds
+import renunney.track1_reference as ref
+
+
+def test_generate_extinction_dataset_writes_expected_files(tmp_path: Path):
+    params = ref.Track1Parameters(K=500, N0=20, n=1, u=0.001, R=10.0, T=10, epochs=1)
+    dataset_dir = tmp_path / "dataset"
+    payload = ds.generate_extinction_dataset(
+        params=params,
+        runs=1,
+        seed_start=1,
+        dataset_dir=dataset_dir,
+        grid={"N0": [20, 500], "u": [0.001, 0.005]},
+    )
+    assert payload["treatment_count"] == 4
+    assert payload["run_row_count"] == 4
+    assert Path(payload["generation_rows_path"]).exists()
+    assert Path(payload["run_rows_path"]).exists()
+    assert Path(payload["treatments_path"]).exists()
+    metadata = json.loads((dataset_dir / "metadata.json").read_text(encoding="utf-8"))
+    assert metadata["treatment_count"] == 4
+
+
+def test_run_config_extinction_dataset_mode(tmp_path: Path):
+    dataset_dir = tmp_path / "dataset"
+    config = api.Track1RunConfig(
+        mode="extinction_dataset",
+        K=500,
+        N0=20,
+        n=1,
+        u=0.001,
+        R=10.0,
+        T=10,
+        epochs=1,
+        runs=1,
+        seed=1,
+        dataset_dir=str(dataset_dir),
+        grid={"u": [0.001, 0.005]},
+    )
+    payload = api.run_config(config)
+    assert payload["mode"] == "extinction_dataset"
+    assert payload["parameters"]["u"] == 0.001
+    assert payload["parameters"]["M"] == 1.0
+    assert payload["treatment_count"] == 2
+    assert (dataset_dir / "run_rows.jsonl").exists()
+    assert (dataset_dir / "generation_rows.jsonl").exists()