diff --git a/README.md b/README.md index c9d8f5f..7ca54f6 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ plane and the Track 1 runner/API boundary are now local to `renunney`. - a local Track 1 simulation kernel, - a local Track 1 report generator, - a local Track 1 extinction-model data layer, +- a local Track 1 dataset generator, - a Makefile for common tasks, - migration notes for pulling code into this repo in stages. @@ -92,7 +93,8 @@ The current state is split: - Track 1 simulation kernel: local to `renunney` - Track 1 report generator: local to `renunney` - Track 1 extinction-model data layer: local to `renunney` -- Track 1 dataset and fit helpers: still imported +- Track 1 dataset generator: local to `renunney` +- Track 1 fit helper: still imported from the older `cost_of_substitution` directory through the local compatibility layer diff --git a/docs/MIGRATION.md b/docs/MIGRATION.md index e462efc..f291e89 100644 --- a/docs/MIGRATION.md +++ b/docs/MIGRATION.md @@ -37,11 +37,12 @@ Operational code still lives in: - `src/renunney/track1_report.py` 8. Track 1 extinction-model data layer has been migrated locally: - `src/renunney/track1_extinction.py` -9. Migrate dataset and fit modules next: - - `python/track1_dataset.py` +9. Track 1 dataset generator has been migrated locally: + - `src/renunney/track1_dataset.py` +10. Migrate the fit module next: - `python/track1_fit.py` -10. Reduce or remove the remaining compatibility-layer imports after those modules are local. -11. Migrate docs and example configs last, after path references are updated. +11. Reduce or remove the remaining compatibility-layer imports after those modules are local. +12. Migrate docs and example configs last, after path references are updated. ## Constraint diff --git a/docs/WORKFLOW.md b/docs/WORKFLOW.md index 9a680f2..2421c47 100644 --- a/docs/WORKFLOW.md +++ b/docs/WORKFLOW.md @@ -49,7 +49,8 @@ make status The Makefile now drives the local orchestration code in `renunney`, while the Track 1 runner/API boundary, analysis layer, threshold/search layer, and simulation kernel, report generator, and extinction-model data layer are also -local to `renunney`. The remaining Track 1 dataset/fit helpers are still -imported from the legacy `cost_of_substitution` directory through the -compatibility layer in `src/renunney/legacy.py`. The paper-scale Figure 1 -configs used for submission are now local to `renunney/config`. +local to `renunney`, and the dataset generator is now local as well. The +remaining Track 1 fit helper is still imported from the legacy +`cost_of_substitution` directory through the compatibility layer in +`src/renunney/legacy.py`. The paper-scale Figure 1 configs used for submission +are now local to `renunney/config`. diff --git a/src/renunney/__init__.py b/src/renunney/__init__.py index 107a7d5..d43de59 100644 --- a/src/renunney/__init__.py +++ b/src/renunney/__init__.py @@ -28,6 +28,7 @@ from .track1_analysis import ( sweep_number_of_loci, ) from .track1_api import Track1RunConfig, config_from_mapping, load_config, run_config, save_payload +from .track1_dataset import GRID_KEYS, generate_extinction_dataset from .track1_extinction import ( ExtinctionGenerationRow, ExtinctionRunRow, @@ -113,6 +114,7 @@ __all__ = [ "expected_female_productivity", "expected_mutations_for_population", "evaluate_threshold_candidate", + "generate_extinction_dataset", "ExtinctionGenerationRow", "ExtinctionRunRow", "female_fecundity", @@ -121,6 +123,7 @@ __all__ = [ "generation_metrics", "genotype_fitness", "generate_report_bundle", + "GRID_KEYS", "initialize_population", "is_extinct", "build_extinction_generation_rows", diff --git a/src/renunney/track1_api.py b/src/renunney/track1_api.py index 9055577..d6805ff 100644 --- a/src/renunney/track1_api.py +++ b/src/renunney/track1_api.py @@ -16,13 +16,13 @@ from typing import Any, Optional from .legacy import ensure_legacy_python_path from .track1_analysis import summarize_tracking, sweep_number_of_loci +from .track1_dataset import generate_extinction_dataset from .track1_reference import Track1Parameters, simulate_run from .track1_report import generate_report_bundle from .track1_threshold import evaluate_threshold_candidate, search_threshold_over_candidates ensure_legacy_python_path() -from track1_dataset import generate_extinction_dataset from track1_fit import class_balance, fit_payload_from_jsonl, load_jsonl diff --git a/src/renunney/track1_dataset.py b/src/renunney/track1_dataset.py new file mode 100644 index 0000000..fa4544e --- /dev/null +++ b/src/renunney/track1_dataset.py @@ -0,0 +1,107 @@ +""" +track1_dataset.py + +Dataset generation for extinction-risk analysis on top of Track 1 simulations. +""" + +from __future__ import annotations + +from dataclasses import asdict +import itertools +import json +from pathlib import Path +from typing import Any + +from .track1_extinction import build_extinction_generation_rows, build_extinction_run_row, save_jsonl +from .track1_reference import Track1Parameters, simulate_run + + +GRID_KEYS = ("K", "N0", "n", "u", "R", "T", "epochs", "p", "a_max") + + +def _grid_axes_from_config(params: Track1Parameters, grid: dict[str, list[Any]] | None) -> dict[str, list[Any]]: + base = { + "K": [params.K], + "N0": [params.N0], + "n": [params.n], + "u": [params.u], + "R": [params.R], + "T": [params.T], + "epochs": [params.epochs], + "p": [params.p], + "a_max": [params.a_max], + } + if not grid: + return base + for key, values in grid.items(): + if key not in GRID_KEYS: + raise ValueError(f"Unsupported extinction dataset grid key: {key}") + if not isinstance(values, list) or len(values) == 0: + raise ValueError(f"Grid key {key} must map to a non-empty list.") + base[key] = values + return base + + +def generate_extinction_dataset( + params: Track1Parameters, + runs: int, + seed_start: int, + dataset_dir: str | Path, + grid: dict[str, list[Any]] | None = None, +) -> dict[str, Any]: + outdir = Path(dataset_dir) + outdir.mkdir(parents=True, exist_ok=True) + + axes = _grid_axes_from_config(params, grid) + combinations = list(itertools.product(*(axes[key] for key in GRID_KEYS))) + generation_rows = [] + run_rows = [] + treatment_rows = [] + + for treatment_index, values in enumerate(combinations): + combo = dict(zip(GRID_KEYS, values)) + run_params = Track1Parameters( + K=int(combo["K"]), + N0=int(combo["N0"]), + n=int(combo["n"]), + u=float(combo["u"]), + R=float(combo["R"]), + T=int(combo["T"]), + epochs=int(combo["epochs"]), + p=float(combo["p"]), + a_max=None if combo["a_max"] is None else int(combo["a_max"]), + ) + treatment_rows.append( + { + "treatment_index": treatment_index, + **asdict(run_params), + "M": run_params.M, + "runs": runs, + } + ) + for run_offset in range(runs): + seed = seed_start + (treatment_index * runs) + run_offset + summaries = simulate_run(run_params, seed=seed) + generation_rows.extend(build_extinction_generation_rows(run_params, summaries, seed=seed)) + run_rows.append(build_extinction_run_row(run_params, summaries, seed=seed)) + + save_jsonl(generation_rows, outdir / "generation_rows.jsonl") + save_jsonl(run_rows, outdir / "run_rows.jsonl") + (outdir / "treatments.json").write_text( + json.dumps(treatment_rows, indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + metadata = { + "dataset_dir": str(outdir), + "generation_rows_path": str(outdir / "generation_rows.jsonl"), + "run_rows_path": str(outdir / "run_rows.jsonl"), + "treatments_path": str(outdir / "treatments.json"), + "treatment_count": len(treatment_rows), + "run_row_count": len(run_rows), + "generation_row_count": len(generation_rows), + "runs_per_treatment": runs, + "seed_start": seed_start, + "grid": axes, + } + (outdir / "metadata.json").write_text(json.dumps(metadata, indent=2, sort_keys=True) + "\n", encoding="utf-8") + return metadata diff --git a/tests/test_track1_dataset.py b/tests/test_track1_dataset.py new file mode 100644 index 0000000..3e97c4e --- /dev/null +++ b/tests/test_track1_dataset.py @@ -0,0 +1,56 @@ +import json +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +SRC_DIR = ROOT / "src" +if str(SRC_DIR) not in sys.path: + sys.path.insert(0, str(SRC_DIR)) + +import renunney.track1_api as api +import renunney.track1_dataset as ds +import renunney.track1_reference as ref + + +def test_generate_extinction_dataset_writes_expected_files(tmp_path: Path): + params = ref.Track1Parameters(K=500, N0=20, n=1, u=0.001, R=10.0, T=10, epochs=1) + dataset_dir = tmp_path / "dataset" + payload = ds.generate_extinction_dataset( + params=params, + runs=1, + seed_start=1, + dataset_dir=dataset_dir, + grid={"N0": [20, 500], "u": [0.001, 0.005]}, + ) + assert payload["treatment_count"] == 4 + assert payload["run_row_count"] == 4 + assert Path(payload["generation_rows_path"]).exists() + assert Path(payload["run_rows_path"]).exists() + assert Path(payload["treatments_path"]).exists() + metadata = json.loads((dataset_dir / "metadata.json").read_text(encoding="utf-8")) + assert metadata["treatment_count"] == 4 + + +def test_run_config_extinction_dataset_mode(tmp_path: Path): + dataset_dir = tmp_path / "dataset" + config = api.Track1RunConfig( + mode="extinction_dataset", + K=500, + N0=20, + n=1, + u=0.001, + R=10.0, + T=10, + epochs=1, + runs=1, + seed=1, + dataset_dir=str(dataset_dir), + grid={"u": [0.001, 0.005]}, + ) + payload = api.run_config(config) + assert payload["mode"] == "extinction_dataset" + assert payload["parameters"]["u"] == 0.001 + assert payload["parameters"]["M"] == 1.0 + assert payload["treatment_count"] == 2 + assert (dataset_dir / "run_rows.jsonl").exists() + assert (dataset_dir / "generation_rows.jsonl").exists()