From 8a21044d1f6a9bb7c6b82e5db55cd338b2a29eeb Mon Sep 17 00:00:00 2001
From: welsberr <welsberr@gmail.com>
Date: Fri, 20 Mar 2026 15:52:57 -0400
Subject: [PATCH] Report progress for long-running CLI tasks

---
 README.md            |  2 ++
 src/citegeist/cli.py | 56 +++++++++++++++++++++++++++++++++++++-------
 tests/test_cli.py    | 49 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 99 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 5196ee9..82d73d7 100644
--- a/README.md
+++ b/README.md
@@ -157,6 +157,8 @@ For a fuller option-by-option CLI cookbook, see [examples/cli/README.md](./examp
 
 Broad BibTeX exports skip DOI-only placeholder records such as `Referenced work N` by default. Use `--include-stubs` on `export` or `export-topic` if you want those entries included anyway.
 
+Long-running CLI commands report progress on `stderr` so `stdout` remains clean for JSON, BibTeX, or tabular output.
+
 For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run.
 
 ## Example Application
diff --git a/src/citegeist/cli.py b/src/citegeist/cli.py
index 4cd3092..a2a66bd 100644
--- a/src/citegeist/cli.py
+++ b/src/citegeist/cli.py
@@ -783,10 +783,23 @@ def _run_extract(input_path: Path, output: str | None) -> int:
     return 0
 
 
+def _print_progress(label: str, index: int, total: int, detail: str | None = None) -> None:
+    message = f"[{index}/{total}] {label}"
+    if detail:
+        message = f"{message}: {detail}"
+    print(message, file=sys.stderr, flush=True)
+
+
+def _print_phase(message: str) -> None:
+    print(message, file=sys.stderr, flush=True)
+
+
 def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int:
     resolver = MetadataResolver()
     exit_code = 0
-    for citation_key in citation_keys:
+    total = len(citation_keys)
+    for index, citation_key in enumerate(citation_keys, start=1):
+        _print_progress("resolving", index, total, citation_key)
         if not _resolve_one(store, resolver, citation_key):
             exit_code = 1
     return exit_code
@@ -846,7 +859,9 @@ def _run_resolve_stubs(
 
     resolver = MetadataResolver()
     exit_code = 0
-    for candidate in candidates:
+    total = len(candidates)
+    for index, candidate in enumerate(candidates, start=1):
+        _print_progress("resolving candidate", index, total, str(candidate["citation_key"]))
         if not _resolve_one(store, resolver, str(candidate["citation_key"])):
             exit_code = 1
     return exit_code
@@ -1240,7 +1255,9 @@ def _run_expand(
         return 1
 
     all_results = []
-    for citation_key in citation_keys:
+    total = len(citation_keys)
+    for index, citation_key in enumerate(citation_keys, start=1):
+        _print_progress("expanding seed", index, total, citation_key)
         all_results.extend(expand_fn(citation_key))
     print(json.dumps([asdict(result) for result in all_results], indent=2))
     return 0
@@ -1259,6 +1276,7 @@ def _run_expand_topic(
     preview: bool,
 ) -> int:
     expander = TopicExpander()
+    _print_phase(f"Loading topic expansion for {topic_slug}")
     stored_topic = store.get_topic(topic_slug)
     effective_phrase = topic_phrase
     if effective_phrase is None and stored_topic is not None:
@@ -1309,6 +1327,7 @@ def _run_harvest_oai(
     review_status: str,
 ) -> int:
     harvester = OaiPmhHarvester()
+    _print_phase(f"Harvesting OAI-PMH records from {base_url}")
     harvested = harvester.list_records(
         base_url,
         metadata_prefix=metadata_prefix,
@@ -1317,7 +1336,9 @@ def _run_harvest_oai(
         date_until=date_until,
         limit=limit,
     )
-    for result in harvested:
+    total = len(harvested)
+    for index, result in enumerate(harvested, start=1):
+        _print_progress("ingesting harvested record", index, total, result.entry.citation_key)
         store.upsert_entry(
             result.entry,
             raw_bibtex=render_bibtex([result.entry]),
@@ -1332,6 +1353,7 @@ def _run_harvest_oai(
 
 def _run_discover_oai(base_url: str) -> int:
     harvester = OaiPmhHarvester()
+    _print_phase(f"Inspecting OAI-PMH repository {base_url}")
     payload = {
         "identify": harvester.identify(base_url),
         "metadata_formats": [asdict(result) for result in harvester.list_metadata_formats(base_url)],
@@ -1358,6 +1380,7 @@ def _run_bootstrap(
         print("bootstrap requires --seed-bib, --topic, or both", file=sys.stderr)
         return 1
 
+    _print_phase("Running bootstrap")
     seed_bibtex = Path(seed_bib).read_text(encoding="utf-8") if seed_bib else None
     bootstrapper = Bootstrapper()
     results = bootstrapper.bootstrap(
@@ -1380,9 +1403,12 @@ def _run_bootstrap(
 def _run_bootstrap_batch(store: BibliographyStore, input_path: Path) -> int:
     jobs = load_batch_jobs(input_path)
     runner = BatchBootstrapRunner()
+    _print_phase(f"Running bootstrap batch with {len(jobs)} jobs")
     results = runner.run(store, jobs)
     payload = []
-    for job_result in results:
+    total = len(results)
+    for index, job_result in enumerate(results, start=1):
+        _print_progress("completed bootstrap job", index, total, job_result.job_name)
         payload.append(
             {
                 "job_name": job_result.job_name,
@@ -1409,6 +1435,7 @@ def _run_scrape_talkorigins(
     review_status: str,
 ) -> int:
     scraper = TalkOriginsScraper()
+    _print_phase(f"Scraping TalkOrigins example corpus from {base_url}")
     export = scraper.scrape_to_directory(
         base_url=base_url,
         output_dir=output_dir,
@@ -1428,6 +1455,7 @@ def _run_scrape_talkorigins(
 
 def _run_validate_talkorigins(manifest_path: Path) -> int:
     scraper = TalkOriginsScraper()
+    _print_phase(f"Validating TalkOrigins manifest {manifest_path}")
     report = scraper.validate_export(manifest_path)
     print(json.dumps(asdict(report), indent=2))
     return 0
@@ -1440,6 +1468,7 @@ def _run_suggest_talkorigins_phrases(
     output: str | None,
 ) -> int:
     scraper = TalkOriginsScraper()
+    _print_phase(f"Generating TalkOrigins topic phrase suggestions from {manifest_path}")
     suggestions = scraper.suggest_topic_phrases(manifest_path, limit=limit, topic_slug=topic_slug)
     payload = json.dumps([asdict(item) for item in suggestions], indent=2)
     if output:
@@ -1461,7 +1490,8 @@ def _run_apply_topic_phrases(store: BibliographyStore, input_path: Path) -> int:
 
     results: list[dict[str, object]] = []
     exit_code = 0
-    for item in items:
+    total = len(items)
+    for index, item in enumerate(items, start=1):
         if not isinstance(item, dict):
             continue
         slug = str(item.get("slug") or "")
@@ -1471,6 +1501,7 @@ def _run_apply_topic_phrases(store: BibliographyStore, input_path: Path) -> int:
         if phrase is not None:
             phrase = str(phrase)
         applied = store.set_topic_expansion_phrase(slug, phrase)
+        _print_progress("applying topic phrase", index, total, slug or "<missing-slug>")
         if not applied:
             exit_code = 1
         results.append(
@@ -1496,7 +1527,8 @@ def _run_stage_topic_phrases(store: BibliographyStore, input_path: Path) -> int:
 
     results: list[dict[str, object]] = []
     exit_code = 0
-    for item in items:
+    total = len(items)
+    for index, item in enumerate(items, start=1):
         if not isinstance(item, dict):
             continue
         slug = str(item.get("slug") or "")
@@ -1514,6 +1546,7 @@ def _run_stage_topic_phrases(store: BibliographyStore, input_path: Path) -> int:
             review_status="pending",
             review_notes=notes,
         )
+        _print_progress("staging topic phrase", index, total, slug or "<missing-slug>")
         if not staged:
             exit_code = 1
         results.append(
@@ -1560,7 +1593,8 @@ def _run_review_topic_phrases(store: BibliographyStore, input_path: Path) -> int
 
     results: list[dict[str, object]] = []
     exit_code = 0
-    for item in items:
+    total = len(items)
+    for index, item in enumerate(items, start=1):
         if not isinstance(item, dict):
             continue
         slug = str(item.get("slug") or "")
@@ -1579,6 +1613,7 @@ def _run_review_topic_phrases(store: BibliographyStore, input_path: Path) -> int
             review_notes=notes,
             applied_phrase=phrase,
         )
+        _print_progress("reviewing topic phrase", index, total, slug or "<missing-slug>")
         if not reviewed:
             exit_code = 1
         results.append(
@@ -1603,6 +1638,7 @@ def _run_duplicates_talkorigins(
     weak_only: bool,
 ) -> int:
     scraper = TalkOriginsScraper()
+    _print_phase(f"Inspecting TalkOrigins duplicate clusters from {manifest_path}")
     clusters = scraper.inspect_duplicate_clusters(
         manifest_path,
         limit=limit,
@@ -1623,6 +1659,7 @@ def _run_ingest_talkorigins(
     dedupe: bool,
 ) -> int:
     scraper = TalkOriginsScraper()
+    _print_phase(f"Ingesting TalkOrigins export from {manifest_path}")
     report = scraper.ingest_export(
         manifest_path,
         store,
@@ -1645,6 +1682,7 @@ def _run_enrich_talkorigins(
     allow_unsafe_matches: bool,
 ) -> int:
     scraper = TalkOriginsScraper()
+    _print_phase(f"Enriching weak TalkOrigins canonicals from {manifest_path}")
     results = scraper.enrich_weak_canonicals(
         manifest_path,
         store,
@@ -1670,6 +1708,7 @@ def _run_review_talkorigins(
     output: str | None,
 ) -> int:
     scraper = TalkOriginsScraper()
+    _print_phase(f"Building TalkOrigins review export from {manifest_path}")
     review = scraper.build_review_export(
         manifest_path,
         store,
@@ -1693,6 +1732,7 @@ def _run_apply_talkorigins_corrections(
     review_status: str,
 ) -> int:
     scraper = TalkOriginsScraper()
+    _print_phase(f"Applying TalkOrigins corrections from {corrections_path}")
     results = scraper.apply_review_corrections(
         manifest_path,
         corrections_path,
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 857c0f8..c514936 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1,8 +1,10 @@
 from __future__ import annotations
 
+import io
 import json
 import subprocess
 import sys
+from contextlib import redirect_stderr, redirect_stdout
 from pathlib import Path
 from unittest.mock import patch
 
@@ -356,6 +358,53 @@ def test_cli_resolve_stubs_can_enrich_all_misc_entries(tmp_path: Path):
     assert "title" in {item["field_name"] for item in payload["field_conflicts"]}
 
 
+def test_cli_resolve_stubs_reports_progress_on_stderr(tmp_path: Path):
+    bib_path = tmp_path / "input.bib"
+    bib_path.write_text(
+        """
+@misc{stubdoi,
+  title = {Referenced work 6},
+  doi = {10.1200/JCO.2002.04.117},
+  url = {https://doi.org/10.1200/JCO.2002.04.117}
+}
+""",
+        encoding="utf-8",
+    )
+    assert run_cli(tmp_path, "ingest", str(bib_path)).returncode == 0
+
+    from citegeist.bibtex import BibEntry
+    from citegeist.resolve import Resolution
+
+    database = tmp_path / "library.sqlite3"
+    with patch("citegeist.cli.MetadataResolver.resolve_entry") as mocked_resolve:
+        mocked_resolve.return_value = Resolution(
+            entry=BibEntry(
+                entry_type="article",
+                citation_key="resolvedkey",
+                fields={"title": "Resolved Work", "year": "2002", "doi": "10.1200/JCO.2002.04.117"},
+            ),
+            source_type="resolver",
+            source_label="crossref:doi:10.1200/JCO.2002.04.117",
+        )
+        stdout_buffer = io.StringIO()
+        stderr_buffer = io.StringIO()
+        with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer):
+            exit_code = main(
+                [
+                    "--db",
+                    str(database),
+                    "resolve-stubs",
+                    "--doi-only",
+                    "--limit",
+                    "10",
+                ]
+            )
+
+    assert exit_code == 0
+    assert "[1/1] resolving candidate: stubdoi" in stderr_buffer.getvalue()
+    assert "stubdoi\tcrossref:doi:10.1200/JCO.2002.04.117" in stdout_buffer.getvalue()
+
+
 def test_cli_resolve_conflicts_updates_status(tmp_path: Path):
     bib_path = tmp_path / "input.bib"
     bib_path.write_text(