From 8a21044d1f6a9bb7c6b82e5db55cd338b2a29eeb Mon Sep 17 00:00:00 2001 From: welsberr Date: Fri, 20 Mar 2026 15:52:57 -0400 Subject: [PATCH] Report progress for long-running CLI tasks --- README.md | 2 ++ src/citegeist/cli.py | 56 +++++++++++++++++++++++++++++++++++++------- tests/test_cli.py | 49 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 5196ee9..82d73d7 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,8 @@ For a fuller option-by-option CLI cookbook, see [examples/cli/README.md](./examp Broad BibTeX exports skip DOI-only placeholder records such as `Referenced work N` by default. Use `--include-stubs` on `export` or `export-topic` if you want those entries included anyway. +Long-running CLI commands report progress on `stderr` so `stdout` remains clean for JSON, BibTeX, or tabular output. + For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run. ## Example Application diff --git a/src/citegeist/cli.py b/src/citegeist/cli.py index 4cd3092..a2a66bd 100644 --- a/src/citegeist/cli.py +++ b/src/citegeist/cli.py @@ -783,10 +783,23 @@ def _run_extract(input_path: Path, output: str | None) -> int: return 0 +def _print_progress(label: str, index: int, total: int, detail: str | None = None) -> None: + message = f"[{index}/{total}] {label}" + if detail: + message = f"{message}: {detail}" + print(message, file=sys.stderr, flush=True) + + +def _print_phase(message: str) -> None: + print(message, file=sys.stderr, flush=True) + + def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int: resolver = MetadataResolver() exit_code = 0 - for citation_key in citation_keys: + total = len(citation_keys) + for index, citation_key in enumerate(citation_keys, start=1): + _print_progress("resolving", index, total, citation_key) if not _resolve_one(store, resolver, citation_key): exit_code = 1 return exit_code @@ -846,7 +859,9 @@ def _run_resolve_stubs( resolver = MetadataResolver() exit_code = 0 - for candidate in candidates: + total = len(candidates) + for index, candidate in enumerate(candidates, start=1): + _print_progress("resolving candidate", index, total, str(candidate["citation_key"])) if not _resolve_one(store, resolver, str(candidate["citation_key"])): exit_code = 1 return exit_code @@ -1240,7 +1255,9 @@ def _run_expand( return 1 all_results = [] - for citation_key in citation_keys: + total = len(citation_keys) + for index, citation_key in enumerate(citation_keys, start=1): + _print_progress("expanding seed", index, total, citation_key) all_results.extend(expand_fn(citation_key)) print(json.dumps([asdict(result) for result in all_results], indent=2)) return 0 @@ -1259,6 +1276,7 @@ def _run_expand_topic( preview: bool, ) -> int: expander = TopicExpander() + _print_phase(f"Loading topic expansion for {topic_slug}") stored_topic = store.get_topic(topic_slug) effective_phrase = topic_phrase if effective_phrase is None and stored_topic is not None: @@ -1309,6 +1327,7 @@ def _run_harvest_oai( review_status: str, ) -> int: harvester = OaiPmhHarvester() + _print_phase(f"Harvesting OAI-PMH records from {base_url}") harvested = harvester.list_records( base_url, metadata_prefix=metadata_prefix, @@ -1317,7 +1336,9 @@ def _run_harvest_oai( date_until=date_until, limit=limit, ) - for result in harvested: + total = len(harvested) + for index, result in enumerate(harvested, start=1): + _print_progress("ingesting harvested record", index, total, result.entry.citation_key) store.upsert_entry( result.entry, raw_bibtex=render_bibtex([result.entry]), @@ -1332,6 +1353,7 @@ def _run_harvest_oai( def _run_discover_oai(base_url: str) -> int: harvester = OaiPmhHarvester() + _print_phase(f"Inspecting OAI-PMH repository {base_url}") payload = { "identify": harvester.identify(base_url), "metadata_formats": [asdict(result) for result in harvester.list_metadata_formats(base_url)], @@ -1358,6 +1380,7 @@ def _run_bootstrap( print("bootstrap requires --seed-bib, --topic, or both", file=sys.stderr) return 1 + _print_phase("Running bootstrap") seed_bibtex = Path(seed_bib).read_text(encoding="utf-8") if seed_bib else None bootstrapper = Bootstrapper() results = bootstrapper.bootstrap( @@ -1380,9 +1403,12 @@ def _run_bootstrap( def _run_bootstrap_batch(store: BibliographyStore, input_path: Path) -> int: jobs = load_batch_jobs(input_path) runner = BatchBootstrapRunner() + _print_phase(f"Running bootstrap batch with {len(jobs)} jobs") results = runner.run(store, jobs) payload = [] - for job_result in results: + total = len(results) + for index, job_result in enumerate(results, start=1): + _print_progress("completed bootstrap job", index, total, job_result.job_name) payload.append( { "job_name": job_result.job_name, @@ -1409,6 +1435,7 @@ def _run_scrape_talkorigins( review_status: str, ) -> int: scraper = TalkOriginsScraper() + _print_phase(f"Scraping TalkOrigins example corpus from {base_url}") export = scraper.scrape_to_directory( base_url=base_url, output_dir=output_dir, @@ -1428,6 +1455,7 @@ def _run_scrape_talkorigins( def _run_validate_talkorigins(manifest_path: Path) -> int: scraper = TalkOriginsScraper() + _print_phase(f"Validating TalkOrigins manifest {manifest_path}") report = scraper.validate_export(manifest_path) print(json.dumps(asdict(report), indent=2)) return 0 @@ -1440,6 +1468,7 @@ def _run_suggest_talkorigins_phrases( output: str | None, ) -> int: scraper = TalkOriginsScraper() + _print_phase(f"Generating TalkOrigins topic phrase suggestions from {manifest_path}") suggestions = scraper.suggest_topic_phrases(manifest_path, limit=limit, topic_slug=topic_slug) payload = json.dumps([asdict(item) for item in suggestions], indent=2) if output: @@ -1461,7 +1490,8 @@ def _run_apply_topic_phrases(store: BibliographyStore, input_path: Path) -> int: results: list[dict[str, object]] = [] exit_code = 0 - for item in items: + total = len(items) + for index, item in enumerate(items, start=1): if not isinstance(item, dict): continue slug = str(item.get("slug") or "") @@ -1471,6 +1501,7 @@ def _run_apply_topic_phrases(store: BibliographyStore, input_path: Path) -> int: if phrase is not None: phrase = str(phrase) applied = store.set_topic_expansion_phrase(slug, phrase) + _print_progress("applying topic phrase", index, total, slug or "") if not applied: exit_code = 1 results.append( @@ -1496,7 +1527,8 @@ def _run_stage_topic_phrases(store: BibliographyStore, input_path: Path) -> int: results: list[dict[str, object]] = [] exit_code = 0 - for item in items: + total = len(items) + for index, item in enumerate(items, start=1): if not isinstance(item, dict): continue slug = str(item.get("slug") or "") @@ -1514,6 +1546,7 @@ def _run_stage_topic_phrases(store: BibliographyStore, input_path: Path) -> int: review_status="pending", review_notes=notes, ) + _print_progress("staging topic phrase", index, total, slug or "") if not staged: exit_code = 1 results.append( @@ -1560,7 +1593,8 @@ def _run_review_topic_phrases(store: BibliographyStore, input_path: Path) -> int results: list[dict[str, object]] = [] exit_code = 0 - for item in items: + total = len(items) + for index, item in enumerate(items, start=1): if not isinstance(item, dict): continue slug = str(item.get("slug") or "") @@ -1579,6 +1613,7 @@ def _run_review_topic_phrases(store: BibliographyStore, input_path: Path) -> int review_notes=notes, applied_phrase=phrase, ) + _print_progress("reviewing topic phrase", index, total, slug or "") if not reviewed: exit_code = 1 results.append( @@ -1603,6 +1638,7 @@ def _run_duplicates_talkorigins( weak_only: bool, ) -> int: scraper = TalkOriginsScraper() + _print_phase(f"Inspecting TalkOrigins duplicate clusters from {manifest_path}") clusters = scraper.inspect_duplicate_clusters( manifest_path, limit=limit, @@ -1623,6 +1659,7 @@ def _run_ingest_talkorigins( dedupe: bool, ) -> int: scraper = TalkOriginsScraper() + _print_phase(f"Ingesting TalkOrigins export from {manifest_path}") report = scraper.ingest_export( manifest_path, store, @@ -1645,6 +1682,7 @@ def _run_enrich_talkorigins( allow_unsafe_matches: bool, ) -> int: scraper = TalkOriginsScraper() + _print_phase(f"Enriching weak TalkOrigins canonicals from {manifest_path}") results = scraper.enrich_weak_canonicals( manifest_path, store, @@ -1670,6 +1708,7 @@ def _run_review_talkorigins( output: str | None, ) -> int: scraper = TalkOriginsScraper() + _print_phase(f"Building TalkOrigins review export from {manifest_path}") review = scraper.build_review_export( manifest_path, store, @@ -1693,6 +1732,7 @@ def _run_apply_talkorigins_corrections( review_status: str, ) -> int: scraper = TalkOriginsScraper() + _print_phase(f"Applying TalkOrigins corrections from {corrections_path}") results = scraper.apply_review_corrections( manifest_path, corrections_path, diff --git a/tests/test_cli.py b/tests/test_cli.py index 857c0f8..c514936 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,8 +1,10 @@ from __future__ import annotations +import io import json import subprocess import sys +from contextlib import redirect_stderr, redirect_stdout from pathlib import Path from unittest.mock import patch @@ -356,6 +358,53 @@ def test_cli_resolve_stubs_can_enrich_all_misc_entries(tmp_path: Path): assert "title" in {item["field_name"] for item in payload["field_conflicts"]} +def test_cli_resolve_stubs_reports_progress_on_stderr(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text( + """ +@misc{stubdoi, + title = {Referenced work 6}, + doi = {10.1200/JCO.2002.04.117}, + url = {https://doi.org/10.1200/JCO.2002.04.117} +} +""", + encoding="utf-8", + ) + assert run_cli(tmp_path, "ingest", str(bib_path)).returncode == 0 + + from citegeist.bibtex import BibEntry + from citegeist.resolve import Resolution + + database = tmp_path / "library.sqlite3" + with patch("citegeist.cli.MetadataResolver.resolve_entry") as mocked_resolve: + mocked_resolve.return_value = Resolution( + entry=BibEntry( + entry_type="article", + citation_key="resolvedkey", + fields={"title": "Resolved Work", "year": "2002", "doi": "10.1200/JCO.2002.04.117"}, + ), + source_type="resolver", + source_label="crossref:doi:10.1200/JCO.2002.04.117", + ) + stdout_buffer = io.StringIO() + stderr_buffer = io.StringIO() + with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer): + exit_code = main( + [ + "--db", + str(database), + "resolve-stubs", + "--doi-only", + "--limit", + "10", + ] + ) + + assert exit_code == 0 + assert "[1/1] resolving candidate: stubdoi" in stderr_buffer.getvalue() + assert "stubdoi\tcrossref:doi:10.1200/JCO.2002.04.117" in stdout_buffer.getvalue() + + def test_cli_resolve_conflicts_updates_status(tmp_path: Path): bib_path = tmp_path / "input.bib" bib_path.write_text(