Report progress for long-running CLI tasks

This commit is contained in:
welsberr 2026-03-20 15:52:57 -04:00
parent ae68ceaa3c
commit 8a21044d1f
3 changed files with 99 additions and 8 deletions

View File

@ -157,6 +157,8 @@ For a fuller option-by-option CLI cookbook, see [examples/cli/README.md](./examp
Broad BibTeX exports skip DOI-only placeholder records such as `Referenced work N` by default. Use `--include-stubs` on `export` or `export-topic` if you want those entries included anyway. Broad BibTeX exports skip DOI-only placeholder records such as `Referenced work N` by default. Use `--include-stubs` on `export` or `export-topic` if you want those entries included anyway.
Long-running CLI commands report progress on `stderr` so `stdout` remains clean for JSON, BibTeX, or tabular output.
For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run. For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run.
## Example Application ## Example Application

View File

@ -783,10 +783,23 @@ def _run_extract(input_path: Path, output: str | None) -> int:
return 0 return 0
def _print_progress(label: str, index: int, total: int, detail: str | None = None) -> None:
message = f"[{index}/{total}] {label}"
if detail:
message = f"{message}: {detail}"
print(message, file=sys.stderr, flush=True)
def _print_phase(message: str) -> None:
print(message, file=sys.stderr, flush=True)
def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int: def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int:
resolver = MetadataResolver() resolver = MetadataResolver()
exit_code = 0 exit_code = 0
for citation_key in citation_keys: total = len(citation_keys)
for index, citation_key in enumerate(citation_keys, start=1):
_print_progress("resolving", index, total, citation_key)
if not _resolve_one(store, resolver, citation_key): if not _resolve_one(store, resolver, citation_key):
exit_code = 1 exit_code = 1
return exit_code return exit_code
@ -846,7 +859,9 @@ def _run_resolve_stubs(
resolver = MetadataResolver() resolver = MetadataResolver()
exit_code = 0 exit_code = 0
for candidate in candidates: total = len(candidates)
for index, candidate in enumerate(candidates, start=1):
_print_progress("resolving candidate", index, total, str(candidate["citation_key"]))
if not _resolve_one(store, resolver, str(candidate["citation_key"])): if not _resolve_one(store, resolver, str(candidate["citation_key"])):
exit_code = 1 exit_code = 1
return exit_code return exit_code
@ -1240,7 +1255,9 @@ def _run_expand(
return 1 return 1
all_results = [] all_results = []
for citation_key in citation_keys: total = len(citation_keys)
for index, citation_key in enumerate(citation_keys, start=1):
_print_progress("expanding seed", index, total, citation_key)
all_results.extend(expand_fn(citation_key)) all_results.extend(expand_fn(citation_key))
print(json.dumps([asdict(result) for result in all_results], indent=2)) print(json.dumps([asdict(result) for result in all_results], indent=2))
return 0 return 0
@ -1259,6 +1276,7 @@ def _run_expand_topic(
preview: bool, preview: bool,
) -> int: ) -> int:
expander = TopicExpander() expander = TopicExpander()
_print_phase(f"Loading topic expansion for {topic_slug}")
stored_topic = store.get_topic(topic_slug) stored_topic = store.get_topic(topic_slug)
effective_phrase = topic_phrase effective_phrase = topic_phrase
if effective_phrase is None and stored_topic is not None: if effective_phrase is None and stored_topic is not None:
@ -1309,6 +1327,7 @@ def _run_harvest_oai(
review_status: str, review_status: str,
) -> int: ) -> int:
harvester = OaiPmhHarvester() harvester = OaiPmhHarvester()
_print_phase(f"Harvesting OAI-PMH records from {base_url}")
harvested = harvester.list_records( harvested = harvester.list_records(
base_url, base_url,
metadata_prefix=metadata_prefix, metadata_prefix=metadata_prefix,
@ -1317,7 +1336,9 @@ def _run_harvest_oai(
date_until=date_until, date_until=date_until,
limit=limit, limit=limit,
) )
for result in harvested: total = len(harvested)
for index, result in enumerate(harvested, start=1):
_print_progress("ingesting harvested record", index, total, result.entry.citation_key)
store.upsert_entry( store.upsert_entry(
result.entry, result.entry,
raw_bibtex=render_bibtex([result.entry]), raw_bibtex=render_bibtex([result.entry]),
@ -1332,6 +1353,7 @@ def _run_harvest_oai(
def _run_discover_oai(base_url: str) -> int: def _run_discover_oai(base_url: str) -> int:
harvester = OaiPmhHarvester() harvester = OaiPmhHarvester()
_print_phase(f"Inspecting OAI-PMH repository {base_url}")
payload = { payload = {
"identify": harvester.identify(base_url), "identify": harvester.identify(base_url),
"metadata_formats": [asdict(result) for result in harvester.list_metadata_formats(base_url)], "metadata_formats": [asdict(result) for result in harvester.list_metadata_formats(base_url)],
@ -1358,6 +1380,7 @@ def _run_bootstrap(
print("bootstrap requires --seed-bib, --topic, or both", file=sys.stderr) print("bootstrap requires --seed-bib, --topic, or both", file=sys.stderr)
return 1 return 1
_print_phase("Running bootstrap")
seed_bibtex = Path(seed_bib).read_text(encoding="utf-8") if seed_bib else None seed_bibtex = Path(seed_bib).read_text(encoding="utf-8") if seed_bib else None
bootstrapper = Bootstrapper() bootstrapper = Bootstrapper()
results = bootstrapper.bootstrap( results = bootstrapper.bootstrap(
@ -1380,9 +1403,12 @@ def _run_bootstrap(
def _run_bootstrap_batch(store: BibliographyStore, input_path: Path) -> int: def _run_bootstrap_batch(store: BibliographyStore, input_path: Path) -> int:
jobs = load_batch_jobs(input_path) jobs = load_batch_jobs(input_path)
runner = BatchBootstrapRunner() runner = BatchBootstrapRunner()
_print_phase(f"Running bootstrap batch with {len(jobs)} jobs")
results = runner.run(store, jobs) results = runner.run(store, jobs)
payload = [] payload = []
for job_result in results: total = len(results)
for index, job_result in enumerate(results, start=1):
_print_progress("completed bootstrap job", index, total, job_result.job_name)
payload.append( payload.append(
{ {
"job_name": job_result.job_name, "job_name": job_result.job_name,
@ -1409,6 +1435,7 @@ def _run_scrape_talkorigins(
review_status: str, review_status: str,
) -> int: ) -> int:
scraper = TalkOriginsScraper() scraper = TalkOriginsScraper()
_print_phase(f"Scraping TalkOrigins example corpus from {base_url}")
export = scraper.scrape_to_directory( export = scraper.scrape_to_directory(
base_url=base_url, base_url=base_url,
output_dir=output_dir, output_dir=output_dir,
@ -1428,6 +1455,7 @@ def _run_scrape_talkorigins(
def _run_validate_talkorigins(manifest_path: Path) -> int: def _run_validate_talkorigins(manifest_path: Path) -> int:
scraper = TalkOriginsScraper() scraper = TalkOriginsScraper()
_print_phase(f"Validating TalkOrigins manifest {manifest_path}")
report = scraper.validate_export(manifest_path) report = scraper.validate_export(manifest_path)
print(json.dumps(asdict(report), indent=2)) print(json.dumps(asdict(report), indent=2))
return 0 return 0
@ -1440,6 +1468,7 @@ def _run_suggest_talkorigins_phrases(
output: str | None, output: str | None,
) -> int: ) -> int:
scraper = TalkOriginsScraper() scraper = TalkOriginsScraper()
_print_phase(f"Generating TalkOrigins topic phrase suggestions from {manifest_path}")
suggestions = scraper.suggest_topic_phrases(manifest_path, limit=limit, topic_slug=topic_slug) suggestions = scraper.suggest_topic_phrases(manifest_path, limit=limit, topic_slug=topic_slug)
payload = json.dumps([asdict(item) for item in suggestions], indent=2) payload = json.dumps([asdict(item) for item in suggestions], indent=2)
if output: if output:
@ -1461,7 +1490,8 @@ def _run_apply_topic_phrases(store: BibliographyStore, input_path: Path) -> int:
results: list[dict[str, object]] = [] results: list[dict[str, object]] = []
exit_code = 0 exit_code = 0
for item in items: total = len(items)
for index, item in enumerate(items, start=1):
if not isinstance(item, dict): if not isinstance(item, dict):
continue continue
slug = str(item.get("slug") or "") slug = str(item.get("slug") or "")
@ -1471,6 +1501,7 @@ def _run_apply_topic_phrases(store: BibliographyStore, input_path: Path) -> int:
if phrase is not None: if phrase is not None:
phrase = str(phrase) phrase = str(phrase)
applied = store.set_topic_expansion_phrase(slug, phrase) applied = store.set_topic_expansion_phrase(slug, phrase)
_print_progress("applying topic phrase", index, total, slug or "<missing-slug>")
if not applied: if not applied:
exit_code = 1 exit_code = 1
results.append( results.append(
@ -1496,7 +1527,8 @@ def _run_stage_topic_phrases(store: BibliographyStore, input_path: Path) -> int:
results: list[dict[str, object]] = [] results: list[dict[str, object]] = []
exit_code = 0 exit_code = 0
for item in items: total = len(items)
for index, item in enumerate(items, start=1):
if not isinstance(item, dict): if not isinstance(item, dict):
continue continue
slug = str(item.get("slug") or "") slug = str(item.get("slug") or "")
@ -1514,6 +1546,7 @@ def _run_stage_topic_phrases(store: BibliographyStore, input_path: Path) -> int:
review_status="pending", review_status="pending",
review_notes=notes, review_notes=notes,
) )
_print_progress("staging topic phrase", index, total, slug or "<missing-slug>")
if not staged: if not staged:
exit_code = 1 exit_code = 1
results.append( results.append(
@ -1560,7 +1593,8 @@ def _run_review_topic_phrases(store: BibliographyStore, input_path: Path) -> int
results: list[dict[str, object]] = [] results: list[dict[str, object]] = []
exit_code = 0 exit_code = 0
for item in items: total = len(items)
for index, item in enumerate(items, start=1):
if not isinstance(item, dict): if not isinstance(item, dict):
continue continue
slug = str(item.get("slug") or "") slug = str(item.get("slug") or "")
@ -1579,6 +1613,7 @@ def _run_review_topic_phrases(store: BibliographyStore, input_path: Path) -> int
review_notes=notes, review_notes=notes,
applied_phrase=phrase, applied_phrase=phrase,
) )
_print_progress("reviewing topic phrase", index, total, slug or "<missing-slug>")
if not reviewed: if not reviewed:
exit_code = 1 exit_code = 1
results.append( results.append(
@ -1603,6 +1638,7 @@ def _run_duplicates_talkorigins(
weak_only: bool, weak_only: bool,
) -> int: ) -> int:
scraper = TalkOriginsScraper() scraper = TalkOriginsScraper()
_print_phase(f"Inspecting TalkOrigins duplicate clusters from {manifest_path}")
clusters = scraper.inspect_duplicate_clusters( clusters = scraper.inspect_duplicate_clusters(
manifest_path, manifest_path,
limit=limit, limit=limit,
@ -1623,6 +1659,7 @@ def _run_ingest_talkorigins(
dedupe: bool, dedupe: bool,
) -> int: ) -> int:
scraper = TalkOriginsScraper() scraper = TalkOriginsScraper()
_print_phase(f"Ingesting TalkOrigins export from {manifest_path}")
report = scraper.ingest_export( report = scraper.ingest_export(
manifest_path, manifest_path,
store, store,
@ -1645,6 +1682,7 @@ def _run_enrich_talkorigins(
allow_unsafe_matches: bool, allow_unsafe_matches: bool,
) -> int: ) -> int:
scraper = TalkOriginsScraper() scraper = TalkOriginsScraper()
_print_phase(f"Enriching weak TalkOrigins canonicals from {manifest_path}")
results = scraper.enrich_weak_canonicals( results = scraper.enrich_weak_canonicals(
manifest_path, manifest_path,
store, store,
@ -1670,6 +1708,7 @@ def _run_review_talkorigins(
output: str | None, output: str | None,
) -> int: ) -> int:
scraper = TalkOriginsScraper() scraper = TalkOriginsScraper()
_print_phase(f"Building TalkOrigins review export from {manifest_path}")
review = scraper.build_review_export( review = scraper.build_review_export(
manifest_path, manifest_path,
store, store,
@ -1693,6 +1732,7 @@ def _run_apply_talkorigins_corrections(
review_status: str, review_status: str,
) -> int: ) -> int:
scraper = TalkOriginsScraper() scraper = TalkOriginsScraper()
_print_phase(f"Applying TalkOrigins corrections from {corrections_path}")
results = scraper.apply_review_corrections( results = scraper.apply_review_corrections(
manifest_path, manifest_path,
corrections_path, corrections_path,

View File

@ -1,8 +1,10 @@
from __future__ import annotations from __future__ import annotations
import io
import json import json
import subprocess import subprocess
import sys import sys
from contextlib import redirect_stderr, redirect_stdout
from pathlib import Path from pathlib import Path
from unittest.mock import patch from unittest.mock import patch
@ -356,6 +358,53 @@ def test_cli_resolve_stubs_can_enrich_all_misc_entries(tmp_path: Path):
assert "title" in {item["field_name"] for item in payload["field_conflicts"]} assert "title" in {item["field_name"] for item in payload["field_conflicts"]}
def test_cli_resolve_stubs_reports_progress_on_stderr(tmp_path: Path):
bib_path = tmp_path / "input.bib"
bib_path.write_text(
"""
@misc{stubdoi,
title = {Referenced work 6},
doi = {10.1200/JCO.2002.04.117},
url = {https://doi.org/10.1200/JCO.2002.04.117}
}
""",
encoding="utf-8",
)
assert run_cli(tmp_path, "ingest", str(bib_path)).returncode == 0
from citegeist.bibtex import BibEntry
from citegeist.resolve import Resolution
database = tmp_path / "library.sqlite3"
with patch("citegeist.cli.MetadataResolver.resolve_entry") as mocked_resolve:
mocked_resolve.return_value = Resolution(
entry=BibEntry(
entry_type="article",
citation_key="resolvedkey",
fields={"title": "Resolved Work", "year": "2002", "doi": "10.1200/JCO.2002.04.117"},
),
source_type="resolver",
source_label="crossref:doi:10.1200/JCO.2002.04.117",
)
stdout_buffer = io.StringIO()
stderr_buffer = io.StringIO()
with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer):
exit_code = main(
[
"--db",
str(database),
"resolve-stubs",
"--doi-only",
"--limit",
"10",
]
)
assert exit_code == 0
assert "[1/1] resolving candidate: stubdoi" in stderr_buffer.getvalue()
assert "stubdoi\tcrossref:doi:10.1200/JCO.2002.04.117" in stdout_buffer.getvalue()
def test_cli_resolve_conflicts_updates_status(tmp_path: Path): def test_cli_resolve_conflicts_updates_status(tmp_path: Path):
bib_path = tmp_path / "input.bib" bib_path = tmp_path / "input.bib"
bib_path.write_text( bib_path.write_text(