Report progress for long-running CLI tasks
This commit is contained in:
parent
ae68ceaa3c
commit
8a21044d1f
|
|
@ -157,6 +157,8 @@ For a fuller option-by-option CLI cookbook, see [examples/cli/README.md](./examp
|
||||||
|
|
||||||
Broad BibTeX exports skip DOI-only placeholder records such as `Referenced work N` by default. Use `--include-stubs` on `export` or `export-topic` if you want those entries included anyway.
|
Broad BibTeX exports skip DOI-only placeholder records such as `Referenced work N` by default. Use `--include-stubs` on `export` or `export-topic` if you want those entries included anyway.
|
||||||
|
|
||||||
|
Long-running CLI commands report progress on `stderr` so `stdout` remains clean for JSON, BibTeX, or tabular output.
|
||||||
|
|
||||||
For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run.
|
For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run.
|
||||||
|
|
||||||
## Example Application
|
## Example Application
|
||||||
|
|
|
||||||
|
|
@ -783,10 +783,23 @@ def _run_extract(input_path: Path, output: str | None) -> int:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def _print_progress(label: str, index: int, total: int, detail: str | None = None) -> None:
|
||||||
|
message = f"[{index}/{total}] {label}"
|
||||||
|
if detail:
|
||||||
|
message = f"{message}: {detail}"
|
||||||
|
print(message, file=sys.stderr, flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _print_phase(message: str) -> None:
|
||||||
|
print(message, file=sys.stderr, flush=True)
|
||||||
|
|
||||||
|
|
||||||
def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int:
|
def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int:
|
||||||
resolver = MetadataResolver()
|
resolver = MetadataResolver()
|
||||||
exit_code = 0
|
exit_code = 0
|
||||||
for citation_key in citation_keys:
|
total = len(citation_keys)
|
||||||
|
for index, citation_key in enumerate(citation_keys, start=1):
|
||||||
|
_print_progress("resolving", index, total, citation_key)
|
||||||
if not _resolve_one(store, resolver, citation_key):
|
if not _resolve_one(store, resolver, citation_key):
|
||||||
exit_code = 1
|
exit_code = 1
|
||||||
return exit_code
|
return exit_code
|
||||||
|
|
@ -846,7 +859,9 @@ def _run_resolve_stubs(
|
||||||
|
|
||||||
resolver = MetadataResolver()
|
resolver = MetadataResolver()
|
||||||
exit_code = 0
|
exit_code = 0
|
||||||
for candidate in candidates:
|
total = len(candidates)
|
||||||
|
for index, candidate in enumerate(candidates, start=1):
|
||||||
|
_print_progress("resolving candidate", index, total, str(candidate["citation_key"]))
|
||||||
if not _resolve_one(store, resolver, str(candidate["citation_key"])):
|
if not _resolve_one(store, resolver, str(candidate["citation_key"])):
|
||||||
exit_code = 1
|
exit_code = 1
|
||||||
return exit_code
|
return exit_code
|
||||||
|
|
@ -1240,7 +1255,9 @@ def _run_expand(
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
all_results = []
|
all_results = []
|
||||||
for citation_key in citation_keys:
|
total = len(citation_keys)
|
||||||
|
for index, citation_key in enumerate(citation_keys, start=1):
|
||||||
|
_print_progress("expanding seed", index, total, citation_key)
|
||||||
all_results.extend(expand_fn(citation_key))
|
all_results.extend(expand_fn(citation_key))
|
||||||
print(json.dumps([asdict(result) for result in all_results], indent=2))
|
print(json.dumps([asdict(result) for result in all_results], indent=2))
|
||||||
return 0
|
return 0
|
||||||
|
|
@ -1259,6 +1276,7 @@ def _run_expand_topic(
|
||||||
preview: bool,
|
preview: bool,
|
||||||
) -> int:
|
) -> int:
|
||||||
expander = TopicExpander()
|
expander = TopicExpander()
|
||||||
|
_print_phase(f"Loading topic expansion for {topic_slug}")
|
||||||
stored_topic = store.get_topic(topic_slug)
|
stored_topic = store.get_topic(topic_slug)
|
||||||
effective_phrase = topic_phrase
|
effective_phrase = topic_phrase
|
||||||
if effective_phrase is None and stored_topic is not None:
|
if effective_phrase is None and stored_topic is not None:
|
||||||
|
|
@ -1309,6 +1327,7 @@ def _run_harvest_oai(
|
||||||
review_status: str,
|
review_status: str,
|
||||||
) -> int:
|
) -> int:
|
||||||
harvester = OaiPmhHarvester()
|
harvester = OaiPmhHarvester()
|
||||||
|
_print_phase(f"Harvesting OAI-PMH records from {base_url}")
|
||||||
harvested = harvester.list_records(
|
harvested = harvester.list_records(
|
||||||
base_url,
|
base_url,
|
||||||
metadata_prefix=metadata_prefix,
|
metadata_prefix=metadata_prefix,
|
||||||
|
|
@ -1317,7 +1336,9 @@ def _run_harvest_oai(
|
||||||
date_until=date_until,
|
date_until=date_until,
|
||||||
limit=limit,
|
limit=limit,
|
||||||
)
|
)
|
||||||
for result in harvested:
|
total = len(harvested)
|
||||||
|
for index, result in enumerate(harvested, start=1):
|
||||||
|
_print_progress("ingesting harvested record", index, total, result.entry.citation_key)
|
||||||
store.upsert_entry(
|
store.upsert_entry(
|
||||||
result.entry,
|
result.entry,
|
||||||
raw_bibtex=render_bibtex([result.entry]),
|
raw_bibtex=render_bibtex([result.entry]),
|
||||||
|
|
@ -1332,6 +1353,7 @@ def _run_harvest_oai(
|
||||||
|
|
||||||
def _run_discover_oai(base_url: str) -> int:
|
def _run_discover_oai(base_url: str) -> int:
|
||||||
harvester = OaiPmhHarvester()
|
harvester = OaiPmhHarvester()
|
||||||
|
_print_phase(f"Inspecting OAI-PMH repository {base_url}")
|
||||||
payload = {
|
payload = {
|
||||||
"identify": harvester.identify(base_url),
|
"identify": harvester.identify(base_url),
|
||||||
"metadata_formats": [asdict(result) for result in harvester.list_metadata_formats(base_url)],
|
"metadata_formats": [asdict(result) for result in harvester.list_metadata_formats(base_url)],
|
||||||
|
|
@ -1358,6 +1380,7 @@ def _run_bootstrap(
|
||||||
print("bootstrap requires --seed-bib, --topic, or both", file=sys.stderr)
|
print("bootstrap requires --seed-bib, --topic, or both", file=sys.stderr)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
_print_phase("Running bootstrap")
|
||||||
seed_bibtex = Path(seed_bib).read_text(encoding="utf-8") if seed_bib else None
|
seed_bibtex = Path(seed_bib).read_text(encoding="utf-8") if seed_bib else None
|
||||||
bootstrapper = Bootstrapper()
|
bootstrapper = Bootstrapper()
|
||||||
results = bootstrapper.bootstrap(
|
results = bootstrapper.bootstrap(
|
||||||
|
|
@ -1380,9 +1403,12 @@ def _run_bootstrap(
|
||||||
def _run_bootstrap_batch(store: BibliographyStore, input_path: Path) -> int:
|
def _run_bootstrap_batch(store: BibliographyStore, input_path: Path) -> int:
|
||||||
jobs = load_batch_jobs(input_path)
|
jobs = load_batch_jobs(input_path)
|
||||||
runner = BatchBootstrapRunner()
|
runner = BatchBootstrapRunner()
|
||||||
|
_print_phase(f"Running bootstrap batch with {len(jobs)} jobs")
|
||||||
results = runner.run(store, jobs)
|
results = runner.run(store, jobs)
|
||||||
payload = []
|
payload = []
|
||||||
for job_result in results:
|
total = len(results)
|
||||||
|
for index, job_result in enumerate(results, start=1):
|
||||||
|
_print_progress("completed bootstrap job", index, total, job_result.job_name)
|
||||||
payload.append(
|
payload.append(
|
||||||
{
|
{
|
||||||
"job_name": job_result.job_name,
|
"job_name": job_result.job_name,
|
||||||
|
|
@ -1409,6 +1435,7 @@ def _run_scrape_talkorigins(
|
||||||
review_status: str,
|
review_status: str,
|
||||||
) -> int:
|
) -> int:
|
||||||
scraper = TalkOriginsScraper()
|
scraper = TalkOriginsScraper()
|
||||||
|
_print_phase(f"Scraping TalkOrigins example corpus from {base_url}")
|
||||||
export = scraper.scrape_to_directory(
|
export = scraper.scrape_to_directory(
|
||||||
base_url=base_url,
|
base_url=base_url,
|
||||||
output_dir=output_dir,
|
output_dir=output_dir,
|
||||||
|
|
@ -1428,6 +1455,7 @@ def _run_scrape_talkorigins(
|
||||||
|
|
||||||
def _run_validate_talkorigins(manifest_path: Path) -> int:
|
def _run_validate_talkorigins(manifest_path: Path) -> int:
|
||||||
scraper = TalkOriginsScraper()
|
scraper = TalkOriginsScraper()
|
||||||
|
_print_phase(f"Validating TalkOrigins manifest {manifest_path}")
|
||||||
report = scraper.validate_export(manifest_path)
|
report = scraper.validate_export(manifest_path)
|
||||||
print(json.dumps(asdict(report), indent=2))
|
print(json.dumps(asdict(report), indent=2))
|
||||||
return 0
|
return 0
|
||||||
|
|
@ -1440,6 +1468,7 @@ def _run_suggest_talkorigins_phrases(
|
||||||
output: str | None,
|
output: str | None,
|
||||||
) -> int:
|
) -> int:
|
||||||
scraper = TalkOriginsScraper()
|
scraper = TalkOriginsScraper()
|
||||||
|
_print_phase(f"Generating TalkOrigins topic phrase suggestions from {manifest_path}")
|
||||||
suggestions = scraper.suggest_topic_phrases(manifest_path, limit=limit, topic_slug=topic_slug)
|
suggestions = scraper.suggest_topic_phrases(manifest_path, limit=limit, topic_slug=topic_slug)
|
||||||
payload = json.dumps([asdict(item) for item in suggestions], indent=2)
|
payload = json.dumps([asdict(item) for item in suggestions], indent=2)
|
||||||
if output:
|
if output:
|
||||||
|
|
@ -1461,7 +1490,8 @@ def _run_apply_topic_phrases(store: BibliographyStore, input_path: Path) -> int:
|
||||||
|
|
||||||
results: list[dict[str, object]] = []
|
results: list[dict[str, object]] = []
|
||||||
exit_code = 0
|
exit_code = 0
|
||||||
for item in items:
|
total = len(items)
|
||||||
|
for index, item in enumerate(items, start=1):
|
||||||
if not isinstance(item, dict):
|
if not isinstance(item, dict):
|
||||||
continue
|
continue
|
||||||
slug = str(item.get("slug") or "")
|
slug = str(item.get("slug") or "")
|
||||||
|
|
@ -1471,6 +1501,7 @@ def _run_apply_topic_phrases(store: BibliographyStore, input_path: Path) -> int:
|
||||||
if phrase is not None:
|
if phrase is not None:
|
||||||
phrase = str(phrase)
|
phrase = str(phrase)
|
||||||
applied = store.set_topic_expansion_phrase(slug, phrase)
|
applied = store.set_topic_expansion_phrase(slug, phrase)
|
||||||
|
_print_progress("applying topic phrase", index, total, slug or "<missing-slug>")
|
||||||
if not applied:
|
if not applied:
|
||||||
exit_code = 1
|
exit_code = 1
|
||||||
results.append(
|
results.append(
|
||||||
|
|
@ -1496,7 +1527,8 @@ def _run_stage_topic_phrases(store: BibliographyStore, input_path: Path) -> int:
|
||||||
|
|
||||||
results: list[dict[str, object]] = []
|
results: list[dict[str, object]] = []
|
||||||
exit_code = 0
|
exit_code = 0
|
||||||
for item in items:
|
total = len(items)
|
||||||
|
for index, item in enumerate(items, start=1):
|
||||||
if not isinstance(item, dict):
|
if not isinstance(item, dict):
|
||||||
continue
|
continue
|
||||||
slug = str(item.get("slug") or "")
|
slug = str(item.get("slug") or "")
|
||||||
|
|
@ -1514,6 +1546,7 @@ def _run_stage_topic_phrases(store: BibliographyStore, input_path: Path) -> int:
|
||||||
review_status="pending",
|
review_status="pending",
|
||||||
review_notes=notes,
|
review_notes=notes,
|
||||||
)
|
)
|
||||||
|
_print_progress("staging topic phrase", index, total, slug or "<missing-slug>")
|
||||||
if not staged:
|
if not staged:
|
||||||
exit_code = 1
|
exit_code = 1
|
||||||
results.append(
|
results.append(
|
||||||
|
|
@ -1560,7 +1593,8 @@ def _run_review_topic_phrases(store: BibliographyStore, input_path: Path) -> int
|
||||||
|
|
||||||
results: list[dict[str, object]] = []
|
results: list[dict[str, object]] = []
|
||||||
exit_code = 0
|
exit_code = 0
|
||||||
for item in items:
|
total = len(items)
|
||||||
|
for index, item in enumerate(items, start=1):
|
||||||
if not isinstance(item, dict):
|
if not isinstance(item, dict):
|
||||||
continue
|
continue
|
||||||
slug = str(item.get("slug") or "")
|
slug = str(item.get("slug") or "")
|
||||||
|
|
@ -1579,6 +1613,7 @@ def _run_review_topic_phrases(store: BibliographyStore, input_path: Path) -> int
|
||||||
review_notes=notes,
|
review_notes=notes,
|
||||||
applied_phrase=phrase,
|
applied_phrase=phrase,
|
||||||
)
|
)
|
||||||
|
_print_progress("reviewing topic phrase", index, total, slug or "<missing-slug>")
|
||||||
if not reviewed:
|
if not reviewed:
|
||||||
exit_code = 1
|
exit_code = 1
|
||||||
results.append(
|
results.append(
|
||||||
|
|
@ -1603,6 +1638,7 @@ def _run_duplicates_talkorigins(
|
||||||
weak_only: bool,
|
weak_only: bool,
|
||||||
) -> int:
|
) -> int:
|
||||||
scraper = TalkOriginsScraper()
|
scraper = TalkOriginsScraper()
|
||||||
|
_print_phase(f"Inspecting TalkOrigins duplicate clusters from {manifest_path}")
|
||||||
clusters = scraper.inspect_duplicate_clusters(
|
clusters = scraper.inspect_duplicate_clusters(
|
||||||
manifest_path,
|
manifest_path,
|
||||||
limit=limit,
|
limit=limit,
|
||||||
|
|
@ -1623,6 +1659,7 @@ def _run_ingest_talkorigins(
|
||||||
dedupe: bool,
|
dedupe: bool,
|
||||||
) -> int:
|
) -> int:
|
||||||
scraper = TalkOriginsScraper()
|
scraper = TalkOriginsScraper()
|
||||||
|
_print_phase(f"Ingesting TalkOrigins export from {manifest_path}")
|
||||||
report = scraper.ingest_export(
|
report = scraper.ingest_export(
|
||||||
manifest_path,
|
manifest_path,
|
||||||
store,
|
store,
|
||||||
|
|
@ -1645,6 +1682,7 @@ def _run_enrich_talkorigins(
|
||||||
allow_unsafe_matches: bool,
|
allow_unsafe_matches: bool,
|
||||||
) -> int:
|
) -> int:
|
||||||
scraper = TalkOriginsScraper()
|
scraper = TalkOriginsScraper()
|
||||||
|
_print_phase(f"Enriching weak TalkOrigins canonicals from {manifest_path}")
|
||||||
results = scraper.enrich_weak_canonicals(
|
results = scraper.enrich_weak_canonicals(
|
||||||
manifest_path,
|
manifest_path,
|
||||||
store,
|
store,
|
||||||
|
|
@ -1670,6 +1708,7 @@ def _run_review_talkorigins(
|
||||||
output: str | None,
|
output: str | None,
|
||||||
) -> int:
|
) -> int:
|
||||||
scraper = TalkOriginsScraper()
|
scraper = TalkOriginsScraper()
|
||||||
|
_print_phase(f"Building TalkOrigins review export from {manifest_path}")
|
||||||
review = scraper.build_review_export(
|
review = scraper.build_review_export(
|
||||||
manifest_path,
|
manifest_path,
|
||||||
store,
|
store,
|
||||||
|
|
@ -1693,6 +1732,7 @@ def _run_apply_talkorigins_corrections(
|
||||||
review_status: str,
|
review_status: str,
|
||||||
) -> int:
|
) -> int:
|
||||||
scraper = TalkOriginsScraper()
|
scraper = TalkOriginsScraper()
|
||||||
|
_print_phase(f"Applying TalkOrigins corrections from {corrections_path}")
|
||||||
results = scraper.apply_review_corrections(
|
results = scraper.apply_review_corrections(
|
||||||
manifest_path,
|
manifest_path,
|
||||||
corrections_path,
|
corrections_path,
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,10 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import io
|
||||||
import json
|
import json
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
from contextlib import redirect_stderr, redirect_stdout
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
|
@ -356,6 +358,53 @@ def test_cli_resolve_stubs_can_enrich_all_misc_entries(tmp_path: Path):
|
||||||
assert "title" in {item["field_name"] for item in payload["field_conflicts"]}
|
assert "title" in {item["field_name"] for item in payload["field_conflicts"]}
|
||||||
|
|
||||||
|
|
||||||
|
def test_cli_resolve_stubs_reports_progress_on_stderr(tmp_path: Path):
|
||||||
|
bib_path = tmp_path / "input.bib"
|
||||||
|
bib_path.write_text(
|
||||||
|
"""
|
||||||
|
@misc{stubdoi,
|
||||||
|
title = {Referenced work 6},
|
||||||
|
doi = {10.1200/JCO.2002.04.117},
|
||||||
|
url = {https://doi.org/10.1200/JCO.2002.04.117}
|
||||||
|
}
|
||||||
|
""",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
assert run_cli(tmp_path, "ingest", str(bib_path)).returncode == 0
|
||||||
|
|
||||||
|
from citegeist.bibtex import BibEntry
|
||||||
|
from citegeist.resolve import Resolution
|
||||||
|
|
||||||
|
database = tmp_path / "library.sqlite3"
|
||||||
|
with patch("citegeist.cli.MetadataResolver.resolve_entry") as mocked_resolve:
|
||||||
|
mocked_resolve.return_value = Resolution(
|
||||||
|
entry=BibEntry(
|
||||||
|
entry_type="article",
|
||||||
|
citation_key="resolvedkey",
|
||||||
|
fields={"title": "Resolved Work", "year": "2002", "doi": "10.1200/JCO.2002.04.117"},
|
||||||
|
),
|
||||||
|
source_type="resolver",
|
||||||
|
source_label="crossref:doi:10.1200/JCO.2002.04.117",
|
||||||
|
)
|
||||||
|
stdout_buffer = io.StringIO()
|
||||||
|
stderr_buffer = io.StringIO()
|
||||||
|
with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer):
|
||||||
|
exit_code = main(
|
||||||
|
[
|
||||||
|
"--db",
|
||||||
|
str(database),
|
||||||
|
"resolve-stubs",
|
||||||
|
"--doi-only",
|
||||||
|
"--limit",
|
||||||
|
"10",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
assert exit_code == 0
|
||||||
|
assert "[1/1] resolving candidate: stubdoi" in stderr_buffer.getvalue()
|
||||||
|
assert "stubdoi\tcrossref:doi:10.1200/JCO.2002.04.117" in stdout_buffer.getvalue()
|
||||||
|
|
||||||
|
|
||||||
def test_cli_resolve_conflicts_updates_status(tmp_path: Path):
|
def test_cli_resolve_conflicts_updates_status(tmp_path: Path):
|
||||||
bib_path = tmp_path / "input.bib"
|
bib_path = tmp_path / "input.bib"
|
||||||
bib_path.write_text(
|
bib_path.write_text(
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue