from __future__ import annotations import argparse from dataclasses import asdict import json import sys from pathlib import Path from .batch import BatchBootstrapRunner, load_batch_jobs from .bibtex import parse_bibtex, render_bibtex from .bootstrap import Bootstrapper from .expand import CrossrefExpander, OpenAlexExpander, TopicExpander from .extract import extract_references from .harvest import OaiPmhHarvester from .resolve import MetadataResolver, merge_entries_with_conflicts from .storage import BibliographyStore from .talkorigins import TalkOriginsScraper def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(prog="citegeist") parser.add_argument("--db", default="library.sqlite3", help="Path to the SQLite database") subparsers = parser.add_subparsers(dest="command", required=True) ingest_parser = subparsers.add_parser("ingest", help="Ingest BibTeX into the database") ingest_parser.add_argument("input", help="BibTeX file to ingest") ingest_parser.add_argument("--status", default="draft", help="Initial review status") ingest_parser.add_argument("--source-label", help="Provenance label for this ingest run") search_parser = subparsers.add_parser("search", help="Search titles, abstracts, and fulltext") search_parser.add_argument("query", help="Search query") search_parser.add_argument("--limit", type=int, default=10, help="Maximum number of results") search_parser.add_argument("--topic", help="Optional topic slug to filter search results") show_parser = subparsers.add_parser("show", help="Show one entry or list entries") show_parser.add_argument("citation_key", nargs="?", help="Citation key to show") show_parser.add_argument("--limit", type=int, default=20, help="Maximum entries when listing") show_parser.add_argument("--provenance", action="store_true", help="Include field provenance") show_parser.add_argument("--conflicts", action="store_true", help="Include field conflicts") export_parser = subparsers.add_parser("export", help="Export entries as BibTeX") export_parser.add_argument("citation_keys", nargs="*", help="Optional citation keys to export") export_parser.add_argument("--output", help="Write BibTeX to a file instead of stdout") status_parser = subparsers.add_parser("set-status", help="Set the review status for one entry") status_parser.add_argument("citation_key", help="Citation key to update") status_parser.add_argument("review_status", help="New review status") conflict_parser = subparsers.add_parser("resolve-conflicts", help="Update conflict review status for one field") conflict_parser.add_argument("citation_key", help="Citation key to update") conflict_parser.add_argument("field_name", help="Field name whose open conflicts should be updated") conflict_parser.add_argument("status", choices=["accepted", "rejected"], help="New conflict status") apply_conflict_parser = subparsers.add_parser( "apply-conflict", help="Accept the proposed value for the latest open conflict on a field", ) apply_conflict_parser.add_argument("citation_key", help="Citation key to update") apply_conflict_parser.add_argument("field_name", help="Field name whose proposed value should be applied") extract_parser = subparsers.add_parser("extract", help="Extract draft BibTeX from plaintext references") extract_parser.add_argument("input", help="Plaintext file containing bibliography-style references") extract_parser.add_argument("--output", help="Write extracted BibTeX to a file instead of stdout") resolve_parser = subparsers.add_parser("resolve", help="Enrich stored entries from external metadata sources") resolve_parser.add_argument("citation_keys", nargs="+", help="Citation keys to enrich") graph_parser = subparsers.add_parser("graph", help="Traverse citation relations from one or more seed entries") graph_parser.add_argument("citation_keys", nargs="+", help="Seed citation keys") graph_parser.add_argument( "--relation", action="append", dest="relations", choices=["cites", "cited_by", "crossref"], help="Relation type to traverse; may be passed multiple times", ) graph_parser.add_argument("--depth", type=int, default=1, help="Maximum traversal depth") graph_parser.add_argument("--review-status", help="Filter results by target review status") graph_parser.add_argument( "--missing-only", action="store_true", help="Show only unresolved target nodes that are not yet present in the database", ) expand_parser = subparsers.add_parser("expand", help="Expand graph edges from external metadata sources") expand_parser.add_argument("citation_keys", nargs="+", help="Seed citation keys to expand") expand_parser.add_argument( "--source", choices=["crossref", "openalex"], default="crossref", help="External source used for graph expansion", ) expand_parser.add_argument( "--relation", choices=["cites", "cited_by"], default="cites", help="Graph direction to expand for sources that support it", ) expand_parser.add_argument("--limit", type=int, default=25, help="Maximum related works to fetch per seed") expand_topic_parser = subparsers.add_parser( "expand-topic", help="Expand one topic from its existing seed entries and assign only relevant discoveries back to that topic", ) expand_topic_parser.add_argument("topic_slug", help="Topic slug to expand from") expand_topic_parser.add_argument( "--topic-phrase", help="Optional phrase used for relevance gating; defaults to the stored topic name", ) expand_topic_parser.add_argument( "--source", choices=["crossref", "openalex"], default="openalex", help="External source used for topic expansion", ) expand_topic_parser.add_argument( "--relation", choices=["cites", "cited_by"], default="cites", help="Graph direction to expand for sources that support it", ) expand_topic_parser.add_argument("--seed-limit", type=int, default=25, help="Maximum topic seed entries to expand from") expand_topic_parser.add_argument("--per-seed-limit", type=int, default=25, help="Maximum discovered works to fetch per seed") expand_topic_parser.add_argument( "--seed-key", action="append", dest="seed_keys", help="Restrict expansion to one trusted seed entry; may be passed multiple times", ) expand_topic_parser.add_argument( "--min-relevance", type=float, default=0.2, help="Minimum topic-term overlap score required to assign a discovered work back to the topic", ) expand_topic_parser.add_argument( "--preview", action="store_true", help="Discover and score candidate expansions without writing entries, relations, or topic assignments", ) set_topic_phrase_parser = subparsers.add_parser( "set-topic-phrase", help="Set or clear the stored expansion phrase for one topic", ) set_topic_phrase_parser.add_argument("topic_slug", help="Topic slug to update") set_topic_phrase_parser.add_argument( "phrase", nargs="?", help="Expansion phrase to store; omit with --clear to remove it", ) set_topic_phrase_parser.add_argument( "--clear", action="store_true", help="Clear the stored expansion phrase for this topic", ) harvest_parser = subparsers.add_parser("harvest-oai", help="Harvest draft entries from an OAI-PMH repository") harvest_parser.add_argument("base_url", help="OAI-PMH base URL") harvest_parser.add_argument("--metadata-prefix", default="oai_dc", help="OAI-PMH metadataPrefix to harvest") harvest_parser.add_argument("--set", dest="set_spec", help="Optional OAI-PMH set spec") harvest_parser.add_argument("--from", dest="date_from", help="Optional OAI-PMH lower date bound") harvest_parser.add_argument("--until", dest="date_until", help="Optional OAI-PMH upper date bound") harvest_parser.add_argument("--limit", type=int, default=20, help="Maximum harvested records to ingest") harvest_parser.add_argument("--status", default="draft", help="Initial review status") discover_parser = subparsers.add_parser("discover-oai", help="Inspect OAI-PMH repository identity and sets") discover_parser.add_argument("base_url", help="OAI-PMH base URL") bootstrap_parser = subparsers.add_parser( "bootstrap", help="Start bibliography expansion from a seed BibTeX file, a topic phrase, or both", ) bootstrap_parser.add_argument("--seed-bib", help="Optional seed BibTeX file") bootstrap_parser.add_argument("--topic", help="Optional topic phrase") bootstrap_parser.add_argument("--topic-slug", help="Optional stored topic slug for this bootstrap topic") bootstrap_parser.add_argument("--topic-name", help="Optional stored topic name for this bootstrap topic") bootstrap_parser.add_argument( "--store-topic-phrase", help="Optional stored expansion phrase to save with the bootstrap topic; defaults to --topic when topic metadata is provided", ) bootstrap_parser.add_argument("--topic-limit", type=int, default=5, help="Maximum topic-search seed candidates") bootstrap_parser.add_argument( "--topic-commit-limit", type=int, help="Maximum ranked topic candidates to actually commit and expand", ) bootstrap_parser.add_argument( "--no-expand", action="store_true", help="Do not run immediate graph expansion after seeding", ) bootstrap_parser.add_argument( "--preview", action="store_true", help="Preview ranked bootstrap candidates without writing to the database or expanding", ) bootstrap_parser.add_argument("--status", default="draft", help="Initial review status for imported entries") batch_parser = subparsers.add_parser( "bootstrap-batch", help="Run multiple bootstrap jobs from a JSON specification file", ) batch_parser.add_argument("input", help="Path to batch JSON file") talkorigins_parser = subparsers.add_parser( "scrape-talkorigins", help="Scrape TalkOrigins into per-topic seed BibTeX files and a bootstrap-batch JSON file", ) talkorigins_parser.add_argument( "output_dir", help="Directory where seed BibTeX files, manifest, and batch JSON should be written", ) talkorigins_parser.add_argument( "--base-url", default="https://www.talkorigins.org/origins/biblio/", help="TalkOrigins bibliography index URL", ) talkorigins_parser.add_argument("--limit-topics", type=int, help="Limit the number of scraped topic pages") talkorigins_parser.add_argument( "--limit-entries-per-topic", type=int, help="Limit the number of parsed references per topic page", ) talkorigins_parser.add_argument( "--resolve-seeds", action="store_true", help="Attempt metadata resolution on parsed seed entries before writing BibTeX", ) talkorigins_parser.add_argument( "--ingest", action="store_true", help="Also ingest the generated seed BibTeX into the configured database", ) talkorigins_parser.add_argument( "--no-expand", action="store_true", help="Write generated batch jobs with graph expansion disabled", ) talkorigins_parser.add_argument( "--no-resume", action="store_true", help="Do not reuse saved TalkOrigins topic snapshots from a prior run", ) talkorigins_parser.add_argument( "--topic-limit", type=int, default=5, help="Default bootstrap topic-search limit to include in generated jobs", ) talkorigins_parser.add_argument( "--topic-commit-limit", type=int, help="Default bootstrap topic commit limit to include in generated jobs", ) talkorigins_parser.add_argument("--status", default="draft", help="Review status for generated seed jobs") validate_talkorigins_parser = subparsers.add_parser( "validate-talkorigins", help="Validate a generated TalkOrigins manifest and report parse coverage and suspicious entries", ) validate_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") suggest_talkorigins_parser = subparsers.add_parser( "suggest-talkorigins-phrases", help="Suggest stored topic expansion phrases from a TalkOrigins manifest", ) suggest_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") suggest_talkorigins_parser.add_argument("--topic", help="Optional topic slug to restrict suggestions") suggest_talkorigins_parser.add_argument("--limit", type=int, help="Maximum topics to include") suggest_talkorigins_parser.add_argument("--output", help="Write suggestions JSON to a file instead of stdout") apply_topic_phrases_parser = subparsers.add_parser( "apply-topic-phrases", help="Apply stored topic expansion phrases from a JSON suggestion or patch file", ) apply_topic_phrases_parser.add_argument("input", help="Path to JSON file containing topic phrase records") stage_topic_phrases_parser = subparsers.add_parser( "stage-topic-phrases", help="Stage topic phrase suggestions from JSON for later review in the database", ) stage_topic_phrases_parser.add_argument("input", help="Path to JSON file containing topic phrase records") review_topic_phrase_parser = subparsers.add_parser( "review-topic-phrase", help="Accept or reject one staged topic phrase suggestion", ) review_topic_phrase_parser.add_argument("topic_slug", help="Topic slug to review") review_topic_phrase_parser.add_argument("status", choices=["accepted", "rejected"], help="Review decision") review_topic_phrase_parser.add_argument( "--notes", help="Optional review notes to store with the decision", ) review_topic_phrase_parser.add_argument( "--phrase", help="Optional expansion phrase override to apply with the review decision", ) review_topic_phrases_parser = subparsers.add_parser( "review-topic-phrases", help="Apply topic phrase review decisions in bulk from JSON", ) review_topic_phrases_parser.add_argument("input", help="Path to JSON file containing topic phrase review records") duplicates_talkorigins_parser = subparsers.add_parser( "duplicates-talkorigins", help="Inspect duplicate clusters in a generated TalkOrigins manifest", ) duplicates_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") duplicates_talkorigins_parser.add_argument("--limit", type=int, default=20, help="Maximum clusters to show") duplicates_talkorigins_parser.add_argument( "--min-count", type=int, default=2, help="Minimum cluster size to include", ) duplicates_talkorigins_parser.add_argument("--match", help="Optional text filter for duplicate clusters") duplicates_talkorigins_parser.add_argument("--topic", help="Optional topic slug to restrict inspection") duplicates_talkorigins_parser.add_argument( "--preview", action="store_true", help="Include the canonical merged entry that ingest-talkorigins would choose", ) duplicates_talkorigins_parser.add_argument( "--weak-only", action="store_true", help="Show only clusters whose canonical preview still looks weak", ) ingest_talkorigins_parser = subparsers.add_parser( "ingest-talkorigins", help="Ingest a TalkOrigins manifest into the database with duplicate consolidation and topic membership", ) ingest_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") ingest_talkorigins_parser.add_argument("--status", default="draft", help="Review status for imported entries") ingest_talkorigins_parser.add_argument( "--no-dedupe", action="store_true", help="Disable duplicate consolidation and import each parsed entry separately", ) enrich_talkorigins_parser = subparsers.add_parser( "enrich-talkorigins", help="Attempt metadata enrichment for weak TalkOrigins canonical entries", ) enrich_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") enrich_talkorigins_parser.add_argument("--limit", type=int, default=20, help="Maximum weak clusters to inspect") enrich_talkorigins_parser.add_argument( "--min-count", type=int, default=2, help="Minimum duplicate-cluster size to include", ) enrich_talkorigins_parser.add_argument("--match", help="Optional text filter for weak canonical clusters") enrich_talkorigins_parser.add_argument("--topic", help="Optional topic slug to restrict enrichment") enrich_talkorigins_parser.add_argument( "--apply", action="store_true", help="Write successful enrichments back into the configured database", ) enrich_talkorigins_parser.add_argument( "--allow-unsafe-search-matches", action="store_true", help="Allow low-trust title-search resolver matches for bounded experiments on copied databases", ) enrich_talkorigins_parser.add_argument( "--status", default="enriched", help="Review status to set when applying successful enrichments", ) review_talkorigins_parser = subparsers.add_parser( "review-talkorigins", help="Export weak TalkOrigins clusters plus dry-run enrichment outcomes for manual review", ) review_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") review_talkorigins_parser.add_argument("--limit", type=int, default=20, help="Maximum weak clusters to export") review_talkorigins_parser.add_argument( "--min-count", type=int, default=2, help="Minimum duplicate-cluster size to include", ) review_talkorigins_parser.add_argument("--match", help="Optional text filter for weak canonical clusters") review_talkorigins_parser.add_argument("--topic", help="Optional topic slug to restrict review export") review_talkorigins_parser.add_argument("--output", help="Write review export JSON to a file instead of stdout") apply_review_talkorigins_parser = subparsers.add_parser( "apply-talkorigins-corrections", help="Apply curated TalkOrigins review corrections to the consolidated database", ) apply_review_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") apply_review_talkorigins_parser.add_argument("corrections", help="Path to corrections JSON") apply_review_talkorigins_parser.add_argument( "--status", default="reviewed", help="Default review status to set on corrected entries", ) topics_parser = subparsers.add_parser("topics", help="List known topics in the database") topics_parser.add_argument("--limit", type=int, default=100, help="Maximum number of topics to list") topics_parser.add_argument( "--phrase-review-status", choices=["unreviewed", "pending", "accepted", "rejected"], help="Restrict topics to one stored phrase review state", ) topic_phrase_reviews_parser = subparsers.add_parser( "topic-phrase-reviews", help="List staged topic phrase suggestions and their review state", ) topic_phrase_reviews_parser.add_argument("--limit", type=int, default=100, help="Maximum reviews to list") topic_phrase_reviews_parser.add_argument( "--phrase-review-status", choices=["unreviewed", "pending", "accepted", "rejected"], help="Restrict results to one stored phrase review state", ) export_topic_phrase_reviews_parser = subparsers.add_parser( "export-topic-phrase-reviews", help="Export an editable JSON review template for staged topic phrase suggestions", ) export_topic_phrase_reviews_parser.add_argument("--limit", type=int, default=100, help="Maximum reviews to export") export_topic_phrase_reviews_parser.add_argument( "--phrase-review-status", choices=["unreviewed", "pending", "accepted", "rejected"], default="pending", help="Restrict exported reviews to one stored phrase review state", ) export_topic_phrase_reviews_parser.add_argument( "--output", help="Write the review template JSON to a file instead of stdout", ) topic_entries_parser = subparsers.add_parser( "topic-entries", help="List entries assigned to one topic", ) topic_entries_parser.add_argument("topic_slug", help="Topic slug to inspect") topic_entries_parser.add_argument("--limit", type=int, default=100, help="Maximum entries to list") export_topic_parser = subparsers.add_parser( "export-topic", help="Export one topic slice as BibTeX", ) export_topic_parser.add_argument("topic_slug", help="Topic slug to export") export_topic_parser.add_argument("--output", help="Write BibTeX to a file instead of stdout") return parser def main(argv: list[str] | None = None) -> int: parser = build_parser() args = parser.parse_args(argv) store = BibliographyStore(args.db) try: if args.command == "ingest": return _run_ingest(store, Path(args.input), args.status, args.source_label) if args.command == "search": return _run_search(store, args.query, args.limit, args.topic) if args.command == "show": return _run_show(store, args.citation_key, args.limit, args.provenance, args.conflicts) if args.command == "export": return _run_export(store, args.citation_keys, args.output) if args.command == "set-status": return _run_set_status(store, args.citation_key, args.review_status) if args.command == "resolve-conflicts": return _run_resolve_conflicts(store, args.citation_key, args.field_name, args.status) if args.command == "apply-conflict": return _run_apply_conflict(store, args.citation_key, args.field_name) if args.command == "extract": return _run_extract(Path(args.input), args.output) if args.command == "resolve": return _run_resolve(store, args.citation_keys) if args.command == "graph": return _run_graph( store, args.citation_keys, args.relations, args.depth, args.review_status, args.missing_only, ) if args.command == "expand": return _run_expand(store, args.citation_keys, args.source, args.relation, args.limit) if args.command == "expand-topic": return _run_expand_topic( store, args.topic_slug, args.topic_phrase, args.source, args.relation, args.seed_limit, args.per_seed_limit, args.min_relevance, args.seed_keys, args.preview, ) if args.command == "set-topic-phrase": return _run_set_topic_phrase(store, args.topic_slug, args.phrase, args.clear) if args.command == "harvest-oai": return _run_harvest_oai( store, args.base_url, args.metadata_prefix, args.set_spec, args.date_from, args.date_until, args.limit, args.status, ) if args.command == "discover-oai": return _run_discover_oai(args.base_url) if args.command == "bootstrap": return _run_bootstrap( store, args.seed_bib, args.topic, args.topic_limit, args.topic_commit_limit, not args.no_expand, args.status, args.preview, args.topic_slug, args.topic_name, args.store_topic_phrase, ) if args.command == "bootstrap-batch": return _run_bootstrap_batch(store, Path(args.input)) if args.command == "scrape-talkorigins": return _run_scrape_talkorigins( store, args.base_url, Path(args.output_dir), args.limit_topics, args.limit_entries_per_topic, args.resolve_seeds, args.ingest, not args.no_expand, not args.no_resume, args.topic_limit, args.topic_commit_limit, args.status, ) if args.command == "validate-talkorigins": return _run_validate_talkorigins(Path(args.manifest)) if args.command == "suggest-talkorigins-phrases": return _run_suggest_talkorigins_phrases(Path(args.manifest), args.topic, args.limit, args.output) if args.command == "apply-topic-phrases": return _run_apply_topic_phrases(store, Path(args.input)) if args.command == "stage-topic-phrases": return _run_stage_topic_phrases(store, Path(args.input)) if args.command == "review-topic-phrase": return _run_review_topic_phrase(store, args.topic_slug, args.status, args.notes, args.phrase) if args.command == "review-topic-phrases": return _run_review_topic_phrases(store, Path(args.input)) if args.command == "duplicates-talkorigins": return _run_duplicates_talkorigins( Path(args.manifest), args.limit, args.min_count, args.match, args.topic, args.preview, args.weak_only, ) if args.command == "ingest-talkorigins": return _run_ingest_talkorigins(store, Path(args.manifest), args.status, not args.no_dedupe) if args.command == "enrich-talkorigins": return _run_enrich_talkorigins( store, Path(args.manifest), args.limit, args.min_count, args.match, args.topic, args.apply, args.status, args.allow_unsafe_search_matches, ) if args.command == "review-talkorigins": return _run_review_talkorigins( store, Path(args.manifest), args.limit, args.min_count, args.match, args.topic, args.output, ) if args.command == "apply-talkorigins-corrections": return _run_apply_talkorigins_corrections( store, Path(args.manifest), Path(args.corrections), args.status, ) if args.command == "topics": return _run_topics(store, args.limit, args.phrase_review_status) if args.command == "topic-phrase-reviews": return _run_topic_phrase_reviews(store, args.limit, args.phrase_review_status) if args.command == "export-topic-phrase-reviews": return _run_export_topic_phrase_reviews(store, args.limit, args.phrase_review_status, args.output) if args.command == "topic-entries": return _run_topic_entries(store, args.topic_slug, args.limit) if args.command == "export-topic": return _run_export_topic(store, args.topic_slug, args.output) finally: store.close() parser.error(f"Unknown command: {args.command}") return 2 def _run_ingest( store: BibliographyStore, input_path: Path, review_status: str, source_label: str | None, ) -> int: text = input_path.read_text(encoding="utf-8") keys = store.ingest_bibtex( text, source_label=source_label or str(input_path), review_status=review_status, ) for key in keys: print(key) return 0 def _run_search(store: BibliographyStore, query: str, limit: int, topic_slug: str | None) -> int: for row in store.search_text(query, limit=limit, topic_slug=topic_slug): score = row.get("score", 0.0) print(f"{row['citation_key']}\t{row.get('year') or ''}\t{score:.3f}\t{row.get('title') or ''}") return 0 def _run_show( store: BibliographyStore, citation_key: str | None, limit: int, provenance: bool, conflicts: bool, ) -> int: if citation_key: entry = store.get_entry(citation_key) if entry is None: print(f"Entry not found: {citation_key}", file=sys.stderr) return 1 if provenance: entry["field_provenance"] = store.get_field_provenance(citation_key) if conflicts: entry["field_conflicts"] = store.get_field_conflicts(citation_key) print(json.dumps(entry, indent=2, sort_keys=True)) return 0 print(json.dumps(store.list_entries(limit=limit), indent=2)) return 0 def _run_export(store: BibliographyStore, citation_keys: list[str], output: str | None) -> int: rendered = store.export_bibtex(citation_keys or None) if output: Path(output).write_text(rendered + ("\n" if rendered else ""), encoding="utf-8") else: if rendered: print(rendered) return 0 def _run_set_status(store: BibliographyStore, citation_key: str, review_status: str) -> int: if not store.set_entry_status(citation_key, review_status): print(f"Entry not found: {citation_key}", file=sys.stderr) return 1 print(f"{citation_key}\t{review_status}") return 0 def _run_resolve_conflicts(store: BibliographyStore, citation_key: str, field_name: str, status: str) -> int: count = store.set_conflict_status(citation_key, field_name, status) if count == 0: print(f"No open conflicts updated for {citation_key}:{field_name}", file=sys.stderr) return 1 print(f"{citation_key}\t{field_name}\t{status}\t{count}") return 0 def _run_apply_conflict(store: BibliographyStore, citation_key: str, field_name: str) -> int: if not store.apply_conflict_value(citation_key, field_name): print(f"No open conflict applied for {citation_key}:{field_name}", file=sys.stderr) return 1 print(f"{citation_key}\t{field_name}\tapplied") return 0 def _run_extract(input_path: Path, output: str | None) -> int: text = input_path.read_text(encoding="utf-8") entries = extract_references(text) rendered = render_bibtex(entries) if output: Path(output).write_text(rendered + ("\n" if rendered else ""), encoding="utf-8") else: if rendered: print(rendered) return 0 def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int: resolver = MetadataResolver() exit_code = 0 for citation_key in citation_keys: existing = store.get_entry(citation_key) if existing is None: print(f"Entry not found: {citation_key}", file=sys.stderr) exit_code = 1 continue bibtex = store.get_entry_bibtex(citation_key) if not bibtex: print(f"Entry not renderable: {citation_key}", file=sys.stderr) exit_code = 1 continue current_entry = parse_bibtex(bibtex)[0] resolution = resolver.resolve_entry(current_entry) if resolution is None: print(f"No resolver match: {citation_key}", file=sys.stderr) exit_code = 1 continue merged, conflicts = merge_entries_with_conflicts(current_entry, resolution.entry) store.replace_entry( citation_key, merged, source_type=resolution.source_type, source_label=resolution.source_label, review_status="enriched", ) if conflicts: store.record_conflicts( citation_key, conflicts, source_type=resolution.source_type, source_label=resolution.source_label, ) print(f"{citation_key}\t{resolution.source_label}") return exit_code def _run_graph( store: BibliographyStore, citation_keys: list[str], relations: list[str] | None, depth: int, review_status: str | None, missing_only: bool, ) -> int: rows = store.traverse_graph( citation_keys, relation_types=relations or ["cites"], max_depth=depth, review_status=review_status, include_missing=True, ) if missing_only: rows = [row for row in rows if not row["target_exists"]] print(json.dumps(rows, indent=2)) return 0 def _run_expand( store: BibliographyStore, citation_keys: list[str], source: str, relation: str, limit: int, ) -> int: if source == "crossref": expander = CrossrefExpander() expand_fn = lambda key: expander.expand_entry_references(store, key) elif source == "openalex": expander = OpenAlexExpander() expand_fn = lambda key: expander.expand_entry(store, key, relation_type=relation, limit=limit) else: print(f"Unsupported expansion source: {source}", file=sys.stderr) return 1 all_results = [] for citation_key in citation_keys: all_results.extend(expand_fn(citation_key)) print(json.dumps([asdict(result) for result in all_results], indent=2)) return 0 def _run_expand_topic( store: BibliographyStore, topic_slug: str, topic_phrase: str | None, source: str, relation: str, seed_limit: int, per_seed_limit: int, min_relevance: float, seed_keys: list[str] | None, preview: bool, ) -> int: expander = TopicExpander() stored_topic = store.get_topic(topic_slug) effective_phrase = topic_phrase if effective_phrase is None and stored_topic is not None: effective_phrase = str(stored_topic.get("expansion_phrase") or "") or None results = expander.expand_topic( store, topic_slug, topic_phrase=effective_phrase, source=source, relation_type=relation, seed_limit=seed_limit, per_seed_limit=per_seed_limit, min_relevance=min_relevance, seed_keys=seed_keys, preview_only=preview, ) print(json.dumps([asdict(result) for result in results], indent=2)) return 0 def _run_set_topic_phrase( store: BibliographyStore, topic_slug: str, phrase: str | None, clear: bool, ) -> int: if clear: phrase = None elif phrase is None: print("set-topic-phrase requires a phrase or --clear", file=sys.stderr) return 1 if not store.set_topic_expansion_phrase(topic_slug, phrase): print(f"Topic not found: {topic_slug}", file=sys.stderr) return 1 payload = store.get_topic(topic_slug) print(json.dumps(payload, indent=2)) return 0 def _run_harvest_oai( store: BibliographyStore, base_url: str, metadata_prefix: str, set_spec: str | None, date_from: str | None, date_until: str | None, limit: int, review_status: str, ) -> int: harvester = OaiPmhHarvester() harvested = harvester.list_records( base_url, metadata_prefix=metadata_prefix, set_spec=set_spec, date_from=date_from, date_until=date_until, limit=limit, ) for result in harvested: store.upsert_entry( result.entry, raw_bibtex=render_bibtex([result.entry]), source_type="harvest", source_label=f"oai:{result.base_url}", review_status=review_status, ) print(result.entry.citation_key) store.connection.commit() return 0 def _run_discover_oai(base_url: str) -> int: harvester = OaiPmhHarvester() payload = { "identify": harvester.identify(base_url), "metadata_formats": [asdict(result) for result in harvester.list_metadata_formats(base_url)], "sets": [asdict(result) for result in harvester.list_sets(base_url)], } print(json.dumps(payload, indent=2, sort_keys=True)) return 0 def _run_bootstrap( store: BibliographyStore, seed_bib: str | None, topic: str | None, topic_limit: int, topic_commit_limit: int | None, expand: bool, review_status: str, preview: bool, topic_slug: str | None, topic_name: str | None, stored_topic_phrase: str | None, ) -> int: if not seed_bib and not topic: print("bootstrap requires --seed-bib, --topic, or both", file=sys.stderr) return 1 seed_bibtex = Path(seed_bib).read_text(encoding="utf-8") if seed_bib else None bootstrapper = Bootstrapper() results = bootstrapper.bootstrap( store, seed_bibtex=seed_bibtex, topic=topic, topic_limit=topic_limit, topic_commit_limit=topic_commit_limit, expand=expand, review_status=review_status, preview_only=preview, topic_slug=topic_slug, topic_name=topic_name, topic_phrase=stored_topic_phrase, ) print(json.dumps([asdict(result) for result in results], indent=2)) return 0 def _run_bootstrap_batch(store: BibliographyStore, input_path: Path) -> int: jobs = load_batch_jobs(input_path) runner = BatchBootstrapRunner() results = runner.run(store, jobs) payload = [] for job_result in results: payload.append( { "job_name": job_result.job_name, "result_count": job_result.result_count, "results": [asdict(item) for item in job_result.results], } ) print(json.dumps(payload, indent=2)) return 0 def _run_scrape_talkorigins( store: BibliographyStore, base_url: str, output_dir: Path, limit_topics: int | None, limit_entries_per_topic: int | None, resolve_seeds: bool, ingest: bool, expand: bool, resume: bool, topic_limit: int, topic_commit_limit: int | None, review_status: str, ) -> int: scraper = TalkOriginsScraper() export = scraper.scrape_to_directory( base_url=base_url, output_dir=output_dir, limit_topics=limit_topics, limit_entries_per_topic=limit_entries_per_topic, resolve_seeds=resolve_seeds, ingest_store=store if ingest else None, review_status=review_status, expand=expand, resume=resume, topic_limit=topic_limit, topic_commit_limit=topic_commit_limit, ) print(json.dumps(asdict(export), indent=2)) return 0 def _run_validate_talkorigins(manifest_path: Path) -> int: scraper = TalkOriginsScraper() report = scraper.validate_export(manifest_path) print(json.dumps(asdict(report), indent=2)) return 0 def _run_suggest_talkorigins_phrases( manifest_path: Path, topic_slug: str | None, limit: int | None, output: str | None, ) -> int: scraper = TalkOriginsScraper() suggestions = scraper.suggest_topic_phrases(manifest_path, limit=limit, topic_slug=topic_slug) payload = json.dumps([asdict(item) for item in suggestions], indent=2) if output: Path(output).write_text(payload + "\n", encoding="utf-8") else: print(payload) return 0 def _run_apply_topic_phrases(store: BibliographyStore, input_path: Path) -> int: payload = json.loads(input_path.read_text(encoding="utf-8")) if isinstance(payload, dict): items = payload.get("topics", []) else: items = payload if not isinstance(items, list): print("Topic phrase JSON must be a list or an object with a 'topics' list", file=sys.stderr) return 1 results: list[dict[str, object]] = [] exit_code = 0 for item in items: if not isinstance(item, dict): continue slug = str(item.get("slug") or "") phrase = item.get("suggested_phrase", item.get("phrase")) if not slug: continue if phrase is not None: phrase = str(phrase) applied = store.set_topic_expansion_phrase(slug, phrase) if not applied: exit_code = 1 results.append( { "slug": slug, "expansion_phrase": phrase, "applied": applied, } ) print(json.dumps(results, indent=2)) return exit_code def _run_stage_topic_phrases(store: BibliographyStore, input_path: Path) -> int: payload = json.loads(input_path.read_text(encoding="utf-8")) if isinstance(payload, dict): items = payload.get("topics", payload.get("items", [])) else: items = payload if not isinstance(items, list): print("Topic phrase JSON must be a list or an object with a 'topics' or 'items' list", file=sys.stderr) return 1 results: list[dict[str, object]] = [] exit_code = 0 for item in items: if not isinstance(item, dict): continue slug = str(item.get("slug") or "") phrase = item.get("suggested_phrase", item.get("phrase")) notes = item.get("review_notes") if not slug: continue if phrase is not None: phrase = str(phrase) if notes is not None: notes = str(notes) staged = store.stage_topic_phrase_suggestion( slug, suggested_phrase=phrase, review_status="pending", review_notes=notes, ) if not staged: exit_code = 1 results.append( { "slug": slug, "suggested_phrase": phrase, "phrase_review_status": "pending", "staged": staged, } ) print(json.dumps(results, indent=2)) return exit_code def _run_review_topic_phrase( store: BibliographyStore, topic_slug: str, status: str, notes: str | None, phrase: str | None, ) -> int: if not store.review_topic_phrase_suggestion( topic_slug, review_status=status, review_notes=notes, applied_phrase=phrase, ): print(f"Topic not found: {topic_slug}", file=sys.stderr) return 1 payload = store.get_topic(topic_slug) print(json.dumps(payload, indent=2)) return 0 def _run_review_topic_phrases(store: BibliographyStore, input_path: Path) -> int: payload = json.loads(input_path.read_text(encoding="utf-8")) if isinstance(payload, dict): items = payload.get("topics", payload.get("items", [])) else: items = payload if not isinstance(items, list): print("Topic phrase review JSON must be a list or an object with a 'topics' or 'items' list", file=sys.stderr) return 1 results: list[dict[str, object]] = [] exit_code = 0 for item in items: if not isinstance(item, dict): continue slug = str(item.get("slug") or "") status = str(item.get("status") or item.get("phrase_review_status") or "") notes = item.get("review_notes") phrase = item.get("phrase", item.get("expansion_phrase")) if not slug or status not in {"accepted", "rejected"}: continue if notes is not None: notes = str(notes) if phrase is not None: phrase = str(phrase) reviewed = store.review_topic_phrase_suggestion( slug, review_status=status, review_notes=notes, applied_phrase=phrase, ) if not reviewed: exit_code = 1 results.append( { "slug": slug, "phrase_review_status": status, "expansion_phrase": phrase, "reviewed": reviewed, } ) print(json.dumps(results, indent=2)) return exit_code def _run_duplicates_talkorigins( manifest_path: Path, limit: int, min_count: int, match: str | None, topic_slug: str | None, preview: bool, weak_only: bool, ) -> int: scraper = TalkOriginsScraper() clusters = scraper.inspect_duplicate_clusters( manifest_path, limit=limit, min_count=min_count, match=match, topic_slug=topic_slug, preview_canonical=preview, weak_only=weak_only, ) print(json.dumps([asdict(cluster) for cluster in clusters], indent=2)) return 0 def _run_ingest_talkorigins( store: BibliographyStore, manifest_path: Path, review_status: str, dedupe: bool, ) -> int: scraper = TalkOriginsScraper() report = scraper.ingest_export( manifest_path, store, review_status=review_status, dedupe=dedupe, ) print(json.dumps(asdict(report), indent=2)) return 0 def _run_enrich_talkorigins( store: BibliographyStore, manifest_path: Path, limit: int, min_count: int, match: str | None, topic_slug: str | None, apply: bool, review_status: str, allow_unsafe_matches: bool, ) -> int: scraper = TalkOriginsScraper() results = scraper.enrich_weak_canonicals( manifest_path, store, limit=limit, min_count=min_count, match=match, topic_slug=topic_slug, apply=apply, review_status=review_status, allow_unsafe_matches=allow_unsafe_matches, ) print(json.dumps([asdict(result) for result in results], indent=2)) return 0 def _run_review_talkorigins( store: BibliographyStore, manifest_path: Path, limit: int, min_count: int, match: str | None, topic_slug: str | None, output: str | None, ) -> int: scraper = TalkOriginsScraper() review = scraper.build_review_export( manifest_path, store, limit=limit, min_count=min_count, match=match, topic_slug=topic_slug, ) payload = json.dumps(asdict(review), indent=2) if output: Path(output).write_text(payload + "\n", encoding="utf-8") else: print(payload) return 0 def _run_apply_talkorigins_corrections( store: BibliographyStore, manifest_path: Path, corrections_path: Path, review_status: str, ) -> int: scraper = TalkOriginsScraper() results = scraper.apply_review_corrections( manifest_path, corrections_path, store, default_review_status=review_status, ) print(json.dumps([asdict(result) for result in results], indent=2)) return 0 def _run_topics(store: BibliographyStore, limit: int, phrase_review_status: str | None) -> int: print(json.dumps(store.list_topics(limit=limit, phrase_review_status=phrase_review_status), indent=2)) return 0 def _run_topic_phrase_reviews(store: BibliographyStore, limit: int, phrase_review_status: str | None) -> int: print(json.dumps(store.list_topic_phrase_reviews(limit=limit, phrase_review_status=phrase_review_status), indent=2)) return 0 def _run_export_topic_phrase_reviews( store: BibliographyStore, limit: int, phrase_review_status: str | None, output: str | None, ) -> int: items = store.list_topic_phrase_reviews(limit=limit, phrase_review_status=phrase_review_status) payload = [ { "slug": item["slug"], "topic": item["name"], "current_expansion_phrase": item.get("expansion_phrase"), "suggested_phrase": item.get("suggested_phrase"), "current_status": item.get("phrase_review_status"), "review_notes": item.get("phrase_review_notes"), "status": "", "phrase": item.get("suggested_phrase"), } for item in items ] rendered = json.dumps(payload, indent=2) if output: Path(output).write_text(rendered + "\n", encoding="utf-8") else: print(rendered) return 0 def _run_topic_entries(store: BibliographyStore, topic_slug: str, limit: int) -> int: topic = store.get_topic(topic_slug) if topic is None: print(f"Topic not found: {topic_slug}", file=sys.stderr) return 1 payload = { "topic": topic, "entries": store.list_topic_entries(topic_slug, limit=limit), } print(json.dumps(payload, indent=2)) return 0 def _run_export_topic(store: BibliographyStore, topic_slug: str, output: str | None) -> int: topic = store.get_topic(topic_slug) if topic is None: print(f"Topic not found: {topic_slug}", file=sys.stderr) return 1 citation_keys = [row["citation_key"] for row in store.list_topic_entries(topic_slug, limit=100000)] rendered = store.export_bibtex(citation_keys) if output: Path(output).write_text(rendered + ("\n" if rendered else ""), encoding="utf-8") else: if rendered: print(rendered) return 0