From 65fde034e1e12e2fcdf97c1e09cde9089e31f6bb Mon Sep 17 00:00:00 2001 From: welsberr Date: Tue, 7 Apr 2026 12:41:54 -0400 Subject: [PATCH] Extend JabRef sync workflow with review annotations --- README.md | 17 ++++ examples/cli/README.md | 20 ++++ src/citegeist/cli.py | 134 +++++++++++++++++++++++++- src/citegeist/storage.py | 3 + tests/test_cli.py | 197 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 370 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d48f0a1..054fe01 100644 --- a/README.md +++ b/README.md @@ -176,6 +176,7 @@ PYTHONPATH=src .venv/bin/python -m citegeist verify --bib draft.bib --output ver PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve smith2024graphs PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --preview --limit 25 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --all-misc --limit 25 +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 sync-jabref my-library.bib --output my-library.enriched.bib PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topics PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topic-entries abiogenesis PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export-topic abiogenesis --output abiogenesis.bib @@ -208,6 +209,22 @@ OpenAlex expansion is also conservative about noisy secondary records. Discoveri For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run. +## JabRef Workflow + +`citegeist` is not a replacement for JabRef's day-to-day BibTeX editing UX. The intended near-term integration model is file-based round-tripping: + +- use JabRef to inspect, edit, and review your main `.bib` library; +- use `citegeist sync-jabref` to ingest that file into CiteGeist, run metadata enrichment against imported entries, and write an enriched `.bib` export back out; +- reopen the enriched file in JabRef for human review and curation. + +That keeps JabRef as the primary manual review surface while letting CiteGeist handle source-backed resolution and discovery work that reference managers usually do not automate well. + +Useful options for that round trip: + +- `--in-place`: overwrite the input `.bib` file instead of writing to a separate export path +- `--annotate-review`: add `x_citegeist_*` sidecar fields such as review status, open-conflict count, and last source label so JabRef can surface CiteGeist review cues directly in the BibTeX record +- `--no-resolve`: skip live metadata resolution and only perform import plus re-export + ## Adopted Ideas From Earlier Repos `citegeist` now absorbs two useful patterns from adjacent bibliography tools while keeping them inside the main Python 3 package boundary: diff --git a/examples/cli/README.md b/examples/cli/README.md index 44b8857..d80725d 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -315,6 +315,26 @@ Limit discoveries per seed: .venv/bin/python -m citegeist --db library.sqlite3 expand langton1989artificial1 --source openalex --limit 10 ``` +### JabRef Round Trip + +Use JabRef as the main editor and CiteGeist as the enrichment pass: + +```bash +.venv/bin/python -m citegeist --db library.sqlite3 sync-jabref my-library.bib --output my-library.enriched.bib +``` + +Skip resolver calls if you only want a normalized import/export pass: + +```bash +.venv/bin/python -m citegeist --db library.sqlite3 sync-jabref my-library.bib --output my-library.enriched.bib --no-resolve +``` + +Write back to the same file and include CiteGeist review cues as BibTeX sidecar fields for JabRef: + +```bash +.venv/bin/python -m citegeist --db library.sqlite3 sync-jabref my-library.bib --in-place --annotate-review +``` + ## Build A Topic-Centered Bibliography Purpose: create, expand, inspect, and export a topic slice such as `artificial life`. diff --git a/src/citegeist/cli.py b/src/citegeist/cli.py index c4f440e..0d729a8 100644 --- a/src/citegeist/cli.py +++ b/src/citegeist/cli.py @@ -8,7 +8,7 @@ import sys from pathlib import Path from .batch import BatchBootstrapRunner, load_batch_jobs -from .bibtex import parse_bibtex, render_bibtex +from .bibtex import BibEntry, parse_bibtex, render_bibtex from .bootstrap import Bootstrapper from .examples.talkorigins import TalkOriginsScraper from .expand import CrossrefExpander, OpenAlexExpander, TopicExpander, _expand_relation_types @@ -56,6 +56,30 @@ def build_parser() -> argparse.ArgumentParser: help="Include DOI-only placeholder records in broad exports", ) + sync_jabref_parser = subparsers.add_parser( + "sync-jabref", + help="Round-trip a JabRef-managed BibTeX file through CiteGeist ingest, enrichment, and export", + ) + sync_jabref_parser.add_argument("input", help="BibTeX file managed in JabRef") + sync_jabref_parser.add_argument("--output", help="Path to write the enriched BibTeX export") + sync_jabref_parser.add_argument( + "--in-place", + action="store_true", + help="Write the enriched BibTeX back to the input file instead of a separate output path", + ) + sync_jabref_parser.add_argument("--status", default="draft", help="Initial review status for newly ingested entries") + sync_jabref_parser.add_argument("--source-label", help="Provenance label for the ingest step") + sync_jabref_parser.add_argument( + "--no-resolve", + action="store_true", + help="Skip metadata resolution after ingest and only re-export the imported entries", + ) + sync_jabref_parser.add_argument( + "--annotate-review", + action="store_true", + help="Add CiteGeist review/status sidecar fields to the exported BibTeX for easier JabRef review", + ) + status_parser = subparsers.add_parser("set-status", help="Set the review status for one entry") status_parser.add_argument("citation_key", help="Citation key to update") status_parser.add_argument("review_status", help="New review status") @@ -662,6 +686,17 @@ def main(argv: list[str] | None = None) -> int: return _run_show(store, args.citation_key, args.limit, args.provenance, args.conflicts) if args.command == "export": return _run_export(store, args.citation_keys, args.output, args.include_stubs) + if args.command == "sync-jabref": + return _run_sync_jabref( + store, + Path(args.input), + Path(args.output) if args.output else None, + args.in_place, + args.status, + args.source_label, + args.no_resolve, + args.annotate_review, + ) if args.command == "set-status": return _run_set_status(store, args.citation_key, args.review_status) if args.command == "resolve-conflicts": @@ -907,6 +942,103 @@ def _run_export( return 0 +def _run_sync_jabref( + store: BibliographyStore, + input_path: Path, + output_path: Path | None, + in_place: bool, + review_status: str, + source_label: str | None, + skip_resolve: bool, + annotate_review: bool, +) -> int: + if in_place: + effective_output_path = input_path + elif output_path is not None: + effective_output_path = output_path + else: + print("sync-jabref requires --output or --in-place", file=sys.stderr) + return 1 + + text = input_path.read_text(encoding="utf-8") + imported_keys = store.ingest_bibtex( + text, + source_label=source_label or str(input_path), + review_status=review_status, + ) + + resolved_keys: list[str] = [] + failed_keys: list[str] = [] + if not skip_resolve: + resolver = MetadataResolver() + total = len(imported_keys) + for index, citation_key in enumerate(imported_keys, start=1): + _print_progress("sync-jabref resolving", index, total, citation_key) + if _resolve_one(store, resolver, citation_key): + resolved_keys.append(citation_key) + else: + failed_keys.append(citation_key) + + rendered = _render_jabref_sync_export(store, imported_keys, annotate_review=annotate_review) + effective_output_path.write_text(rendered + ("\n" if rendered else ""), encoding="utf-8") + print( + json.dumps( + { + "input": str(input_path), + "output": str(effective_output_path), + "imported_count": len(imported_keys), + "resolved_count": len(resolved_keys), + "failed_resolve_count": len(failed_keys), + "skipped_resolution": skip_resolve, + "annotated_review": annotate_review, + "in_place": in_place, + "citation_keys": imported_keys, + }, + indent=2, + ) + ) + return 0 if skip_resolve or not failed_keys else 1 + + +def _render_jabref_sync_export( + store: BibliographyStore, + citation_keys: list[str], + *, + annotate_review: bool, +) -> str: + entries: list[BibEntry] = [] + for citation_key in citation_keys: + entry = store.get_bib_entry(citation_key) + if entry is None: + continue + if annotate_review: + entry = _annotated_jabref_entry(store, entry) + entries.append(entry) + return render_bibtex(entries) if entries else "" + + +def _annotated_jabref_entry(store: BibliographyStore, entry: BibEntry) -> BibEntry: + row = store.get_entry(entry.citation_key) or {} + annotated = BibEntry( + entry_type=entry.entry_type, + citation_key=entry.citation_key, + fields=dict(entry.fields), + ) + review_status = str(row.get("review_status") or "") + if review_status: + annotated.fields["x_citegeist_review_status"] = review_status + open_conflicts = store.get_field_conflicts(entry.citation_key, status="open") + if open_conflicts: + annotated.fields["x_citegeist_open_conflicts"] = str(len(open_conflicts)) + annotated.fields["x_citegeist_conflict_fields"] = ", ".join( + sorted({str(conflict.get("field_name") or "") for conflict in open_conflicts if conflict.get("field_name")}) + ) + provenance = store.get_field_provenance(entry.citation_key) + if provenance: + annotated.fields["x_citegeist_last_source"] = str(provenance[-1].get("source_label") or "") + return annotated + + def _run_set_status(store: BibliographyStore, citation_key: str, review_status: str) -> int: if not store.set_entry_status(citation_key, review_status): print(f"Entry not found: {citation_key}", file=sys.stderr) diff --git a/src/citegeist/storage.py b/src/citegeist/storage.py index 70b7a3c..dfe4be7 100644 --- a/src/citegeist/storage.py +++ b/src/citegeist/storage.py @@ -1048,6 +1048,9 @@ class BibliographyStore: return None return render_bibtex([entry]) + def get_bib_entry(self, citation_key: str) -> BibEntry | None: + return self._load_bib_entry(citation_key) + def export_bibtex(self, citation_keys: list[str] | None = None, include_stubs: bool | None = None) -> str: return self.export_bibtex_report(citation_keys, include_stubs=include_stubs)["bibtex"] diff --git a/tests/test_cli.py b/tests/test_cli.py index f6622ff..6a0fe92 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -247,6 +247,203 @@ def test_cli_verify_bib_outputs_json(tmp_path: Path): assert payload[0]["entry"]["citation_key"] == "candidate2024" +def test_cli_sync_jabref_ingests_resolves_and_exports(tmp_path: Path): + bib_path = tmp_path / "jabref-library.bib" + bib_path.write_text( + """ +@article{smith2024graphs, + author = {Smith, Jane}, + title = {Graph-first bibliography augmentation}, + year = {2024} +} +""", + encoding="utf-8", + ) + output_path = tmp_path / "jabref-library.enriched.bib" + + class FakeStore: + def __init__(self) -> None: + self.ingest_calls: list[tuple[str, str, str]] = [] + self.closed = False + + def ingest_bibtex(self, text: str, source_label: str, review_status: str) -> list[str]: + self.ingest_calls.append((text, source_label, review_status)) + return ["smith2024graphs"] + + def get_bib_entry(self, citation_key: str): + from citegeist.bibtex import BibEntry + + return BibEntry("article", citation_key, {"title": "Resolved Work"}) + + def get_entry(self, citation_key: str): + return {"citation_key": citation_key, "review_status": "enriched"} + + def get_field_conflicts(self, citation_key: str, status: str | None = None): + return [] + + def get_field_provenance(self, citation_key: str): + return [] + + def close(self) -> None: + self.closed = True + + fake_store = FakeStore() + resolve_calls: list[str] = [] + + stdout_buffer = io.StringIO() + with ( + patch("citegeist.cli.BibliographyStore", return_value=fake_store), + patch("citegeist.cli.MetadataResolver"), + patch("citegeist.cli.render_bibtex", return_value="@article{smith2024graphs,\n title = {Resolved Work}\n}"), + patch( + "citegeist.cli._resolve_one", + side_effect=lambda store, resolver, citation_key: resolve_calls.append(citation_key) or True, + ), + redirect_stdout(stdout_buffer), + ): + exit_code = main( + [ + "--db", + str(tmp_path / "library.sqlite3"), + "sync-jabref", + str(bib_path), + "--output", + str(output_path), + "--status", + "draft", + "--source-label", + "jabref:test", + ] + ) + + assert exit_code == 0 + assert fake_store.ingest_calls[0][1:] == ("jabref:test", "draft") + assert resolve_calls == ["smith2024graphs"] + assert "@article{smith2024graphs," in output_path.read_text(encoding="utf-8") + payload = json.loads(stdout_buffer.getvalue()) + assert payload["imported_count"] == 1 + assert payload["resolved_count"] == 1 + assert payload["failed_resolve_count"] == 0 + assert payload["skipped_resolution"] is False + + +def test_cli_sync_jabref_can_skip_resolution(tmp_path: Path): + bib_path = tmp_path / "jabref-library.bib" + bib_path.write_text("@article{seed2024, title = {Seed}}\n", encoding="utf-8") + output_path = tmp_path / "jabref-library.enriched.bib" + + class FakeStore: + def ingest_bibtex(self, text: str, source_label: str, review_status: str) -> list[str]: + return ["seed2024"] + + def get_bib_entry(self, citation_key: str): + from citegeist.bibtex import BibEntry + + return BibEntry("article", citation_key, {"title": "Seed"}) + + def get_entry(self, citation_key: str): + return {"citation_key": citation_key, "review_status": "draft"} + + def get_field_conflicts(self, citation_key: str, status: str | None = None): + return [] + + def get_field_provenance(self, citation_key: str): + return [] + + def close(self) -> None: + return None + + stdout_buffer = io.StringIO() + with ( + patch("citegeist.cli.BibliographyStore", return_value=FakeStore()), + patch("citegeist.cli.render_bibtex", return_value="@article{seed2024,\n title = {Seed}\n}"), + patch("citegeist.cli._resolve_one") as mocked_resolve, + redirect_stdout(stdout_buffer), + ): + exit_code = main( + [ + "--db", + str(tmp_path / "library.sqlite3"), + "sync-jabref", + str(bib_path), + "--output", + str(output_path), + "--no-resolve", + ] + ) + + assert exit_code == 0 + mocked_resolve.assert_not_called() + payload = json.loads(stdout_buffer.getvalue()) + assert payload["skipped_resolution"] is True + assert payload["resolved_count"] == 0 + + +def test_cli_sync_jabref_can_annotate_review_fields_and_write_in_place(tmp_path: Path): + bib_path = tmp_path / "jabref-library.bib" + bib_path.write_text("@article{seed2024, title = {Seed}}\n", encoding="utf-8") + + class FakeStore: + def ingest_bibtex(self, text: str, source_label: str, review_status: str) -> list[str]: + return ["seed2024"] + + def get_bib_entry(self, citation_key: str): + from citegeist.bibtex import BibEntry + + return BibEntry("article", citation_key, {"title": "Seed"}) + + def get_entry(self, citation_key: str): + return {"citation_key": citation_key, "review_status": "enriched"} + + def get_field_conflicts(self, citation_key: str, status: str | None = None): + return [{"field_name": "title"}] + + def get_field_provenance(self, citation_key: str): + return [{"source_label": "pubmed:pmid:12345678"}] + + def close(self) -> None: + return None + + stdout_buffer = io.StringIO() + with ( + patch("citegeist.cli.BibliographyStore", return_value=FakeStore()), + patch( + "citegeist.cli.render_bibtex", + side_effect=lambda entries: "\n".join( + [ + "@article{seed2024,", + f" title = {{{entries[0].fields['title']}}},", + f" x_citegeist_review_status = {{{entries[0].fields.get('x_citegeist_review_status', '')}}},", + f" x_citegeist_open_conflicts = {{{entries[0].fields.get('x_citegeist_open_conflicts', '')}}},", + f" x_citegeist_last_source = {{{entries[0].fields.get('x_citegeist_last_source', '')}}}", + "}", + ] + ), + ), + patch("citegeist.cli._resolve_one", return_value=True), + redirect_stdout(stdout_buffer), + ): + exit_code = main( + [ + "--db", + str(tmp_path / "library.sqlite3"), + "sync-jabref", + str(bib_path), + "--in-place", + "--annotate-review", + ] + ) + + assert exit_code == 0 + rendered = bib_path.read_text(encoding="utf-8") + assert "x_citegeist_review_status" in rendered + assert "x_citegeist_open_conflicts" in rendered + assert "x_citegeist_last_source" in rendered + payload = json.loads(stdout_buffer.getvalue()) + assert payload["in_place"] is True + assert payload["annotated_review"] is True + + def test_cli_resolve_updates_entry(tmp_path: Path): bib_path = tmp_path / "input.bib" bib_path.write_text(