Extend JabRef sync workflow with review annotations

2026-04-07 12:41:54 -04:00 · 2026-04-07 12:41:54 -04:00 · 65fde034e1
parent b26f662af9
commit 65fde034e1
5 changed files with 370 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -176,6 +176,7 @@ PYTHONPATH=src .venv/bin/python -m citegeist verify --bib draft.bib --output ver
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve smith2024graphs
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --preview --limit 25
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --all-misc --limit 25
+PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 sync-jabref my-library.bib --output my-library.enriched.bib
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topics
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topic-entries abiogenesis
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export-topic abiogenesis --output abiogenesis.bib
@ -208,6 +209,22 @@ OpenAlex expansion is also conservative about noisy secondary records. Discoveri

 For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run.

+## JabRef Workflow
+
+`citegeist` is not a replacement for JabRef's day-to-day BibTeX editing UX. The intended near-term integration model is file-based round-tripping:
+
+- use JabRef to inspect, edit, and review your main `.bib` library;
+- use `citegeist sync-jabref` to ingest that file into CiteGeist, run metadata enrichment against imported entries, and write an enriched `.bib` export back out;
+- reopen the enriched file in JabRef for human review and curation.
+
+That keeps JabRef as the primary manual review surface while letting CiteGeist handle source-backed resolution and discovery work that reference managers usually do not automate well.
+
+Useful options for that round trip:
+
+- `--in-place`: overwrite the input `.bib` file instead of writing to a separate export path
+- `--annotate-review`: add `x_citegeist_*` sidecar fields such as review status, open-conflict count, and last source label so JabRef can surface CiteGeist review cues directly in the BibTeX record
+- `--no-resolve`: skip live metadata resolution and only perform import plus re-export
+
 ## Adopted Ideas From Earlier Repos

 `citegeist` now absorbs two useful patterns from adjacent bibliography tools while keeping them inside the main Python 3 package boundary:
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -315,6 +315,26 @@ Limit discoveries per seed:
 .venv/bin/python -m citegeist --db library.sqlite3 expand langton1989artificial1 --source openalex --limit 10
 ```

+### JabRef Round Trip
+
+Use JabRef as the main editor and CiteGeist as the enrichment pass:
+
+```bash
+.venv/bin/python -m citegeist --db library.sqlite3 sync-jabref my-library.bib --output my-library.enriched.bib
+```
+
+Skip resolver calls if you only want a normalized import/export pass:
+
+```bash
+.venv/bin/python -m citegeist --db library.sqlite3 sync-jabref my-library.bib --output my-library.enriched.bib --no-resolve
+```
+
+Write back to the same file and include CiteGeist review cues as BibTeX sidecar fields for JabRef:
+
+```bash
+.venv/bin/python -m citegeist --db library.sqlite3 sync-jabref my-library.bib --in-place --annotate-review
+```
+
 ## Build A Topic-Centered Bibliography

 Purpose: create, expand, inspect, and export a topic slice such as `artificial life`.
--- a/src/citegeist/cli.py
+++ b/src/citegeist/cli.py
@ -8,7 +8,7 @@ import sys
 from pathlib import Path

 from .batch import BatchBootstrapRunner, load_batch_jobs
-from .bibtex import parse_bibtex, render_bibtex
+from .bibtex import BibEntry, parse_bibtex, render_bibtex
 from .bootstrap import Bootstrapper
 from .examples.talkorigins import TalkOriginsScraper
 from .expand import CrossrefExpander, OpenAlexExpander, TopicExpander, _expand_relation_types
@ -56,6 +56,30 @@ def build_parser() -> argparse.ArgumentParser:
        help="Include DOI-only placeholder records in broad exports",
    )

+    sync_jabref_parser = subparsers.add_parser(
+        "sync-jabref",
+        help="Round-trip a JabRef-managed BibTeX file through CiteGeist ingest, enrichment, and export",
+    )
+    sync_jabref_parser.add_argument("input", help="BibTeX file managed in JabRef")
+    sync_jabref_parser.add_argument("--output", help="Path to write the enriched BibTeX export")
+    sync_jabref_parser.add_argument(
+        "--in-place",
+        action="store_true",
+        help="Write the enriched BibTeX back to the input file instead of a separate output path",
+    )
+    sync_jabref_parser.add_argument("--status", default="draft", help="Initial review status for newly ingested entries")
+    sync_jabref_parser.add_argument("--source-label", help="Provenance label for the ingest step")
+    sync_jabref_parser.add_argument(
+        "--no-resolve",
+        action="store_true",
+        help="Skip metadata resolution after ingest and only re-export the imported entries",
+    )
+    sync_jabref_parser.add_argument(
+        "--annotate-review",
+        action="store_true",
+        help="Add CiteGeist review/status sidecar fields to the exported BibTeX for easier JabRef review",
+    )
+
    status_parser = subparsers.add_parser("set-status", help="Set the review status for one entry")
    status_parser.add_argument("citation_key", help="Citation key to update")
    status_parser.add_argument("review_status", help="New review status")
@ -662,6 +686,17 @@ def main(argv: list[str] | None = None) -> int:
            return _run_show(store, args.citation_key, args.limit, args.provenance, args.conflicts)
        if args.command == "export":
            return _run_export(store, args.citation_keys, args.output, args.include_stubs)
+        if args.command == "sync-jabref":
+            return _run_sync_jabref(
+                store,
+                Path(args.input),
+                Path(args.output) if args.output else None,
+                args.in_place,
+                args.status,
+                args.source_label,
+                args.no_resolve,
+                args.annotate_review,
+            )
        if args.command == "set-status":
            return _run_set_status(store, args.citation_key, args.review_status)
        if args.command == "resolve-conflicts":
@ -907,6 +942,103 @@ def _run_export(
    return 0


+def _run_sync_jabref(
+    store: BibliographyStore,
+    input_path: Path,
+    output_path: Path | None,
+    in_place: bool,
+    review_status: str,
+    source_label: str | None,
+    skip_resolve: bool,
+    annotate_review: bool,
+) -> int:
+    if in_place:
+        effective_output_path = input_path
+    elif output_path is not None:
+        effective_output_path = output_path
+    else:
+        print("sync-jabref requires --output or --in-place", file=sys.stderr)
+        return 1
+
+    text = input_path.read_text(encoding="utf-8")
+    imported_keys = store.ingest_bibtex(
+        text,
+        source_label=source_label or str(input_path),
+        review_status=review_status,
+    )
+
+    resolved_keys: list[str] = []
+    failed_keys: list[str] = []
+    if not skip_resolve:
+        resolver = MetadataResolver()
+        total = len(imported_keys)
+        for index, citation_key in enumerate(imported_keys, start=1):
+            _print_progress("sync-jabref resolving", index, total, citation_key)
+            if _resolve_one(store, resolver, citation_key):
+                resolved_keys.append(citation_key)
+            else:
+                failed_keys.append(citation_key)
+
+    rendered = _render_jabref_sync_export(store, imported_keys, annotate_review=annotate_review)
+    effective_output_path.write_text(rendered + ("\n" if rendered else ""), encoding="utf-8")
+    print(
+        json.dumps(
+            {
+                "input": str(input_path),
+                "output": str(effective_output_path),
+                "imported_count": len(imported_keys),
+                "resolved_count": len(resolved_keys),
+                "failed_resolve_count": len(failed_keys),
+                "skipped_resolution": skip_resolve,
+                "annotated_review": annotate_review,
+                "in_place": in_place,
+                "citation_keys": imported_keys,
+            },
+            indent=2,
+        )
+    )
+    return 0 if skip_resolve or not failed_keys else 1
+
+
+def _render_jabref_sync_export(
+    store: BibliographyStore,
+    citation_keys: list[str],
+    *,
+    annotate_review: bool,
+) -> str:
+    entries: list[BibEntry] = []
+    for citation_key in citation_keys:
+        entry = store.get_bib_entry(citation_key)
+        if entry is None:
+            continue
+        if annotate_review:
+            entry = _annotated_jabref_entry(store, entry)
+        entries.append(entry)
+    return render_bibtex(entries) if entries else ""
+
+
+def _annotated_jabref_entry(store: BibliographyStore, entry: BibEntry) -> BibEntry:
+    row = store.get_entry(entry.citation_key) or {}
+    annotated = BibEntry(
+        entry_type=entry.entry_type,
+        citation_key=entry.citation_key,
+        fields=dict(entry.fields),
+    )
+    review_status = str(row.get("review_status") or "")
+    if review_status:
+        annotated.fields["x_citegeist_review_status"] = review_status
+    open_conflicts = store.get_field_conflicts(entry.citation_key, status="open")
+    if open_conflicts:
+        annotated.fields["x_citegeist_open_conflicts"] = str(len(open_conflicts))
+        annotated.fields["x_citegeist_conflict_fields"] = ", ".join(
+            sorted({str(conflict.get("field_name") or "") for conflict in open_conflicts if conflict.get("field_name")})
+        )
+    provenance = store.get_field_provenance(entry.citation_key)
+    if provenance:
+        annotated.fields["x_citegeist_last_source"] = str(provenance[-1].get("source_label") or "")
+    return annotated
+
+
 def _run_set_status(store: BibliographyStore, citation_key: str, review_status: str) -> int:
    if not store.set_entry_status(citation_key, review_status):
        print(f"Entry not found: {citation_key}", file=sys.stderr)
--- a/src/citegeist/storage.py
+++ b/src/citegeist/storage.py
@ -1048,6 +1048,9 @@ class BibliographyStore:
            return None
        return render_bibtex([entry])

+    def get_bib_entry(self, citation_key: str) -> BibEntry | None:
+        return self._load_bib_entry(citation_key)
+
    def export_bibtex(self, citation_keys: list[str] | None = None, include_stubs: bool | None = None) -> str:
        return self.export_bibtex_report(citation_keys, include_stubs=include_stubs)["bibtex"]

--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -247,6 +247,203 @@ def test_cli_verify_bib_outputs_json(tmp_path: Path):
    assert payload[0]["entry"]["citation_key"] == "candidate2024"


+def test_cli_sync_jabref_ingests_resolves_and_exports(tmp_path: Path):
+    bib_path = tmp_path / "jabref-library.bib"
+    bib_path.write_text(
+        """
+@article{smith2024graphs,
+  author = {Smith, Jane},
+  title = {Graph-first bibliography augmentation},
+  year = {2024}
+}
+""",
+        encoding="utf-8",
+    )
+    output_path = tmp_path / "jabref-library.enriched.bib"
+
+    class FakeStore:
+        def __init__(self) -> None:
+            self.ingest_calls: list[tuple[str, str, str]] = []
+            self.closed = False
+
+        def ingest_bibtex(self, text: str, source_label: str, review_status: str) -> list[str]:
+            self.ingest_calls.append((text, source_label, review_status))
+            return ["smith2024graphs"]
+
+        def get_bib_entry(self, citation_key: str):
+            from citegeist.bibtex import BibEntry
+
+            return BibEntry("article", citation_key, {"title": "Resolved Work"})
+
+        def get_entry(self, citation_key: str):
+            return {"citation_key": citation_key, "review_status": "enriched"}
+
+        def get_field_conflicts(self, citation_key: str, status: str | None = None):
+            return []
+
+        def get_field_provenance(self, citation_key: str):
+            return []
+
+        def close(self) -> None:
+            self.closed = True
+
+    fake_store = FakeStore()
+    resolve_calls: list[str] = []
+
+    stdout_buffer = io.StringIO()
+    with (
+        patch("citegeist.cli.BibliographyStore", return_value=fake_store),
+        patch("citegeist.cli.MetadataResolver"),
+        patch("citegeist.cli.render_bibtex", return_value="@article{smith2024graphs,\n  title = {Resolved Work}\n}"),
+        patch(
+            "citegeist.cli._resolve_one",
+            side_effect=lambda store, resolver, citation_key: resolve_calls.append(citation_key) or True,
+        ),
+        redirect_stdout(stdout_buffer),
+    ):
+        exit_code = main(
+            [
+                "--db",
+                str(tmp_path / "library.sqlite3"),
+                "sync-jabref",
+                str(bib_path),
+                "--output",
+                str(output_path),
+                "--status",
+                "draft",
+                "--source-label",
+                "jabref:test",
+            ]
+        )
+
+    assert exit_code == 0
+    assert fake_store.ingest_calls[0][1:] == ("jabref:test", "draft")
+    assert resolve_calls == ["smith2024graphs"]
+    assert "@article{smith2024graphs," in output_path.read_text(encoding="utf-8")
+    payload = json.loads(stdout_buffer.getvalue())
+    assert payload["imported_count"] == 1
+    assert payload["resolved_count"] == 1
+    assert payload["failed_resolve_count"] == 0
+    assert payload["skipped_resolution"] is False
+
+
+def test_cli_sync_jabref_can_skip_resolution(tmp_path: Path):
+    bib_path = tmp_path / "jabref-library.bib"
+    bib_path.write_text("@article{seed2024, title = {Seed}}\n", encoding="utf-8")
+    output_path = tmp_path / "jabref-library.enriched.bib"
+
+    class FakeStore:
+        def ingest_bibtex(self, text: str, source_label: str, review_status: str) -> list[str]:
+            return ["seed2024"]
+
+        def get_bib_entry(self, citation_key: str):
+            from citegeist.bibtex import BibEntry
+
+            return BibEntry("article", citation_key, {"title": "Seed"})
+
+        def get_entry(self, citation_key: str):
+            return {"citation_key": citation_key, "review_status": "draft"}
+
+        def get_field_conflicts(self, citation_key: str, status: str | None = None):
+            return []
+
+        def get_field_provenance(self, citation_key: str):
+            return []
+
+        def close(self) -> None:
+            return None
+
+    stdout_buffer = io.StringIO()
+    with (
+        patch("citegeist.cli.BibliographyStore", return_value=FakeStore()),
+        patch("citegeist.cli.render_bibtex", return_value="@article{seed2024,\n  title = {Seed}\n}"),
+        patch("citegeist.cli._resolve_one") as mocked_resolve,
+        redirect_stdout(stdout_buffer),
+    ):
+        exit_code = main(
+            [
+                "--db",
+                str(tmp_path / "library.sqlite3"),
+                "sync-jabref",
+                str(bib_path),
+                "--output",
+                str(output_path),
+                "--no-resolve",
+            ]
+        )
+
+    assert exit_code == 0
+    mocked_resolve.assert_not_called()
+    payload = json.loads(stdout_buffer.getvalue())
+    assert payload["skipped_resolution"] is True
+    assert payload["resolved_count"] == 0
+
+
+def test_cli_sync_jabref_can_annotate_review_fields_and_write_in_place(tmp_path: Path):
+    bib_path = tmp_path / "jabref-library.bib"
+    bib_path.write_text("@article{seed2024, title = {Seed}}\n", encoding="utf-8")
+
+    class FakeStore:
+        def ingest_bibtex(self, text: str, source_label: str, review_status: str) -> list[str]:
+            return ["seed2024"]
+
+        def get_bib_entry(self, citation_key: str):
+            from citegeist.bibtex import BibEntry
+
+            return BibEntry("article", citation_key, {"title": "Seed"})
+
+        def get_entry(self, citation_key: str):
+            return {"citation_key": citation_key, "review_status": "enriched"}
+
+        def get_field_conflicts(self, citation_key: str, status: str | None = None):
+            return [{"field_name": "title"}]
+
+        def get_field_provenance(self, citation_key: str):
+            return [{"source_label": "pubmed:pmid:12345678"}]
+
+        def close(self) -> None:
+            return None
+
+    stdout_buffer = io.StringIO()
+    with (
+        patch("citegeist.cli.BibliographyStore", return_value=FakeStore()),
+        patch(
+            "citegeist.cli.render_bibtex",
+            side_effect=lambda entries: "\n".join(
+                [
+                    "@article{seed2024,",
+                    f"  title = {{{entries[0].fields['title']}}},",
+                    f"  x_citegeist_review_status = {{{entries[0].fields.get('x_citegeist_review_status', '')}}},",
+                    f"  x_citegeist_open_conflicts = {{{entries[0].fields.get('x_citegeist_open_conflicts', '')}}},",
+                    f"  x_citegeist_last_source = {{{entries[0].fields.get('x_citegeist_last_source', '')}}}",
+                    "}",
+                ]
+            ),
+        ),
+        patch("citegeist.cli._resolve_one", return_value=True),
+        redirect_stdout(stdout_buffer),
+    ):
+        exit_code = main(
+            [
+                "--db",
+                str(tmp_path / "library.sqlite3"),
+                "sync-jabref",
+                str(bib_path),
+                "--in-place",
+                "--annotate-review",
+            ]
+        )
+
+    assert exit_code == 0
+    rendered = bib_path.read_text(encoding="utf-8")
+    assert "x_citegeist_review_status" in rendered
+    assert "x_citegeist_open_conflicts" in rendered
+    assert "x_citegeist_last_source" in rendered
+    payload = json.loads(stdout_buffer.getvalue())
+    assert payload["in_place"] is True
+    assert payload["annotated_review"] is True
+
+
 def test_cli_resolve_updates_entry(tmp_path: Path):
    bib_path = tmp_path / "input.bib"
    bib_path.write_text(