Extend JabRef sync workflow with review annotations

This commit is contained in:
welsberr 2026-04-07 12:41:54 -04:00
parent b26f662af9
commit 65fde034e1
5 changed files with 370 additions and 1 deletions

View File

@ -176,6 +176,7 @@ PYTHONPATH=src .venv/bin/python -m citegeist verify --bib draft.bib --output ver
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve smith2024graphs
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --preview --limit 25
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --all-misc --limit 25
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 sync-jabref my-library.bib --output my-library.enriched.bib
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topics
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topic-entries abiogenesis
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export-topic abiogenesis --output abiogenesis.bib
@ -208,6 +209,22 @@ OpenAlex expansion is also conservative about noisy secondary records. Discoveri
For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run.
## JabRef Workflow
`citegeist` is not a replacement for JabRef's day-to-day BibTeX editing UX. The intended near-term integration model is file-based round-tripping:
- use JabRef to inspect, edit, and review your main `.bib` library;
- use `citegeist sync-jabref` to ingest that file into CiteGeist, run metadata enrichment against imported entries, and write an enriched `.bib` export back out;
- reopen the enriched file in JabRef for human review and curation.
That keeps JabRef as the primary manual review surface while letting CiteGeist handle source-backed resolution and discovery work that reference managers usually do not automate well.
Useful options for that round trip:
- `--in-place`: overwrite the input `.bib` file instead of writing to a separate export path
- `--annotate-review`: add `x_citegeist_*` sidecar fields such as review status, open-conflict count, and last source label so JabRef can surface CiteGeist review cues directly in the BibTeX record
- `--no-resolve`: skip live metadata resolution and only perform import plus re-export
## Adopted Ideas From Earlier Repos
`citegeist` now absorbs two useful patterns from adjacent bibliography tools while keeping them inside the main Python 3 package boundary:

View File

@ -315,6 +315,26 @@ Limit discoveries per seed:
.venv/bin/python -m citegeist --db library.sqlite3 expand langton1989artificial1 --source openalex --limit 10
```
### JabRef Round Trip
Use JabRef as the main editor and CiteGeist as the enrichment pass:
```bash
.venv/bin/python -m citegeist --db library.sqlite3 sync-jabref my-library.bib --output my-library.enriched.bib
```
Skip resolver calls if you only want a normalized import/export pass:
```bash
.venv/bin/python -m citegeist --db library.sqlite3 sync-jabref my-library.bib --output my-library.enriched.bib --no-resolve
```
Write back to the same file and include CiteGeist review cues as BibTeX sidecar fields for JabRef:
```bash
.venv/bin/python -m citegeist --db library.sqlite3 sync-jabref my-library.bib --in-place --annotate-review
```
## Build A Topic-Centered Bibliography
Purpose: create, expand, inspect, and export a topic slice such as `artificial life`.

View File

@ -8,7 +8,7 @@ import sys
from pathlib import Path
from .batch import BatchBootstrapRunner, load_batch_jobs
from .bibtex import parse_bibtex, render_bibtex
from .bibtex import BibEntry, parse_bibtex, render_bibtex
from .bootstrap import Bootstrapper
from .examples.talkorigins import TalkOriginsScraper
from .expand import CrossrefExpander, OpenAlexExpander, TopicExpander, _expand_relation_types
@ -56,6 +56,30 @@ def build_parser() -> argparse.ArgumentParser:
help="Include DOI-only placeholder records in broad exports",
)
sync_jabref_parser = subparsers.add_parser(
"sync-jabref",
help="Round-trip a JabRef-managed BibTeX file through CiteGeist ingest, enrichment, and export",
)
sync_jabref_parser.add_argument("input", help="BibTeX file managed in JabRef")
sync_jabref_parser.add_argument("--output", help="Path to write the enriched BibTeX export")
sync_jabref_parser.add_argument(
"--in-place",
action="store_true",
help="Write the enriched BibTeX back to the input file instead of a separate output path",
)
sync_jabref_parser.add_argument("--status", default="draft", help="Initial review status for newly ingested entries")
sync_jabref_parser.add_argument("--source-label", help="Provenance label for the ingest step")
sync_jabref_parser.add_argument(
"--no-resolve",
action="store_true",
help="Skip metadata resolution after ingest and only re-export the imported entries",
)
sync_jabref_parser.add_argument(
"--annotate-review",
action="store_true",
help="Add CiteGeist review/status sidecar fields to the exported BibTeX for easier JabRef review",
)
status_parser = subparsers.add_parser("set-status", help="Set the review status for one entry")
status_parser.add_argument("citation_key", help="Citation key to update")
status_parser.add_argument("review_status", help="New review status")
@ -662,6 +686,17 @@ def main(argv: list[str] | None = None) -> int:
return _run_show(store, args.citation_key, args.limit, args.provenance, args.conflicts)
if args.command == "export":
return _run_export(store, args.citation_keys, args.output, args.include_stubs)
if args.command == "sync-jabref":
return _run_sync_jabref(
store,
Path(args.input),
Path(args.output) if args.output else None,
args.in_place,
args.status,
args.source_label,
args.no_resolve,
args.annotate_review,
)
if args.command == "set-status":
return _run_set_status(store, args.citation_key, args.review_status)
if args.command == "resolve-conflicts":
@ -907,6 +942,103 @@ def _run_export(
return 0
def _run_sync_jabref(
store: BibliographyStore,
input_path: Path,
output_path: Path | None,
in_place: bool,
review_status: str,
source_label: str | None,
skip_resolve: bool,
annotate_review: bool,
) -> int:
if in_place:
effective_output_path = input_path
elif output_path is not None:
effective_output_path = output_path
else:
print("sync-jabref requires --output or --in-place", file=sys.stderr)
return 1
text = input_path.read_text(encoding="utf-8")
imported_keys = store.ingest_bibtex(
text,
source_label=source_label or str(input_path),
review_status=review_status,
)
resolved_keys: list[str] = []
failed_keys: list[str] = []
if not skip_resolve:
resolver = MetadataResolver()
total = len(imported_keys)
for index, citation_key in enumerate(imported_keys, start=1):
_print_progress("sync-jabref resolving", index, total, citation_key)
if _resolve_one(store, resolver, citation_key):
resolved_keys.append(citation_key)
else:
failed_keys.append(citation_key)
rendered = _render_jabref_sync_export(store, imported_keys, annotate_review=annotate_review)
effective_output_path.write_text(rendered + ("\n" if rendered else ""), encoding="utf-8")
print(
json.dumps(
{
"input": str(input_path),
"output": str(effective_output_path),
"imported_count": len(imported_keys),
"resolved_count": len(resolved_keys),
"failed_resolve_count": len(failed_keys),
"skipped_resolution": skip_resolve,
"annotated_review": annotate_review,
"in_place": in_place,
"citation_keys": imported_keys,
},
indent=2,
)
)
return 0 if skip_resolve or not failed_keys else 1
def _render_jabref_sync_export(
store: BibliographyStore,
citation_keys: list[str],
*,
annotate_review: bool,
) -> str:
entries: list[BibEntry] = []
for citation_key in citation_keys:
entry = store.get_bib_entry(citation_key)
if entry is None:
continue
if annotate_review:
entry = _annotated_jabref_entry(store, entry)
entries.append(entry)
return render_bibtex(entries) if entries else ""
def _annotated_jabref_entry(store: BibliographyStore, entry: BibEntry) -> BibEntry:
row = store.get_entry(entry.citation_key) or {}
annotated = BibEntry(
entry_type=entry.entry_type,
citation_key=entry.citation_key,
fields=dict(entry.fields),
)
review_status = str(row.get("review_status") or "")
if review_status:
annotated.fields["x_citegeist_review_status"] = review_status
open_conflicts = store.get_field_conflicts(entry.citation_key, status="open")
if open_conflicts:
annotated.fields["x_citegeist_open_conflicts"] = str(len(open_conflicts))
annotated.fields["x_citegeist_conflict_fields"] = ", ".join(
sorted({str(conflict.get("field_name") or "") for conflict in open_conflicts if conflict.get("field_name")})
)
provenance = store.get_field_provenance(entry.citation_key)
if provenance:
annotated.fields["x_citegeist_last_source"] = str(provenance[-1].get("source_label") or "")
return annotated
def _run_set_status(store: BibliographyStore, citation_key: str, review_status: str) -> int:
if not store.set_entry_status(citation_key, review_status):
print(f"Entry not found: {citation_key}", file=sys.stderr)

View File

@ -1048,6 +1048,9 @@ class BibliographyStore:
return None
return render_bibtex([entry])
def get_bib_entry(self, citation_key: str) -> BibEntry | None:
return self._load_bib_entry(citation_key)
def export_bibtex(self, citation_keys: list[str] | None = None, include_stubs: bool | None = None) -> str:
return self.export_bibtex_report(citation_keys, include_stubs=include_stubs)["bibtex"]

View File

@ -247,6 +247,203 @@ def test_cli_verify_bib_outputs_json(tmp_path: Path):
assert payload[0]["entry"]["citation_key"] == "candidate2024"
def test_cli_sync_jabref_ingests_resolves_and_exports(tmp_path: Path):
bib_path = tmp_path / "jabref-library.bib"
bib_path.write_text(
"""
@article{smith2024graphs,
author = {Smith, Jane},
title = {Graph-first bibliography augmentation},
year = {2024}
}
""",
encoding="utf-8",
)
output_path = tmp_path / "jabref-library.enriched.bib"
class FakeStore:
def __init__(self) -> None:
self.ingest_calls: list[tuple[str, str, str]] = []
self.closed = False
def ingest_bibtex(self, text: str, source_label: str, review_status: str) -> list[str]:
self.ingest_calls.append((text, source_label, review_status))
return ["smith2024graphs"]
def get_bib_entry(self, citation_key: str):
from citegeist.bibtex import BibEntry
return BibEntry("article", citation_key, {"title": "Resolved Work"})
def get_entry(self, citation_key: str):
return {"citation_key": citation_key, "review_status": "enriched"}
def get_field_conflicts(self, citation_key: str, status: str | None = None):
return []
def get_field_provenance(self, citation_key: str):
return []
def close(self) -> None:
self.closed = True
fake_store = FakeStore()
resolve_calls: list[str] = []
stdout_buffer = io.StringIO()
with (
patch("citegeist.cli.BibliographyStore", return_value=fake_store),
patch("citegeist.cli.MetadataResolver"),
patch("citegeist.cli.render_bibtex", return_value="@article{smith2024graphs,\n title = {Resolved Work}\n}"),
patch(
"citegeist.cli._resolve_one",
side_effect=lambda store, resolver, citation_key: resolve_calls.append(citation_key) or True,
),
redirect_stdout(stdout_buffer),
):
exit_code = main(
[
"--db",
str(tmp_path / "library.sqlite3"),
"sync-jabref",
str(bib_path),
"--output",
str(output_path),
"--status",
"draft",
"--source-label",
"jabref:test",
]
)
assert exit_code == 0
assert fake_store.ingest_calls[0][1:] == ("jabref:test", "draft")
assert resolve_calls == ["smith2024graphs"]
assert "@article{smith2024graphs," in output_path.read_text(encoding="utf-8")
payload = json.loads(stdout_buffer.getvalue())
assert payload["imported_count"] == 1
assert payload["resolved_count"] == 1
assert payload["failed_resolve_count"] == 0
assert payload["skipped_resolution"] is False
def test_cli_sync_jabref_can_skip_resolution(tmp_path: Path):
bib_path = tmp_path / "jabref-library.bib"
bib_path.write_text("@article{seed2024, title = {Seed}}\n", encoding="utf-8")
output_path = tmp_path / "jabref-library.enriched.bib"
class FakeStore:
def ingest_bibtex(self, text: str, source_label: str, review_status: str) -> list[str]:
return ["seed2024"]
def get_bib_entry(self, citation_key: str):
from citegeist.bibtex import BibEntry
return BibEntry("article", citation_key, {"title": "Seed"})
def get_entry(self, citation_key: str):
return {"citation_key": citation_key, "review_status": "draft"}
def get_field_conflicts(self, citation_key: str, status: str | None = None):
return []
def get_field_provenance(self, citation_key: str):
return []
def close(self) -> None:
return None
stdout_buffer = io.StringIO()
with (
patch("citegeist.cli.BibliographyStore", return_value=FakeStore()),
patch("citegeist.cli.render_bibtex", return_value="@article{seed2024,\n title = {Seed}\n}"),
patch("citegeist.cli._resolve_one") as mocked_resolve,
redirect_stdout(stdout_buffer),
):
exit_code = main(
[
"--db",
str(tmp_path / "library.sqlite3"),
"sync-jabref",
str(bib_path),
"--output",
str(output_path),
"--no-resolve",
]
)
assert exit_code == 0
mocked_resolve.assert_not_called()
payload = json.loads(stdout_buffer.getvalue())
assert payload["skipped_resolution"] is True
assert payload["resolved_count"] == 0
def test_cli_sync_jabref_can_annotate_review_fields_and_write_in_place(tmp_path: Path):
bib_path = tmp_path / "jabref-library.bib"
bib_path.write_text("@article{seed2024, title = {Seed}}\n", encoding="utf-8")
class FakeStore:
def ingest_bibtex(self, text: str, source_label: str, review_status: str) -> list[str]:
return ["seed2024"]
def get_bib_entry(self, citation_key: str):
from citegeist.bibtex import BibEntry
return BibEntry("article", citation_key, {"title": "Seed"})
def get_entry(self, citation_key: str):
return {"citation_key": citation_key, "review_status": "enriched"}
def get_field_conflicts(self, citation_key: str, status: str | None = None):
return [{"field_name": "title"}]
def get_field_provenance(self, citation_key: str):
return [{"source_label": "pubmed:pmid:12345678"}]
def close(self) -> None:
return None
stdout_buffer = io.StringIO()
with (
patch("citegeist.cli.BibliographyStore", return_value=FakeStore()),
patch(
"citegeist.cli.render_bibtex",
side_effect=lambda entries: "\n".join(
[
"@article{seed2024,",
f" title = {{{entries[0].fields['title']}}},",
f" x_citegeist_review_status = {{{entries[0].fields.get('x_citegeist_review_status', '')}}},",
f" x_citegeist_open_conflicts = {{{entries[0].fields.get('x_citegeist_open_conflicts', '')}}},",
f" x_citegeist_last_source = {{{entries[0].fields.get('x_citegeist_last_source', '')}}}",
"}",
]
),
),
patch("citegeist.cli._resolve_one", return_value=True),
redirect_stdout(stdout_buffer),
):
exit_code = main(
[
"--db",
str(tmp_path / "library.sqlite3"),
"sync-jabref",
str(bib_path),
"--in-place",
"--annotate-review",
]
)
assert exit_code == 0
rendered = bib_path.read_text(encoding="utf-8")
assert "x_citegeist_review_status" in rendered
assert "x_citegeist_open_conflicts" in rendered
assert "x_citegeist_last_source" in rendered
payload = json.loads(stdout_buffer.getvalue())
assert payload["in_place"] is True
assert payload["annotated_review"] is True
def test_cli_resolve_updates_entry(tmp_path: Path):
bib_path = tmp_path / "input.bib"
bib_path.write_text(