Add citation graph expansion workflow

2026-03-19 21:06:12 -04:00 · 2026-03-19 21:06:12 -04:00 · 10280a6229
parent ac405943fb
commit 10280a6229
12 changed files with 1339 additions and 16 deletions
--- a/README.md
+++ b/README.md
@ -46,6 +46,11 @@ The initial repo includes:
 - `pybtex`-backed BibTeX parsing and export in a repo-local virtual environment;
 - a SQLite-backed bibliography store;
 - a small CLI for ingest, search, inspection, and export;
 - review-state tracking on entries and per-field ingest provenance;
 - first-pass plaintext reference extraction into draft BibTeX;
 - identifier-first metadata resolution for DOI, DBLP, and arXiv-backed entries;
 - local citation-graph traversal over stored `cites`, `cited_by`, and `crossref` edges;
 - Crossref-backed graph expansion that materializes draft referenced works and edge provenance;
 - normalized tables for entries, creators, identifiers, and citation relations;
 - full-text-search-ready indexing over title, abstract, and fulltext when SQLite FTS5 is available;
 - tests covering parsing, ingestion, relation storage, and search.
@ -106,15 +111,19 @@ Or use the CLI directly:
 cd citegeist
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 ingest references.bib
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 search "semantic search"
-PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 show smith2024graphs
+PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 show --provenance smith2024graphs
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 set-status smith2024graphs reviewed
 PYTHONPATH=src .venv/bin/python -m citegeist extract references.txt --output draft.bib
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve smith2024graphs
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 graph smith2024graphs --relation cites --depth 2 --missing-only
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source crossref
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export --output reviewed.bib
 ```
 ## Near-Term Priorities
- provenance tracking and entry review states;
+- stronger plaintext extraction coverage for more citation styles;
- plaintext reference extraction into draft BibTeX;
+- richer graph expansion from additional external citation sources.
 - metadata resolvers for DOI, Crossref, DBLP, and arXiv.
 See [ROADMAP.md](./ROADMAP.md) for the prioritized phase plan and rationale.
--- a/src/citegeist/init.py
+++ b/src/citegeist/init.py
@ -1,4 +1,15 @@
 from .bibtex import BibEntry, parse_bibtex
 from .expand import CrossrefExpander
 from .extract import extract_references
 from .resolve import MetadataResolver, merge_entries
 from .storage import BibliographyStore
-__all__ = ["BibEntry", "BibliographyStore", "parse_bibtex"]
+__all__ = [
    "BibEntry",
    "BibliographyStore",
    "CrossrefExpander",
    "MetadataResolver",
    "extract_references",
    "merge_entries",
    "parse_bibtex",
 ]
--- a/src/citegeist/cli.py
+++ b/src/citegeist/cli.py
@ -1,10 +1,15 @@
 from __future__ import annotations
 import argparse
 from dataclasses import asdict
 import json
 import sys
 from pathlib import Path
 from .bibtex import parse_bibtex, render_bibtex
 from .expand import CrossrefExpander
 from .extract import extract_references
 from .resolve import MetadataResolver, merge_entries
 from .storage import BibliographyStore
@ -16,6 +21,8 @@ def build_parser() -> argparse.ArgumentParser:
    ingest_parser = subparsers.add_parser("ingest", help="Ingest BibTeX into the database")
    ingest_parser.add_argument("input", help="BibTeX file to ingest")
    ingest_parser.add_argument("--status", default="draft", help="Initial review status")
    ingest_parser.add_argument("--source-label", help="Provenance label for this ingest run")
    search_parser = subparsers.add_parser("search", help="Search titles, abstracts, and fulltext")
    search_parser.add_argument("query", help="Search query")
@ -24,11 +31,49 @@ def build_parser() -> argparse.ArgumentParser:
    show_parser = subparsers.add_parser("show", help="Show one entry or list entries")
    show_parser.add_argument("citation_key", nargs="?", help="Citation key to show")
    show_parser.add_argument("--limit", type=int, default=20, help="Maximum entries when listing")
    show_parser.add_argument("--provenance", action="store_true", help="Include field provenance")
    export_parser = subparsers.add_parser("export", help="Export entries as BibTeX")
    export_parser.add_argument("citation_keys", nargs="*", help="Optional citation keys to export")
    export_parser.add_argument("--output", help="Write BibTeX to a file instead of stdout")
    status_parser = subparsers.add_parser("set-status", help="Set the review status for one entry")
    status_parser.add_argument("citation_key", help="Citation key to update")
    status_parser.add_argument("review_status", help="New review status")
    extract_parser = subparsers.add_parser("extract", help="Extract draft BibTeX from plaintext references")
    extract_parser.add_argument("input", help="Plaintext file containing bibliography-style references")
    extract_parser.add_argument("--output", help="Write extracted BibTeX to a file instead of stdout")
    resolve_parser = subparsers.add_parser("resolve", help="Enrich stored entries from external metadata sources")
    resolve_parser.add_argument("citation_keys", nargs="+", help="Citation keys to enrich")
    graph_parser = subparsers.add_parser("graph", help="Traverse citation relations from one or more seed entries")
    graph_parser.add_argument("citation_keys", nargs="+", help="Seed citation keys")
    graph_parser.add_argument(
        "--relation",
        action="append",
        dest="relations",
        choices=["cites", "cited_by", "crossref"],
        help="Relation type to traverse; may be passed multiple times",
    )
    graph_parser.add_argument("--depth", type=int, default=1, help="Maximum traversal depth")
    graph_parser.add_argument("--review-status", help="Filter results by target review status")
    graph_parser.add_argument(
        "--missing-only",
        action="store_true",
        help="Show only unresolved target nodes that are not yet present in the database",
    )
    expand_parser = subparsers.add_parser("expand", help="Expand graph edges from external metadata sources")
    expand_parser.add_argument("citation_keys", nargs="+", help="Seed citation keys to expand")
    expand_parser.add_argument(
        "--source",
        choices=["crossref"],
        default="crossref",
        help="External source used for graph expansion",
    )
    return parser
@ -39,13 +84,30 @@ def main(argv: list[str] | None = None) -> int:
    store = BibliographyStore(args.db)
    try:
        if args.command == "ingest":
-            return _run_ingest(store, Path(args.input))
+            return _run_ingest(store, Path(args.input), args.status, args.source_label)
        if args.command == "search":
            return _run_search(store, args.query, args.limit)
        if args.command == "show":
-            return _run_show(store, args.citation_key, args.limit)
+            return _run_show(store, args.citation_key, args.limit, args.provenance)
        if args.command == "export":
            return _run_export(store, args.citation_keys, args.output)
        if args.command == "set-status":
            return _run_set_status(store, args.citation_key, args.review_status)
        if args.command == "extract":
            return _run_extract(Path(args.input), args.output)
        if args.command == "resolve":
            return _run_resolve(store, args.citation_keys)
        if args.command == "graph":
            return _run_graph(
                store,
                args.citation_keys,
                args.relations,
                args.depth,
                args.review_status,
                args.missing_only,
            )
        if args.command == "expand":
            return _run_expand(store, args.citation_keys, args.source)
    finally:
        store.close()
@ -53,9 +115,18 @@ def main(argv: list[str] | None = None) -> int:
    return 2
-def _run_ingest(store: BibliographyStore, input_path: Path) -> int:
+def _run_ingest(
    store: BibliographyStore,
    input_path: Path,
    review_status: str,
    source_label: str | None,
 ) -> int:
    text = input_path.read_text(encoding="utf-8")
-    keys = store.ingest_bibtex(text)
+    keys = store.ingest_bibtex(
        text,
        source_label=source_label or str(input_path),
        review_status=review_status,
    )
    for key in keys:
        print(key)
    return 0
@ -68,12 +139,14 @@ def _run_search(store: BibliographyStore, query: str, limit: int) -> int:
    return 0
-def _run_show(store: BibliographyStore, citation_key: str | None, limit: int) -> int:
+def _run_show(store: BibliographyStore, citation_key: str | None, limit: int, provenance: bool) -> int:
    if citation_key:
        entry = store.get_entry(citation_key)
        if entry is None:
            print(f"Entry not found: {citation_key}", file=sys.stderr)
            return 1
        if provenance:
            entry["field_provenance"] = store.get_field_provenance(citation_key)
        print(json.dumps(entry, indent=2, sort_keys=True))
        return 0
@ -89,3 +162,89 @@ def _run_export(store: BibliographyStore, citation_keys: list[str], output: str
        if rendered:
            print(rendered)
    return 0
 def _run_set_status(store: BibliographyStore, citation_key: str, review_status: str) -> int:
    if not store.set_entry_status(citation_key, review_status):
        print(f"Entry not found: {citation_key}", file=sys.stderr)
        return 1
    print(f"{citation_key}\t{review_status}")
    return 0
 def _run_extract(input_path: Path, output: str | None) -> int:
    text = input_path.read_text(encoding="utf-8")
    entries = extract_references(text)
    rendered = render_bibtex(entries)
    if output:
        Path(output).write_text(rendered + ("\n" if rendered else ""), encoding="utf-8")
    else:
        if rendered:
            print(rendered)
    return 0
 def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int:
    resolver = MetadataResolver()
    exit_code = 0
    for citation_key in citation_keys:
        existing = store.get_entry(citation_key)
        if existing is None:
            print(f"Entry not found: {citation_key}", file=sys.stderr)
            exit_code = 1
            continue
        bibtex = store.get_entry_bibtex(citation_key)
        if not bibtex:
            print(f"Entry not renderable: {citation_key}", file=sys.stderr)
            exit_code = 1
            continue
        current_entry = parse_bibtex(bibtex)[0]
        resolution = resolver.resolve_entry(current_entry)
        if resolution is None:
            print(f"No resolver match: {citation_key}", file=sys.stderr)
            exit_code = 1
            continue
        merged = merge_entries(current_entry, resolution.entry)
        store.replace_entry(
            citation_key,
            merged,
            source_type=resolution.source_type,
            source_label=resolution.source_label,
            review_status="enriched",
        )
        print(f"{citation_key}\t{resolution.source_label}")
    return exit_code
 def _run_graph(
    store: BibliographyStore,
    citation_keys: list[str],
    relations: list[str] | None,
    depth: int,
    review_status: str | None,
    missing_only: bool,
 ) -> int:
    rows = store.traverse_graph(
        citation_keys,
        relation_types=relations or ["cites"],
        max_depth=depth,
        review_status=review_status,
        include_missing=True,
    )
    if missing_only:
        rows = [row for row in rows if not row["target_exists"]]
    print(json.dumps(rows, indent=2))
    return 0
 def _run_expand(store: BibliographyStore, citation_keys: list[str], source: str) -> int:
    if source != "crossref":
        print(f"Unsupported expansion source: {source}", file=sys.stderr)
        return 1
    expander = CrossrefExpander()
    all_results = []
    for citation_key in citation_keys:
        all_results.extend(expander.expand_entry_references(store, citation_key))
    print(json.dumps([asdict(result) for result in all_results], indent=2))
    return 0
--- a/src/citegeist/expand.py
+++ b/src/citegeist/expand.py
@ -0,0 +1,121 @@
 from __future__ import annotations
 import re
 from dataclasses import dataclass
 from .bibtex import BibEntry
 from .resolve import MetadataResolver
 from .storage import BibliographyStore
@dataclass(slots=True)
 class ExpansionResult:
    source_citation_key: str
    discovered_citation_key: str
    created_entry: bool
    relation_type: str
    source_label: str
 class CrossrefExpander:
    def __init__(self, resolver: MetadataResolver | None = None) -> None:
        self.resolver = resolver or MetadataResolver()
    def expand_entry_references(
        self,
        store: BibliographyStore,
        citation_key: str,
    ) -> list[ExpansionResult]:
        entry = store.get_entry(citation_key)
        if entry is None:
            return []
        doi = entry.get("doi")
        if not doi:
            return []
        payload = self.resolver._get_json(  # noqa: SLF001
            f"https://api.crossref.org/works/{doi}?mailto=welsberr@gmail.com"
        )
        references = payload.get("message", {}).get("reference", [])
        results: list[ExpansionResult] = []
        for index, reference in enumerate(references, start=1):
            discovered = _crossref_reference_to_entry(reference, citation_key, index)
            created = False
            if store.get_entry(discovered.citation_key) is None:
                store.upsert_entry(
                    discovered,
                    raw_bibtex=None,
                    source_type="graph_expand",
                    source_label=f"crossref:references:{doi}",
                    review_status="draft",
                )
                store.connection.commit()
                created = True
            store.add_relation(
                citation_key,
                discovered.citation_key,
                "cites",
                source_type="graph_expand",
                source_label=f"crossref:references:{doi}",
                confidence=1.0 if reference.get("DOI") else 0.6,
            )
            results.append(
                ExpansionResult(
                    source_citation_key=citation_key,
                    discovered_citation_key=discovered.citation_key,
                    created_entry=created,
                    relation_type="cites",
                    source_label=f"crossref:references:{doi}",
                )
            )
        return results
 def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry:
    title = (
        reference.get("article-title")
        or reference.get("volume-title")
        or reference.get("journal-title")
        or reference.get("unstructured")
        or f"Referenced work {ordinal}"
    )
    year = str(reference.get("year") or "")
    author = reference.get("author") or ""
    doi = reference.get("DOI") or ""
    journal_title = reference.get("journal-title") or ""
    fields: dict[str, str] = {
        "title": _normalize_text(title),
        "note": f"discovered_from = {{{source_citation_key}}}",
    }
    if year:
        fields["year"] = year
    if author:
        fields["author"] = _normalize_text(author)
    if doi:
        fields["doi"] = doi
        fields["url"] = f"https://doi.org/{doi}"
    if journal_title:
        fields["journal"] = _normalize_text(journal_title)
    citation_key = _reference_citation_key(reference, title, year, ordinal)
    entry_type = "article" if journal_title else "misc"
    return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
 def _reference_citation_key(reference: dict, title: str, year: str, ordinal: int) -> str:
    if doi := reference.get("DOI"):
        suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower()
        return f"doi{suffix}"
    author = reference.get("author") or "ref"
    family = author.split(",")[0].split()[-1]
    family = re.sub(r"[^A-Za-z0-9]+", "", family).lower() or "ref"
    first_word = re.sub(r"[^A-Za-z0-9]+", "", title.split()[0]).lower() if title.split() else "untitled"
    return f"{family}{year or 'nd'}{first_word}{ordinal}"
 def _normalize_text(value: str) -> str:
    return " ".join(value.split())
--- a/src/citegeist/extract.py
+++ b/src/citegeist/extract.py
@ -0,0 +1,102 @@
 from __future__ import annotations
 import re
 from .bibtex import BibEntry
 YEAR_PATTERN = re.compile(r"\b(19|20)\d{2}\b")
 def extract_references(text: str) -> list[BibEntry]:
    entries: list[BibEntry] = []
    for index, line in enumerate(_iter_reference_lines(text), start=1):
        parsed = _parse_reference_line(line, index)
        if parsed is not None:
            entries.append(parsed)
    return entries
 def render_extracted_bibtex(text: str) -> str:
    from .bibtex import render_bibtex
    return render_bibtex(extract_references(text))
 def _iter_reference_lines(text: str) -> list[str]:
    lines: list[str] = []
    for raw_line in text.splitlines():
        line = raw_line.strip()
        if not line:
            continue
        line = re.sub(r"^\[\d+\]\s*", "", line)
        line = re.sub(r"^\d+\.\s*", "", line)
        line = re.sub(r"^\(\d+\)\s*", "", line)
        if len(line) < 20:
            continue
        lines.append(" ".join(line.split()))
    return lines
 def _parse_reference_line(line: str, ordinal: int) -> BibEntry | None:
    year_match = YEAR_PATTERN.search(line)
    if year_match is None:
        return None
    year = year_match.group(0)
    author_part = line[: year_match.start()].strip(" .")
    remainder = line[year_match.end() :].strip(" .")
    if not author_part or not remainder:
        return None
    segments = [segment.strip(" .") for segment in remainder.split(".") if segment.strip(" .")]
    if not segments:
        return None
    title = segments[0]
    venue = segments[1] if len(segments) > 1 else ""
    authors = _normalize_authors(author_part)
    citation_key = _make_citation_key(authors, year, title, ordinal)
    entry_type = _guess_entry_type(venue)
    fields: dict[str, str] = {
        "author": authors,
        "year": year,
        "title": title,
        "note": f"extracted_reference = {{true}}; raw_reference = {{{line}}}",
    }
    if venue:
        if entry_type == "article":
            fields["journal"] = venue
        else:
            fields["booktitle"] = venue
    return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
 def _normalize_authors(author_part: str) -> str:
    normalized = author_part.replace(" & ", " and ")
    normalized = re.sub(r"\bet al\.$", "and others", normalized)
    normalized = re.sub(r"\s+and\s+", " and ", normalized)
    normalized = re.sub(r"\s*,\s*", ", ", normalized)
    return normalized.strip(" .")
 def _make_citation_key(authors: str, year: str, title: str, ordinal: int) -> str:
    first_author = authors.split(" and ")[0]
    family_name = first_author.split(",")[0] if "," in first_author else first_author.split()[-1]
    family_name = re.sub(r"[^A-Za-z0-9]+", "", family_name).lower() or "ref"
    first_word = re.sub(r"[^A-Za-z0-9]+", "", title.split()[0]).lower() if title.split() else "untitled"
    if not first_word:
        first_word = "untitled"
    return f"{family_name}{year}{first_word}{ordinal}"
 def _guess_entry_type(venue: str) -> str:
    lowered = venue.lower()
    if any(token in lowered for token in ("journal", "transactions", "review", "letters")):
        return "article"
    if any(token in lowered for token in ("proceedings", "conference", "workshop", "symposium")):
        return "inproceedings"
    return "misc"
--- a/src/citegeist/resolve.py
+++ b/src/citegeist/resolve.py
@ -0,0 +1,240 @@
 from __future__ import annotations
 import json
 import urllib.parse
 import urllib.request
 import xml.etree.ElementTree as ET
 from dataclasses import dataclass
 from .bibtex import BibEntry, parse_bibtex
@dataclass(slots=True)
 class Resolution:
    entry: BibEntry
    source_type: str
    source_label: str
 class MetadataResolver:
    def __init__(self, user_agent: str = "citegeist/0.1 (local research tool)") -> None:
        self.user_agent = user_agent
    def resolve_entry(self, entry: BibEntry) -> Resolution | None:
        if doi := entry.fields.get("doi"):
            resolved = self.resolve_doi(doi)
            if resolved is not None:
                return resolved
        if dblp_key := entry.fields.get("dblp"):
            resolved = self.resolve_dblp(dblp_key)
            if resolved is not None:
                return resolved
        if arxiv_id := entry.fields.get("arxiv"):
            resolved = self.resolve_arxiv(arxiv_id)
            if resolved is not None:
                return resolved
        return None
    def resolve_doi(self, doi: str) -> Resolution | None:
        encoded = urllib.parse.quote(doi, safe="")
        payload = self._get_json(f"https://api.crossref.org/works/{encoded}")
        message = payload.get("message", {})
        if not message:
            return None
        return Resolution(
            entry=_crossref_message_to_entry(message),
            source_type="resolver",
            source_label=f"crossref:doi:{doi}",
        )
    def search_crossref(self, title: str, limit: int = 5) -> list[BibEntry]:
        query = urllib.parse.urlencode({"query.title": title, "rows": limit})
        payload = self._get_json(f"https://api.crossref.org/works?{query}")
        items = payload.get("message", {}).get("items", [])
        return [_crossref_message_to_entry(item) for item in items]
    def resolve_dblp(self, dblp_key: str) -> Resolution | None:
        encoded_key = urllib.parse.quote(dblp_key, safe="/:")
        text = self._get_text(f"https://dblp.org/rec/{encoded_key}.bib")
        entries = parse_bibtex(text)
        if not entries:
            return None
        return Resolution(
            entry=entries[0],
            source_type="resolver",
            source_label=f"dblp:key:{dblp_key}",
        )
    def search_dblp(self, query_text: str, limit: int = 5) -> list[BibEntry]:
        query = urllib.parse.urlencode({"q": query_text, "format": "json", "h": limit})
        payload = self._get_json(f"https://dblp.org/search/publ/api?{query}")
        hits = payload.get("result", {}).get("hits", {}).get("hit", [])
        if isinstance(hits, dict):
            hits = [hits]
        results: list[BibEntry] = []
        for hit in hits:
            info = hit.get("info", {})
            dblp_key = info.get("key")
            if dblp_key:
                resolved = self.resolve_dblp(dblp_key)
                if resolved is not None:
                    results.append(resolved.entry)
        return results
    def resolve_arxiv(self, arxiv_id: str) -> Resolution | None:
        query = urllib.parse.urlencode({"id_list": arxiv_id})
        root = self._get_xml(f"https://export.arxiv.org/api/query?{query}")
        namespace = {"atom": "http://www.w3.org/2005/Atom"}
        entry = root.find("atom:entry", namespace)
        if entry is None:
            return None
        return Resolution(
            entry=_arxiv_atom_entry_to_bib(entry, arxiv_id),
            source_type="resolver",
            source_label=f"arxiv:id:{arxiv_id}",
        )
    def _get_json(self, url: str) -> dict:
        with urllib.request.urlopen(self._request(url)) as response:
            return json.load(response)
    def _get_text(self, url: str) -> str:
        with urllib.request.urlopen(self._request(url)) as response:
            return response.read().decode("utf-8")
    def _get_xml(self, url: str) -> ET.Element:
        with urllib.request.urlopen(self._request(url)) as response:
            return ET.fromstring(response.read())
    def _request(self, url: str) -> urllib.request.Request:
        return urllib.request.Request(
            url,
            headers={
                "User-Agent": self.user_agent,
            },
        )
 def merge_entries(base: BibEntry, resolved: BibEntry) -> BibEntry:
    merged_fields = dict(base.fields)
    for key, value in resolved.fields.items():
        if value and (key not in merged_fields or not merged_fields[key]):
            merged_fields[key] = value
    return BibEntry(
        entry_type=base.entry_type or resolved.entry_type,
        citation_key=base.citation_key,
        fields=merged_fields,
    )
 def _crossref_message_to_entry(message: dict) -> BibEntry:
    entry_type = _crossref_type_to_bibtype(message.get("type", "article"))
    title_values = message.get("title", [])
    title = title_values[0] if title_values else ""
    year = _extract_crossref_year(message)
    authors = " and ".join(_crossref_person_to_name(person) for person in message.get("author", []))
    venue = ""
    if container_title := message.get("container-title", []):
        venue = container_title[0]
    fields: dict[str, str] = {}
    if authors:
        fields["author"] = authors
    if title:
        fields["title"] = title
    if year:
        fields["year"] = year
    if doi := message.get("DOI"):
        fields["doi"] = doi
    if url := message.get("URL"):
        fields["url"] = url
    if abstract := message.get("abstract"):
        fields["abstract"] = abstract
    if venue:
        if entry_type == "article":
            fields["journal"] = venue
        else:
            fields["booktitle"] = venue
    if volume := message.get("volume"):
        fields["volume"] = str(volume)
    if issue := message.get("issue"):
        fields["number"] = str(issue)
    if pages := message.get("page"):
        fields["pages"] = str(pages)
    citation_key = _make_resolution_key(fields.get("author", "crossref"), year or "n.d.", title or "untitled")
    return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
 def _arxiv_atom_entry_to_bib(node: ET.Element, arxiv_id: str) -> BibEntry:
    ns = {
        "atom": "http://www.w3.org/2005/Atom",
        "arxiv": "http://arxiv.org/schemas/atom",
    }
    title = _node_text(node.find("atom:title", ns))
    summary = _node_text(node.find("atom:summary", ns))
    published = _node_text(node.find("atom:published", ns))
    year = published[:4] if published else ""
    authors = " and ".join(
        _node_text(author.find("atom:name", ns)) for author in node.findall("atom:author", ns)
    )
    doi = _node_text(node.find("arxiv:doi", ns))
    fields: dict[str, str] = {
        "title": title,
        "author": authors,
        "year": year,
        "arxiv": arxiv_id,
        "url": f"https://arxiv.org/abs/{arxiv_id}",
        "pdf": f"https://arxiv.org/pdf/{arxiv_id}.pdf",
    }
    if summary:
        fields["abstract"] = summary
    if doi:
        fields["doi"] = doi
    return BibEntry(entry_type="article", citation_key=f"arxiv{arxiv_id.replace('.', '').replace('/', '')}", fields=fields)
 def _crossref_type_to_bibtype(crossref_type: str) -> str:
    mapping = {
        "journal-article": "article",
        "proceedings-article": "inproceedings",
        "book-chapter": "incollection",
        "book": "book",
        "proceedings": "proceedings",
    }
    return mapping.get(crossref_type, "misc")
 def _extract_crossref_year(message: dict) -> str:
    for field_name in ("published-print", "published-online", "issued", "created"):
        date_parts = message.get(field_name, {}).get("date-parts", [])
        if date_parts and date_parts[0]:
            return str(date_parts[0][0])
    return ""
 def _crossref_person_to_name(person: dict) -> str:
    family = person.get("family", "")
    given = person.get("given", "")
    if family and given:
        return f"{family}, {given}"
    return family or given
 def _node_text(node: ET.Element | None) -> str:
    if node is None or node.text is None:
        return ""
    return " ".join(node.text.split())
 def _make_resolution_key(author_text: str, year: str, title: str) -> str:
    first_author = author_text.split(" and ")[0]
    family_name = first_author.split(",")[0] if "," in first_author else first_author.split()[-1]
    family_name = "".join(ch for ch in family_name.lower() if ch.isalnum()) or "ref"
    first_word = "".join(ch for ch in title.split()[0].lower() if ch.isalnum()) if title.split() else "untitled"
    return f"{family_name}{year}{first_word}"
--- a/src/citegeist/storage.py
+++ b/src/citegeist/storage.py
@ -2,6 +2,7 @@ from __future__ import annotations
 import json
 import sqlite3
 from collections import deque
 from collections import OrderedDict
 from pathlib import Path
@ -47,6 +48,7 @@ class BibliographyStore:
                id INTEGER PRIMARY KEY,
                citation_key TEXT NOT NULL UNIQUE,
                entry_type TEXT NOT NULL,
                review_status TEXT NOT NULL DEFAULT 'draft',
                title TEXT,
                year TEXT,
                journal TEXT,
@ -92,9 +94,34 @@ class BibliographyStore:
                relation_type TEXT NOT NULL,
                PRIMARY KEY (source_entry_id, target_citation_key, relation_type)
            );
            CREATE TABLE IF NOT EXISTS field_provenance (
                id INTEGER PRIMARY KEY,
                entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
                field_name TEXT NOT NULL,
                field_value TEXT,
                source_type TEXT NOT NULL,
                source_label TEXT NOT NULL,
                operation TEXT NOT NULL,
                confidence REAL,
                recorded_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
            );
            CREATE TABLE IF NOT EXISTS relation_provenance (
                id INTEGER PRIMARY KEY,
                source_entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
                target_citation_key TEXT NOT NULL,
                relation_type TEXT NOT NULL,
                source_type TEXT NOT NULL,
                source_label TEXT NOT NULL,
                confidence REAL,
                recorded_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
            );
            """
        )
        self._ensure_entry_columns()
        if self._fts5_enabled:
            self.connection.execute(
                """
@ -109,24 +136,45 @@ class BibliographyStore:
            )
        self.connection.commit()
-    def ingest_bibtex(self, text: str, fulltext_by_key: dict[str, str] | None = None) -> list[str]:
+    def ingest_bibtex(
        self,
        text: str,
        fulltext_by_key: dict[str, str] | None = None,
        source_label: str = "bibtex_import",
        review_status: str = "draft",
    ) -> list[str]:
        fulltext_by_key = fulltext_by_key or {}
        entries = parse_bibtex(text)
        keys: list[str] = []
        for entry in entries:
            fulltext = fulltext_by_key.get(entry.citation_key)
-            self.upsert_entry(entry, fulltext=fulltext, raw_bibtex=_entry_to_bibtex(entry))
+            self.upsert_entry(
                entry,
                fulltext=fulltext,
                raw_bibtex=_entry_to_bibtex(entry),
                source_type="bibtex",
                source_label=source_label,
                review_status=review_status,
            )
            keys.append(entry.citation_key)
        self.connection.commit()
        return keys
-    def upsert_entry(self, entry: BibEntry, fulltext: str | None = None, raw_bibtex: str | None = None) -> int:
+    def upsert_entry(
        self,
        entry: BibEntry,
        fulltext: str | None = None,
        raw_bibtex: str | None = None,
        source_type: str = "manual",
        source_label: str = "manual",
        review_status: str = "draft",
    ) -> int:
        row = self.connection.execute(
            """
            INSERT INTO entries (
-                citation_key, entry_type, title, year, journal, booktitle, publisher,
+                citation_key, entry_type, review_status, title, year, journal, booktitle, publisher,
                abstract, keywords, url, doi, isbn, fulltext, raw_bibtex, extra_fields_json
-            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ON CONFLICT(citation_key) DO UPDATE SET
                entry_type = excluded.entry_type,
                title = excluded.title,
@ -148,6 +196,7 @@ class BibliographyStore:
            (
                entry.citation_key,
                entry.entry_type,
                review_status,
                entry.fields.get("title"),
                entry.fields.get("year"),
                entry.fields.get("journal"),
@ -165,6 +214,15 @@ class BibliographyStore:
        ).fetchone()
        entry_id = int(row["id"])
        self._record_field_provenance(
            entry_id=entry_id,
            entry=entry,
            source_type=source_type,
            source_label=source_label,
            operation="upsert",
            fulltext=fulltext,
        )
        self.connection.execute("DELETE FROM entry_creators WHERE entry_id = ?", (entry_id,))
        for role in ("author", "editor"):
            names = _split_names(entry.fields.get(role, ""))
@ -262,6 +320,64 @@ class BibliographyStore:
        ).fetchall()
        return [str(row["target_citation_key"]) for row in rows]
    def traverse_graph(
        self,
        seed_keys: list[str],
        relation_types: list[str] | None = None,
        max_depth: int = 1,
        review_status: str | None = None,
        include_missing: bool = True,
    ) -> list[dict[str, object]]:
        relation_types = relation_types or ["cites"]
        allowed_relations = set(relation_types)
        visited: dict[str, int] = {}
        queue: deque[tuple[str, int]] = deque()
        for seed_key in seed_keys:
            queue.append((seed_key, 0))
            visited[seed_key] = 0
        results: list[dict[str, object]] = []
        while queue:
            citation_key, depth = queue.popleft()
            if depth >= max_depth:
                continue
            for edge in self._iter_graph_edges(citation_key, allowed_relations):
                target_key = str(edge["target_citation_key"])
                target_entry = self.get_entry(target_key)
                target_status = target_entry.get("review_status") if target_entry else None
                if review_status is not None and target_status != review_status:
                    if target_entry is not None or not include_missing:
                        continue
                next_depth = depth + 1
                result = {
                    "source_citation_key": citation_key,
                    "target_citation_key": target_key,
                    "relation_type": str(edge["relation_type"]),
                    "depth": next_depth,
                    "target_exists": target_entry is not None,
                    "target_review_status": target_status,
                    "target_title": target_entry.get("title") if target_entry else None,
                }
                results.append(result)
                if target_entry is not None and (target_key not in visited or next_depth < visited[target_key]):
                    visited[target_key] = next_depth
                    queue.append((target_key, next_depth))
        results.sort(
            key=lambda row: (
                int(row["depth"]),
                str(row["relation_type"]),
                str(row["source_citation_key"]),
                str(row["target_citation_key"]),
            )
        )
        return results
    def get_entry(self, citation_key: str) -> dict[str, object] | None:
        row = self.connection.execute(
            "SELECT * FROM entries WHERE citation_key = ?",
@ -272,7 +388,7 @@ class BibliographyStore:
    def list_entries(self, limit: int = 50) -> list[dict[str, object]]:
        rows = self.connection.execute(
            """
-            SELECT citation_key, entry_type, title, year
+            SELECT citation_key, entry_type, review_status, title, year
            FROM entries
            ORDER BY COALESCE(year, ''), citation_key
            LIMIT ?
@ -281,6 +397,109 @@ class BibliographyStore:
        ).fetchall()
        return [dict(row) for row in rows]
    def set_entry_status(self, citation_key: str, review_status: str) -> bool:
        row = self.connection.execute(
            """
            UPDATE entries
            SET review_status = ?, updated_at = CURRENT_TIMESTAMP
            WHERE citation_key = ?
            RETURNING id
            """,
            (review_status, citation_key),
        ).fetchone()
        self.connection.commit()
        return row is not None
    def replace_entry(
        self,
        citation_key: str,
        entry: BibEntry,
        source_type: str,
        source_label: str,
        review_status: str = "enriched",
    ) -> bool:
        existing = self.get_entry(citation_key)
        if existing is None:
            return False
        replacement = BibEntry(
            entry_type=entry.entry_type,
            citation_key=citation_key,
            fields=entry.fields,
        )
        self.upsert_entry(
            replacement,
            fulltext=existing.get("fulltext"),
            raw_bibtex=_entry_to_bibtex(replacement),
            source_type=source_type,
            source_label=source_label,
            review_status=review_status,
        )
        self.connection.commit()
        return True
    def add_relation(
        self,
        source_citation_key: str,
        target_citation_key: str,
        relation_type: str,
        source_type: str,
        source_label: str,
        confidence: float = 1.0,
    ) -> bool:
        row = self.connection.execute(
            "SELECT id FROM entries WHERE citation_key = ?",
            (source_citation_key,),
        ).fetchone()
        if row is None:
            return False
        source_entry_id = int(row["id"])
        self.connection.execute(
            """
            INSERT OR IGNORE INTO relations (source_entry_id, target_citation_key, relation_type)
            VALUES (?, ?, ?)
            """,
            (source_entry_id, target_citation_key, relation_type),
        )
        self.connection.execute(
            """
            INSERT INTO relation_provenance (
                source_entry_id, target_citation_key, relation_type, source_type, source_label, confidence
            ) VALUES (?, ?, ?, ?, ?, ?)
            """,
            (source_entry_id, target_citation_key, relation_type, source_type, source_label, confidence),
        )
        self.connection.commit()
        return True
    def get_field_provenance(self, citation_key: str) -> list[dict[str, object]]:
        rows = self.connection.execute(
            """
            SELECT fp.field_name, fp.field_value, fp.source_type, fp.source_label,
                   fp.operation, fp.confidence, fp.recorded_at
            FROM field_provenance fp
            JOIN entries e ON e.id = fp.entry_id
            WHERE e.citation_key = ?
            ORDER BY fp.recorded_at, fp.id
            """,
            (citation_key,),
        ).fetchall()
        return [dict(row) for row in rows]
    def get_relation_provenance(self, citation_key: str) -> list[dict[str, object]]:
        rows = self.connection.execute(
            """
            SELECT rp.target_citation_key, rp.relation_type, rp.source_type, rp.source_label,
                   rp.confidence, rp.recorded_at
            FROM relation_provenance rp
            JOIN entries e ON e.id = rp.source_entry_id
            WHERE e.citation_key = ?
            ORDER BY rp.recorded_at, rp.id
            """,
            (citation_key,),
        ).fetchall()
        return [dict(row) for row in rows]
    def get_entry_bibtex(self, citation_key: str) -> str | None:
        entry = self._load_bib_entry(citation_key)
        if entry is None:
@ -382,6 +601,72 @@ class BibliographyStore:
        ).fetchall()
        return [str(row["full_name"]) for row in rows]
    def _iter_graph_edges(self, citation_key: str, allowed_relations: set[str]) -> list[sqlite3.Row]:
        rows = self.connection.execute(
            """
            SELECT e.citation_key AS source_citation_key, r.target_citation_key, r.relation_type
            FROM relations r
            JOIN entries e ON e.id = r.source_entry_id
            WHERE e.citation_key = ? AND r.relation_type IN ({placeholders})
            ORDER BY r.relation_type, r.target_citation_key
            """.format(placeholders=",".join("?" for _ in allowed_relations)),
            (citation_key, *sorted(allowed_relations)),
        ).fetchall()
        reverse_rows = []
        if "cited_by" in allowed_relations:
            reverse_rows = self.connection.execute(
                """
                SELECT ? AS source_citation_key, e.citation_key AS target_citation_key, 'cited_by' AS relation_type
                FROM relations r
                JOIN entries e ON e.id = r.source_entry_id
                WHERE r.target_citation_key = ? AND r.relation_type = 'cites'
                ORDER BY e.citation_key
                """,
                (citation_key, citation_key),
            ).fetchall()
        seen: set[tuple[str, str]] = set()
        merged: list[sqlite3.Row] = []
        for row in list(rows) + list(reverse_rows):
            key = (str(row["relation_type"]), str(row["target_citation_key"]))
            if key not in seen:
                seen.add(key)
                merged.append(row)
        return merged
    def _ensure_entry_columns(self) -> None:
        columns = {
            row["name"] for row in self.connection.execute("PRAGMA table_info(entries)").fetchall()
        }
        if "review_status" not in columns:
            self.connection.execute(
                "ALTER TABLE entries ADD COLUMN review_status TEXT NOT NULL DEFAULT 'draft'"
            )
    def _record_field_provenance(
        self,
        entry_id: int,
        entry: BibEntry,
        source_type: str,
        source_label: str,
        operation: str,
        fulltext: str | None,
    ) -> None:
        field_items = list(entry.fields.items())
        if fulltext:
            field_items.append(("fulltext", fulltext))
        for field_name, field_value in field_items:
            self.connection.execute(
                """
                INSERT INTO field_provenance (
                    entry_id, field_name, field_value, source_type, source_label, operation, confidence
                ) VALUES (?, ?, ?, ?, ?, ?, ?)
                """,
                (entry_id, field_name, field_value, source_type, source_label, operation, 1.0),
            )
 def _split_names(value: str) -> list[str]:
    if not value:
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -4,6 +4,9 @@ import json
 import subprocess
 import sys
 from pathlib import Path
 from unittest.mock import patch
 from citegeist.cli import main
 SAMPLE_BIB = """
@ -59,3 +62,144 @@ def test_cli_ingest_show_search_and_export(tmp_path: Path):
    assert export_result.returncode == 0
    exported = export_path.read_text(encoding="utf-8")
    assert "@article{smith2024graphs," in exported
 def test_cli_provenance_and_status_updates(tmp_path: Path):
    bib_path = tmp_path / "input.bib"
    bib_path.write_text(SAMPLE_BIB, encoding="utf-8")
    ingest = run_cli(
        tmp_path,
        "ingest",
        "--status",
        "draft",
        "--source-label",
        "tests/input.bib",
        str(bib_path),
    )
    assert ingest.returncode == 0
    show = run_cli(tmp_path, "show", "--provenance", "smith2024graphs")
    assert show.returncode == 0
    payload = json.loads(show.stdout)
    assert payload["review_status"] == "draft"
    assert payload["field_provenance"][0]["source_label"] == "tests/input.bib"
    status = run_cli(tmp_path, "set-status", "smith2024graphs", "reviewed")
    assert status.returncode == 0
    assert "reviewed" in status.stdout
 def test_cli_resolve_updates_entry(tmp_path: Path):
    bib_path = tmp_path / "input.bib"
    bib_path.write_text(
        """
@article{smith2024graphs,
  author = {Smith, Jane},
  title = {Graph-first bibliography augmentation},
  year = {2024},
  doi = {10.1000/example-doi}
 }
 """,
        encoding="utf-8",
    )
    ingest = run_cli(tmp_path, "ingest", str(bib_path))
    assert ingest.returncode == 0
    from citegeist.bibtex import BibEntry
    from citegeist.resolve import Resolution
    database = tmp_path / "library.sqlite3"
    with patch("citegeist.cli.MetadataResolver.resolve_entry") as mocked_resolve:
        mocked_resolve.return_value = Resolution(
            entry=BibEntry(
                entry_type="article",
                citation_key="resolvedkey",
                fields={
                    "author": "Smith, Jane",
                    "title": "Graph-first bibliography augmentation",
                    "year": "2024",
                    "doi": "10.1000/example-doi",
                    "journal": "Journal of Graph Studies",
                },
            ),
            source_type="resolver",
            source_label="crossref:doi:10.1000/example-doi",
        )
        exit_code = main(
            [
                "--db",
                str(database),
                "resolve",
                "smith2024graphs",
            ]
        )
    assert exit_code == 0
 def test_cli_graph_outputs_missing_targets(tmp_path: Path):
    bib_path = tmp_path / "graph.bib"
    bib_path.write_text(
        """
@article{seed2024,
  author = {Seed, Alice},
  title = {Seed Paper},
  year = {2024},
  references = {known2023, missing2022}
 }
@article{known2023,
  author = {Known, Bob},
  title = {Known Paper},
  year = {2023}
 }
 """,
        encoding="utf-8",
    )
    ingest = run_cli(tmp_path, "ingest", str(bib_path))
    assert ingest.returncode == 0
    graph = run_cli(tmp_path, "graph", "seed2024", "--missing-only")
    assert graph.returncode == 0
    payload = json.loads(graph.stdout)
    assert len(payload) == 1
    assert payload[0]["target_citation_key"] == "missing2022"
    assert payload[0]["target_exists"] is False
 def test_cli_expand_with_mocked_crossref(tmp_path: Path):
    bib_path = tmp_path / "expand.bib"
    bib_path.write_text(
        """
@article{seed2024,
  author = {Seed, Alice},
  title = {Seed Paper},
  year = {2024},
  doi = {10.1000/seed-doi}
 }
 """,
        encoding="utf-8",
    )
    ingest = run_cli(tmp_path, "ingest", str(bib_path))
    assert ingest.returncode == 0
    from citegeist.expand import ExpansionResult
    with patch("citegeist.cli.CrossrefExpander.expand_entry_references") as mocked_expand:
        mocked_expand.return_value = [
            ExpansionResult(
                source_citation_key="seed2024",
                discovered_citation_key="doi101000exampleref",
                created_entry=True,
                relation_type="cites",
                source_label="crossref:references:10.1000/seed-doi",
            )
        ]
        database = tmp_path / "library.sqlite3"
        exit_code = main(["--db", str(database), "expand", "seed2024"])
    assert exit_code == 0
--- a/tests/test_expand.py
+++ b/tests/test_expand.py
@ -0,0 +1,69 @@
 from citegeist.expand import CrossrefExpander, _crossref_reference_to_entry
 from citegeist.storage import BibliographyStore
 def test_crossref_reference_to_entry_prefers_doi_key():
    entry = _crossref_reference_to_entry(
        {
            "DOI": "10.1000/example-ref",
            "article-title": "Discovered Reference",
            "author": "Doe, Alex",
            "year": "2022",
            "journal-title": "Journal of Discovery",
        },
        "seed2024",
        1,
    )
    assert entry.citation_key == "doi101000exampleref"
    assert entry.fields["doi"] == "10.1000/example-ref"
    assert entry.fields["journal"] == "Journal of Discovery"
 def test_crossref_expander_creates_draft_nodes_and_relations():
    store = BibliographyStore()
    try:
        store.ingest_bibtex(
            """
@article{seed2024,
  author = {Seed, Alice},
  title = {Seed Paper},
  year = {2024},
  doi = {10.1000/seed-doi}
 }
 """
        )
        expander = CrossrefExpander()
        expander.resolver._get_json = lambda _url: {  # type: ignore[method-assign]
            "message": {
                "reference": [
                    {
                        "DOI": "10.1000/example-ref",
                        "article-title": "Discovered Reference",
                        "author": "Doe, Alex",
                        "year": "2022",
                        "journal-title": "Journal of Discovery",
                    },
                    {
                        "unstructured": "Unstructured reference string",
                        "year": "2021",
                    },
                ]
            }
        }
        results = expander.expand_entry_references(store, "seed2024")
        assert [result.discovered_citation_key for result in results] == [
            "doi101000exampleref",
            "ref2021unstructured2",
        ]
        discovered = store.get_entry("doi101000exampleref")
        assert discovered is not None
        assert discovered["review_status"] == "draft"
        assert store.get_relations("seed2024") == ["doi101000exampleref", "ref2021unstructured2"]
        relation_provenance = store.get_relation_provenance("seed2024")
        assert relation_provenance[0]["source_type"] == "graph_expand"
    finally:
        store.close()
--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@ -0,0 +1,35 @@
 from citegeist import extract_references, parse_bibtex
 from citegeist.cli import main
 SAMPLE_REFERENCES = """
 [1] Smith, Jane and Doe, Alex. 2024. Graph-first bibliography augmentation. Journal of Research Systems.
 [2] Miller, Sam. 2023. Semantic search for research corpora. Proceedings of the Retrieval Workshop.
 """
 def test_extract_references_builds_draft_entries():
    entries = extract_references(SAMPLE_REFERENCES)
    assert [entry.citation_key for entry in entries] == [
        "smith2024graphfirst1",
        "miller2023semantic2",
    ]
    assert entries[0].entry_type == "article"
    assert entries[0].fields["journal"] == "Journal of Research Systems"
    assert entries[1].entry_type == "inproceedings"
    assert entries[1].fields["booktitle"] == "Proceedings of the Retrieval Workshop"
 def test_extract_cli_writes_bibtex(tmp_path):
    input_path = tmp_path / "references.txt"
    output_path = tmp_path / "draft.bib"
    input_path.write_text(SAMPLE_REFERENCES, encoding="utf-8")
    exit_code = main(["extract", str(input_path), "--output", str(output_path)])
    assert exit_code == 0
    exported = output_path.read_text(encoding="utf-8")
    parsed = {entry.citation_key: entry for entry in parse_bibtex(exported)}
    assert parsed["smith2024graphfirst1"].fields["journal"] == "Journal of Research Systems"
    assert parsed["miller2023semantic2"].fields["booktitle"] == "Proceedings of the Retrieval Workshop"
--- a/tests/test_resolve.py
+++ b/tests/test_resolve.py
@ -0,0 +1,85 @@
 from xml.etree import ElementTree as ET
 from citegeist.bibtex import BibEntry
 from citegeist.resolve import MetadataResolver, _arxiv_atom_entry_to_bib, _crossref_message_to_entry, merge_entries
 def test_crossref_message_to_entry_maps_basic_fields():
    entry = _crossref_message_to_entry(
        {
            "type": "journal-article",
            "title": ["Graph-first bibliography augmentation"],
            "DOI": "10.1000/example-doi",
            "URL": "https://doi.org/10.1000/example-doi",
            "container-title": ["Journal of Graph Studies"],
            "author": [{"family": "Smith", "given": "Jane"}],
            "issued": {"date-parts": [[2024, 5, 1]]},
        }
    )
    assert entry.entry_type == "article"
    assert entry.fields["author"] == "Smith, Jane"
    assert entry.fields["journal"] == "Journal of Graph Studies"
    assert entry.fields["year"] == "2024"
 def test_arxiv_atom_entry_to_bib_maps_basic_fields():
    xml = ET.fromstring(
        """
 <entry xmlns="http://www.w3.org/2005/Atom" xmlns:arxiv="http://arxiv.org/schemas/atom">
  <title>Semantic search for research corpora</title>
  <summary>Dense retrieval improves recall.</summary>
  <published>2023-01-15T00:00:00Z</published>
  <author><name>Miller, Sam</name></author>
  <arxiv:doi>10.1000/arxiv-example</arxiv:doi>
 </entry>
 """
    )
    entry = _arxiv_atom_entry_to_bib(xml, "2301.12345")
    assert entry.fields["author"] == "Miller, Sam"
    assert entry.fields["arxiv"] == "2301.12345"
    assert entry.fields["doi"] == "10.1000/arxiv-example"
 def test_merge_entries_prefers_existing_values_and_adds_missing_fields():
    base = BibEntry(
        entry_type="article",
        citation_key="smith2024graphs",
        fields={"title": "Graph-first bibliography augmentation", "doi": "10.1000/example-doi"},
    )
    resolved = BibEntry(
        entry_type="article",
        citation_key="otherkey",
        fields={"title": "Different title", "journal": "Journal of Graph Studies"},
    )
    merged = merge_entries(base, resolved)
    assert merged.fields["title"] == "Graph-first bibliography augmentation"
    assert merged.fields["journal"] == "Journal of Graph Studies"
 def test_resolver_tries_doi_before_dblp():
    resolver = MetadataResolver()
    calls: list[tuple[str, str]] = []
    def fake_doi(value: str):
        calls.append(("doi", value))
        return None
    def fake_dblp(value: str):
        calls.append(("dblp", value))
        return None
    resolver.resolve_doi = fake_doi  # type: ignore[method-assign]
    resolver.resolve_dblp = fake_dblp  # type: ignore[method-assign]
    resolver.resolve_entry(
        BibEntry(
            entry_type="article",
            citation_key="smith2024graphs",
            fields={"doi": "10.1000/example-doi", "dblp": "conf/test/Smith24"},
        )
    )
    assert calls == [("doi", "10.1000/example-doi"), ("dblp", "conf/test/Smith24")]
--- a/tests/test_storage.py
+++ b/tests/test_storage.py
@ -67,3 +67,66 @@ def test_store_exports_bibtex_from_normalized_rows():
        assert parsed["smith2024graphs"].fields["references"] == "miller2023search"
    finally:
        store.close()
 def test_store_records_provenance_and_review_status():
    store = BibliographyStore()
    try:
        store.ingest_bibtex(SAMPLE_BIB, source_label="fixtures/sample.bib", review_status="draft")
        entry = store.get_entry("smith2024graphs")
        assert entry is not None
        assert entry["review_status"] == "draft"
        provenance = store.get_field_provenance("smith2024graphs")
        assert provenance
        assert provenance[0]["source_type"] == "bibtex"
        assert provenance[0]["source_label"] == "fixtures/sample.bib"
        assert store.set_entry_status("smith2024graphs", "reviewed") is True
        updated = store.get_entry("smith2024graphs")
        assert updated is not None
        assert updated["review_status"] == "reviewed"
    finally:
        store.close()
 def test_store_traverses_graph_and_surfaces_missing_targets():
    store = BibliographyStore()
    try:
        store.ingest_bibtex(
            """
@article{seed2024,
  author = {Seed, Alice},
  title = {Seed Paper},
  year = {2024},
  references = {known2023, missing2022}
 }
@article{known2023,
  author = {Known, Bob},
  title = {Known Paper},
  year = {2023},
  references = {leaf2021}
 }
@article{leaf2021,
  author = {Leaf, Carol},
  title = {Leaf Paper},
  year = {2021}
 }
 """,
            review_status="reviewed",
        )
        rows = store.traverse_graph(["seed2024"], relation_types=["cites"], max_depth=2)
        assert [row["target_citation_key"] for row in rows] == [
            "known2023",
            "missing2022",
            "leaf2021",
        ]
        assert rows[1]["target_exists"] is False
        assert rows[2]["depth"] == 2
    finally:
        store.close()