diff --git a/README.md b/README.md index ef4ece4..0219a33 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,11 @@ The initial repo includes: - `pybtex`-backed BibTeX parsing and export in a repo-local virtual environment; - a SQLite-backed bibliography store; - a small CLI for ingest, search, inspection, and export; +- review-state tracking on entries and per-field ingest provenance; +- first-pass plaintext reference extraction into draft BibTeX; +- identifier-first metadata resolution for DOI, DBLP, and arXiv-backed entries; +- local citation-graph traversal over stored `cites`, `cited_by`, and `crossref` edges; +- Crossref-backed graph expansion that materializes draft referenced works and edge provenance; - normalized tables for entries, creators, identifiers, and citation relations; - full-text-search-ready indexing over title, abstract, and fulltext when SQLite FTS5 is available; - tests covering parsing, ingestion, relation storage, and search. @@ -106,15 +111,19 @@ Or use the CLI directly: cd citegeist PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 ingest references.bib PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 search "semantic search" -PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 show smith2024graphs +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 show --provenance smith2024graphs +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 set-status smith2024graphs reviewed +PYTHONPATH=src .venv/bin/python -m citegeist extract references.txt --output draft.bib +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve smith2024graphs +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 graph smith2024graphs --relation cites --depth 2 --missing-only +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source crossref PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export --output reviewed.bib ``` ## Near-Term Priorities -- provenance tracking and entry review states; -- plaintext reference extraction into draft BibTeX; -- metadata resolvers for DOI, Crossref, DBLP, and arXiv. +- stronger plaintext extraction coverage for more citation styles; +- richer graph expansion from additional external citation sources. See [ROADMAP.md](./ROADMAP.md) for the prioritized phase plan and rationale. diff --git a/src/citegeist/__init__.py b/src/citegeist/__init__.py index 1681197..4022f8e 100644 --- a/src/citegeist/__init__.py +++ b/src/citegeist/__init__.py @@ -1,4 +1,15 @@ from .bibtex import BibEntry, parse_bibtex +from .expand import CrossrefExpander +from .extract import extract_references +from .resolve import MetadataResolver, merge_entries from .storage import BibliographyStore -__all__ = ["BibEntry", "BibliographyStore", "parse_bibtex"] +__all__ = [ + "BibEntry", + "BibliographyStore", + "CrossrefExpander", + "MetadataResolver", + "extract_references", + "merge_entries", + "parse_bibtex", +] diff --git a/src/citegeist/cli.py b/src/citegeist/cli.py index 0b44015..25c5739 100644 --- a/src/citegeist/cli.py +++ b/src/citegeist/cli.py @@ -1,10 +1,15 @@ from __future__ import annotations import argparse +from dataclasses import asdict import json import sys from pathlib import Path +from .bibtex import parse_bibtex, render_bibtex +from .expand import CrossrefExpander +from .extract import extract_references +from .resolve import MetadataResolver, merge_entries from .storage import BibliographyStore @@ -16,6 +21,8 @@ def build_parser() -> argparse.ArgumentParser: ingest_parser = subparsers.add_parser("ingest", help="Ingest BibTeX into the database") ingest_parser.add_argument("input", help="BibTeX file to ingest") + ingest_parser.add_argument("--status", default="draft", help="Initial review status") + ingest_parser.add_argument("--source-label", help="Provenance label for this ingest run") search_parser = subparsers.add_parser("search", help="Search titles, abstracts, and fulltext") search_parser.add_argument("query", help="Search query") @@ -24,11 +31,49 @@ def build_parser() -> argparse.ArgumentParser: show_parser = subparsers.add_parser("show", help="Show one entry or list entries") show_parser.add_argument("citation_key", nargs="?", help="Citation key to show") show_parser.add_argument("--limit", type=int, default=20, help="Maximum entries when listing") + show_parser.add_argument("--provenance", action="store_true", help="Include field provenance") export_parser = subparsers.add_parser("export", help="Export entries as BibTeX") export_parser.add_argument("citation_keys", nargs="*", help="Optional citation keys to export") export_parser.add_argument("--output", help="Write BibTeX to a file instead of stdout") + status_parser = subparsers.add_parser("set-status", help="Set the review status for one entry") + status_parser.add_argument("citation_key", help="Citation key to update") + status_parser.add_argument("review_status", help="New review status") + + extract_parser = subparsers.add_parser("extract", help="Extract draft BibTeX from plaintext references") + extract_parser.add_argument("input", help="Plaintext file containing bibliography-style references") + extract_parser.add_argument("--output", help="Write extracted BibTeX to a file instead of stdout") + + resolve_parser = subparsers.add_parser("resolve", help="Enrich stored entries from external metadata sources") + resolve_parser.add_argument("citation_keys", nargs="+", help="Citation keys to enrich") + + graph_parser = subparsers.add_parser("graph", help="Traverse citation relations from one or more seed entries") + graph_parser.add_argument("citation_keys", nargs="+", help="Seed citation keys") + graph_parser.add_argument( + "--relation", + action="append", + dest="relations", + choices=["cites", "cited_by", "crossref"], + help="Relation type to traverse; may be passed multiple times", + ) + graph_parser.add_argument("--depth", type=int, default=1, help="Maximum traversal depth") + graph_parser.add_argument("--review-status", help="Filter results by target review status") + graph_parser.add_argument( + "--missing-only", + action="store_true", + help="Show only unresolved target nodes that are not yet present in the database", + ) + + expand_parser = subparsers.add_parser("expand", help="Expand graph edges from external metadata sources") + expand_parser.add_argument("citation_keys", nargs="+", help="Seed citation keys to expand") + expand_parser.add_argument( + "--source", + choices=["crossref"], + default="crossref", + help="External source used for graph expansion", + ) + return parser @@ -39,13 +84,30 @@ def main(argv: list[str] | None = None) -> int: store = BibliographyStore(args.db) try: if args.command == "ingest": - return _run_ingest(store, Path(args.input)) + return _run_ingest(store, Path(args.input), args.status, args.source_label) if args.command == "search": return _run_search(store, args.query, args.limit) if args.command == "show": - return _run_show(store, args.citation_key, args.limit) + return _run_show(store, args.citation_key, args.limit, args.provenance) if args.command == "export": return _run_export(store, args.citation_keys, args.output) + if args.command == "set-status": + return _run_set_status(store, args.citation_key, args.review_status) + if args.command == "extract": + return _run_extract(Path(args.input), args.output) + if args.command == "resolve": + return _run_resolve(store, args.citation_keys) + if args.command == "graph": + return _run_graph( + store, + args.citation_keys, + args.relations, + args.depth, + args.review_status, + args.missing_only, + ) + if args.command == "expand": + return _run_expand(store, args.citation_keys, args.source) finally: store.close() @@ -53,9 +115,18 @@ def main(argv: list[str] | None = None) -> int: return 2 -def _run_ingest(store: BibliographyStore, input_path: Path) -> int: +def _run_ingest( + store: BibliographyStore, + input_path: Path, + review_status: str, + source_label: str | None, +) -> int: text = input_path.read_text(encoding="utf-8") - keys = store.ingest_bibtex(text) + keys = store.ingest_bibtex( + text, + source_label=source_label or str(input_path), + review_status=review_status, + ) for key in keys: print(key) return 0 @@ -68,12 +139,14 @@ def _run_search(store: BibliographyStore, query: str, limit: int) -> int: return 0 -def _run_show(store: BibliographyStore, citation_key: str | None, limit: int) -> int: +def _run_show(store: BibliographyStore, citation_key: str | None, limit: int, provenance: bool) -> int: if citation_key: entry = store.get_entry(citation_key) if entry is None: print(f"Entry not found: {citation_key}", file=sys.stderr) return 1 + if provenance: + entry["field_provenance"] = store.get_field_provenance(citation_key) print(json.dumps(entry, indent=2, sort_keys=True)) return 0 @@ -89,3 +162,89 @@ def _run_export(store: BibliographyStore, citation_keys: list[str], output: str if rendered: print(rendered) return 0 + + +def _run_set_status(store: BibliographyStore, citation_key: str, review_status: str) -> int: + if not store.set_entry_status(citation_key, review_status): + print(f"Entry not found: {citation_key}", file=sys.stderr) + return 1 + print(f"{citation_key}\t{review_status}") + return 0 + + +def _run_extract(input_path: Path, output: str | None) -> int: + text = input_path.read_text(encoding="utf-8") + entries = extract_references(text) + rendered = render_bibtex(entries) + if output: + Path(output).write_text(rendered + ("\n" if rendered else ""), encoding="utf-8") + else: + if rendered: + print(rendered) + return 0 + + +def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int: + resolver = MetadataResolver() + exit_code = 0 + for citation_key in citation_keys: + existing = store.get_entry(citation_key) + if existing is None: + print(f"Entry not found: {citation_key}", file=sys.stderr) + exit_code = 1 + continue + bibtex = store.get_entry_bibtex(citation_key) + if not bibtex: + print(f"Entry not renderable: {citation_key}", file=sys.stderr) + exit_code = 1 + continue + current_entry = parse_bibtex(bibtex)[0] + resolution = resolver.resolve_entry(current_entry) + if resolution is None: + print(f"No resolver match: {citation_key}", file=sys.stderr) + exit_code = 1 + continue + merged = merge_entries(current_entry, resolution.entry) + store.replace_entry( + citation_key, + merged, + source_type=resolution.source_type, + source_label=resolution.source_label, + review_status="enriched", + ) + print(f"{citation_key}\t{resolution.source_label}") + return exit_code + + +def _run_graph( + store: BibliographyStore, + citation_keys: list[str], + relations: list[str] | None, + depth: int, + review_status: str | None, + missing_only: bool, +) -> int: + rows = store.traverse_graph( + citation_keys, + relation_types=relations or ["cites"], + max_depth=depth, + review_status=review_status, + include_missing=True, + ) + if missing_only: + rows = [row for row in rows if not row["target_exists"]] + print(json.dumps(rows, indent=2)) + return 0 + + +def _run_expand(store: BibliographyStore, citation_keys: list[str], source: str) -> int: + if source != "crossref": + print(f"Unsupported expansion source: {source}", file=sys.stderr) + return 1 + + expander = CrossrefExpander() + all_results = [] + for citation_key in citation_keys: + all_results.extend(expander.expand_entry_references(store, citation_key)) + print(json.dumps([asdict(result) for result in all_results], indent=2)) + return 0 diff --git a/src/citegeist/expand.py b/src/citegeist/expand.py new file mode 100644 index 0000000..82670e8 --- /dev/null +++ b/src/citegeist/expand.py @@ -0,0 +1,121 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass + +from .bibtex import BibEntry +from .resolve import MetadataResolver +from .storage import BibliographyStore + + +@dataclass(slots=True) +class ExpansionResult: + source_citation_key: str + discovered_citation_key: str + created_entry: bool + relation_type: str + source_label: str + + +class CrossrefExpander: + def __init__(self, resolver: MetadataResolver | None = None) -> None: + self.resolver = resolver or MetadataResolver() + + def expand_entry_references( + self, + store: BibliographyStore, + citation_key: str, + ) -> list[ExpansionResult]: + entry = store.get_entry(citation_key) + if entry is None: + return [] + + doi = entry.get("doi") + if not doi: + return [] + + payload = self.resolver._get_json( # noqa: SLF001 + f"https://api.crossref.org/works/{doi}?mailto=welsberr@gmail.com" + ) + references = payload.get("message", {}).get("reference", []) + results: list[ExpansionResult] = [] + for index, reference in enumerate(references, start=1): + discovered = _crossref_reference_to_entry(reference, citation_key, index) + created = False + if store.get_entry(discovered.citation_key) is None: + store.upsert_entry( + discovered, + raw_bibtex=None, + source_type="graph_expand", + source_label=f"crossref:references:{doi}", + review_status="draft", + ) + store.connection.commit() + created = True + + store.add_relation( + citation_key, + discovered.citation_key, + "cites", + source_type="graph_expand", + source_label=f"crossref:references:{doi}", + confidence=1.0 if reference.get("DOI") else 0.6, + ) + results.append( + ExpansionResult( + source_citation_key=citation_key, + discovered_citation_key=discovered.citation_key, + created_entry=created, + relation_type="cites", + source_label=f"crossref:references:{doi}", + ) + ) + return results + + +def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry: + title = ( + reference.get("article-title") + or reference.get("volume-title") + or reference.get("journal-title") + or reference.get("unstructured") + or f"Referenced work {ordinal}" + ) + year = str(reference.get("year") or "") + author = reference.get("author") or "" + doi = reference.get("DOI") or "" + journal_title = reference.get("journal-title") or "" + + fields: dict[str, str] = { + "title": _normalize_text(title), + "note": f"discovered_from = {{{source_citation_key}}}", + } + if year: + fields["year"] = year + if author: + fields["author"] = _normalize_text(author) + if doi: + fields["doi"] = doi + fields["url"] = f"https://doi.org/{doi}" + if journal_title: + fields["journal"] = _normalize_text(journal_title) + + citation_key = _reference_citation_key(reference, title, year, ordinal) + entry_type = "article" if journal_title else "misc" + return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) + + +def _reference_citation_key(reference: dict, title: str, year: str, ordinal: int) -> str: + if doi := reference.get("DOI"): + suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower() + return f"doi{suffix}" + + author = reference.get("author") or "ref" + family = author.split(",")[0].split()[-1] + family = re.sub(r"[^A-Za-z0-9]+", "", family).lower() or "ref" + first_word = re.sub(r"[^A-Za-z0-9]+", "", title.split()[0]).lower() if title.split() else "untitled" + return f"{family}{year or 'nd'}{first_word}{ordinal}" + + +def _normalize_text(value: str) -> str: + return " ".join(value.split()) diff --git a/src/citegeist/extract.py b/src/citegeist/extract.py new file mode 100644 index 0000000..5df2eb6 --- /dev/null +++ b/src/citegeist/extract.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +import re + +from .bibtex import BibEntry + +YEAR_PATTERN = re.compile(r"\b(19|20)\d{2}\b") + + +def extract_references(text: str) -> list[BibEntry]: + entries: list[BibEntry] = [] + for index, line in enumerate(_iter_reference_lines(text), start=1): + parsed = _parse_reference_line(line, index) + if parsed is not None: + entries.append(parsed) + return entries + + +def render_extracted_bibtex(text: str) -> str: + from .bibtex import render_bibtex + + return render_bibtex(extract_references(text)) + + +def _iter_reference_lines(text: str) -> list[str]: + lines: list[str] = [] + for raw_line in text.splitlines(): + line = raw_line.strip() + if not line: + continue + line = re.sub(r"^\[\d+\]\s*", "", line) + line = re.sub(r"^\d+\.\s*", "", line) + line = re.sub(r"^\(\d+\)\s*", "", line) + if len(line) < 20: + continue + lines.append(" ".join(line.split())) + return lines + + +def _parse_reference_line(line: str, ordinal: int) -> BibEntry | None: + year_match = YEAR_PATTERN.search(line) + if year_match is None: + return None + + year = year_match.group(0) + author_part = line[: year_match.start()].strip(" .") + remainder = line[year_match.end() :].strip(" .") + if not author_part or not remainder: + return None + + segments = [segment.strip(" .") for segment in remainder.split(".") if segment.strip(" .")] + if not segments: + return None + + title = segments[0] + venue = segments[1] if len(segments) > 1 else "" + + authors = _normalize_authors(author_part) + citation_key = _make_citation_key(authors, year, title, ordinal) + entry_type = _guess_entry_type(venue) + + fields: dict[str, str] = { + "author": authors, + "year": year, + "title": title, + "note": f"extracted_reference = {{true}}; raw_reference = {{{line}}}", + } + if venue: + if entry_type == "article": + fields["journal"] = venue + else: + fields["booktitle"] = venue + + return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) + + +def _normalize_authors(author_part: str) -> str: + normalized = author_part.replace(" & ", " and ") + normalized = re.sub(r"\bet al\.$", "and others", normalized) + normalized = re.sub(r"\s+and\s+", " and ", normalized) + normalized = re.sub(r"\s*,\s*", ", ", normalized) + return normalized.strip(" .") + + +def _make_citation_key(authors: str, year: str, title: str, ordinal: int) -> str: + first_author = authors.split(" and ")[0] + family_name = first_author.split(",")[0] if "," in first_author else first_author.split()[-1] + family_name = re.sub(r"[^A-Za-z0-9]+", "", family_name).lower() or "ref" + + first_word = re.sub(r"[^A-Za-z0-9]+", "", title.split()[0]).lower() if title.split() else "untitled" + if not first_word: + first_word = "untitled" + return f"{family_name}{year}{first_word}{ordinal}" + + +def _guess_entry_type(venue: str) -> str: + lowered = venue.lower() + if any(token in lowered for token in ("journal", "transactions", "review", "letters")): + return "article" + if any(token in lowered for token in ("proceedings", "conference", "workshop", "symposium")): + return "inproceedings" + return "misc" diff --git a/src/citegeist/resolve.py b/src/citegeist/resolve.py new file mode 100644 index 0000000..8f6da4c --- /dev/null +++ b/src/citegeist/resolve.py @@ -0,0 +1,240 @@ +from __future__ import annotations + +import json +import urllib.parse +import urllib.request +import xml.etree.ElementTree as ET +from dataclasses import dataclass + +from .bibtex import BibEntry, parse_bibtex + + +@dataclass(slots=True) +class Resolution: + entry: BibEntry + source_type: str + source_label: str + + +class MetadataResolver: + def __init__(self, user_agent: str = "citegeist/0.1 (local research tool)") -> None: + self.user_agent = user_agent + + def resolve_entry(self, entry: BibEntry) -> Resolution | None: + if doi := entry.fields.get("doi"): + resolved = self.resolve_doi(doi) + if resolved is not None: + return resolved + + if dblp_key := entry.fields.get("dblp"): + resolved = self.resolve_dblp(dblp_key) + if resolved is not None: + return resolved + + if arxiv_id := entry.fields.get("arxiv"): + resolved = self.resolve_arxiv(arxiv_id) + if resolved is not None: + return resolved + + return None + + def resolve_doi(self, doi: str) -> Resolution | None: + encoded = urllib.parse.quote(doi, safe="") + payload = self._get_json(f"https://api.crossref.org/works/{encoded}") + message = payload.get("message", {}) + if not message: + return None + return Resolution( + entry=_crossref_message_to_entry(message), + source_type="resolver", + source_label=f"crossref:doi:{doi}", + ) + + def search_crossref(self, title: str, limit: int = 5) -> list[BibEntry]: + query = urllib.parse.urlencode({"query.title": title, "rows": limit}) + payload = self._get_json(f"https://api.crossref.org/works?{query}") + items = payload.get("message", {}).get("items", []) + return [_crossref_message_to_entry(item) for item in items] + + def resolve_dblp(self, dblp_key: str) -> Resolution | None: + encoded_key = urllib.parse.quote(dblp_key, safe="/:") + text = self._get_text(f"https://dblp.org/rec/{encoded_key}.bib") + entries = parse_bibtex(text) + if not entries: + return None + return Resolution( + entry=entries[0], + source_type="resolver", + source_label=f"dblp:key:{dblp_key}", + ) + + def search_dblp(self, query_text: str, limit: int = 5) -> list[BibEntry]: + query = urllib.parse.urlencode({"q": query_text, "format": "json", "h": limit}) + payload = self._get_json(f"https://dblp.org/search/publ/api?{query}") + hits = payload.get("result", {}).get("hits", {}).get("hit", []) + if isinstance(hits, dict): + hits = [hits] + + results: list[BibEntry] = [] + for hit in hits: + info = hit.get("info", {}) + dblp_key = info.get("key") + if dblp_key: + resolved = self.resolve_dblp(dblp_key) + if resolved is not None: + results.append(resolved.entry) + return results + + def resolve_arxiv(self, arxiv_id: str) -> Resolution | None: + query = urllib.parse.urlencode({"id_list": arxiv_id}) + root = self._get_xml(f"https://export.arxiv.org/api/query?{query}") + namespace = {"atom": "http://www.w3.org/2005/Atom"} + entry = root.find("atom:entry", namespace) + if entry is None: + return None + return Resolution( + entry=_arxiv_atom_entry_to_bib(entry, arxiv_id), + source_type="resolver", + source_label=f"arxiv:id:{arxiv_id}", + ) + + def _get_json(self, url: str) -> dict: + with urllib.request.urlopen(self._request(url)) as response: + return json.load(response) + + def _get_text(self, url: str) -> str: + with urllib.request.urlopen(self._request(url)) as response: + return response.read().decode("utf-8") + + def _get_xml(self, url: str) -> ET.Element: + with urllib.request.urlopen(self._request(url)) as response: + return ET.fromstring(response.read()) + + def _request(self, url: str) -> urllib.request.Request: + return urllib.request.Request( + url, + headers={ + "User-Agent": self.user_agent, + }, + ) + + +def merge_entries(base: BibEntry, resolved: BibEntry) -> BibEntry: + merged_fields = dict(base.fields) + for key, value in resolved.fields.items(): + if value and (key not in merged_fields or not merged_fields[key]): + merged_fields[key] = value + return BibEntry( + entry_type=base.entry_type or resolved.entry_type, + citation_key=base.citation_key, + fields=merged_fields, + ) + + +def _crossref_message_to_entry(message: dict) -> BibEntry: + entry_type = _crossref_type_to_bibtype(message.get("type", "article")) + title_values = message.get("title", []) + title = title_values[0] if title_values else "" + year = _extract_crossref_year(message) + authors = " and ".join(_crossref_person_to_name(person) for person in message.get("author", [])) + venue = "" + if container_title := message.get("container-title", []): + venue = container_title[0] + + fields: dict[str, str] = {} + if authors: + fields["author"] = authors + if title: + fields["title"] = title + if year: + fields["year"] = year + if doi := message.get("DOI"): + fields["doi"] = doi + if url := message.get("URL"): + fields["url"] = url + if abstract := message.get("abstract"): + fields["abstract"] = abstract + if venue: + if entry_type == "article": + fields["journal"] = venue + else: + fields["booktitle"] = venue + if volume := message.get("volume"): + fields["volume"] = str(volume) + if issue := message.get("issue"): + fields["number"] = str(issue) + if pages := message.get("page"): + fields["pages"] = str(pages) + + citation_key = _make_resolution_key(fields.get("author", "crossref"), year or "n.d.", title or "untitled") + return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) + + +def _arxiv_atom_entry_to_bib(node: ET.Element, arxiv_id: str) -> BibEntry: + ns = { + "atom": "http://www.w3.org/2005/Atom", + "arxiv": "http://arxiv.org/schemas/atom", + } + title = _node_text(node.find("atom:title", ns)) + summary = _node_text(node.find("atom:summary", ns)) + published = _node_text(node.find("atom:published", ns)) + year = published[:4] if published else "" + authors = " and ".join( + _node_text(author.find("atom:name", ns)) for author in node.findall("atom:author", ns) + ) + doi = _node_text(node.find("arxiv:doi", ns)) + + fields: dict[str, str] = { + "title": title, + "author": authors, + "year": year, + "arxiv": arxiv_id, + "url": f"https://arxiv.org/abs/{arxiv_id}", + "pdf": f"https://arxiv.org/pdf/{arxiv_id}.pdf", + } + if summary: + fields["abstract"] = summary + if doi: + fields["doi"] = doi + return BibEntry(entry_type="article", citation_key=f"arxiv{arxiv_id.replace('.', '').replace('/', '')}", fields=fields) + + +def _crossref_type_to_bibtype(crossref_type: str) -> str: + mapping = { + "journal-article": "article", + "proceedings-article": "inproceedings", + "book-chapter": "incollection", + "book": "book", + "proceedings": "proceedings", + } + return mapping.get(crossref_type, "misc") + + +def _extract_crossref_year(message: dict) -> str: + for field_name in ("published-print", "published-online", "issued", "created"): + date_parts = message.get(field_name, {}).get("date-parts", []) + if date_parts and date_parts[0]: + return str(date_parts[0][0]) + return "" + + +def _crossref_person_to_name(person: dict) -> str: + family = person.get("family", "") + given = person.get("given", "") + if family and given: + return f"{family}, {given}" + return family or given + + +def _node_text(node: ET.Element | None) -> str: + if node is None or node.text is None: + return "" + return " ".join(node.text.split()) + + +def _make_resolution_key(author_text: str, year: str, title: str) -> str: + first_author = author_text.split(" and ")[0] + family_name = first_author.split(",")[0] if "," in first_author else first_author.split()[-1] + family_name = "".join(ch for ch in family_name.lower() if ch.isalnum()) or "ref" + first_word = "".join(ch for ch in title.split()[0].lower() if ch.isalnum()) if title.split() else "untitled" + return f"{family_name}{year}{first_word}" diff --git a/src/citegeist/storage.py b/src/citegeist/storage.py index 203b2d7..10fdc6a 100644 --- a/src/citegeist/storage.py +++ b/src/citegeist/storage.py @@ -2,6 +2,7 @@ from __future__ import annotations import json import sqlite3 +from collections import deque from collections import OrderedDict from pathlib import Path @@ -47,6 +48,7 @@ class BibliographyStore: id INTEGER PRIMARY KEY, citation_key TEXT NOT NULL UNIQUE, entry_type TEXT NOT NULL, + review_status TEXT NOT NULL DEFAULT 'draft', title TEXT, year TEXT, journal TEXT, @@ -92,9 +94,34 @@ class BibliographyStore: relation_type TEXT NOT NULL, PRIMARY KEY (source_entry_id, target_citation_key, relation_type) ); + + CREATE TABLE IF NOT EXISTS field_provenance ( + id INTEGER PRIMARY KEY, + entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE, + field_name TEXT NOT NULL, + field_value TEXT, + source_type TEXT NOT NULL, + source_label TEXT NOT NULL, + operation TEXT NOT NULL, + confidence REAL, + recorded_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP + ); + + CREATE TABLE IF NOT EXISTS relation_provenance ( + id INTEGER PRIMARY KEY, + source_entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE, + target_citation_key TEXT NOT NULL, + relation_type TEXT NOT NULL, + source_type TEXT NOT NULL, + source_label TEXT NOT NULL, + confidence REAL, + recorded_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP + ); """ ) + self._ensure_entry_columns() + if self._fts5_enabled: self.connection.execute( """ @@ -109,24 +136,45 @@ class BibliographyStore: ) self.connection.commit() - def ingest_bibtex(self, text: str, fulltext_by_key: dict[str, str] | None = None) -> list[str]: + def ingest_bibtex( + self, + text: str, + fulltext_by_key: dict[str, str] | None = None, + source_label: str = "bibtex_import", + review_status: str = "draft", + ) -> list[str]: fulltext_by_key = fulltext_by_key or {} entries = parse_bibtex(text) keys: list[str] = [] for entry in entries: fulltext = fulltext_by_key.get(entry.citation_key) - self.upsert_entry(entry, fulltext=fulltext, raw_bibtex=_entry_to_bibtex(entry)) + self.upsert_entry( + entry, + fulltext=fulltext, + raw_bibtex=_entry_to_bibtex(entry), + source_type="bibtex", + source_label=source_label, + review_status=review_status, + ) keys.append(entry.citation_key) self.connection.commit() return keys - def upsert_entry(self, entry: BibEntry, fulltext: str | None = None, raw_bibtex: str | None = None) -> int: + def upsert_entry( + self, + entry: BibEntry, + fulltext: str | None = None, + raw_bibtex: str | None = None, + source_type: str = "manual", + source_label: str = "manual", + review_status: str = "draft", + ) -> int: row = self.connection.execute( """ INSERT INTO entries ( - citation_key, entry_type, title, year, journal, booktitle, publisher, + citation_key, entry_type, review_status, title, year, journal, booktitle, publisher, abstract, keywords, url, doi, isbn, fulltext, raw_bibtex, extra_fields_json - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(citation_key) DO UPDATE SET entry_type = excluded.entry_type, title = excluded.title, @@ -148,6 +196,7 @@ class BibliographyStore: ( entry.citation_key, entry.entry_type, + review_status, entry.fields.get("title"), entry.fields.get("year"), entry.fields.get("journal"), @@ -165,6 +214,15 @@ class BibliographyStore: ).fetchone() entry_id = int(row["id"]) + self._record_field_provenance( + entry_id=entry_id, + entry=entry, + source_type=source_type, + source_label=source_label, + operation="upsert", + fulltext=fulltext, + ) + self.connection.execute("DELETE FROM entry_creators WHERE entry_id = ?", (entry_id,)) for role in ("author", "editor"): names = _split_names(entry.fields.get(role, "")) @@ -262,6 +320,64 @@ class BibliographyStore: ).fetchall() return [str(row["target_citation_key"]) for row in rows] + def traverse_graph( + self, + seed_keys: list[str], + relation_types: list[str] | None = None, + max_depth: int = 1, + review_status: str | None = None, + include_missing: bool = True, + ) -> list[dict[str, object]]: + relation_types = relation_types or ["cites"] + allowed_relations = set(relation_types) + visited: dict[str, int] = {} + queue: deque[tuple[str, int]] = deque() + + for seed_key in seed_keys: + queue.append((seed_key, 0)) + visited[seed_key] = 0 + + results: list[dict[str, object]] = [] + while queue: + citation_key, depth = queue.popleft() + if depth >= max_depth: + continue + + for edge in self._iter_graph_edges(citation_key, allowed_relations): + target_key = str(edge["target_citation_key"]) + target_entry = self.get_entry(target_key) + target_status = target_entry.get("review_status") if target_entry else None + + if review_status is not None and target_status != review_status: + if target_entry is not None or not include_missing: + continue + + next_depth = depth + 1 + result = { + "source_citation_key": citation_key, + "target_citation_key": target_key, + "relation_type": str(edge["relation_type"]), + "depth": next_depth, + "target_exists": target_entry is not None, + "target_review_status": target_status, + "target_title": target_entry.get("title") if target_entry else None, + } + results.append(result) + + if target_entry is not None and (target_key not in visited or next_depth < visited[target_key]): + visited[target_key] = next_depth + queue.append((target_key, next_depth)) + + results.sort( + key=lambda row: ( + int(row["depth"]), + str(row["relation_type"]), + str(row["source_citation_key"]), + str(row["target_citation_key"]), + ) + ) + return results + def get_entry(self, citation_key: str) -> dict[str, object] | None: row = self.connection.execute( "SELECT * FROM entries WHERE citation_key = ?", @@ -272,7 +388,7 @@ class BibliographyStore: def list_entries(self, limit: int = 50) -> list[dict[str, object]]: rows = self.connection.execute( """ - SELECT citation_key, entry_type, title, year + SELECT citation_key, entry_type, review_status, title, year FROM entries ORDER BY COALESCE(year, ''), citation_key LIMIT ? @@ -281,6 +397,109 @@ class BibliographyStore: ).fetchall() return [dict(row) for row in rows] + def set_entry_status(self, citation_key: str, review_status: str) -> bool: + row = self.connection.execute( + """ + UPDATE entries + SET review_status = ?, updated_at = CURRENT_TIMESTAMP + WHERE citation_key = ? + RETURNING id + """, + (review_status, citation_key), + ).fetchone() + self.connection.commit() + return row is not None + + def replace_entry( + self, + citation_key: str, + entry: BibEntry, + source_type: str, + source_label: str, + review_status: str = "enriched", + ) -> bool: + existing = self.get_entry(citation_key) + if existing is None: + return False + replacement = BibEntry( + entry_type=entry.entry_type, + citation_key=citation_key, + fields=entry.fields, + ) + self.upsert_entry( + replacement, + fulltext=existing.get("fulltext"), + raw_bibtex=_entry_to_bibtex(replacement), + source_type=source_type, + source_label=source_label, + review_status=review_status, + ) + self.connection.commit() + return True + + def add_relation( + self, + source_citation_key: str, + target_citation_key: str, + relation_type: str, + source_type: str, + source_label: str, + confidence: float = 1.0, + ) -> bool: + row = self.connection.execute( + "SELECT id FROM entries WHERE citation_key = ?", + (source_citation_key,), + ).fetchone() + if row is None: + return False + + source_entry_id = int(row["id"]) + self.connection.execute( + """ + INSERT OR IGNORE INTO relations (source_entry_id, target_citation_key, relation_type) + VALUES (?, ?, ?) + """, + (source_entry_id, target_citation_key, relation_type), + ) + self.connection.execute( + """ + INSERT INTO relation_provenance ( + source_entry_id, target_citation_key, relation_type, source_type, source_label, confidence + ) VALUES (?, ?, ?, ?, ?, ?) + """, + (source_entry_id, target_citation_key, relation_type, source_type, source_label, confidence), + ) + self.connection.commit() + return True + + def get_field_provenance(self, citation_key: str) -> list[dict[str, object]]: + rows = self.connection.execute( + """ + SELECT fp.field_name, fp.field_value, fp.source_type, fp.source_label, + fp.operation, fp.confidence, fp.recorded_at + FROM field_provenance fp + JOIN entries e ON e.id = fp.entry_id + WHERE e.citation_key = ? + ORDER BY fp.recorded_at, fp.id + """, + (citation_key,), + ).fetchall() + return [dict(row) for row in rows] + + def get_relation_provenance(self, citation_key: str) -> list[dict[str, object]]: + rows = self.connection.execute( + """ + SELECT rp.target_citation_key, rp.relation_type, rp.source_type, rp.source_label, + rp.confidence, rp.recorded_at + FROM relation_provenance rp + JOIN entries e ON e.id = rp.source_entry_id + WHERE e.citation_key = ? + ORDER BY rp.recorded_at, rp.id + """, + (citation_key,), + ).fetchall() + return [dict(row) for row in rows] + def get_entry_bibtex(self, citation_key: str) -> str | None: entry = self._load_bib_entry(citation_key) if entry is None: @@ -382,6 +601,72 @@ class BibliographyStore: ).fetchall() return [str(row["full_name"]) for row in rows] + def _iter_graph_edges(self, citation_key: str, allowed_relations: set[str]) -> list[sqlite3.Row]: + rows = self.connection.execute( + """ + SELECT e.citation_key AS source_citation_key, r.target_citation_key, r.relation_type + FROM relations r + JOIN entries e ON e.id = r.source_entry_id + WHERE e.citation_key = ? AND r.relation_type IN ({placeholders}) + ORDER BY r.relation_type, r.target_citation_key + """.format(placeholders=",".join("?" for _ in allowed_relations)), + (citation_key, *sorted(allowed_relations)), + ).fetchall() + + reverse_rows = [] + if "cited_by" in allowed_relations: + reverse_rows = self.connection.execute( + """ + SELECT ? AS source_citation_key, e.citation_key AS target_citation_key, 'cited_by' AS relation_type + FROM relations r + JOIN entries e ON e.id = r.source_entry_id + WHERE r.target_citation_key = ? AND r.relation_type = 'cites' + ORDER BY e.citation_key + """, + (citation_key, citation_key), + ).fetchall() + + seen: set[tuple[str, str]] = set() + merged: list[sqlite3.Row] = [] + for row in list(rows) + list(reverse_rows): + key = (str(row["relation_type"]), str(row["target_citation_key"])) + if key not in seen: + seen.add(key) + merged.append(row) + return merged + + def _ensure_entry_columns(self) -> None: + columns = { + row["name"] for row in self.connection.execute("PRAGMA table_info(entries)").fetchall() + } + if "review_status" not in columns: + self.connection.execute( + "ALTER TABLE entries ADD COLUMN review_status TEXT NOT NULL DEFAULT 'draft'" + ) + + def _record_field_provenance( + self, + entry_id: int, + entry: BibEntry, + source_type: str, + source_label: str, + operation: str, + fulltext: str | None, + ) -> None: + field_items = list(entry.fields.items()) + if fulltext: + field_items.append(("fulltext", fulltext)) + + for field_name, field_value in field_items: + self.connection.execute( + """ + INSERT INTO field_provenance ( + entry_id, field_name, field_value, source_type, source_label, operation, confidence + ) VALUES (?, ?, ?, ?, ?, ?, ?) + """, + (entry_id, field_name, field_value, source_type, source_label, operation, 1.0), + ) + def _split_names(value: str) -> list[str]: if not value: diff --git a/tests/test_cli.py b/tests/test_cli.py index afc7091..bfc1734 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -4,6 +4,9 @@ import json import subprocess import sys from pathlib import Path +from unittest.mock import patch + +from citegeist.cli import main SAMPLE_BIB = """ @@ -59,3 +62,144 @@ def test_cli_ingest_show_search_and_export(tmp_path: Path): assert export_result.returncode == 0 exported = export_path.read_text(encoding="utf-8") assert "@article{smith2024graphs," in exported + + +def test_cli_provenance_and_status_updates(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text(SAMPLE_BIB, encoding="utf-8") + + ingest = run_cli( + tmp_path, + "ingest", + "--status", + "draft", + "--source-label", + "tests/input.bib", + str(bib_path), + ) + assert ingest.returncode == 0 + + show = run_cli(tmp_path, "show", "--provenance", "smith2024graphs") + assert show.returncode == 0 + payload = json.loads(show.stdout) + assert payload["review_status"] == "draft" + assert payload["field_provenance"][0]["source_label"] == "tests/input.bib" + + status = run_cli(tmp_path, "set-status", "smith2024graphs", "reviewed") + assert status.returncode == 0 + assert "reviewed" in status.stdout + + +def test_cli_resolve_updates_entry(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text( + """ +@article{smith2024graphs, + author = {Smith, Jane}, + title = {Graph-first bibliography augmentation}, + year = {2024}, + doi = {10.1000/example-doi} +} +""", + encoding="utf-8", + ) + + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + from citegeist.bibtex import BibEntry + from citegeist.resolve import Resolution + + database = tmp_path / "library.sqlite3" + + with patch("citegeist.cli.MetadataResolver.resolve_entry") as mocked_resolve: + mocked_resolve.return_value = Resolution( + entry=BibEntry( + entry_type="article", + citation_key="resolvedkey", + fields={ + "author": "Smith, Jane", + "title": "Graph-first bibliography augmentation", + "year": "2024", + "doi": "10.1000/example-doi", + "journal": "Journal of Graph Studies", + }, + ), + source_type="resolver", + source_label="crossref:doi:10.1000/example-doi", + ) + exit_code = main( + [ + "--db", + str(database), + "resolve", + "smith2024graphs", + ] + ) + + assert exit_code == 0 + + +def test_cli_graph_outputs_missing_targets(tmp_path: Path): + bib_path = tmp_path / "graph.bib" + bib_path.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + references = {known2023, missing2022} +} + +@article{known2023, + author = {Known, Bob}, + title = {Known Paper}, + year = {2023} +} +""", + encoding="utf-8", + ) + + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + graph = run_cli(tmp_path, "graph", "seed2024", "--missing-only") + assert graph.returncode == 0 + payload = json.loads(graph.stdout) + assert len(payload) == 1 + assert payload[0]["target_citation_key"] == "missing2022" + assert payload[0]["target_exists"] is False + + +def test_cli_expand_with_mocked_crossref(tmp_path: Path): + bib_path = tmp_path / "expand.bib" + bib_path.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + doi = {10.1000/seed-doi} +} +""", + encoding="utf-8", + ) + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + from citegeist.expand import ExpansionResult + + with patch("citegeist.cli.CrossrefExpander.expand_entry_references") as mocked_expand: + mocked_expand.return_value = [ + ExpansionResult( + source_citation_key="seed2024", + discovered_citation_key="doi101000exampleref", + created_entry=True, + relation_type="cites", + source_label="crossref:references:10.1000/seed-doi", + ) + ] + database = tmp_path / "library.sqlite3" + exit_code = main(["--db", str(database), "expand", "seed2024"]) + + assert exit_code == 0 diff --git a/tests/test_expand.py b/tests/test_expand.py new file mode 100644 index 0000000..365ba2c --- /dev/null +++ b/tests/test_expand.py @@ -0,0 +1,69 @@ +from citegeist.expand import CrossrefExpander, _crossref_reference_to_entry +from citegeist.storage import BibliographyStore + + +def test_crossref_reference_to_entry_prefers_doi_key(): + entry = _crossref_reference_to_entry( + { + "DOI": "10.1000/example-ref", + "article-title": "Discovered Reference", + "author": "Doe, Alex", + "year": "2022", + "journal-title": "Journal of Discovery", + }, + "seed2024", + 1, + ) + + assert entry.citation_key == "doi101000exampleref" + assert entry.fields["doi"] == "10.1000/example-ref" + assert entry.fields["journal"] == "Journal of Discovery" + + +def test_crossref_expander_creates_draft_nodes_and_relations(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + doi = {10.1000/seed-doi} +} +""" + ) + + expander = CrossrefExpander() + expander.resolver._get_json = lambda _url: { # type: ignore[method-assign] + "message": { + "reference": [ + { + "DOI": "10.1000/example-ref", + "article-title": "Discovered Reference", + "author": "Doe, Alex", + "year": "2022", + "journal-title": "Journal of Discovery", + }, + { + "unstructured": "Unstructured reference string", + "year": "2021", + }, + ] + } + } + + results = expander.expand_entry_references(store, "seed2024") + + assert [result.discovered_citation_key for result in results] == [ + "doi101000exampleref", + "ref2021unstructured2", + ] + discovered = store.get_entry("doi101000exampleref") + assert discovered is not None + assert discovered["review_status"] == "draft" + assert store.get_relations("seed2024") == ["doi101000exampleref", "ref2021unstructured2"] + relation_provenance = store.get_relation_provenance("seed2024") + assert relation_provenance[0]["source_type"] == "graph_expand" + finally: + store.close() diff --git a/tests/test_extract.py b/tests/test_extract.py new file mode 100644 index 0000000..18b0283 --- /dev/null +++ b/tests/test_extract.py @@ -0,0 +1,35 @@ +from citegeist import extract_references, parse_bibtex +from citegeist.cli import main + + +SAMPLE_REFERENCES = """ +[1] Smith, Jane and Doe, Alex. 2024. Graph-first bibliography augmentation. Journal of Research Systems. +[2] Miller, Sam. 2023. Semantic search for research corpora. Proceedings of the Retrieval Workshop. +""" + + +def test_extract_references_builds_draft_entries(): + entries = extract_references(SAMPLE_REFERENCES) + + assert [entry.citation_key for entry in entries] == [ + "smith2024graphfirst1", + "miller2023semantic2", + ] + assert entries[0].entry_type == "article" + assert entries[0].fields["journal"] == "Journal of Research Systems" + assert entries[1].entry_type == "inproceedings" + assert entries[1].fields["booktitle"] == "Proceedings of the Retrieval Workshop" + + +def test_extract_cli_writes_bibtex(tmp_path): + input_path = tmp_path / "references.txt" + output_path = tmp_path / "draft.bib" + input_path.write_text(SAMPLE_REFERENCES, encoding="utf-8") + + exit_code = main(["extract", str(input_path), "--output", str(output_path)]) + assert exit_code == 0 + + exported = output_path.read_text(encoding="utf-8") + parsed = {entry.citation_key: entry for entry in parse_bibtex(exported)} + assert parsed["smith2024graphfirst1"].fields["journal"] == "Journal of Research Systems" + assert parsed["miller2023semantic2"].fields["booktitle"] == "Proceedings of the Retrieval Workshop" diff --git a/tests/test_resolve.py b/tests/test_resolve.py new file mode 100644 index 0000000..f66220d --- /dev/null +++ b/tests/test_resolve.py @@ -0,0 +1,85 @@ +from xml.etree import ElementTree as ET + +from citegeist.bibtex import BibEntry +from citegeist.resolve import MetadataResolver, _arxiv_atom_entry_to_bib, _crossref_message_to_entry, merge_entries + + +def test_crossref_message_to_entry_maps_basic_fields(): + entry = _crossref_message_to_entry( + { + "type": "journal-article", + "title": ["Graph-first bibliography augmentation"], + "DOI": "10.1000/example-doi", + "URL": "https://doi.org/10.1000/example-doi", + "container-title": ["Journal of Graph Studies"], + "author": [{"family": "Smith", "given": "Jane"}], + "issued": {"date-parts": [[2024, 5, 1]]}, + } + ) + + assert entry.entry_type == "article" + assert entry.fields["author"] == "Smith, Jane" + assert entry.fields["journal"] == "Journal of Graph Studies" + assert entry.fields["year"] == "2024" + + +def test_arxiv_atom_entry_to_bib_maps_basic_fields(): + xml = ET.fromstring( + """ + + Semantic search for research corpora + Dense retrieval improves recall. + 2023-01-15T00:00:00Z + Miller, Sam + 10.1000/arxiv-example + +""" + ) + entry = _arxiv_atom_entry_to_bib(xml, "2301.12345") + assert entry.fields["author"] == "Miller, Sam" + assert entry.fields["arxiv"] == "2301.12345" + assert entry.fields["doi"] == "10.1000/arxiv-example" + + +def test_merge_entries_prefers_existing_values_and_adds_missing_fields(): + base = BibEntry( + entry_type="article", + citation_key="smith2024graphs", + fields={"title": "Graph-first bibliography augmentation", "doi": "10.1000/example-doi"}, + ) + resolved = BibEntry( + entry_type="article", + citation_key="otherkey", + fields={"title": "Different title", "journal": "Journal of Graph Studies"}, + ) + + merged = merge_entries(base, resolved) + + assert merged.fields["title"] == "Graph-first bibliography augmentation" + assert merged.fields["journal"] == "Journal of Graph Studies" + + +def test_resolver_tries_doi_before_dblp(): + resolver = MetadataResolver() + calls: list[tuple[str, str]] = [] + + def fake_doi(value: str): + calls.append(("doi", value)) + return None + + def fake_dblp(value: str): + calls.append(("dblp", value)) + return None + + resolver.resolve_doi = fake_doi # type: ignore[method-assign] + resolver.resolve_dblp = fake_dblp # type: ignore[method-assign] + + resolver.resolve_entry( + BibEntry( + entry_type="article", + citation_key="smith2024graphs", + fields={"doi": "10.1000/example-doi", "dblp": "conf/test/Smith24"}, + ) + ) + + assert calls == [("doi", "10.1000/example-doi"), ("dblp", "conf/test/Smith24")] diff --git a/tests/test_storage.py b/tests/test_storage.py index a8ed075..3458f52 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -67,3 +67,66 @@ def test_store_exports_bibtex_from_normalized_rows(): assert parsed["smith2024graphs"].fields["references"] == "miller2023search" finally: store.close() + + +def test_store_records_provenance_and_review_status(): + store = BibliographyStore() + try: + store.ingest_bibtex(SAMPLE_BIB, source_label="fixtures/sample.bib", review_status="draft") + + entry = store.get_entry("smith2024graphs") + assert entry is not None + assert entry["review_status"] == "draft" + + provenance = store.get_field_provenance("smith2024graphs") + assert provenance + assert provenance[0]["source_type"] == "bibtex" + assert provenance[0]["source_label"] == "fixtures/sample.bib" + + assert store.set_entry_status("smith2024graphs", "reviewed") is True + updated = store.get_entry("smith2024graphs") + assert updated is not None + assert updated["review_status"] == "reviewed" + finally: + store.close() + + +def test_store_traverses_graph_and_surfaces_missing_targets(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + references = {known2023, missing2022} +} + +@article{known2023, + author = {Known, Bob}, + title = {Known Paper}, + year = {2023}, + references = {leaf2021} +} + +@article{leaf2021, + author = {Leaf, Carol}, + title = {Leaf Paper}, + year = {2021} +} +""", + review_status="reviewed", + ) + + rows = store.traverse_graph(["seed2024"], relation_types=["cites"], max_depth=2) + + assert [row["target_citation_key"] for row in rows] == [ + "known2023", + "missing2022", + "leaf2021", + ] + assert rows[1]["target_exists"] is False + assert rows[2]["depth"] == 2 + finally: + store.close()