From e4eaf52393451d1acb01984c7830c9dd0792e655 Mon Sep 17 00:00:00 2001 From: welsberr Date: Fri, 20 Mar 2026 08:39:46 -0400 Subject: [PATCH] Add graph export and viewer workflows --- README.md | 5 + src/citegeist/cli.py | 374 ++++++++++++++++++++++++++++++++++++++++++- tests/test_cli.py | 184 +++++++++++++++++++++ 3 files changed, 562 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 62ded23..fe33179 100644 --- a/README.md +++ b/README.md @@ -136,6 +136,11 @@ PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topics PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topic-entries abiogenesis PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export-topic abiogenesis --output abiogenesis.bib PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 graph smith2024graphs --relation cites --depth 2 --missing-only +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 graph smith2024graphs --relation cites --depth 2 --format json-graph +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 graph smith2024graphs --relation cites --depth 2 --format dot +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 graph smith2024graphs --relation cites --depth 2 --format dot --output graph.dot +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 graph smith2024graphs --relation cites --depth 2 --format json-graph --output graph.json +PYTHONPATH=src .venv/bin/python -m citegeist graph-view graph.json --output graph.html --title "CiteGeist Graph" PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source crossref PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source openalex --relation cited_by --limit 10 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand-topic abiogenesis --topic-phrase "abiogenesis origin chemistry" --source openalex --relation cites --seed-key seed2024 --min-relevance 0.3 --preview diff --git a/src/citegeist/cli.py b/src/citegeist/cli.py index 7d50f22..6c5a85b 100644 --- a/src/citegeist/cli.py +++ b/src/citegeist/cli.py @@ -2,6 +2,7 @@ from __future__ import annotations import argparse from dataclasses import asdict +from html import escape as html_escape import json import sys from pathlib import Path @@ -82,6 +83,24 @@ def build_parser() -> argparse.ArgumentParser: action="store_true", help="Show only unresolved target nodes that are not yet present in the database", ) + graph_parser.add_argument( + "--format", + choices=["json", "dot", "json-graph"], + default="json", + help="Output format for traversed graph results", + ) + graph_parser.add_argument( + "--output", + help="Write graph output to a file instead of stdout", + ) + + graph_view_parser = subparsers.add_parser( + "graph-view", + help="Render a self-contained HTML viewer from a json-graph export", + ) + graph_view_parser.add_argument("input", help="Path to a graph JSON file exported with --format json-graph") + graph_view_parser.add_argument("--output", required=True, help="Path to write the HTML viewer") + graph_view_parser.add_argument("--title", default="CiteGeist Graph View", help="HTML page title") expand_parser = subparsers.add_parser("expand", help="Expand graph edges from external metadata sources") expand_parser.add_argument("citation_keys", nargs="+", help="Seed citation keys to expand") @@ -491,7 +510,11 @@ def main(argv: list[str] | None = None) -> int: args.depth, args.review_status, args.missing_only, + args.format, + args.output, ) + if args.command == "graph-view": + return _run_graph_view(Path(args.input), Path(args.output), args.title) if args.command == "expand": return _run_expand(store, args.citation_keys, args.source, args.relation, args.limit) if args.command == "expand-topic": @@ -763,6 +786,8 @@ def _run_graph( depth: int, review_status: str | None, missing_only: bool, + output_format: str, + output: str | None, ) -> int: rows = store.traverse_graph( citation_keys, @@ -773,10 +798,357 @@ def _run_graph( ) if missing_only: rows = [row for row in rows if not row["target_exists"]] - print(json.dumps(rows, indent=2)) + rendered: str + if output_format == "dot": + rendered = _render_graph_dot(store, citation_keys, rows) + elif output_format == "json-graph": + rendered = json.dumps(_render_graph_json(store, citation_keys, rows), indent=2) + else: + rendered = json.dumps(rows, indent=2) + if output: + Path(output).write_text(rendered + ("\n" if rendered and not rendered.endswith("\n") else ""), encoding="utf-8") + else: + print(rendered) return 0 +def _run_graph_view(input_path: Path, output_path: Path, title: str) -> int: + payload = json.loads(input_path.read_text(encoding="utf-8")) + if not isinstance(payload, dict) or not isinstance(payload.get("nodes"), list) or not isinstance(payload.get("edges"), list): + print("graph-view expects a json-graph payload with 'nodes' and 'edges'", file=sys.stderr) + return 1 + output_path.write_text(_render_graph_html(payload, title), encoding="utf-8") + return 0 + + +def _render_graph_dot( + store: BibliographyStore, + seed_keys: list[str], + rows: list[dict[str, object]], +) -> str: + node_payloads = _collect_graph_nodes(store, seed_keys, rows) + + lines = ["digraph citegeist {", " rankdir=LR;"] + for citation_key, payload in sorted(node_payloads.items()): + attributes = { + "label": _graph_node_label(payload), + "shape": "doublecircle" if payload.get("is_seed") else "ellipse", + } + if not payload.get("target_exists"): + attributes["style"] = "dashed" + attributes["color"] = "gray50" + elif payload.get("review_status") == "reviewed": + attributes["color"] = "forestgreen" + elif payload.get("review_status") == "draft": + attributes["color"] = "goldenrod" + attr_string = ", ".join(f'{key}="{_dot_escape(str(value))}"' for key, value in attributes.items()) + lines.append(f' "{_dot_escape(citation_key)}" [{attr_string}];') + + for row in rows: + source_key = _dot_escape(str(row["source_citation_key"])) + target_key = _dot_escape(str(row["target_citation_key"])) + relation_type = _dot_escape(str(row["relation_type"])) + depth_value = _dot_escape(str(row["depth"])) + lines.append( + f' "{source_key}" -> "{target_key}" [label="{relation_type} d={depth_value}"];' + ) + lines.append("}") + return "\n".join(lines) + + +def _render_graph_json( + store: BibliographyStore, + seed_keys: list[str], + rows: list[dict[str, object]], +) -> dict[str, object]: + node_payloads = _collect_graph_nodes(store, seed_keys, rows) + nodes = [] + for citation_key, payload in sorted(node_payloads.items()): + nodes.append( + { + "id": citation_key, + "label": citation_key, + "title": payload.get("title"), + "review_status": payload.get("review_status"), + "target_exists": payload.get("target_exists"), + "is_seed": payload.get("is_seed"), + } + ) + edges = [] + for index, row in enumerate(rows, start=1): + edges.append( + { + "id": f"edge-{index}", + "source": str(row["source_citation_key"]), + "target": str(row["target_citation_key"]), + "relation_type": str(row["relation_type"]), + "depth": int(row["depth"]), + "target_exists": bool(row["target_exists"]), + } + ) + return {"nodes": nodes, "edges": edges} + + +def _render_graph_html(payload: dict[str, object], title: str) -> str: + graph_json = json.dumps(payload) + safe_title = html_escape(title) + return """ + + + + + {title} + + + +
+ +
+ + + + + +
+
+ + + +""".format(title=safe_title, graph_json=graph_json) + + +def _collect_graph_nodes( + store: BibliographyStore, + seed_keys: list[str], + rows: list[dict[str, object]], +) -> dict[str, dict[str, object]]: + node_payloads: dict[str, dict[str, object]] = {} + entry_cache: dict[str, dict[str, object] | None] = {} + + def get_entry(citation_key: str) -> dict[str, object] | None: + if citation_key not in entry_cache: + entry_cache[citation_key] = store.get_entry(citation_key) + return entry_cache[citation_key] + + for seed_key in seed_keys: + entry = get_entry(seed_key) + node_payloads[seed_key] = { + "citation_key": seed_key, + "title": entry.get("title") if entry else None, + "review_status": entry.get("review_status") if entry else None, + "target_exists": entry is not None, + "is_seed": True, + } + + for row in rows: + source_key = str(row["source_citation_key"]) + target_key = str(row["target_citation_key"]) + source_entry = get_entry(source_key) + node_payloads.setdefault( + source_key, + { + "citation_key": source_key, + "title": source_entry.get("title") if source_entry else None, + "review_status": source_entry.get("review_status") if source_entry else None, + "target_exists": source_entry is not None, + "is_seed": source_key in seed_keys, + }, + ) + node_payloads[target_key] = { + "citation_key": target_key, + "title": row.get("target_title"), + "review_status": row.get("target_review_status"), + "target_exists": bool(row.get("target_exists")), + "is_seed": target_key in seed_keys, + } + return node_payloads + + +def _graph_node_label(payload: dict[str, object]) -> str: + citation_key = str(payload.get("citation_key") or "") + title = str(payload.get("title") or "").strip() + review_status = str(payload.get("review_status") or "").strip() + parts = [citation_key] + if title: + parts.append(title) + if review_status: + parts.append(f"[{review_status}]") + return "\\n".join(parts) + + +def _dot_escape(value: str) -> str: + return value.replace("\\", "\\\\").replace('"', '\\"') + + def _run_expand( store: BibliographyStore, citation_keys: list[str], diff --git a/tests/test_cli.py b/tests/test_cli.py index 38b567d..0b4968e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1144,6 +1144,190 @@ def test_cli_graph_outputs_missing_targets(tmp_path: Path): assert payload[0]["target_exists"] is False +def test_cli_graph_can_render_dot_output(tmp_path: Path): + bib_path = tmp_path / "graph.bib" + bib_path.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + references = {known2023, missing2022} +} + +@article{known2023, + author = {Known, Bob}, + title = {Known Paper}, + year = {2023} +} +""", + encoding="utf-8", + ) + + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + graph = run_cli(tmp_path, "graph", "seed2024", "--format", "dot") + assert graph.returncode == 0 + assert "digraph citegeist {" in graph.stdout + assert '"seed2024" [label="seed2024\\\\nSeed Paper\\\\n[draft]"' in graph.stdout + assert '"seed2024" -> "known2023" [label="cites d=1"]' in graph.stdout + assert '"seed2024" -> "missing2022" [label="cites d=1"]' in graph.stdout + + +def test_cli_graph_can_write_dot_output_to_file(tmp_path: Path): + bib_path = tmp_path / "graph.bib" + bib_path.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + references = {known2023} +} + +@article{known2023, + author = {Known, Bob}, + title = {Known Paper}, + year = {2023} +} +""", + encoding="utf-8", + ) + + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + output_path = tmp_path / "graph.dot" + graph = run_cli(tmp_path, "graph", "seed2024", "--format", "dot", "--output", str(output_path)) + assert graph.returncode == 0 + assert graph.stdout == "" + rendered = output_path.read_text(encoding="utf-8") + assert "digraph citegeist {" in rendered + assert '"seed2024" -> "known2023" [label="cites d=1"]' in rendered + + +def test_cli_graph_can_render_json_graph_output(tmp_path: Path): + bib_path = tmp_path / "graph.bib" + bib_path.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + references = {known2023, missing2022} +} + +@article{known2023, + author = {Known, Bob}, + title = {Known Paper}, + year = {2023} +} +""", + encoding="utf-8", + ) + + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + graph = run_cli(tmp_path, "graph", "seed2024", "--format", "json-graph") + assert graph.returncode == 0 + payload = json.loads(graph.stdout) + assert [node["id"] for node in payload["nodes"]] == ["known2023", "missing2022", "seed2024"] + assert payload["nodes"][2]["is_seed"] is True + assert payload["edges"][0]["source"] == "seed2024" + assert payload["edges"][0]["target"] == "known2023" + assert payload["edges"][1]["target_exists"] is False + + +def test_cli_graph_can_write_json_graph_output_to_file(tmp_path: Path): + bib_path = tmp_path / "graph.bib" + bib_path.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + references = {known2023} +} + +@article{known2023, + author = {Known, Bob}, + title = {Known Paper}, + year = {2023} +} +""", + encoding="utf-8", + ) + + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + output_path = tmp_path / "graph.json" + graph = run_cli(tmp_path, "graph", "seed2024", "--format", "json-graph", "--output", str(output_path)) + assert graph.returncode == 0 + assert graph.stdout == "" + payload = json.loads(output_path.read_text(encoding="utf-8")) + assert [edge["target"] for edge in payload["edges"]] == ["known2023"] + + +def test_cli_graph_view_renders_html_from_json_graph(tmp_path: Path): + graph_path = tmp_path / "graph.json" + graph_path.write_text( + json.dumps( + { + "nodes": [ + { + "id": "seed2024", + "label": "seed2024", + "title": "Seed Paper", + "review_status": "draft", + "target_exists": True, + "is_seed": True, + }, + { + "id": "known2023", + "label": "known2023", + "title": "Known Paper", + "review_status": "reviewed", + "target_exists": True, + "is_seed": False, + }, + ], + "edges": [ + { + "id": "edge-1", + "source": "seed2024", + "target": "known2023", + "relation_type": "cites", + "depth": 1, + "target_exists": True, + } + ], + } + ), + encoding="utf-8", + ) + + output_path = tmp_path / "graph.html" + result = run_cli( + tmp_path, + "graph-view", + str(graph_path), + "--output", + str(output_path), + "--title", + "Graph Demo", + ) + assert result.returncode == 0 + assert result.stdout == "" + html = output_path.read_text(encoding="utf-8") + assert "" in html + assert "Graph Demo" in html + assert '"seed2024"' in html + assert '"known2023"' in html + + def test_cli_expand_with_mocked_crossref(tmp_path: Path): bib_path = tmp_path / "expand.bib" bib_path.write_text(