From dc53d16af56f26a27bcfa10b04d477c00e0cda27 Mon Sep 17 00:00:00 2001 From: welsberr Date: Fri, 20 Mar 2026 08:14:41 -0400 Subject: [PATCH] Add topic phrase review export workflow --- README.md | 7 +- src/citegeist/cli.py | 117 +++++++++++++++++++++++++++ src/citegeist/storage.py | 42 +++++++++- tests/test_cli.py | 168 ++++++++++++++++++++++++++++++++++++++- tests/test_storage.py | 48 ++++++++++- 5 files changed, 378 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index d2294f7..609ff29 100644 --- a/README.md +++ b/README.md @@ -164,7 +164,9 @@ PYTHONPATH=src .venv/bin/python -m citegeist duplicates-talkorigins talkorigins- PYTHONPATH=src .venv/bin/python -m citegeist duplicates-talkorigins talkorigins-out/talkorigins_manifest.json --limit 20 --preview --weak-only PYTHONPATH=src .venv/bin/python -m citegeist suggest-talkorigins-phrases talkorigins-out/talkorigins_manifest.json --output topic-phrases.json PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 stage-topic-phrases topic-phrases.json +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export-topic-phrase-reviews --output topic-phrase-review.json PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 review-topic-phrase abiogenesis accepted --notes "curated from local corpus" +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 review-topic-phrases topic-phrase-review.json PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 apply-topic-phrases topic-phrases.json PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 enrich-talkorigins talkorigins-out/talkorigins_manifest.json --limit 20 PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins-copy.sqlite3 enrich-talkorigins talkorigins-out/talkorigins_manifest.json --limit 5 --apply --allow-unsafe-search-matches @@ -179,8 +181,11 @@ Use `duplicates-talkorigins` when you want to inspect specific clusters, filter Use `suggest-talkorigins-phrases` to derive candidate stored expansion phrases from the existing TalkOrigins topic corpus itself. The output is deterministic JSON keyed by topic slug, with a suggested phrase plus the extracted keywords that drove it. This is a useful first pass before setting topic phrases in the database or editing generated batch jobs. Use `stage-topic-phrases` to load those suggestions into the database as review items. Staging stores the candidate in `suggested_phrase` and marks the topic `pending` without changing the active `expansion_phrase`. -Use `review-topic-phrase` to accept or reject one staged suggestion in place. Accepting a suggestion copies it into `expansion_phrase`; rejecting it preserves the review state without changing the live phrase. +Use `export-topic-phrase-reviews` to write an editable JSON template directly from the database for the currently staged suggestions. That gives you a round-trip path from DB review queue to file edits and back into `review-topic-phrases`. +Use `review-topic-phrase` to accept or reject one staged suggestion in place. Accepting a suggestion copies it into `expansion_phrase` and clears it from the staged review queue; rejecting it preserves the staged suggestion together with its review state. +Use `review-topic-phrases` when you want to apply many accept/reject decisions from one JSON file. Each item should carry `slug`, `status`, and optional `phrase` / `review_notes`. Use `apply-topic-phrases` when you want a direct patch path instead of the staged review flow. It accepts either the raw suggestion list or an object with a `topics` list, and will apply `suggested_phrase` or `phrase` to matching topic slugs immediately. +Use `topic-phrase-reviews --phrase-review-status pending` when you want a compact audit view of unresolved staged suggestions, including both the current live phrase and the pending replacement. Use `enrich-talkorigins` when you want to target those weak canonical entries for resolver-based metadata upgrades before retrying graph expansion on imported topic slices. Use `review-talkorigins` when you want one JSON review artifact that combines weak canonical clusters with dry-run enrichment outcomes for manual cleanup. Use `expand-topic` when you already have both a topic phrase and a curated topic seed set in the database: it expands outward from the topic’s existing entries, then only assigns discovered works back to that topic if they clear a topic-relevance threshold. Write-enabled assignment is stricter than preview ranking: a candidate must clear the score threshold and show a non-generic title anchor to the topic phrase, so broad methods papers do not get attached just because their abstracts or related terms overlap. On large noisy topics, prefer `--seed-key` to restrict the run to just the trusted seed entries you want to expand from, and use `--preview` first to inspect discovered candidates and relevance scores before writing anything. diff --git a/src/citegeist/cli.py b/src/citegeist/cli.py index 7973e87..801f272 100644 --- a/src/citegeist/cli.py +++ b/src/citegeist/cli.py @@ -298,6 +298,12 @@ def build_parser() -> argparse.ArgumentParser: help="Optional expansion phrase override to apply with the review decision", ) + review_topic_phrases_parser = subparsers.add_parser( + "review-topic-phrases", + help="Apply topic phrase review decisions in bulk from JSON", + ) + review_topic_phrases_parser.add_argument("input", help="Path to JSON file containing topic phrase review records") + duplicates_talkorigins_parser = subparsers.add_parser( "duplicates-talkorigins", help="Inspect duplicate clusters in a generated TalkOrigins manifest", @@ -401,6 +407,33 @@ def build_parser() -> argparse.ArgumentParser: help="Restrict topics to one stored phrase review state", ) + topic_phrase_reviews_parser = subparsers.add_parser( + "topic-phrase-reviews", + help="List staged topic phrase suggestions and their review state", + ) + topic_phrase_reviews_parser.add_argument("--limit", type=int, default=100, help="Maximum reviews to list") + topic_phrase_reviews_parser.add_argument( + "--phrase-review-status", + choices=["unreviewed", "pending", "accepted", "rejected"], + help="Restrict results to one stored phrase review state", + ) + + export_topic_phrase_reviews_parser = subparsers.add_parser( + "export-topic-phrase-reviews", + help="Export an editable JSON review template for staged topic phrase suggestions", + ) + export_topic_phrase_reviews_parser.add_argument("--limit", type=int, default=100, help="Maximum reviews to export") + export_topic_phrase_reviews_parser.add_argument( + "--phrase-review-status", + choices=["unreviewed", "pending", "accepted", "rejected"], + default="pending", + help="Restrict exported reviews to one stored phrase review state", + ) + export_topic_phrase_reviews_parser.add_argument( + "--output", + help="Write the review template JSON to a file instead of stdout", + ) + topic_entries_parser = subparsers.add_parser( "topic-entries", help="List entries assigned to one topic", @@ -522,6 +555,8 @@ def main(argv: list[str] | None = None) -> int: return _run_stage_topic_phrases(store, Path(args.input)) if args.command == "review-topic-phrase": return _run_review_topic_phrase(store, args.topic_slug, args.status, args.notes, args.phrase) + if args.command == "review-topic-phrases": + return _run_review_topic_phrases(store, Path(args.input)) if args.command == "duplicates-talkorigins": return _run_duplicates_talkorigins( Path(args.manifest), @@ -565,6 +600,10 @@ def main(argv: list[str] | None = None) -> int: ) if args.command == "topics": return _run_topics(store, args.limit, args.phrase_review_status) + if args.command == "topic-phrase-reviews": + return _run_topic_phrase_reviews(store, args.limit, args.phrase_review_status) + if args.command == "export-topic-phrase-reviews": + return _run_export_topic_phrase_reviews(store, args.limit, args.phrase_review_status, args.output) if args.command == "topic-entries": return _run_topic_entries(store, args.topic_slug, args.limit) if args.command == "export-topic": @@ -1056,6 +1095,51 @@ def _run_review_topic_phrase( return 0 +def _run_review_topic_phrases(store: BibliographyStore, input_path: Path) -> int: + payload = json.loads(input_path.read_text(encoding="utf-8")) + if isinstance(payload, dict): + items = payload.get("topics", payload.get("items", [])) + else: + items = payload + if not isinstance(items, list): + print("Topic phrase review JSON must be a list or an object with a 'topics' or 'items' list", file=sys.stderr) + return 1 + + results: list[dict[str, object]] = [] + exit_code = 0 + for item in items: + if not isinstance(item, dict): + continue + slug = str(item.get("slug") or "") + status = str(item.get("status") or item.get("phrase_review_status") or "") + notes = item.get("review_notes") + phrase = item.get("phrase", item.get("expansion_phrase")) + if not slug or status not in {"accepted", "rejected"}: + continue + if notes is not None: + notes = str(notes) + if phrase is not None: + phrase = str(phrase) + reviewed = store.review_topic_phrase_suggestion( + slug, + review_status=status, + review_notes=notes, + applied_phrase=phrase, + ) + if not reviewed: + exit_code = 1 + results.append( + { + "slug": slug, + "phrase_review_status": status, + "expansion_phrase": phrase, + "reviewed": reviewed, + } + ) + print(json.dumps(results, indent=2)) + return exit_code + + def _run_duplicates_talkorigins( manifest_path: Path, limit: int, @@ -1171,6 +1255,39 @@ def _run_topics(store: BibliographyStore, limit: int, phrase_review_status: str return 0 +def _run_topic_phrase_reviews(store: BibliographyStore, limit: int, phrase_review_status: str | None) -> int: + print(json.dumps(store.list_topic_phrase_reviews(limit=limit, phrase_review_status=phrase_review_status), indent=2)) + return 0 + + +def _run_export_topic_phrase_reviews( + store: BibliographyStore, + limit: int, + phrase_review_status: str | None, + output: str | None, +) -> int: + items = store.list_topic_phrase_reviews(limit=limit, phrase_review_status=phrase_review_status) + payload = [ + { + "slug": item["slug"], + "topic": item["name"], + "current_expansion_phrase": item.get("expansion_phrase"), + "suggested_phrase": item.get("suggested_phrase"), + "current_status": item.get("phrase_review_status"), + "review_notes": item.get("phrase_review_notes"), + "status": "", + "phrase": item.get("suggested_phrase"), + } + for item in items + ] + rendered = json.dumps(payload, indent=2) + if output: + Path(output).write_text(rendered + "\n", encoding="utf-8") + else: + print(rendered) + return 0 + + def _run_topic_entries(store: BibliographyStore, topic_slug: str, limit: int) -> int: topic = store.get_topic(topic_slug) if topic is None: diff --git a/src/citegeist/storage.py b/src/citegeist/storage.py index f2578f8..f0b85ab 100644 --- a/src/citegeist/storage.py +++ b/src/citegeist/storage.py @@ -603,6 +603,43 @@ class BibliographyStore: ).fetchone() return dict(row) if row else None + def list_topic_phrase_reviews( + self, + limit: int = 100, + phrase_review_status: str | None = None, + ) -> list[dict[str, object]]: + where = "WHERE t.suggested_phrase IS NOT NULL" + params: list[object] = [] + if phrase_review_status is not None: + where += " AND t.phrase_review_status = ?" + params.append(phrase_review_status) + params.append(limit) + rows = self.connection.execute( + f""" + SELECT t.slug, t.name, t.expansion_phrase, t.suggested_phrase, + t.phrase_review_status, t.phrase_review_notes, + COUNT(et.entry_id) AS entry_count + FROM topics t + LEFT JOIN entry_topics et ON et.topic_id = t.id + {where} + GROUP BY t.id, t.slug, t.name, t.expansion_phrase, t.suggested_phrase, + t.phrase_review_status, t.phrase_review_notes + ORDER BY + CASE t.phrase_review_status + WHEN 'pending' THEN 0 + WHEN 'unreviewed' THEN 1 + WHEN 'rejected' THEN 2 + WHEN 'accepted' THEN 3 + ELSE 4 + END, + t.name, + t.slug + LIMIT ? + """, + params, + ).fetchall() + return [dict(row) for row in rows] + def set_topic_expansion_phrase(self, slug: str, expansion_phrase: str | None) -> bool: row = self.connection.execute( """ @@ -651,8 +688,10 @@ class BibliographyStore: suggested_phrase = topic.get("suggested_phrase") expansion_phrase = topic.get("expansion_phrase") + stored_suggested_phrase = suggested_phrase if review_status == "accepted": expansion_phrase = applied_phrase if applied_phrase is not None else suggested_phrase + stored_suggested_phrase = None elif applied_phrase is not None: expansion_phrase = applied_phrase @@ -660,13 +699,14 @@ class BibliographyStore: """ UPDATE topics SET expansion_phrase = ?, + suggested_phrase = ?, phrase_review_status = ?, phrase_review_notes = ?, updated_at = CURRENT_TIMESTAMP WHERE slug = ? RETURNING id """, - (expansion_phrase, review_status, review_notes, slug), + (expansion_phrase, stored_suggested_phrase, review_status, review_notes, slug), ).fetchone() self.connection.commit() return row is not None diff --git a/tests/test_cli.py b/tests/test_cli.py index 7ab29c9..368571b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -797,7 +797,7 @@ def test_cli_can_review_topic_phrase(tmp_path: Path): ) assert result.returncode == 0 payload = json.loads(result.stdout) - assert payload["suggested_phrase"] == "graph networks biology" + assert payload["suggested_phrase"] is None assert payload["expansion_phrase"] == "graph networks biology" assert payload["phrase_review_status"] == "accepted" assert payload["phrase_review_notes"] == "curated and approved" @@ -844,6 +844,172 @@ def test_cli_topics_can_filter_by_phrase_review_status(tmp_path: Path): assert [topic["slug"] for topic in payload] == ["graph-methods"] +def test_cli_can_list_topic_phrase_reviews(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024} +} +""", + encoding="utf-8", + ) + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + from citegeist.storage import BibliographyStore + + database = tmp_path / "library.sqlite3" + store = BibliographyStore(database) + try: + store.add_entry_topic( + "seed2024", + topic_slug="graph-methods", + topic_name="Graph Methods", + source_type="talkorigins", + source_url="https://example.org/topics/graph-methods", + source_label="topic-seed", + ) + store.ensure_topic("abiogenesis", "Abiogenesis") + store.stage_topic_phrase_suggestion("graph-methods", "graph networks biology") + store.stage_topic_phrase_suggestion("abiogenesis", "abiogenesis life origin") + store.review_topic_phrase_suggestion("abiogenesis", "accepted") + finally: + store.close() + + result = run_cli(tmp_path, "topic-phrase-reviews", "--phrase-review-status", "pending") + assert result.returncode == 0 + payload = json.loads(result.stdout) + assert [review["slug"] for review in payload] == ["graph-methods"] + assert payload[0]["suggested_phrase"] == "graph networks biology" + assert payload[0]["phrase_review_status"] == "pending" + + +def test_cli_can_review_topic_phrases_in_bulk(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024} +} +""", + encoding="utf-8", + ) + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + from citegeist.storage import BibliographyStore + + database = tmp_path / "library.sqlite3" + store = BibliographyStore(database) + try: + store.add_entry_topic( + "seed2024", + topic_slug="graph-methods", + topic_name="Graph Methods", + source_type="talkorigins", + source_url="https://example.org/topics/graph-methods", + source_label="topic-seed", + ) + store.ensure_topic("abiogenesis", "Abiogenesis") + store.stage_topic_phrase_suggestion("graph-methods", "graph networks biology") + store.stage_topic_phrase_suggestion("abiogenesis", "abiogenesis life origin") + finally: + store.close() + + review_path = tmp_path / "phrase-review.json" + review_path.write_text( + json.dumps( + [ + { + "slug": "graph-methods", + "status": "accepted", + "review_notes": "good phrase", + }, + { + "slug": "abiogenesis", + "status": "rejected", + "review_notes": "too sparse", + }, + ] + ), + encoding="utf-8", + ) + + result = run_cli(tmp_path, "review-topic-phrases", str(review_path)) + assert result.returncode == 0 + payload = json.loads(result.stdout) + assert payload[0]["reviewed"] is True + assert payload[1]["reviewed"] is True + + pending_result = run_cli(tmp_path, "topic-phrase-reviews", "--phrase-review-status", "pending") + assert pending_result.returncode == 0 + assert json.loads(pending_result.stdout) == [] + + rejected_result = run_cli(tmp_path, "topic-phrase-reviews", "--phrase-review-status", "rejected") + assert rejected_result.returncode == 0 + rejected_payload = json.loads(rejected_result.stdout) + assert [review["slug"] for review in rejected_payload] == ["abiogenesis"] + + topics_result = run_cli(tmp_path, "topics", "--phrase-review-status", "accepted") + assert topics_result.returncode == 0 + topics_payload = json.loads(topics_result.stdout) + assert [topic["slug"] for topic in topics_payload] == ["graph-methods"] + + +def test_cli_can_export_topic_phrase_review_template(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024} +} +""", + encoding="utf-8", + ) + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + from citegeist.storage import BibliographyStore + + database = tmp_path / "library.sqlite3" + store = BibliographyStore(database) + try: + store.add_entry_topic( + "seed2024", + topic_slug="graph-methods", + topic_name="Graph Methods", + source_type="talkorigins", + source_url="https://example.org/topics/graph-methods", + source_label="topic-seed", + ) + store.stage_topic_phrase_suggestion("graph-methods", "graph networks biology") + finally: + store.close() + + output_path = tmp_path / "topic-phrase-review.json" + result = run_cli( + tmp_path, + "export-topic-phrase-reviews", + "--output", + str(output_path), + ) + assert result.returncode == 0 + payload = json.loads(output_path.read_text(encoding="utf-8")) + assert [item["slug"] for item in payload] == ["graph-methods"] + assert payload[0]["current_expansion_phrase"] is None + assert payload[0]["suggested_phrase"] == "graph networks biology" + assert payload[0]["current_status"] == "pending" + assert payload[0]["status"] == "" + assert payload[0]["phrase"] == "graph networks biology" + + def test_cli_export_topic(tmp_path: Path): bib_path = tmp_path / "input.bib" bib_path.write_text( diff --git a/tests/test_storage.py b/tests/test_storage.py index f432bfd..0344d1a 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -307,7 +307,7 @@ def test_store_can_stage_and_review_topic_phrase_suggestion(): reviewed = store.get_topic("graph-methods") assert reviewed is not None - assert reviewed["suggested_phrase"] == "graph networks biology" + assert reviewed["suggested_phrase"] is None assert reviewed["expansion_phrase"] == "graph networks biology" assert reviewed["phrase_review_status"] == "accepted" assert reviewed["phrase_review_notes"] == "looks good" @@ -333,6 +333,52 @@ def test_store_can_filter_topics_by_phrase_review_status(): store.close() +def test_store_can_list_topic_phrase_reviews(): + store = BibliographyStore() + try: + store.ensure_topic("graph-methods", "Graph Methods") + store.ensure_topic("abiogenesis", "Abiogenesis") + store.ensure_topic("plain-topic", "Plain Topic") + store.stage_topic_phrase_suggestion("graph-methods", "graph networks biology") + store.stage_topic_phrase_suggestion("abiogenesis", "abiogenesis life origin") + store.review_topic_phrase_suggestion("abiogenesis", "accepted") + + reviews = store.list_topic_phrase_reviews() + pending_reviews = store.list_topic_phrase_reviews(phrase_review_status="pending") + + assert [review["slug"] for review in reviews] == ["graph-methods"] + assert reviews[0]["suggested_phrase"] == "graph networks biology" + assert reviews[0]["phrase_review_status"] == "pending" + assert [review["slug"] for review in pending_reviews] == ["graph-methods"] + finally: + store.close() + + +def test_store_rejected_topic_phrase_stays_in_review_queue(): + store = BibliographyStore() + try: + store.ensure_topic("graph-methods", "Graph Methods") + store.stage_topic_phrase_suggestion("graph-methods", "graph networks biology") + + assert store.review_topic_phrase_suggestion( + "graph-methods", + "rejected", + review_notes="too broad", + ) is True + + topic = store.get_topic("graph-methods") + assert topic is not None + assert topic["suggested_phrase"] == "graph networks biology" + assert topic["expansion_phrase"] is None + assert topic["phrase_review_status"] == "rejected" + + reviews = store.list_topic_phrase_reviews() + assert [review["slug"] for review in reviews] == ["graph-methods"] + assert reviews[0]["phrase_review_status"] == "rejected" + finally: + store.close() + + def test_store_search_text_can_filter_by_topic(): store = BibliographyStore() try: