Add stub resolution workflow

2026-03-20 14:53:49 -04:00 · 2026-03-20 14:53:49 -04:00 · 4eba64d352
parent 425e153bee
commit 4eba64d352
8 changed files with 322 additions and 39 deletions
--- a/README.md
+++ b/README.md
@ -133,6 +133,7 @@ PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 bootstrap --se
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 bootstrap --topic "bayesian nonparametrics" --preview --topic-commit-limit 5
 PYTHONPATH=src .venv/bin/python -m citegeist extract references.txt --output draft.bib
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve smith2024graphs
+PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --preview --limit 25
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topics
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topic-entries abiogenesis
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export-topic abiogenesis --output abiogenesis.bib
@ -157,19 +158,20 @@ For live-source development, prefer fixture-backed or cache-backed source client

 ## Example Application

-Use `stage-topic-phrases` to load those suggestions into the database as review items. Staging stores the candidate in `suggested_phrase` and marks the topic `pending` without changing the active `expansion_phrase`.
-Use `export-topic-phrase-reviews` to write an editable JSON template directly from the database for the currently staged suggestions. That gives you a round-trip path from DB review queue to file edits and back into `review-topic-phrases`.
-Use `review-topic-phrase` to accept or reject one staged suggestion in place. Accepting a suggestion copies it into `expansion_phrase` and clears it from the staged review queue; rejecting it preserves the staged suggestion together with its review state.
-Use `review-topic-phrases` when you want to apply many accept/reject decisions from one JSON file. Each item should carry `slug`, `status`, and optional `phrase` / `review_notes`.
-Use `apply-topic-phrases` when you want a direct patch path instead of the staged review flow. It accepts either the raw suggestion list or an object with a `topics` list, and will apply `suggested_phrase` or `phrase` to matching topic slugs immediately.
-Use `topic-phrase-reviews --phrase-review-status pending` when you want a compact audit view of unresolved staged suggestions, including both the current live phrase and the pending replacement.
-Use `enrich-talkorigins` when you want to target those weak canonical entries for resolver-based metadata upgrades before retrying graph expansion on imported topic slices.
-Use `review-talkorigins` when you want one JSON review artifact that combines weak canonical clusters with dry-run enrichment outcomes for manual cleanup.
-Use `expand-topic` when you already have both a topic phrase and a curated topic seed set in the database: it expands outward from the topic’s existing entries, then only assigns discovered works back to that topic if they clear a topic-relevance threshold. Write-enabled assignment is stricter than preview ranking: a candidate must clear the score threshold and show a non-generic title anchor to the topic phrase, so broad methods papers do not get attached just because their abstracts or related terms overlap. On large noisy topics, prefer `--seed-key` to restrict the run to just the trusted seed entries you want to expand from, and use `--preview` first to inspect discovered candidates and relevance scores before writing anything.
+- Use `stage-topic-phrases` to load those suggestions into the database as review items. Staging stores the candidate in `suggested_phrase` and marks the topic `pending` without changing the active `expansion_phrase`.

-Use `set-topic-phrase` to store a curated expansion phrase on the topic itself. When a stored phrase exists, `expand-topic` will use it automatically if you do not pass `--topic-phrase`. Batch bootstrap jobs can also set `topic_slug`, `topic_name`, and `topic_phrase` so curated topic metadata is created as part of the run.
-Use `topics --phrase-review-status pending` when you want to audit only topics whose staged phrase suggestions still need review.
-`--allow-unsafe-search-matches` exists only for bounded experiments on copied databases when you explicitly want to relax trust to exercise downstream expansion behavior.
+- Use `export-topic-phrase-reviews` to write an editable JSON template directly from the database for the currently staged suggestions. That gives you a round-trip path from DB review queue to file edits and back into `review-topic-phrases`.
+- Use `review-topic-phrase` to accept or reject one staged suggestion in place. Accepting a suggestion copies it into `expansion_phrase` and clears it from the staged review queue; rejecting it preserves the staged suggestion together with its review state.
+- Use `review-topic-phrases` when you want to apply many accept/reject decisions from one JSON file. Each item should carry `slug`, `status`, and optional `phrase` / `review_notes`.
+- Use `apply-topic-phrases` when you want a direct patch path instead of the staged review flow. It accepts either the raw suggestion list or an object with a `topics` list, and will apply `suggested_phrase` or `phrase` to matching topic slugs immediately.
+- Use `topic-phrase-reviews --phrase-review-status pending` when you want a compact audit view of unresolved staged suggestions, including both the current live phrase and the pending replacement.
+- Use `enrich-talkorigins` when you want to target those weak canonical entries for resolver-based metadata upgrades before retrying graph expansion on imported topic slices.
+- Use `review-talkorigins` when you want one JSON review artifact that combines weak canonical clusters with dry-run enrichment outcomes for manual cleanup.
+- Use `expand-topic` when you already have both a topic phrase and a curated topic seed set in the database: it expands outward from the topic’s existing entries, then only assigns discovered works back to that topic if they clear a topic-relevance threshold. Write-enabled assignment is stricter than preview ranking: a candidate must clear the score threshold and show a non-generic title anchor to the topic phrase, so broad methods papers do not get attached just because their abstracts or related terms overlap. On large noisy topics, prefer `--seed-key` to restrict the run to just the trusted seed entries you want to expand from, and use `--preview` first to inspect discovered candidates and relevance scores before writing anything.
+
+- Use `set-topic-phrase` to store a curated expansion phrase on the topic itself. When a stored phrase exists, `expand-topic` will use it automatically if you do not pass `--topic-phrase`. Batch bootstrap jobs can also set `topic_slug`, `topic_name`, and `topic_phrase` so curated topic metadata is created as part of the run.
+- Use `topics --phrase-review-status pending` when you want to audit only topics whose staged phrase suggestions still need review.
+- `--allow-unsafe-search-matches` exists only for bounded experiments on copied databases when you explicitly want to relax trust to exercise downstream expansion behavior.

 The TalkOrigins corpus pipeline remains in the repository as an example application rather than a core package surface. Use the example-scoped Python namespace:

--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -181,6 +181,18 @@ Resolve one or more entries against remote metadata:
 .venv/bin/python -m citegeist --db library.sqlite3 resolve langton1989artificial1 bedau2003artificial2
 ```

+Preview DOI-bearing placeholder records before enriching them:
+
+```bash
+.venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --preview --limit 25
+```
+
+Enrich DOI-bearing placeholder records inside one topic slice:
+
+```bash
+.venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --topic artificial-life --limit 25
+```
+
 ## Explore Citation Graphs

 Purpose: traverse citation edges, export graph data, and render quick visualizations.
--- a/src/citegeist/cli.py
+++ b/src/citegeist/cli.py
@ -67,6 +67,26 @@ def build_parser() -> argparse.ArgumentParser:
    resolve_parser = subparsers.add_parser("resolve", help="Enrich stored entries from external metadata sources")
    resolve_parser.add_argument("citation_keys", nargs="+", help="Citation keys to enrich")

+    resolve_stubs_parser = subparsers.add_parser(
+        "resolve-stubs",
+        help="Find and enrich stub-like stored entries, optionally limited to DOI-bearing candidates",
+    )
+    resolve_stubs_parser.add_argument("--limit", type=int, default=25, help="Maximum candidate entries to inspect")
+    resolve_stubs_parser.add_argument(
+        "--doi-only",
+        action="store_true",
+        help="Only consider candidates that already have a DOI",
+    )
+    resolve_stubs_parser.add_argument(
+        "--topic",
+        help="Optional topic slug to limit candidate selection",
+    )
+    resolve_stubs_parser.add_argument(
+        "--preview",
+        action="store_true",
+        help="Show the selected candidate entries without resolving them",
+    )
+
    graph_parser = subparsers.add_parser("graph", help="Traverse citation relations from one or more seed entries")
    graph_parser.add_argument("citation_keys", nargs="+", help="Seed citation keys")
    graph_parser.add_argument(
@ -502,6 +522,8 @@ def main(argv: list[str] | None = None) -> int:
            return _run_extract(Path(args.input), args.output)
        if args.command == "resolve":
            return _run_resolve(store, args.citation_keys)
+        if args.command == "resolve-stubs":
+            return _run_resolve_stubs(store, args.limit, args.doi_only, args.topic, args.preview)
        if args.command == "graph":
            return _run_graph(
                store,
@ -744,22 +766,25 @@ def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int:
    resolver = MetadataResolver()
    exit_code = 0
    for citation_key in citation_keys:
+        if not _resolve_one(store, resolver, citation_key):
+            exit_code = 1
+    return exit_code
+
+
+def _resolve_one(store: BibliographyStore, resolver: MetadataResolver, citation_key: str) -> bool:
    existing = store.get_entry(citation_key)
    if existing is None:
        print(f"Entry not found: {citation_key}", file=sys.stderr)
-            exit_code = 1
-            continue
+        return False
    bibtex = store.get_entry_bibtex(citation_key)
    if not bibtex:
        print(f"Entry not renderable: {citation_key}", file=sys.stderr)
-            exit_code = 1
-            continue
+        return False
    current_entry = parse_bibtex(bibtex)[0]
    resolution = resolver.resolve_entry(current_entry)
    if resolution is None:
        print(f"No resolver match: {citation_key}", file=sys.stderr)
-            exit_code = 1
-            continue
+        return False
    merged, conflicts = merge_entries_with_conflicts(current_entry, resolution.entry)
    store.replace_entry(
        citation_key,
@ -776,6 +801,31 @@ def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int:
            source_label=resolution.source_label,
        )
    print(f"{citation_key}\t{resolution.source_label}")
+    return True
+
+
+def _run_resolve_stubs(
+    store: BibliographyStore,
+    limit: int,
+    doi_only: bool,
+    topic_slug: str | None,
+    preview: bool,
+) -> int:
+    candidates = store.list_resolution_candidates(
+        limit=limit,
+        doi_only=doi_only,
+        stub_only=True,
+        topic_slug=topic_slug,
+    )
+    if preview:
+        print(json.dumps(candidates, indent=2))
+        return 0
+
+    resolver = MetadataResolver()
+    exit_code = 0
+    for candidate in candidates:
+        if not _resolve_one(store, resolver, str(candidate["citation_key"])):
+            exit_code = 1
    return exit_code


--- a/src/citegeist/resolve.py
+++ b/src/citegeist/resolve.py
@ -239,6 +239,9 @@ def merge_entries_with_conflicts(base: BibEntry, resolved: BibEntry) -> tuple[Bi
        if not value:
            continue
        current_value = merged_fields.get(key, "")
+        if _is_placeholder_value(key, current_value) and current_value != value:
+            merged_fields[key] = value
+            continue
        if current_value and current_value != value:
            conflicts.append(
                {
@ -260,6 +263,16 @@ def merge_entries_with_conflicts(base: BibEntry, resolved: BibEntry) -> tuple[Bi
    )


+def _is_placeholder_value(field_name: str, value: str) -> bool:
+    normalized = " ".join((value or "").split()).strip()
+    if not normalized:
+        return True
+    lowered = normalized.lower()
+    if field_name == "title":
+        return bool(re.fullmatch(r"referenced work \d+", lowered)) or lowered.startswith("untitled")
+    return False
+
+
 def _crossref_message_to_entry(message: dict) -> BibEntry:
    entry_type = _crossref_type_to_bibtype(message.get("type", "article"))
    title_values = message.get("title", [])
--- a/src/citegeist/storage.py
+++ b/src/citegeist/storage.py
@ -466,6 +466,72 @@ class BibliographyStore:
        ).fetchall()
        return [dict(row) for row in rows]

+    def list_resolution_candidates(
+        self,
+        *,
+        limit: int = 50,
+        doi_only: bool = False,
+        stub_only: bool = False,
+        topic_slug: str | None = None,
+    ) -> list[dict[str, object]]:
+        clauses: list[str] = []
+        params: list[object] = []
+        joins = ""
+
+        if topic_slug is not None:
+            joins = """
+            JOIN entry_topics et ON et.entry_id = e.id
+            JOIN topics t ON t.id = et.topic_id
+            """
+            clauses.append("t.slug = ?")
+            params.append(topic_slug)
+
+        if doi_only:
+            clauses.append("e.doi IS NOT NULL AND TRIM(e.doi) <> ''")
+
+        if stub_only:
+            clauses.append(
+                """
+                (
+                    e.title IS NULL
+                    OR TRIM(e.title) = ''
+                    OR LOWER(TRIM(e.title)) GLOB 'referenced work *'
+                    OR LOWER(TRIM(e.title)) GLOB 'untitled*'
+                    OR (
+                        e.entry_type = 'misc'
+                        AND (
+                            e.abstract IS NULL
+                            OR TRIM(e.abstract) = ''
+                        )
+                    )
+                )
+                """
+            )
+
+        where_clause = ""
+        if clauses:
+            where_clause = "WHERE " + " AND ".join(clauses)
+
+        rows = self.connection.execute(
+            f"""
+            SELECT DISTINCT
+                e.citation_key,
+                e.entry_type,
+                e.review_status,
+                e.title,
+                e.year,
+                e.doi,
+                e.abstract
+            FROM entries e
+            {joins}
+            {where_clause}
+            ORDER BY COALESCE(e.year, ''), e.citation_key
+            LIMIT ?
+            """,
+            (*params, limit),
+        ).fetchall()
+        return [dict(row) for row in rows]
+
    def ensure_topic(
        self,
        slug: str,
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -154,6 +154,87 @@ def test_cli_resolve_updates_entry(tmp_path: Path):
    assert payload["field_conflicts"][0]["field_name"] == "title"


+def test_cli_resolve_stubs_preview_lists_doi_stub_candidates(tmp_path: Path):
+    bib_path = tmp_path / "input.bib"
+    bib_path.write_text(
+        """
+@misc{stubdoi,
+  title = {Referenced work 6},
+  doi = {10.1200/JCO.2002.04.117},
+  url = {https://doi.org/10.1200/JCO.2002.04.117}
+}
+
+@article{complete,
+  author = {Smith, Jane},
+  title = {Complete Record},
+  year = {2024},
+  doi = {10.1000/complete}
+}
+""",
+        encoding="utf-8",
+    )
+    assert run_cli(tmp_path, "ingest", str(bib_path)).returncode == 0
+
+    result = run_cli(tmp_path, "resolve-stubs", "--doi-only", "--preview", "--limit", "10")
+    assert result.returncode == 0
+    payload = json.loads(result.stdout)
+    assert [row["citation_key"] for row in payload] == ["stubdoi"]
+    assert payload[0]["title"] == "Referenced work 6"
+
+
+def test_cli_resolve_stubs_enriches_matching_candidates(tmp_path: Path):
+    bib_path = tmp_path / "input.bib"
+    bib_path.write_text(
+        """
+@misc{stubdoi,
+  title = {Referenced work 6},
+  doi = {10.1200/JCO.2002.04.117},
+  url = {https://doi.org/10.1200/JCO.2002.04.117}
+}
+""",
+        encoding="utf-8",
+    )
+    assert run_cli(tmp_path, "ingest", str(bib_path)).returncode == 0
+
+    from citegeist.bibtex import BibEntry
+    from citegeist.resolve import Resolution
+
+    database = tmp_path / "library.sqlite3"
+
+    with patch("citegeist.cli.MetadataResolver.resolve_entry") as mocked_resolve:
+        mocked_resolve.return_value = Resolution(
+            entry=BibEntry(
+                entry_type="article",
+                citation_key="resolvedkey",
+                fields={
+                    "author": "Doe, Alex",
+                    "title": "Resolved Work",
+                    "year": "2002",
+                    "doi": "10.1200/JCO.2002.04.117",
+                    "journal": "Journal of Clinical Oncology",
+                },
+            ),
+            source_type="resolver",
+            source_label="crossref:doi:10.1200/JCO.2002.04.117",
+        )
+        exit_code = main(
+            [
+                "--db",
+                str(database),
+                "resolve-stubs",
+                "--doi-only",
+                "--limit",
+                "10",
+            ]
+        )
+
+    assert exit_code == 0
+    show = run_cli(tmp_path, "show", "stubdoi")
+    payload = json.loads(show.stdout)
+    assert payload["title"] == "Resolved Work"
+    assert payload["review_status"] == "enriched"
+
+
 def test_cli_resolve_conflicts_updates_status(tmp_path: Path):
    bib_path = tmp_path / "input.bib"
    bib_path.write_text(
--- a/tests/test_resolve.py
+++ b/tests/test_resolve.py
@ -108,6 +108,25 @@ def test_merge_entries_with_conflicts_records_disagreements():
    ]


+def test_merge_entries_replaces_placeholder_titles_without_conflict():
+    base = BibEntry(
+        entry_type="misc",
+        citation_key="stubdoi",
+        fields={"title": "Referenced work 6", "doi": "10.1200/JCO.2002.04.117"},
+    )
+    resolved = BibEntry(
+        entry_type="article",
+        citation_key="resolved",
+        fields={"title": "Resolved Work", "journal": "Journal of Clinical Oncology"},
+    )
+
+    merged, conflicts = merge_entries_with_conflicts(base, resolved)
+
+    assert merged.fields["title"] == "Resolved Work"
+    assert merged.fields["journal"] == "Journal of Clinical Oncology"
+    assert conflicts == []
+
+
 def test_resolver_tries_doi_before_dblp():
    resolver = MetadataResolver()
    calls: list[tuple[str, str]] = []
--- a/tests/test_storage.py
+++ b/tests/test_storage.py
@ -281,6 +281,46 @@ def test_store_can_set_topic_expansion_phrase():
        store.close()


+def test_store_lists_stub_resolution_candidates():
+    store = BibliographyStore()
+    try:
+        store.ingest_bibtex(
+            """
+@misc{stubdoi,
+  title = {Referenced work 6},
+  doi = {10.1200/JCO.2002.04.117},
+  url = {https://doi.org/10.1200/JCO.2002.04.117}
+}
+
+@article{complete,
+  author = {Smith, Jane},
+  title = {Complete Record},
+  year = {2024},
+  doi = {10.1000/complete}
+}
+"""
+        )
+        store.add_entry_topic(
+            "stubdoi",
+            topic_slug="artificial-life",
+            topic_name="Artificial life",
+            source_label="test",
+        )
+
+        candidates = store.list_resolution_candidates(limit=10, doi_only=True, stub_only=True)
+        assert [row["citation_key"] for row in candidates] == ["stubdoi"]
+
+        topic_candidates = store.list_resolution_candidates(
+            limit=10,
+            doi_only=True,
+            stub_only=True,
+            topic_slug="artificial-life",
+        )
+        assert [row["citation_key"] for row in topic_candidates] == ["stubdoi"]
+    finally:
+        store.close()
+
+
 def test_store_can_stage_and_review_topic_phrase_suggestion():
    store = BibliographyStore()
    try: