From b74582b72f09f36b63e459c26e3cc7ea3d0696c2 Mon Sep 17 00:00:00 2001
From: welsberr <welsberr@gmail.com>
Date: Fri, 20 Mar 2026 07:42:49 -0400
Subject: [PATCH] Add topic review workflow and expansion tooling

---
 Makefile                     |    5 +-
 README.md                    |  102 ++-
 src/citegeist/__init__.py    |   36 +-
 src/citegeist/batch.py       |   78 ++
 src/citegeist/bibtex.py      |   60 +-
 src/citegeist/bootstrap.py   |  145 ++++
 src/citegeist/cli.py         |  947 +++++++++++++++++++++-
 src/citegeist/expand.py      |  309 +++++++
 src/citegeist/harvest.py     |  317 ++++++++
 src/citegeist/resolve.py     |  254 +++++-
 src/citegeist/sources.py     |   12 +-
 src/citegeist/storage.py     |  473 ++++++++++-
 src/citegeist/talkorigins.py | 1485 ++++++++++++++++++++++++++++++++++
 tests/test_batch.py          |  129 +++
 tests/test_bootstrap.py      |  175 ++++
 tests/test_cli.py            |  839 ++++++++++++++++++-
 tests/test_harvest.py        |  293 +++++++
 tests/test_resolve.py        |  251 +++++-
 tests/test_sources.py        |   11 +
 tests/test_storage.py        |  247 ++++++
 tests/test_talkorigins.py    | 1024 +++++++++++++++++++++++
 tests/test_topic_expand.py   |  242 ++++++
 22 files changed, 7365 insertions(+), 69 deletions(-)
 create mode 100644 src/citegeist/batch.py
 create mode 100644 src/citegeist/bootstrap.py
 create mode 100644 src/citegeist/harvest.py
 create mode 100644 src/citegeist/talkorigins.py
 create mode 100644 tests/test_batch.py
 create mode 100644 tests/test_bootstrap.py
 create mode 100644 tests/test_harvest.py
 create mode 100644 tests/test_talkorigins.py
 create mode 100644 tests/test_topic_expand.py

diff --git a/Makefile b/Makefile
index 8b20c95..6456e47 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 PYTHONPATH_SRC=PYTHONPATH=src
 VENV_PYTHON=.venv/bin/python
 
-.PHONY: test test-live live-smoke
+.PHONY: test test-live live-smoke validate-talkorigins
 
 test:
 	$(PYTHONPATH_SRC) $(VENV_PYTHON) -m pytest -q
@@ -11,3 +11,6 @@ test-live:
 
 live-smoke:
 	CITEGEIST_SOURCE_CACHE=.cache/citegeist $(PYTHONPATH_SRC) $(VENV_PYTHON) scripts/live_smoke.py
+
+validate-talkorigins:
+	$(PYTHONPATH_SRC) $(VENV_PYTHON) -m citegeist validate-talkorigins talkorigins-out/talkorigins_manifest.json
diff --git a/README.md b/README.md
index e795a94..d2294f7 100644
--- a/README.md
+++ b/README.md
@@ -46,12 +46,17 @@ The initial repo includes:
 - `pybtex`-backed BibTeX parsing and export in a repo-local virtual environment;
 - a SQLite-backed bibliography store;
 - a small CLI for ingest, search, inspection, and export;
-- review-state tracking on entries and per-field ingest provenance;
+- review-state tracking on entries, per-field ingest provenance, and field-level conflict review;
 - plaintext reference extraction into draft BibTeX for numbered, APA-like, wrapped-line, and simple book-style references;
-- identifier-first metadata resolution for DOI, OpenAlex, DBLP, and arXiv-backed entries, with OpenAlex title-search fallback;
+- identifier-first metadata resolution for DOI, OpenAlex, DBLP, arXiv, and DataCite-backed entries, with OpenAlex/DataCite title-search fallback;
 - local citation-graph traversal over stored `cites`, `cited_by`, and `crossref` edges;
 - Crossref- and OpenAlex-backed graph expansion that materializes draft related works and edge provenance;
 - a dedicated source-client layer with fixture/cache support for live-source development;
+- OAI-PMH Dublin Core harvesting for institutional repositories and thesis/dissertation sources;
+- OAI-PMH repository discovery via `Identify`, `ListSets`, and `ListMetadataFormats` to target harvests more precisely;
+- bibliography bootstrap workflows that can start from a seed `.bib`, a topic phrase, or both;
+- batch bootstrap orchestration from JSON job files containing seed BibTeX paths, topic phrases, or both;
+- a TalkOrigins scraper that fixes repeated-author plaintext references, emits per-topic seed BibTeX files, and writes a batch JSON specification;
 - normalized tables for entries, creators, identifiers, and citation relations;
 - full-text-search-ready indexing over title, abstract, and fulltext when SQLite FTS5 is available;
 - tests covering parsing, ingestion, relation storage, and search.
@@ -113,18 +118,107 @@ Or use the CLI directly:
 cd citegeist
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 ingest references.bib
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 search "semantic search"
-PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 show --provenance smith2024graphs
+PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 search "origin" --topic abiogenesis
+PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 show --provenance --conflicts smith2024graphs
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 set-status smith2024graphs reviewed
+PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve-conflicts smith2024graphs title accepted
+PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 apply-conflict smith2024graphs title
+PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 bootstrap --seed-bib seed.bib --topic "bayesian nonparametrics"
+PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 bootstrap --topic "bayesian nonparametrics" --preview --topic-commit-limit 5
+PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 scrape-talkorigins talkorigins-out --limit-topics 5 --limit-entries-per-topic 20
 PYTHONPATH=src .venv/bin/python -m citegeist extract references.txt --output draft.bib
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve smith2024graphs
+PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topics
+PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topic-entries abiogenesis
+PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export-topic abiogenesis --output abiogenesis.bib
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 graph smith2024graphs --relation cites --depth 2 --missing-only
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source crossref
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source openalex --relation cited_by --limit 10
+PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand-topic abiogenesis --topic-phrase "abiogenesis origin chemistry" --source openalex --relation cites --seed-key seed2024 --min-relevance 0.3 --preview
+PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 set-topic-phrase abiogenesis "abiogenesis origin chemistry prebiotic"
+PYTHONPATH=src .venv/bin/python -m citegeist discover-oai https://example.edu/oai
+PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 harvest-oai https://example.edu/oai --metadata-prefix mods --from 2024-01-01 --until 2024-12-31 --limit 10
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export --output reviewed.bib
 ```
 
 For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run.
 
+For large legacy plaintext corpora such as the TalkOrigins bibliography, prefer a two-step workflow:
+
+1. `scrape-talkorigins` to generate cleaned per-topic `seed_bib` files plus a `talkorigins_jobs.json` batch spec.
+2. `bootstrap-batch` on that JSON file when you want to ingest, resolve, and expand from the generated seeds.
+
+The TalkOrigins scrape output now includes:
+
+- `seeds/*.bib` per-topic seed BibTeX files for `bootstrap-batch`
+- `plaintext/*.txt` per-topic cleaned GSA-style plaintext with repeated authors expanded
+- `site/topics/*.html` reconstructed topic pages with hide/show BibTeX blocks
+- `talkorigins_full.txt` and `talkorigins_full.bib` aggregate downloads
+- `snapshots/*.json` cached topic payloads so reruns can resume without re-fetching already scraped topics
+
+After a full scrape, run:
+
+```bash
+PYTHONPATH=src .venv/bin/python -m citegeist validate-talkorigins talkorigins-out/talkorigins_manifest.json
+PYTHONPATH=src .venv/bin/python -m citegeist duplicates-talkorigins talkorigins-out/talkorigins_manifest.json --limit 20
+PYTHONPATH=src .venv/bin/python -m citegeist duplicates-talkorigins talkorigins-out/talkorigins_manifest.json --limit 20 --preview --weak-only
+PYTHONPATH=src .venv/bin/python -m citegeist suggest-talkorigins-phrases talkorigins-out/talkorigins_manifest.json --output topic-phrases.json
+PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 stage-topic-phrases topic-phrases.json
+PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 review-topic-phrase abiogenesis accepted --notes "curated from local corpus"
+PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 apply-topic-phrases topic-phrases.json
+PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 enrich-talkorigins talkorigins-out/talkorigins_manifest.json --limit 20
+PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins-copy.sqlite3 enrich-talkorigins talkorigins-out/talkorigins_manifest.json --limit 5 --apply --allow-unsafe-search-matches
+PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 review-talkorigins talkorigins-out/talkorigins_manifest.json --output talkorigins-review.json
+PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 apply-talkorigins-corrections talkorigins-out/talkorigins_manifest.json talkorigins-corrections.json
+```
+
+That report summarizes parse coverage and flags suspicious entry-type / venue combinations for manual cleanup.
+It also reports duplicate clusters across topic seed files so you can gauge how much deduplication pressure to expect before ingestion.
+Use `duplicates-talkorigins` when you want to inspect specific clusters, filter by text, restrict the audit to one topic slug, or preview only weak canonicalization outcomes before importing.
+
+Use `suggest-talkorigins-phrases` to derive candidate stored expansion phrases from the existing TalkOrigins topic corpus itself. The output is deterministic JSON keyed by topic slug, with a suggested phrase plus the extracted keywords that drove it. This is a useful first pass before setting topic phrases in the database or editing generated batch jobs.
+
+Use `stage-topic-phrases` to load those suggestions into the database as review items. Staging stores the candidate in `suggested_phrase` and marks the topic `pending` without changing the active `expansion_phrase`.
+Use `review-topic-phrase` to accept or reject one staged suggestion in place. Accepting a suggestion copies it into `expansion_phrase`; rejecting it preserves the review state without changing the live phrase.
+Use `apply-topic-phrases` when you want a direct patch path instead of the staged review flow. It accepts either the raw suggestion list or an object with a `topics` list, and will apply `suggested_phrase` or `phrase` to matching topic slugs immediately.
+Use `enrich-talkorigins` when you want to target those weak canonical entries for resolver-based metadata upgrades before retrying graph expansion on imported topic slices.
+Use `review-talkorigins` when you want one JSON review artifact that combines weak canonical clusters with dry-run enrichment outcomes for manual cleanup.
+Use `expand-topic` when you already have both a topic phrase and a curated topic seed set in the database: it expands outward from the topic’s existing entries, then only assigns discovered works back to that topic if they clear a topic-relevance threshold. Write-enabled assignment is stricter than preview ranking: a candidate must clear the score threshold and show a non-generic title anchor to the topic phrase, so broad methods papers do not get attached just because their abstracts or related terms overlap. On large noisy topics, prefer `--seed-key` to restrict the run to just the trusted seed entries you want to expand from, and use `--preview` first to inspect discovered candidates and relevance scores before writing anything.
+
+Use `set-topic-phrase` to store a curated expansion phrase on the topic itself. When a stored phrase exists, `expand-topic` will use it automatically if you do not pass `--topic-phrase`. Batch bootstrap jobs can also set `topic_slug`, `topic_name`, and `topic_phrase` so curated topic metadata is created as part of the run.
+Use `topics --phrase-review-status pending` when you want to audit only topics whose staged phrase suggestions still need review.
+`--allow-unsafe-search-matches` exists only for bounded experiments on copied databases when you explicitly want to relax trust to exercise downstream expansion behavior.
+
+Correction files are simple JSON:
+
+```json
+{
+  "corrections": [
+    {
+      "key": "smith jane|1999|weak duplicate",
+      "entry_type": "article",
+      "review_status": "reviewed",
+      "fields": {
+        "journal": "Journal of Better Metadata",
+        "doi": "10.1000/weak",
+        "note": null
+      }
+    }
+  ]
+}
+```
+
+`fields` values overwrite the canonical entry for that duplicate-cluster key. Set a field to `null` to remove it.
+
+To import the reconstructed corpus into SQLite while collapsing duplicate works across topics into canonical entries:
+
+```bash
+PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 ingest-talkorigins talkorigins-out/talkorigins_manifest.json
+```
+
+That import preserves many-to-many topic membership through the `topics` and `entry_topics` tables.
+After import, use `topics`, `topic-entries`, `search --topic`, and `export-topic` to inspect or export topic slices from the consolidated database.
+
 Live-source workflow:
 
 ```bash
@@ -147,7 +241,7 @@ make live-smoke
 
 ## Near-Term Priorities
 
-- additional resolvers and expansion paths for non-DOI scholarly ecosystems.
+- source adapters beyond OAI-PMH for additional non-DOI scholarly ecosystems.
 
 See [ROADMAP.md](./ROADMAP.md) for the prioritized phase plan and rationale.
 
diff --git a/src/citegeist/__init__.py b/src/citegeist/__init__.py
index 952a02c..bacb1c7 100644
--- a/src/citegeist/__init__.py
+++ b/src/citegeist/__init__.py
@@ -1,18 +1,52 @@
+from .batch import BatchBootstrapRunner, BatchJobResult, load_batch_jobs
 from .bibtex import BibEntry, parse_bibtex
+from .bootstrap import BootstrapResult, Bootstrapper
 from .expand import CrossrefExpander, OpenAlexExpander
 from .extract import extract_references
-from .resolve import MetadataResolver, merge_entries
+from .harvest import OaiMetadataFormat, OaiPmhHarvester, OaiSet
+from .resolve import MetadataResolver, merge_entries, merge_entries_with_conflicts
 from .sources import SourceClient
 from .storage import BibliographyStore
+from .talkorigins import (
+    TalkOriginsBatchExport,
+    TalkOriginsDuplicateCluster,
+    TalkOriginsEnrichmentResult,
+    TalkOriginsIngestReport,
+    TalkOriginsReviewExport,
+    TalkOriginsScraper,
+    TalkOriginsSeedSet,
+    TalkOriginsTopicPhraseSuggestion,
+    TalkOriginsTopic,
+    TalkOriginsValidationReport,
+)
 
 __all__ = [
     "BibEntry",
+    "BatchBootstrapRunner",
+    "BatchJobResult",
     "BibliographyStore",
+    "BootstrapResult",
+    "Bootstrapper",
     "CrossrefExpander",
     "MetadataResolver",
     "OpenAlexExpander",
+    "OaiPmhHarvester",
+    "OaiMetadataFormat",
+    "OaiSet",
     "SourceClient",
+    "TalkOriginsBatchExport",
+    "TalkOriginsDuplicateCluster",
+    "TalkOriginsEnrichmentResult",
+    "TalkOriginsIngestReport",
+    "TalkOriginsReviewExport",
+    "TalkOriginsScraper",
+    "TalkOriginsSeedSet",
+    "TalkOriginsTopicPhraseSuggestion",
+    "TalkOriginsTopic",
+    "TalkOriginsValidationReport",
     "extract_references",
+    "load_batch_jobs",
     "merge_entries",
+    "merge_entries_with_conflicts",
     "parse_bibtex",
 ]
diff --git a/src/citegeist/batch.py b/src/citegeist/batch.py
new file mode 100644
index 0000000..203a4a1
--- /dev/null
+++ b/src/citegeist/batch.py
@@ -0,0 +1,78 @@
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+
+from .bootstrap import BootstrapResult, Bootstrapper
+from .storage import BibliographyStore
+
+
+@dataclass(slots=True)
+class BatchJobResult:
+    job_name: str
+    result_count: int
+    results: list[BootstrapResult]
+
+
+def load_batch_jobs(path: str | Path) -> list[dict]:
+    path = Path(path)
+    payload = json.loads(path.read_text(encoding="utf-8"))
+    if isinstance(payload, dict):
+        jobs = payload.get("jobs", [])
+    else:
+        jobs = payload
+    if not isinstance(jobs, list):
+        raise ValueError("Batch JSON must be a list of jobs or an object with a 'jobs' list")
+    normalized_jobs: list[dict] = []
+    for job in jobs:
+        if not isinstance(job, dict):
+            raise ValueError("Each batch job must be an object")
+        normalized = dict(job)
+        seed_bib = normalized.get("seed_bib")
+        if isinstance(seed_bib, str) and seed_bib:
+            seed_path = Path(seed_bib)
+            if not seed_path.is_absolute():
+                normalized["seed_bib"] = str((path.parent / seed_path).resolve())
+        normalized_jobs.append(normalized)
+    return normalized_jobs
+
+
+class BatchBootstrapRunner:
+    def __init__(self, bootstrapper: Bootstrapper | None = None) -> None:
+        self.bootstrapper = bootstrapper or Bootstrapper()
+
+    def run(self, store: BibliographyStore, jobs: list[dict]) -> list[BatchJobResult]:
+        results: list[BatchJobResult] = []
+        for index, job in enumerate(jobs, start=1):
+            seed_bib = job.get("seed_bib")
+            topic = job.get("topic")
+            topic_limit = int(job.get("topic_limit", 5))
+            topic_commit_limit = job.get("topic_commit_limit")
+            expand = bool(job.get("expand", True))
+            review_status = str(job.get("status", "draft"))
+            preview = bool(job.get("preview", False))
+            name = str(job.get("name") or f"job_{index}")
+            topic_slug = job.get("topic_slug")
+            topic_name = job.get("topic_name")
+            topic_phrase = job.get("topic_phrase")
+
+            seed_bibtex = None
+            if seed_bib:
+                seed_bibtex = Path(seed_bib).read_text(encoding="utf-8")
+
+            job_results = self.bootstrapper.bootstrap(
+                store,
+                seed_bibtex=seed_bibtex,
+                topic=topic,
+                topic_limit=topic_limit,
+                topic_commit_limit=int(topic_commit_limit) if topic_commit_limit is not None else None,
+                expand=expand,
+                review_status=review_status,
+                preview_only=preview,
+                topic_slug=str(topic_slug) if topic_slug else None,
+                topic_name=str(topic_name) if topic_name else None,
+                topic_phrase=str(topic_phrase) if topic_phrase else None,
+            )
+            results.append(BatchJobResult(name, len(job_results), job_results))
+        return results
diff --git a/src/citegeist/bibtex.py b/src/citegeist/bibtex.py
index 41ed97d..d815b9a 100644
--- a/src/citegeist/bibtex.py
+++ b/src/citegeist/bibtex.py
@@ -5,8 +5,10 @@ from io import StringIO
 
 try:
     from pybtex.database import BibliographyData, Entry, Person, parse_string
+    from pybtex.bibtex.exceptions import BibTeXError
     from pybtex.database.output.bibtex import Writer
 except ImportError:  # pragma: no cover - exercised only outside the configured venv
+    BibTeXError = None
     BibliographyData = Entry = Person = Writer = None
     parse_string = None
 
@@ -40,7 +42,11 @@ def render_bibtex(entries: list[BibEntry]) -> str:
     _require_pybtex()
     bibliography_entries = {}
     for entry in entries:
-        fields = {key: value for key, value in entry.fields.items() if key not in {"author", "editor"}}
+        fields = {
+            key: _sanitize_bibtex_value(value)
+            for key, value in entry.fields.items()
+            if key not in {"author", "editor"}
+        }
         persons = {}
         for role in ("author", "editor"):
             raw_names = entry.fields.get(role)
@@ -49,7 +55,24 @@ def render_bibtex(entries: list[BibEntry]) -> str:
         bibliography_entries[entry.citation_key] = Entry(entry.entry_type, fields=fields, persons=persons)
 
     buffer = StringIO()
-    Writer().write_stream(BibliographyData(entries=bibliography_entries), buffer)
+    try:
+        Writer().write_stream(BibliographyData(entries=bibliography_entries), buffer)
+    except BibTeXError:
+        conservative_entries = {}
+        for entry in entries:
+            fields = {
+                key: _flatten_bibtex_braces(value)
+                for key, value in entry.fields.items()
+                if key not in {"author", "editor"}
+            }
+            persons = {}
+            for role in ("author", "editor"):
+                raw_names = entry.fields.get(role)
+                if raw_names:
+                    persons[role] = [Person(name.strip()) for name in raw_names.split(" and ") if name.strip()]
+            conservative_entries[entry.citation_key] = Entry(entry.entry_type, fields=fields, persons=persons)
+        buffer = StringIO()
+        Writer().write_stream(BibliographyData(entries=conservative_entries), buffer)
     return buffer.getvalue().strip()
 
 
@@ -58,3 +81,36 @@ def _require_pybtex() -> None:
         raise RuntimeError(
             "pybtex is required. Use the repo-local virtual environment under .venv/ for citegeist commands."
         )
+
+
+def _sanitize_bibtex_value(value: str) -> str:
+    depth = 0
+    parts: list[str] = []
+    for char in value:
+        if char == "{":
+            depth += 1
+            parts.append(char)
+            continue
+        if char == "}":
+            if depth == 0:
+                parts.append(")")
+            else:
+                depth -= 1
+                parts.append(char)
+            continue
+        parts.append(char)
+    if depth > 0:
+        open_count = depth
+        normalized = []
+        for char in parts:
+            if char == "{" and open_count > 0:
+                normalized.append("(")
+                open_count -= 1
+            else:
+                normalized.append(char)
+        return "".join(normalized)
+    return "".join(parts)
+
+
+def _flatten_bibtex_braces(value: str) -> str:
+    return value.replace("{", "(").replace("}", ")")
diff --git a/src/citegeist/bootstrap.py b/src/citegeist/bootstrap.py
new file mode 100644
index 0000000..80bb4e6
--- /dev/null
+++ b/src/citegeist/bootstrap.py
@@ -0,0 +1,145 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+import re
+
+from .bibtex import BibEntry, parse_bibtex
+from .expand import CrossrefExpander, OpenAlexExpander
+from .resolve import MetadataResolver
+from .storage import BibliographyStore
+
+
+@dataclass(slots=True)
+class BootstrapResult:
+    citation_key: str
+    origin: str
+    created: bool
+    score: float = 0.0
+
+
+class Bootstrapper:
+    def __init__(
+        self,
+        resolver: MetadataResolver | None = None,
+        crossref_expander: CrossrefExpander | None = None,
+        openalex_expander: OpenAlexExpander | None = None,
+    ) -> None:
+        self.resolver = resolver or MetadataResolver()
+        self.crossref_expander = crossref_expander or CrossrefExpander(self.resolver)
+        self.openalex_expander = openalex_expander or OpenAlexExpander(self.resolver)
+
+    def bootstrap(
+        self,
+        store: BibliographyStore,
+        seed_bibtex: str | None = None,
+        topic: str | None = None,
+        topic_limit: int = 5,
+        topic_commit_limit: int | None = None,
+        expand: bool = True,
+        review_status: str = "draft",
+        preview_only: bool = False,
+        topic_slug: str | None = None,
+        topic_name: str | None = None,
+        topic_phrase: str | None = None,
+    ) -> list[BootstrapResult]:
+        results: list[BootstrapResult] = []
+        seed_keys: list[str] = []
+
+        if seed_bibtex:
+            for entry in parse_bibtex(seed_bibtex):
+                created = store.get_entry(entry.citation_key) is None
+                if not preview_only:
+                    store.upsert_entry(
+                        entry,
+                        raw_bibtex=None,
+                        source_type="bootstrap",
+                        source_label="seed_bibtex",
+                        review_status=review_status,
+                    )
+                    seed_keys.append(entry.citation_key)
+                results.append(BootstrapResult(entry.citation_key, "seed_bibtex", created))
+
+        if topic:
+            if not preview_only and (topic_slug or topic_name or topic_phrase):
+                store.ensure_topic(
+                    slug=topic_slug or _slugify(topic),
+                    name=topic_name or topic,
+                    source_type="bootstrap",
+                    expansion_phrase=topic_phrase or topic,
+                )
+            ranked_candidates = self._topic_candidates(topic, seed_keys, topic_limit)
+            if topic_commit_limit is not None:
+                ranked_candidates = ranked_candidates[:topic_commit_limit]
+
+            for entry, score in ranked_candidates:
+                created = store.get_entry(entry.citation_key) is None
+                if not preview_only:
+                    store.upsert_entry(
+                        entry,
+                        raw_bibtex=None,
+                        source_type="bootstrap",
+                        source_label=f"topic:{topic}",
+                        review_status=review_status,
+                    )
+                    seed_keys.append(entry.citation_key)
+                results.append(BootstrapResult(entry.citation_key, "topic", created, score=score))
+
+        if expand and not preview_only:
+            expanded_keys = list(dict.fromkeys(seed_keys))
+            for citation_key in expanded_keys:
+                for item in self.crossref_expander.expand_entry_references(store, citation_key):
+                    results.append(BootstrapResult(item.discovered_citation_key, "crossref_expand", item.created_entry))
+                for item in self.openalex_expander.expand_entry(store, citation_key, relation_type="cites", limit=topic_limit):
+                    results.append(BootstrapResult(item.discovered_citation_key, "openalex_expand", item.created_entry))
+
+        store.connection.commit()
+        return results
+
+    def _topic_candidates(self, topic: str, seed_keys: list[str], limit: int) -> list[tuple[BibEntry, float]]:
+        scored: dict[str, tuple[BibEntry, float]] = {}
+
+        for source_name, base_score, entries in (
+            ("openalex", 3.0, self.resolver.search_openalex(topic, limit=limit)),
+            ("crossref", 2.0, self.resolver.search_crossref(topic, limit=limit)),
+            ("datacite", 1.5, self.resolver.search_datacite(topic, limit=limit)),
+        ):
+            for entry in entries:
+                score = base_score + _topic_relevance_score(entry, topic) + _seed_overlap_score(entry, seed_keys)
+                existing = scored.get(entry.citation_key)
+                if existing is None or score > existing[1]:
+                    scored[entry.citation_key] = (entry, score)
+
+        ranked = sorted(
+            scored.values(),
+            key=lambda item: (-item[1], item[0].citation_key),
+        )
+        return ranked[:limit]
+
+
+def _topic_relevance_score(entry: BibEntry, topic: str) -> float:
+    topic_terms = _tokenize(topic)
+    title_terms = _tokenize(entry.fields.get("title", ""))
+    abstract_terms = _tokenize(entry.fields.get("abstract", ""))
+    overlap = len(topic_terms & (title_terms | abstract_terms))
+    return float(overlap)
+
+
+def _seed_overlap_score(entry: BibEntry, seed_keys: list[str]) -> float:
+    if not seed_keys:
+        return 0.0
+    title_terms = _tokenize(entry.fields.get("title", ""))
+    score = 0.0
+    for seed_key in seed_keys:
+        seed_terms = _tokenize(seed_key)
+        if seed_terms & title_terms:
+            score += 0.25
+    return score
+
+
+def _tokenize(value: str) -> set[str]:
+    return {token for token in re.split(r"\W+", value.lower()) if token}
+
+
+def _slugify(value: str) -> str:
+    slug = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-")
+    return slug or "topic"
diff --git a/src/citegeist/cli.py b/src/citegeist/cli.py
index 2af8de9..7973e87 100644
--- a/src/citegeist/cli.py
+++ b/src/citegeist/cli.py
@@ -6,11 +6,15 @@ import json
 import sys
 from pathlib import Path
 
+from .batch import BatchBootstrapRunner, load_batch_jobs
 from .bibtex import parse_bibtex, render_bibtex
-from .expand import CrossrefExpander, OpenAlexExpander
+from .bootstrap import Bootstrapper
+from .expand import CrossrefExpander, OpenAlexExpander, TopicExpander
 from .extract import extract_references
-from .resolve import MetadataResolver, merge_entries
+from .harvest import OaiPmhHarvester
+from .resolve import MetadataResolver, merge_entries_with_conflicts
 from .storage import BibliographyStore
+from .talkorigins import TalkOriginsScraper
 
 
 def build_parser() -> argparse.ArgumentParser:
@@ -27,11 +31,13 @@ def build_parser() -> argparse.ArgumentParser:
     search_parser = subparsers.add_parser("search", help="Search titles, abstracts, and fulltext")
     search_parser.add_argument("query", help="Search query")
     search_parser.add_argument("--limit", type=int, default=10, help="Maximum number of results")
+    search_parser.add_argument("--topic", help="Optional topic slug to filter search results")
 
     show_parser = subparsers.add_parser("show", help="Show one entry or list entries")
     show_parser.add_argument("citation_key", nargs="?", help="Citation key to show")
     show_parser.add_argument("--limit", type=int, default=20, help="Maximum entries when listing")
     show_parser.add_argument("--provenance", action="store_true", help="Include field provenance")
+    show_parser.add_argument("--conflicts", action="store_true", help="Include field conflicts")
 
     export_parser = subparsers.add_parser("export", help="Export entries as BibTeX")
     export_parser.add_argument("citation_keys", nargs="*", help="Optional citation keys to export")
@@ -41,6 +47,18 @@ def build_parser() -> argparse.ArgumentParser:
     status_parser.add_argument("citation_key", help="Citation key to update")
     status_parser.add_argument("review_status", help="New review status")
 
+    conflict_parser = subparsers.add_parser("resolve-conflicts", help="Update conflict review status for one field")
+    conflict_parser.add_argument("citation_key", help="Citation key to update")
+    conflict_parser.add_argument("field_name", help="Field name whose open conflicts should be updated")
+    conflict_parser.add_argument("status", choices=["accepted", "rejected"], help="New conflict status")
+
+    apply_conflict_parser = subparsers.add_parser(
+        "apply-conflict",
+        help="Accept the proposed value for the latest open conflict on a field",
+    )
+    apply_conflict_parser.add_argument("citation_key", help="Citation key to update")
+    apply_conflict_parser.add_argument("field_name", help="Field name whose proposed value should be applied")
+
     extract_parser = subparsers.add_parser("extract", help="Extract draft BibTeX from plaintext references")
     extract_parser.add_argument("input", help="Plaintext file containing bibliography-style references")
     extract_parser.add_argument("--output", help="Write extracted BibTeX to a file instead of stdout")
@@ -81,6 +99,322 @@ def build_parser() -> argparse.ArgumentParser:
     )
     expand_parser.add_argument("--limit", type=int, default=25, help="Maximum related works to fetch per seed")
 
+    expand_topic_parser = subparsers.add_parser(
+        "expand-topic",
+        help="Expand one topic from its existing seed entries and assign only relevant discoveries back to that topic",
+    )
+    expand_topic_parser.add_argument("topic_slug", help="Topic slug to expand from")
+    expand_topic_parser.add_argument(
+        "--topic-phrase",
+        help="Optional phrase used for relevance gating; defaults to the stored topic name",
+    )
+    expand_topic_parser.add_argument(
+        "--source",
+        choices=["crossref", "openalex"],
+        default="openalex",
+        help="External source used for topic expansion",
+    )
+    expand_topic_parser.add_argument(
+        "--relation",
+        choices=["cites", "cited_by"],
+        default="cites",
+        help="Graph direction to expand for sources that support it",
+    )
+    expand_topic_parser.add_argument("--seed-limit", type=int, default=25, help="Maximum topic seed entries to expand from")
+    expand_topic_parser.add_argument("--per-seed-limit", type=int, default=25, help="Maximum discovered works to fetch per seed")
+    expand_topic_parser.add_argument(
+        "--seed-key",
+        action="append",
+        dest="seed_keys",
+        help="Restrict expansion to one trusted seed entry; may be passed multiple times",
+    )
+    expand_topic_parser.add_argument(
+        "--min-relevance",
+        type=float,
+        default=0.2,
+        help="Minimum topic-term overlap score required to assign a discovered work back to the topic",
+    )
+    expand_topic_parser.add_argument(
+        "--preview",
+        action="store_true",
+        help="Discover and score candidate expansions without writing entries, relations, or topic assignments",
+    )
+
+    set_topic_phrase_parser = subparsers.add_parser(
+        "set-topic-phrase",
+        help="Set or clear the stored expansion phrase for one topic",
+    )
+    set_topic_phrase_parser.add_argument("topic_slug", help="Topic slug to update")
+    set_topic_phrase_parser.add_argument(
+        "phrase",
+        nargs="?",
+        help="Expansion phrase to store; omit with --clear to remove it",
+    )
+    set_topic_phrase_parser.add_argument(
+        "--clear",
+        action="store_true",
+        help="Clear the stored expansion phrase for this topic",
+    )
+
+    harvest_parser = subparsers.add_parser("harvest-oai", help="Harvest draft entries from an OAI-PMH repository")
+    harvest_parser.add_argument("base_url", help="OAI-PMH base URL")
+    harvest_parser.add_argument("--metadata-prefix", default="oai_dc", help="OAI-PMH metadataPrefix to harvest")
+    harvest_parser.add_argument("--set", dest="set_spec", help="Optional OAI-PMH set spec")
+    harvest_parser.add_argument("--from", dest="date_from", help="Optional OAI-PMH lower date bound")
+    harvest_parser.add_argument("--until", dest="date_until", help="Optional OAI-PMH upper date bound")
+    harvest_parser.add_argument("--limit", type=int, default=20, help="Maximum harvested records to ingest")
+    harvest_parser.add_argument("--status", default="draft", help="Initial review status")
+
+    discover_parser = subparsers.add_parser("discover-oai", help="Inspect OAI-PMH repository identity and sets")
+    discover_parser.add_argument("base_url", help="OAI-PMH base URL")
+
+    bootstrap_parser = subparsers.add_parser(
+        "bootstrap",
+        help="Start bibliography expansion from a seed BibTeX file, a topic phrase, or both",
+    )
+    bootstrap_parser.add_argument("--seed-bib", help="Optional seed BibTeX file")
+    bootstrap_parser.add_argument("--topic", help="Optional topic phrase")
+    bootstrap_parser.add_argument("--topic-slug", help="Optional stored topic slug for this bootstrap topic")
+    bootstrap_parser.add_argument("--topic-name", help="Optional stored topic name for this bootstrap topic")
+    bootstrap_parser.add_argument(
+        "--store-topic-phrase",
+        help="Optional stored expansion phrase to save with the bootstrap topic; defaults to --topic when topic metadata is provided",
+    )
+    bootstrap_parser.add_argument("--topic-limit", type=int, default=5, help="Maximum topic-search seed candidates")
+    bootstrap_parser.add_argument(
+        "--topic-commit-limit",
+        type=int,
+        help="Maximum ranked topic candidates to actually commit and expand",
+    )
+    bootstrap_parser.add_argument(
+        "--no-expand",
+        action="store_true",
+        help="Do not run immediate graph expansion after seeding",
+    )
+    bootstrap_parser.add_argument(
+        "--preview",
+        action="store_true",
+        help="Preview ranked bootstrap candidates without writing to the database or expanding",
+    )
+    bootstrap_parser.add_argument("--status", default="draft", help="Initial review status for imported entries")
+
+    batch_parser = subparsers.add_parser(
+        "bootstrap-batch",
+        help="Run multiple bootstrap jobs from a JSON specification file",
+    )
+    batch_parser.add_argument("input", help="Path to batch JSON file")
+
+    talkorigins_parser = subparsers.add_parser(
+        "scrape-talkorigins",
+        help="Scrape TalkOrigins into per-topic seed BibTeX files and a bootstrap-batch JSON file",
+    )
+    talkorigins_parser.add_argument(
+        "output_dir",
+        help="Directory where seed BibTeX files, manifest, and batch JSON should be written",
+    )
+    talkorigins_parser.add_argument(
+        "--base-url",
+        default="https://www.talkorigins.org/origins/biblio/",
+        help="TalkOrigins bibliography index URL",
+    )
+    talkorigins_parser.add_argument("--limit-topics", type=int, help="Limit the number of scraped topic pages")
+    talkorigins_parser.add_argument(
+        "--limit-entries-per-topic",
+        type=int,
+        help="Limit the number of parsed references per topic page",
+    )
+    talkorigins_parser.add_argument(
+        "--resolve-seeds",
+        action="store_true",
+        help="Attempt metadata resolution on parsed seed entries before writing BibTeX",
+    )
+    talkorigins_parser.add_argument(
+        "--ingest",
+        action="store_true",
+        help="Also ingest the generated seed BibTeX into the configured database",
+    )
+    talkorigins_parser.add_argument(
+        "--no-expand",
+        action="store_true",
+        help="Write generated batch jobs with graph expansion disabled",
+    )
+    talkorigins_parser.add_argument(
+        "--no-resume",
+        action="store_true",
+        help="Do not reuse saved TalkOrigins topic snapshots from a prior run",
+    )
+    talkorigins_parser.add_argument(
+        "--topic-limit",
+        type=int,
+        default=5,
+        help="Default bootstrap topic-search limit to include in generated jobs",
+    )
+    talkorigins_parser.add_argument(
+        "--topic-commit-limit",
+        type=int,
+        help="Default bootstrap topic commit limit to include in generated jobs",
+    )
+    talkorigins_parser.add_argument("--status", default="draft", help="Review status for generated seed jobs")
+
+    validate_talkorigins_parser = subparsers.add_parser(
+        "validate-talkorigins",
+        help="Validate a generated TalkOrigins manifest and report parse coverage and suspicious entries",
+    )
+    validate_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json")
+
+    suggest_talkorigins_parser = subparsers.add_parser(
+        "suggest-talkorigins-phrases",
+        help="Suggest stored topic expansion phrases from a TalkOrigins manifest",
+    )
+    suggest_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json")
+    suggest_talkorigins_parser.add_argument("--topic", help="Optional topic slug to restrict suggestions")
+    suggest_talkorigins_parser.add_argument("--limit", type=int, help="Maximum topics to include")
+    suggest_talkorigins_parser.add_argument("--output", help="Write suggestions JSON to a file instead of stdout")
+
+    apply_topic_phrases_parser = subparsers.add_parser(
+        "apply-topic-phrases",
+        help="Apply stored topic expansion phrases from a JSON suggestion or patch file",
+    )
+    apply_topic_phrases_parser.add_argument("input", help="Path to JSON file containing topic phrase records")
+
+    stage_topic_phrases_parser = subparsers.add_parser(
+        "stage-topic-phrases",
+        help="Stage topic phrase suggestions from JSON for later review in the database",
+    )
+    stage_topic_phrases_parser.add_argument("input", help="Path to JSON file containing topic phrase records")
+
+    review_topic_phrase_parser = subparsers.add_parser(
+        "review-topic-phrase",
+        help="Accept or reject one staged topic phrase suggestion",
+    )
+    review_topic_phrase_parser.add_argument("topic_slug", help="Topic slug to review")
+    review_topic_phrase_parser.add_argument("status", choices=["accepted", "rejected"], help="Review decision")
+    review_topic_phrase_parser.add_argument(
+        "--notes",
+        help="Optional review notes to store with the decision",
+    )
+    review_topic_phrase_parser.add_argument(
+        "--phrase",
+        help="Optional expansion phrase override to apply with the review decision",
+    )
+
+    duplicates_talkorigins_parser = subparsers.add_parser(
+        "duplicates-talkorigins",
+        help="Inspect duplicate clusters in a generated TalkOrigins manifest",
+    )
+    duplicates_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json")
+    duplicates_talkorigins_parser.add_argument("--limit", type=int, default=20, help="Maximum clusters to show")
+    duplicates_talkorigins_parser.add_argument(
+        "--min-count",
+        type=int,
+        default=2,
+        help="Minimum cluster size to include",
+    )
+    duplicates_talkorigins_parser.add_argument("--match", help="Optional text filter for duplicate clusters")
+    duplicates_talkorigins_parser.add_argument("--topic", help="Optional topic slug to restrict inspection")
+    duplicates_talkorigins_parser.add_argument(
+        "--preview",
+        action="store_true",
+        help="Include the canonical merged entry that ingest-talkorigins would choose",
+    )
+    duplicates_talkorigins_parser.add_argument(
+        "--weak-only",
+        action="store_true",
+        help="Show only clusters whose canonical preview still looks weak",
+    )
+
+    ingest_talkorigins_parser = subparsers.add_parser(
+        "ingest-talkorigins",
+        help="Ingest a TalkOrigins manifest into the database with duplicate consolidation and topic membership",
+    )
+    ingest_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json")
+    ingest_talkorigins_parser.add_argument("--status", default="draft", help="Review status for imported entries")
+    ingest_talkorigins_parser.add_argument(
+        "--no-dedupe",
+        action="store_true",
+        help="Disable duplicate consolidation and import each parsed entry separately",
+    )
+
+    enrich_talkorigins_parser = subparsers.add_parser(
+        "enrich-talkorigins",
+        help="Attempt metadata enrichment for weak TalkOrigins canonical entries",
+    )
+    enrich_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json")
+    enrich_talkorigins_parser.add_argument("--limit", type=int, default=20, help="Maximum weak clusters to inspect")
+    enrich_talkorigins_parser.add_argument(
+        "--min-count",
+        type=int,
+        default=2,
+        help="Minimum duplicate-cluster size to include",
+    )
+    enrich_talkorigins_parser.add_argument("--match", help="Optional text filter for weak canonical clusters")
+    enrich_talkorigins_parser.add_argument("--topic", help="Optional topic slug to restrict enrichment")
+    enrich_talkorigins_parser.add_argument(
+        "--apply",
+        action="store_true",
+        help="Write successful enrichments back into the configured database",
+    )
+    enrich_talkorigins_parser.add_argument(
+        "--allow-unsafe-search-matches",
+        action="store_true",
+        help="Allow low-trust title-search resolver matches for bounded experiments on copied databases",
+    )
+    enrich_talkorigins_parser.add_argument(
+        "--status",
+        default="enriched",
+        help="Review status to set when applying successful enrichments",
+    )
+
+    review_talkorigins_parser = subparsers.add_parser(
+        "review-talkorigins",
+        help="Export weak TalkOrigins clusters plus dry-run enrichment outcomes for manual review",
+    )
+    review_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json")
+    review_talkorigins_parser.add_argument("--limit", type=int, default=20, help="Maximum weak clusters to export")
+    review_talkorigins_parser.add_argument(
+        "--min-count",
+        type=int,
+        default=2,
+        help="Minimum duplicate-cluster size to include",
+    )
+    review_talkorigins_parser.add_argument("--match", help="Optional text filter for weak canonical clusters")
+    review_talkorigins_parser.add_argument("--topic", help="Optional topic slug to restrict review export")
+    review_talkorigins_parser.add_argument("--output", help="Write review export JSON to a file instead of stdout")
+
+    apply_review_talkorigins_parser = subparsers.add_parser(
+        "apply-talkorigins-corrections",
+        help="Apply curated TalkOrigins review corrections to the consolidated database",
+    )
+    apply_review_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json")
+    apply_review_talkorigins_parser.add_argument("corrections", help="Path to corrections JSON")
+    apply_review_talkorigins_parser.add_argument(
+        "--status",
+        default="reviewed",
+        help="Default review status to set on corrected entries",
+    )
+
+    topics_parser = subparsers.add_parser("topics", help="List known topics in the database")
+    topics_parser.add_argument("--limit", type=int, default=100, help="Maximum number of topics to list")
+    topics_parser.add_argument(
+        "--phrase-review-status",
+        choices=["unreviewed", "pending", "accepted", "rejected"],
+        help="Restrict topics to one stored phrase review state",
+    )
+
+    topic_entries_parser = subparsers.add_parser(
+        "topic-entries",
+        help="List entries assigned to one topic",
+    )
+    topic_entries_parser.add_argument("topic_slug", help="Topic slug to inspect")
+    topic_entries_parser.add_argument("--limit", type=int, default=100, help="Maximum entries to list")
+
+    export_topic_parser = subparsers.add_parser(
+        "export-topic",
+        help="Export one topic slice as BibTeX",
+    )
+    export_topic_parser.add_argument("topic_slug", help="Topic slug to export")
+    export_topic_parser.add_argument("--output", help="Write BibTeX to a file instead of stdout")
+
     return parser
 
 
@@ -93,13 +427,17 @@ def main(argv: list[str] | None = None) -> int:
         if args.command == "ingest":
             return _run_ingest(store, Path(args.input), args.status, args.source_label)
         if args.command == "search":
-            return _run_search(store, args.query, args.limit)
+            return _run_search(store, args.query, args.limit, args.topic)
         if args.command == "show":
-            return _run_show(store, args.citation_key, args.limit, args.provenance)
+            return _run_show(store, args.citation_key, args.limit, args.provenance, args.conflicts)
         if args.command == "export":
             return _run_export(store, args.citation_keys, args.output)
         if args.command == "set-status":
             return _run_set_status(store, args.citation_key, args.review_status)
+        if args.command == "resolve-conflicts":
+            return _run_resolve_conflicts(store, args.citation_key, args.field_name, args.status)
+        if args.command == "apply-conflict":
+            return _run_apply_conflict(store, args.citation_key, args.field_name)
         if args.command == "extract":
             return _run_extract(Path(args.input), args.output)
         if args.command == "resolve":
@@ -115,6 +453,122 @@ def main(argv: list[str] | None = None) -> int:
             )
         if args.command == "expand":
             return _run_expand(store, args.citation_keys, args.source, args.relation, args.limit)
+        if args.command == "expand-topic":
+            return _run_expand_topic(
+                store,
+                args.topic_slug,
+                args.topic_phrase,
+                args.source,
+                args.relation,
+                args.seed_limit,
+                args.per_seed_limit,
+                args.min_relevance,
+                args.seed_keys,
+                args.preview,
+            )
+        if args.command == "set-topic-phrase":
+            return _run_set_topic_phrase(store, args.topic_slug, args.phrase, args.clear)
+        if args.command == "harvest-oai":
+            return _run_harvest_oai(
+                store,
+                args.base_url,
+                args.metadata_prefix,
+                args.set_spec,
+                args.date_from,
+                args.date_until,
+                args.limit,
+                args.status,
+            )
+        if args.command == "discover-oai":
+            return _run_discover_oai(args.base_url)
+        if args.command == "bootstrap":
+            return _run_bootstrap(
+                store,
+                args.seed_bib,
+                args.topic,
+                args.topic_limit,
+                args.topic_commit_limit,
+                not args.no_expand,
+                args.status,
+                args.preview,
+                args.topic_slug,
+                args.topic_name,
+                args.store_topic_phrase,
+            )
+        if args.command == "bootstrap-batch":
+            return _run_bootstrap_batch(store, Path(args.input))
+        if args.command == "scrape-talkorigins":
+            return _run_scrape_talkorigins(
+                store,
+                args.base_url,
+                Path(args.output_dir),
+                args.limit_topics,
+                args.limit_entries_per_topic,
+                args.resolve_seeds,
+                args.ingest,
+                not args.no_expand,
+                not args.no_resume,
+                args.topic_limit,
+                args.topic_commit_limit,
+                args.status,
+            )
+        if args.command == "validate-talkorigins":
+            return _run_validate_talkorigins(Path(args.manifest))
+        if args.command == "suggest-talkorigins-phrases":
+            return _run_suggest_talkorigins_phrases(Path(args.manifest), args.topic, args.limit, args.output)
+        if args.command == "apply-topic-phrases":
+            return _run_apply_topic_phrases(store, Path(args.input))
+        if args.command == "stage-topic-phrases":
+            return _run_stage_topic_phrases(store, Path(args.input))
+        if args.command == "review-topic-phrase":
+            return _run_review_topic_phrase(store, args.topic_slug, args.status, args.notes, args.phrase)
+        if args.command == "duplicates-talkorigins":
+            return _run_duplicates_talkorigins(
+                Path(args.manifest),
+                args.limit,
+                args.min_count,
+                args.match,
+                args.topic,
+                args.preview,
+                args.weak_only,
+            )
+        if args.command == "ingest-talkorigins":
+            return _run_ingest_talkorigins(store, Path(args.manifest), args.status, not args.no_dedupe)
+        if args.command == "enrich-talkorigins":
+            return _run_enrich_talkorigins(
+                store,
+                Path(args.manifest),
+                args.limit,
+                args.min_count,
+                args.match,
+                args.topic,
+                args.apply,
+                args.status,
+                args.allow_unsafe_search_matches,
+            )
+        if args.command == "review-talkorigins":
+            return _run_review_talkorigins(
+                store,
+                Path(args.manifest),
+                args.limit,
+                args.min_count,
+                args.match,
+                args.topic,
+                args.output,
+            )
+        if args.command == "apply-talkorigins-corrections":
+            return _run_apply_talkorigins_corrections(
+                store,
+                Path(args.manifest),
+                Path(args.corrections),
+                args.status,
+            )
+        if args.command == "topics":
+            return _run_topics(store, args.limit, args.phrase_review_status)
+        if args.command == "topic-entries":
+            return _run_topic_entries(store, args.topic_slug, args.limit)
+        if args.command == "export-topic":
+            return _run_export_topic(store, args.topic_slug, args.output)
     finally:
         store.close()
 
@@ -139,14 +593,20 @@ def _run_ingest(
     return 0
 
 
-def _run_search(store: BibliographyStore, query: str, limit: int) -> int:
-    for row in store.search_text(query, limit=limit):
+def _run_search(store: BibliographyStore, query: str, limit: int, topic_slug: str | None) -> int:
+    for row in store.search_text(query, limit=limit, topic_slug=topic_slug):
         score = row.get("score", 0.0)
         print(f"{row['citation_key']}\t{row.get('year') or ''}\t{score:.3f}\t{row.get('title') or ''}")
     return 0
 
 
-def _run_show(store: BibliographyStore, citation_key: str | None, limit: int, provenance: bool) -> int:
+def _run_show(
+    store: BibliographyStore,
+    citation_key: str | None,
+    limit: int,
+    provenance: bool,
+    conflicts: bool,
+) -> int:
     if citation_key:
         entry = store.get_entry(citation_key)
         if entry is None:
@@ -154,6 +614,8 @@ def _run_show(store: BibliographyStore, citation_key: str | None, limit: int, pr
             return 1
         if provenance:
             entry["field_provenance"] = store.get_field_provenance(citation_key)
+        if conflicts:
+            entry["field_conflicts"] = store.get_field_conflicts(citation_key)
         print(json.dumps(entry, indent=2, sort_keys=True))
         return 0
 
@@ -179,6 +641,23 @@ def _run_set_status(store: BibliographyStore, citation_key: str, review_status:
     return 0
 
 
+def _run_resolve_conflicts(store: BibliographyStore, citation_key: str, field_name: str, status: str) -> int:
+    count = store.set_conflict_status(citation_key, field_name, status)
+    if count == 0:
+        print(f"No open conflicts updated for {citation_key}:{field_name}", file=sys.stderr)
+        return 1
+    print(f"{citation_key}\t{field_name}\t{status}\t{count}")
+    return 0
+
+
+def _run_apply_conflict(store: BibliographyStore, citation_key: str, field_name: str) -> int:
+    if not store.apply_conflict_value(citation_key, field_name):
+        print(f"No open conflict applied for {citation_key}:{field_name}", file=sys.stderr)
+        return 1
+    print(f"{citation_key}\t{field_name}\tapplied")
+    return 0
+
+
 def _run_extract(input_path: Path, output: str | None) -> int:
     text = input_path.read_text(encoding="utf-8")
     entries = extract_references(text)
@@ -211,7 +690,7 @@ def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int:
             print(f"No resolver match: {citation_key}", file=sys.stderr)
             exit_code = 1
             continue
-        merged = merge_entries(current_entry, resolution.entry)
+        merged, conflicts = merge_entries_with_conflicts(current_entry, resolution.entry)
         store.replace_entry(
             citation_key,
             merged,
@@ -219,6 +698,13 @@ def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int:
             source_label=resolution.source_label,
             review_status="enriched",
         )
+        if conflicts:
+            store.record_conflicts(
+                citation_key,
+                conflicts,
+                source_type=resolution.source_type,
+                source_label=resolution.source_label,
+            )
         print(f"{citation_key}\t{resolution.source_label}")
     return exit_code
 
@@ -266,3 +752,448 @@ def _run_expand(
         all_results.extend(expand_fn(citation_key))
     print(json.dumps([asdict(result) for result in all_results], indent=2))
     return 0
+
+
+def _run_expand_topic(
+    store: BibliographyStore,
+    topic_slug: str,
+    topic_phrase: str | None,
+    source: str,
+    relation: str,
+    seed_limit: int,
+    per_seed_limit: int,
+    min_relevance: float,
+    seed_keys: list[str] | None,
+    preview: bool,
+) -> int:
+    expander = TopicExpander()
+    stored_topic = store.get_topic(topic_slug)
+    effective_phrase = topic_phrase
+    if effective_phrase is None and stored_topic is not None:
+        effective_phrase = str(stored_topic.get("expansion_phrase") or "") or None
+    results = expander.expand_topic(
+        store,
+        topic_slug,
+        topic_phrase=effective_phrase,
+        source=source,
+        relation_type=relation,
+        seed_limit=seed_limit,
+        per_seed_limit=per_seed_limit,
+        min_relevance=min_relevance,
+        seed_keys=seed_keys,
+        preview_only=preview,
+    )
+    print(json.dumps([asdict(result) for result in results], indent=2))
+    return 0
+
+
+def _run_set_topic_phrase(
+    store: BibliographyStore,
+    topic_slug: str,
+    phrase: str | None,
+    clear: bool,
+) -> int:
+    if clear:
+        phrase = None
+    elif phrase is None:
+        print("set-topic-phrase requires a phrase or --clear", file=sys.stderr)
+        return 1
+    if not store.set_topic_expansion_phrase(topic_slug, phrase):
+        print(f"Topic not found: {topic_slug}", file=sys.stderr)
+        return 1
+    payload = store.get_topic(topic_slug)
+    print(json.dumps(payload, indent=2))
+    return 0
+
+
+def _run_harvest_oai(
+    store: BibliographyStore,
+    base_url: str,
+    metadata_prefix: str,
+    set_spec: str | None,
+    date_from: str | None,
+    date_until: str | None,
+    limit: int,
+    review_status: str,
+) -> int:
+    harvester = OaiPmhHarvester()
+    harvested = harvester.list_records(
+        base_url,
+        metadata_prefix=metadata_prefix,
+        set_spec=set_spec,
+        date_from=date_from,
+        date_until=date_until,
+        limit=limit,
+    )
+    for result in harvested:
+        store.upsert_entry(
+            result.entry,
+            raw_bibtex=render_bibtex([result.entry]),
+            source_type="harvest",
+            source_label=f"oai:{result.base_url}",
+            review_status=review_status,
+        )
+        print(result.entry.citation_key)
+    store.connection.commit()
+    return 0
+
+
+def _run_discover_oai(base_url: str) -> int:
+    harvester = OaiPmhHarvester()
+    payload = {
+        "identify": harvester.identify(base_url),
+        "metadata_formats": [asdict(result) for result in harvester.list_metadata_formats(base_url)],
+        "sets": [asdict(result) for result in harvester.list_sets(base_url)],
+    }
+    print(json.dumps(payload, indent=2, sort_keys=True))
+    return 0
+
+
+def _run_bootstrap(
+    store: BibliographyStore,
+    seed_bib: str | None,
+    topic: str | None,
+    topic_limit: int,
+    topic_commit_limit: int | None,
+    expand: bool,
+    review_status: str,
+    preview: bool,
+    topic_slug: str | None,
+    topic_name: str | None,
+    stored_topic_phrase: str | None,
+) -> int:
+    if not seed_bib and not topic:
+        print("bootstrap requires --seed-bib, --topic, or both", file=sys.stderr)
+        return 1
+
+    seed_bibtex = Path(seed_bib).read_text(encoding="utf-8") if seed_bib else None
+    bootstrapper = Bootstrapper()
+    results = bootstrapper.bootstrap(
+        store,
+        seed_bibtex=seed_bibtex,
+        topic=topic,
+        topic_limit=topic_limit,
+        topic_commit_limit=topic_commit_limit,
+        expand=expand,
+        review_status=review_status,
+        preview_only=preview,
+        topic_slug=topic_slug,
+        topic_name=topic_name,
+        topic_phrase=stored_topic_phrase,
+    )
+    print(json.dumps([asdict(result) for result in results], indent=2))
+    return 0
+
+
+def _run_bootstrap_batch(store: BibliographyStore, input_path: Path) -> int:
+    jobs = load_batch_jobs(input_path)
+    runner = BatchBootstrapRunner()
+    results = runner.run(store, jobs)
+    payload = []
+    for job_result in results:
+        payload.append(
+            {
+                "job_name": job_result.job_name,
+                "result_count": job_result.result_count,
+                "results": [asdict(item) for item in job_result.results],
+            }
+        )
+    print(json.dumps(payload, indent=2))
+    return 0
+
+
+def _run_scrape_talkorigins(
+    store: BibliographyStore,
+    base_url: str,
+    output_dir: Path,
+    limit_topics: int | None,
+    limit_entries_per_topic: int | None,
+    resolve_seeds: bool,
+    ingest: bool,
+    expand: bool,
+    resume: bool,
+    topic_limit: int,
+    topic_commit_limit: int | None,
+    review_status: str,
+) -> int:
+    scraper = TalkOriginsScraper()
+    export = scraper.scrape_to_directory(
+        base_url=base_url,
+        output_dir=output_dir,
+        limit_topics=limit_topics,
+        limit_entries_per_topic=limit_entries_per_topic,
+        resolve_seeds=resolve_seeds,
+        ingest_store=store if ingest else None,
+        review_status=review_status,
+        expand=expand,
+        resume=resume,
+        topic_limit=topic_limit,
+        topic_commit_limit=topic_commit_limit,
+    )
+    print(json.dumps(asdict(export), indent=2))
+    return 0
+
+
+def _run_validate_talkorigins(manifest_path: Path) -> int:
+    scraper = TalkOriginsScraper()
+    report = scraper.validate_export(manifest_path)
+    print(json.dumps(asdict(report), indent=2))
+    return 0
+
+
+def _run_suggest_talkorigins_phrases(
+    manifest_path: Path,
+    topic_slug: str | None,
+    limit: int | None,
+    output: str | None,
+) -> int:
+    scraper = TalkOriginsScraper()
+    suggestions = scraper.suggest_topic_phrases(manifest_path, limit=limit, topic_slug=topic_slug)
+    payload = json.dumps([asdict(item) for item in suggestions], indent=2)
+    if output:
+        Path(output).write_text(payload + "\n", encoding="utf-8")
+    else:
+        print(payload)
+    return 0
+
+
+def _run_apply_topic_phrases(store: BibliographyStore, input_path: Path) -> int:
+    payload = json.loads(input_path.read_text(encoding="utf-8"))
+    if isinstance(payload, dict):
+        items = payload.get("topics", [])
+    else:
+        items = payload
+    if not isinstance(items, list):
+        print("Topic phrase JSON must be a list or an object with a 'topics' list", file=sys.stderr)
+        return 1
+
+    results: list[dict[str, object]] = []
+    exit_code = 0
+    for item in items:
+        if not isinstance(item, dict):
+            continue
+        slug = str(item.get("slug") or "")
+        phrase = item.get("suggested_phrase", item.get("phrase"))
+        if not slug:
+            continue
+        if phrase is not None:
+            phrase = str(phrase)
+        applied = store.set_topic_expansion_phrase(slug, phrase)
+        if not applied:
+            exit_code = 1
+        results.append(
+            {
+                "slug": slug,
+                "expansion_phrase": phrase,
+                "applied": applied,
+            }
+        )
+    print(json.dumps(results, indent=2))
+    return exit_code
+
+
+def _run_stage_topic_phrases(store: BibliographyStore, input_path: Path) -> int:
+    payload = json.loads(input_path.read_text(encoding="utf-8"))
+    if isinstance(payload, dict):
+        items = payload.get("topics", payload.get("items", []))
+    else:
+        items = payload
+    if not isinstance(items, list):
+        print("Topic phrase JSON must be a list or an object with a 'topics' or 'items' list", file=sys.stderr)
+        return 1
+
+    results: list[dict[str, object]] = []
+    exit_code = 0
+    for item in items:
+        if not isinstance(item, dict):
+            continue
+        slug = str(item.get("slug") or "")
+        phrase = item.get("suggested_phrase", item.get("phrase"))
+        notes = item.get("review_notes")
+        if not slug:
+            continue
+        if phrase is not None:
+            phrase = str(phrase)
+        if notes is not None:
+            notes = str(notes)
+        staged = store.stage_topic_phrase_suggestion(
+            slug,
+            suggested_phrase=phrase,
+            review_status="pending",
+            review_notes=notes,
+        )
+        if not staged:
+            exit_code = 1
+        results.append(
+            {
+                "slug": slug,
+                "suggested_phrase": phrase,
+                "phrase_review_status": "pending",
+                "staged": staged,
+            }
+        )
+    print(json.dumps(results, indent=2))
+    return exit_code
+
+
+def _run_review_topic_phrase(
+    store: BibliographyStore,
+    topic_slug: str,
+    status: str,
+    notes: str | None,
+    phrase: str | None,
+) -> int:
+    if not store.review_topic_phrase_suggestion(
+        topic_slug,
+        review_status=status,
+        review_notes=notes,
+        applied_phrase=phrase,
+    ):
+        print(f"Topic not found: {topic_slug}", file=sys.stderr)
+        return 1
+    payload = store.get_topic(topic_slug)
+    print(json.dumps(payload, indent=2))
+    return 0
+
+
+def _run_duplicates_talkorigins(
+    manifest_path: Path,
+    limit: int,
+    min_count: int,
+    match: str | None,
+    topic_slug: str | None,
+    preview: bool,
+    weak_only: bool,
+) -> int:
+    scraper = TalkOriginsScraper()
+    clusters = scraper.inspect_duplicate_clusters(
+        manifest_path,
+        limit=limit,
+        min_count=min_count,
+        match=match,
+        topic_slug=topic_slug,
+        preview_canonical=preview,
+        weak_only=weak_only,
+    )
+    print(json.dumps([asdict(cluster) for cluster in clusters], indent=2))
+    return 0
+
+
+def _run_ingest_talkorigins(
+    store: BibliographyStore,
+    manifest_path: Path,
+    review_status: str,
+    dedupe: bool,
+) -> int:
+    scraper = TalkOriginsScraper()
+    report = scraper.ingest_export(
+        manifest_path,
+        store,
+        review_status=review_status,
+        dedupe=dedupe,
+    )
+    print(json.dumps(asdict(report), indent=2))
+    return 0
+
+
+def _run_enrich_talkorigins(
+    store: BibliographyStore,
+    manifest_path: Path,
+    limit: int,
+    min_count: int,
+    match: str | None,
+    topic_slug: str | None,
+    apply: bool,
+    review_status: str,
+    allow_unsafe_matches: bool,
+) -> int:
+    scraper = TalkOriginsScraper()
+    results = scraper.enrich_weak_canonicals(
+        manifest_path,
+        store,
+        limit=limit,
+        min_count=min_count,
+        match=match,
+        topic_slug=topic_slug,
+        apply=apply,
+        review_status=review_status,
+        allow_unsafe_matches=allow_unsafe_matches,
+    )
+    print(json.dumps([asdict(result) for result in results], indent=2))
+    return 0
+
+
+def _run_review_talkorigins(
+    store: BibliographyStore,
+    manifest_path: Path,
+    limit: int,
+    min_count: int,
+    match: str | None,
+    topic_slug: str | None,
+    output: str | None,
+) -> int:
+    scraper = TalkOriginsScraper()
+    review = scraper.build_review_export(
+        manifest_path,
+        store,
+        limit=limit,
+        min_count=min_count,
+        match=match,
+        topic_slug=topic_slug,
+    )
+    payload = json.dumps(asdict(review), indent=2)
+    if output:
+        Path(output).write_text(payload + "\n", encoding="utf-8")
+    else:
+        print(payload)
+    return 0
+
+
+def _run_apply_talkorigins_corrections(
+    store: BibliographyStore,
+    manifest_path: Path,
+    corrections_path: Path,
+    review_status: str,
+) -> int:
+    scraper = TalkOriginsScraper()
+    results = scraper.apply_review_corrections(
+        manifest_path,
+        corrections_path,
+        store,
+        default_review_status=review_status,
+    )
+    print(json.dumps([asdict(result) for result in results], indent=2))
+    return 0
+
+
+def _run_topics(store: BibliographyStore, limit: int, phrase_review_status: str | None) -> int:
+    print(json.dumps(store.list_topics(limit=limit, phrase_review_status=phrase_review_status), indent=2))
+    return 0
+
+
+def _run_topic_entries(store: BibliographyStore, topic_slug: str, limit: int) -> int:
+    topic = store.get_topic(topic_slug)
+    if topic is None:
+        print(f"Topic not found: {topic_slug}", file=sys.stderr)
+        return 1
+    payload = {
+        "topic": topic,
+        "entries": store.list_topic_entries(topic_slug, limit=limit),
+    }
+    print(json.dumps(payload, indent=2))
+    return 0
+
+
+def _run_export_topic(store: BibliographyStore, topic_slug: str, output: str | None) -> int:
+    topic = store.get_topic(topic_slug)
+    if topic is None:
+        print(f"Topic not found: {topic_slug}", file=sys.stderr)
+        return 1
+    citation_keys = [row["citation_key"] for row in store.list_topic_entries(topic_slug, limit=100000)]
+    rendered = store.export_bibtex(citation_keys)
+    if output:
+        Path(output).write_text(rendered + ("\n" if rendered else ""), encoding="utf-8")
+    else:
+        if rendered:
+            print(rendered)
+    return 0
diff --git a/src/citegeist/expand.py b/src/citegeist/expand.py
index a9079a1..b93943d 100644
--- a/src/citegeist/expand.py
+++ b/src/citegeist/expand.py
@@ -18,6 +18,20 @@ class ExpansionResult:
     source_label: str
 
 
+@dataclass(slots=True)
+class TopicExpansionResult:
+    topic_slug: str
+    source_citation_key: str
+    discovered_citation_key: str
+    discovered_title: str
+    created_entry: bool
+    relation_type: str
+    source_label: str
+    relevance_score: float
+    meets_relevance_threshold: bool
+    assigned_to_topic: bool
+
+
 class CrossrefExpander:
     def __init__(self, resolver: MetadataResolver | None = None) -> None:
         self.resolver = resolver or MetadataResolver()
@@ -163,6 +177,192 @@ class OpenAlexExpander:
         return _normalize_openalex_id(results[0].get("id", ""))
 
 
+class TopicExpander:
+    def __init__(
+        self,
+        crossref_expander: CrossrefExpander | None = None,
+        openalex_expander: OpenAlexExpander | None = None,
+    ) -> None:
+        self.crossref_expander = crossref_expander or CrossrefExpander()
+        self.openalex_expander = openalex_expander or OpenAlexExpander()
+
+    def expand_topic(
+        self,
+        store: BibliographyStore,
+        topic_slug: str,
+        topic_phrase: str | None = None,
+        source: str = "openalex",
+        relation_type: str = "cites",
+        seed_limit: int = 25,
+        per_seed_limit: int = 25,
+        min_relevance: float = 0.2,
+        seed_keys: list[str] | None = None,
+        preview_only: bool = False,
+    ) -> list[TopicExpansionResult]:
+        topic = store.get_topic(topic_slug)
+        if topic is None:
+            return []
+
+        phrase = (topic_phrase or str(topic.get("name") or topic_slug)).strip()
+        seeds = store.list_topic_entries(topic_slug, limit=seed_limit)
+        if seed_keys:
+            allowed = set(seed_keys)
+            seeds = [seed for seed in seeds if str(seed["citation_key"]) in allowed]
+        results: list[TopicExpansionResult] = []
+
+        for seed in seeds:
+            seed_key = str(seed["citation_key"])
+            if preview_only:
+                discovered_rows = self._preview_discoveries(
+                    store,
+                    seed_key,
+                    source=source,
+                    relation_type=relation_type,
+                    limit=per_seed_limit,
+                )
+            else:
+                discovered_rows = self._materialized_discoveries(
+                    store,
+                    seed_key,
+                    source=source,
+                    relation_type=relation_type,
+                    limit=per_seed_limit,
+                )
+
+            for row, target_entry in discovered_rows:
+                score = _topic_relevance_score(phrase, target_entry)
+                meets_threshold = _meets_topic_assignment_threshold(
+                    phrase,
+                    target_entry,
+                    min_relevance=min_relevance,
+                    relevance_score=score,
+                )
+                assigned = False
+                if not preview_only and meets_threshold and target_entry is not None:
+                    assigned = store.add_entry_topic(
+                        row.discovered_citation_key,
+                        topic_slug=topic_slug,
+                        topic_name=str(topic.get("name") or topic_slug),
+                        source_type="topic_expand",
+                        source_url=str(topic.get("source_url") or ""),
+                        source_label=f"{source}:{relation_type}:{seed_key}",
+                        confidence=score,
+                    )
+                results.append(
+                    TopicExpansionResult(
+                        topic_slug=topic_slug,
+                        source_citation_key=row.source_citation_key,
+                        discovered_citation_key=row.discovered_citation_key,
+                        discovered_title=str(target_entry.get("title") or ""),
+                        created_entry=row.created_entry,
+                        relation_type=row.relation_type,
+                        source_label=row.source_label,
+                        relevance_score=score,
+                        meets_relevance_threshold=meets_threshold,
+                        assigned_to_topic=assigned,
+                    )
+                )
+        store.connection.commit()
+        return results
+
+    def _materialized_discoveries(
+        self,
+        store: BibliographyStore,
+        citation_key: str,
+        source: str,
+        relation_type: str,
+        limit: int,
+    ) -> list[tuple[ExpansionResult, dict[str, object] | None]]:
+        if source == "crossref":
+            expansion_rows = self.crossref_expander.expand_entry_references(store, citation_key)
+        else:
+            expansion_rows = self.openalex_expander.expand_entry(
+                store,
+                citation_key,
+                relation_type=relation_type,
+                limit=limit,
+            )
+        return [(row, store.get_entry(row.discovered_citation_key)) for row in expansion_rows]
+
+    def _preview_discoveries(
+        self,
+        store: BibliographyStore,
+        citation_key: str,
+        source: str,
+        relation_type: str,
+        limit: int,
+    ) -> list[tuple[ExpansionResult, dict[str, object]]]:
+        if source == "crossref":
+            return self._preview_crossref_discoveries(store, citation_key, limit)
+        return self._preview_openalex_discoveries(store, citation_key, relation_type, limit)
+
+    def _preview_crossref_discoveries(
+        self,
+        store: BibliographyStore,
+        citation_key: str,
+        limit: int,
+    ) -> list[tuple[ExpansionResult, dict[str, object]]]:
+        entry = store.get_entry(citation_key)
+        if entry is None or not entry.get("doi"):
+            return []
+        doi = str(entry["doi"])
+        payload = self.crossref_expander.resolver.source_client.get_json(
+            f"https://api.crossref.org/works/{doi}?mailto=welsberr@gmail.com"
+        )
+        references = payload.get("message", {}).get("reference", [])[:limit]
+        rows: list[tuple[ExpansionResult, dict[str, object]]] = []
+        for index, reference in enumerate(references, start=1):
+            discovered = _crossref_reference_to_entry(reference, citation_key, index)
+            rows.append(
+                (
+                    ExpansionResult(
+                        source_citation_key=citation_key,
+                        discovered_citation_key=discovered.citation_key,
+                        created_entry=store.get_entry(discovered.citation_key) is None,
+                        relation_type="cites",
+                        source_label=f"crossref:references:{doi}",
+                    ),
+                    dict(discovered.fields),
+                )
+            )
+        return rows
+
+    def _preview_openalex_discoveries(
+        self,
+        store: BibliographyStore,
+        citation_key: str,
+        relation_type: str,
+        limit: int,
+    ) -> list[tuple[ExpansionResult, dict[str, object]]]:
+        entry = store.get_entry(citation_key)
+        if entry is None:
+            return []
+        openalex_id = entry.get("openalex") or self.openalex_expander._lookup_openalex_id(entry)
+        if not openalex_id:
+            return []
+        filter_name = "cited_by" if relation_type == "cites" else "cites"
+        query = urlencode({"filter": f"{filter_name}:{openalex_id}", "per-page": limit})
+        payload = self.openalex_expander.resolver.source_client.get_json(f"https://api.openalex.org/works?{query}")
+        works = payload.get("results", [])
+        rows: list[tuple[ExpansionResult, dict[str, object]]] = []
+        for work in works:
+            discovered = _openalex_work_to_entry(work)
+            source_key = citation_key if relation_type == "cites" else discovered.citation_key
+            rows.append(
+                (
+                    ExpansionResult(
+                        source_citation_key=source_key,
+                        discovered_citation_key=discovered.citation_key,
+                        created_entry=store.get_entry(discovered.citation_key) is None,
+                        relation_type=relation_type,
+                        source_label=f"openalex:{relation_type}:{openalex_id}",
+                    ),
+                    dict(discovered.fields),
+                )
+            )
+        return rows
+
+
 def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry:
     title = (
         reference.get("article-title")
@@ -211,6 +411,115 @@ def _normalize_text(value: str) -> str:
     return " ".join(value.split())
 
 
+def _topic_relevance_score(topic_phrase: str, entry: dict[str, object] | None) -> float:
+    if entry is None:
+        return 0.0
+    topic_terms = _expanded_keyword_terms(topic_phrase)
+    if not topic_terms:
+        return 0.0
+    title_terms = _expanded_keyword_terms(str(entry.get("title") or ""))
+    abstract_terms = _expanded_keyword_terms(str(entry.get("abstract") or ""))
+    keyword_terms = _expanded_keyword_terms(str(entry.get("keywords") or ""))
+    venue_terms = _expanded_keyword_terms(" ".join(str(entry.get(field) or "") for field in ("journal", "booktitle")))
+
+    score = 0.0
+    score += 0.6 * _term_overlap_ratio(topic_terms, title_terms)
+    score += 0.25 * _term_overlap_ratio(topic_terms, abstract_terms)
+    score += 0.1 * _term_overlap_ratio(topic_terms, keyword_terms)
+    score += 0.05 * _term_overlap_ratio(topic_terms, venue_terms)
+
+    phrase = _normalize_text(topic_phrase.casefold())
+    title = _normalize_text(str(entry.get("title") or "").casefold())
+    if phrase and title and phrase in title:
+        score = max(score, 0.75)
+
+    return min(score, 1.0)
+
+
+def _meets_topic_assignment_threshold(
+    topic_phrase: str,
+    entry: dict[str, object] | None,
+    min_relevance: float,
+    relevance_score: float | None = None,
+) -> bool:
+    if entry is None:
+        return False
+    score = relevance_score if relevance_score is not None else _topic_relevance_score(topic_phrase, entry)
+    if score < min_relevance:
+        return False
+    title_anchor = _title_topic_anchor_ratio(topic_phrase, str(entry.get("title") or ""))
+    return title_anchor >= 0.2
+
+
+def _keyword_terms(text: str) -> set[str]:
+    return {
+        _normalize_keyword(term)
+        for term in re.findall(r"[A-Za-z0-9]+", text.casefold())
+        if len(term) >= 4
+    }
+
+
+def _expanded_keyword_terms(text: str) -> set[str]:
+    terms = _keyword_terms(text)
+    expanded = set(terms)
+    for term in terms:
+        expanded.update(_related_topic_terms(term))
+    return expanded
+
+
+def _title_topic_anchor_ratio(topic_phrase: str, title: str) -> float:
+    normalized_phrase = _normalize_text(topic_phrase.casefold())
+    normalized_title = _normalize_text(title.casefold())
+    if normalized_phrase and normalized_title and normalized_phrase in normalized_title:
+        return 1.0
+
+    topic_terms = _core_topic_terms(topic_phrase)
+    title_terms = _keyword_terms(title)
+    if not topic_terms or not title_terms:
+        return 0.0
+    overlap = topic_terms & title_terms
+    if overlap:
+        return max(0.25, len(overlap) / len(topic_terms))
+    return 0.0
+
+
+def _core_topic_terms(topic_phrase: str) -> set[str]:
+    generic_terms = {"evolution", "origin", "origins", "science", "study", "studies"}
+    return {term for term in _keyword_terms(topic_phrase) if term not in generic_terms}
+
+
+def _term_overlap_ratio(topic_terms: set[str], candidate_terms: set[str]) -> float:
+    if not topic_terms or not candidate_terms:
+        return 0.0
+    return len(topic_terms & candidate_terms) / len(topic_terms)
+
+
+def _normalize_keyword(term: str) -> str:
+    normalized = term.casefold()
+    for suffix in ("isms", "ists", "ation", "ment", "ings", "ness", "isms", "ism", "ist", "ing", "ers", "er", "ies", "ied", "ed", "es", "s"):
+        if len(normalized) > len(suffix) + 3 and normalized.endswith(suffix):
+            if suffix in {"ies", "ied"}:
+                return normalized[: -len(suffix)] + "y"
+            return normalized[: -len(suffix)]
+    return normalized
+
+
+def _related_topic_terms(term: str) -> set[str]:
+    related_groups = (
+        {"human", "hominid", "hominin", "homo"},
+        {"chimpanzee", "chimp", "pan", "ape", "apes", "primate"},
+        {"primate", "primate", "ape", "apes", "hominid", "hominin"},
+        {"evolution", "evolutionary", "phylogeny", "phylogen", "ancestor", "ancestral"},
+        {"origin", "origins", "abiogenesis", "prebiotic"},
+        {"morphometry", "morphology", "cranial", "dental", "skeletal", "body"},
+        {"paleolithic", "paleoanthropology", "paleoanthropological", "pleistocene", "fossil"},
+    )
+    for group in related_groups:
+        if term in group:
+            return group - {term}
+    return set()
+
+
 def _openalex_work_to_entry(work: dict) -> BibEntry:
     title = _normalize_text(work.get("display_name", "") or "Untitled work")
     year = str(work.get("publication_year") or "")
diff --git a/src/citegeist/harvest.py b/src/citegeist/harvest.py
new file mode 100644
index 0000000..1a85662
--- /dev/null
+++ b/src/citegeist/harvest.py
@@ -0,0 +1,317 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from urllib.parse import urlencode
+import xml.etree.ElementTree as ET
+
+from .bibtex import BibEntry
+from .sources import SourceClient
+
+NS = {
+    "oai": "http://www.openarchives.org/OAI/2.0/",
+    "oai_dc": "http://www.openarchives.org/OAI/2.0/oai_dc/",
+    "dc": "http://purl.org/dc/elements/1.1/",
+    "mods": "http://www.loc.gov/mods/v3",
+}
+
+
+@dataclass(slots=True)
+class HarvestResult:
+    base_url: str
+    identifier: str
+    entry: BibEntry
+
+
+@dataclass(slots=True)
+class OaiSet:
+    set_spec: str
+    set_name: str
+    set_description: str = ""
+
+
+@dataclass(slots=True)
+class OaiMetadataFormat:
+    metadata_prefix: str
+    schema: str
+    metadata_namespace: str
+
+
+class OaiPmhHarvester:
+    def __init__(self, source_client: SourceClient | None = None) -> None:
+        self.source_client = source_client or SourceClient()
+
+    def identify(self, base_url: str) -> dict[str, str]:
+        root = self.source_client.get_xml(f"{base_url}?{urlencode({'verb': 'Identify'})}")
+        identify = root.find(".//oai:Identify", NS)
+        if identify is None:
+            return {}
+        payload: dict[str, str] = {}
+        for field_name in (
+            "repositoryName",
+            "baseURL",
+            "protocolVersion",
+            "adminEmail",
+            "earliestDatestamp",
+            "deletedRecord",
+            "granularity",
+        ):
+            payload[field_name] = _node_text(identify.find(f"oai:{field_name}", NS))
+        return payload
+
+    def list_sets(self, base_url: str) -> list[OaiSet]:
+        root = self.source_client.get_xml(f"{base_url}?{urlencode({'verb': 'ListSets'})}")
+        sets = root.findall(".//oai:set", NS)
+        results: list[OaiSet] = []
+        for node in sets:
+            results.append(
+                OaiSet(
+                    set_spec=_node_text(node.find("oai:setSpec", NS)),
+                    set_name=_node_text(node.find("oai:setName", NS)),
+                    set_description=_flatten_set_description(node.find("oai:setDescription", NS)),
+                )
+            )
+        return results
+
+    def list_metadata_formats(self, base_url: str, identifier: str | None = None) -> list[OaiMetadataFormat]:
+        params = {"verb": "ListMetadataFormats"}
+        if identifier:
+            params["identifier"] = identifier
+        root = self.source_client.get_xml(f"{base_url}?{urlencode(params)}")
+        formats = root.findall(".//oai:metadataFormat", NS)
+        results: list[OaiMetadataFormat] = []
+        for node in formats:
+            results.append(
+                OaiMetadataFormat(
+                    metadata_prefix=_node_text(node.find("oai:metadataPrefix", NS)),
+                    schema=_node_text(node.find("oai:schema", NS)),
+                    metadata_namespace=_node_text(node.find("oai:metadataNamespace", NS)),
+                )
+            )
+        return results
+
+    def list_records(
+        self,
+        base_url: str,
+        metadata_prefix: str = "oai_dc",
+        set_spec: str | None = None,
+        date_from: str | None = None,
+        date_until: str | None = None,
+        limit: int | None = None,
+    ) -> list[HarvestResult]:
+        results: list[HarvestResult] = []
+        params = {"verb": "ListRecords", "metadataPrefix": metadata_prefix}
+        if set_spec:
+            params["set"] = set_spec
+        if date_from:
+            params["from"] = date_from
+        if date_until:
+            params["until"] = date_until
+
+        ordinal = 1
+        next_url = f"{base_url}?{urlencode(params)}"
+        while next_url:
+            root = self.source_client.get_xml(next_url)
+            records = root.findall(".//oai:record", NS)
+            for record in records:
+                parsed = self._record_to_result(base_url, record, ordinal)
+                ordinal += 1
+                if parsed is not None:
+                    results.append(parsed)
+                if limit is not None and len(results) >= limit:
+                    return results
+            next_url = self._resumption_url(base_url, root)
+        return results
+
+    def get_record(
+        self,
+        base_url: str,
+        identifier: str,
+        metadata_prefix: str = "oai_dc",
+    ) -> HarvestResult | None:
+        params = {
+            "verb": "GetRecord",
+            "metadataPrefix": metadata_prefix,
+            "identifier": identifier,
+        }
+        root = self.source_client.get_xml(f"{base_url}?{urlencode(params)}")
+        record = root.find(".//oai:record", NS)
+        if record is None:
+            return None
+        return self._record_to_result(base_url, record, 1)
+
+    def _record_to_result(self, base_url: str, record: ET.Element, ordinal: int) -> HarvestResult | None:
+        identifier = _node_text(record.find("./oai:header/oai:identifier", NS))
+        metadata_node = record.find("./oai:metadata/*", NS)
+        if metadata_node is None or not identifier:
+            return None
+
+        entry = _metadata_node_to_entry(base_url, identifier, metadata_node, ordinal)
+        return HarvestResult(base_url=base_url, identifier=identifier, entry=entry)
+
+    def _resumption_url(self, base_url: str, root: ET.Element) -> str | None:
+        token = _node_text(root.find(".//oai:resumptionToken", NS))
+        if not token:
+            return None
+        return f"{base_url}?{urlencode({'verb': 'ListRecords', 'resumptionToken': token})}"
+
+
+def _oai_dc_to_entry(base_url: str, identifier: str, metadata: ET.Element, ordinal: int) -> BibEntry:
+    titles = _all_text(metadata.findall("dc:title", NS))
+    creators = _all_text(metadata.findall("dc:creator", NS))
+    dates = _all_text(metadata.findall("dc:date", NS))
+    descriptions = _all_text(metadata.findall("dc:description", NS))
+    identifiers = _all_text(metadata.findall("dc:identifier", NS))
+    publishers = _all_text(metadata.findall("dc:publisher", NS))
+    types = [value.lower() for value in _all_text(metadata.findall("dc:type", NS))]
+
+    title = titles[0] if titles else "Untitled record"
+    year = _first_year(dates)
+    entry_type = _guess_oai_entry_type(types)
+
+    fields: dict[str, str] = {
+        "title": title,
+        "oai": identifier,
+        "url": _best_identifier_url(identifiers) or f"{base_url}?verb=GetRecord&identifier={identifier}&metadataPrefix=oai_dc",
+        "note": "harvested_from = {oai_pmh}",
+    }
+    if creators:
+        fields["author"] = " and ".join(creators)
+    if year:
+        fields["year"] = year
+    if descriptions:
+        fields["abstract"] = descriptions[0]
+    if publishers:
+        fields["publisher"] = publishers[0]
+
+    citation_key = _oai_citation_key(creators, year, title, ordinal)
+    return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
+
+
+def _mods_to_entry(base_url: str, identifier: str, metadata: ET.Element, ordinal: int) -> BibEntry:
+    title = _node_text(metadata.find(".//mods:titleInfo/mods:title", NS)) or "Untitled record"
+    sub_title = _node_text(metadata.find(".//mods:titleInfo/mods:subTitle", NS))
+    if sub_title:
+        title = f"{title}: {sub_title}"
+
+    creators: list[str] = []
+    for name in metadata.findall(".//mods:name", NS):
+        role_terms = [term.text or "" for term in name.findall(".//mods:roleTerm", NS)]
+        if role_terms and not any(term.lower() == "author" for term in role_terms):
+            continue
+        parts = [_node_text(part) for part in name.findall("./mods:namePart", NS)]
+        parts = [part for part in parts if part]
+        if parts:
+            creators.append(", ".join(parts) if len(parts) == 2 else " ".join(parts))
+
+    year = ""
+    for date_node in metadata.findall(".//mods:originInfo/mods:dateIssued", NS):
+        text = _node_text(date_node)
+        if len(text) >= 4 and text[:4].isdigit():
+            year = text[:4]
+            break
+
+    publisher = _node_text(metadata.find(".//mods:originInfo/mods:publisher", NS))
+    abstract = _node_text(metadata.find(".//mods:abstract", NS))
+    genre = _node_text(metadata.find(".//mods:genre", NS)).lower()
+    related_title = _node_text(metadata.find(".//mods:relatedItem/mods:titleInfo/mods:title", NS))
+    url = _node_text(metadata.find(".//mods:location/mods:url", NS))
+
+    entry_type = "phdthesis" if "thesis" in genre or "dissertation" in genre else "misc"
+    if not entry_type == "phdthesis":
+        if related_title:
+            entry_type = "article"
+
+    fields: dict[str, str] = {
+        "title": title,
+        "oai": identifier,
+        "url": url or f"{base_url}?verb=GetRecord&identifier={identifier}&metadataPrefix=mods",
+        "note": "harvested_from = {oai_pmh_mods}",
+    }
+    if creators:
+        fields["author"] = " and ".join(creators)
+    if year:
+        fields["year"] = year
+    if publisher:
+        fields["publisher"] = publisher
+    if abstract:
+        fields["abstract"] = abstract
+    if related_title:
+        fields["journal"] = related_title
+
+    citation_key = _oai_citation_key(creators, year, title, ordinal)
+    return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
+
+
+def _metadata_node_to_entry(base_url: str, identifier: str, metadata: ET.Element, ordinal: int) -> BibEntry:
+    if metadata.tag.endswith("dc"):
+        return _oai_dc_to_entry(base_url, identifier, metadata, ordinal)
+    if metadata.tag.endswith("mods"):
+        return _mods_to_entry(base_url, identifier, metadata, ordinal)
+    return BibEntry(
+        entry_type="misc",
+        citation_key=_oai_citation_key([], "", identifier, ordinal),
+        fields={
+            "title": identifier,
+            "oai": identifier,
+            "url": f"{base_url}?verb=GetRecord&identifier={identifier}",
+            "note": f"unsupported_oai_metadata = {{{metadata.tag}}}",
+        },
+    )
+
+
+def _node_text(node: ET.Element | None) -> str:
+    if node is None or node.text is None:
+        return ""
+    return " ".join(node.text.split())
+
+
+def _all_text(nodes: list[ET.Element]) -> list[str]:
+    values = []
+    for node in nodes:
+        value = _node_text(node)
+        if value:
+            values.append(value)
+    return values
+
+
+def _first_year(dates: list[str]) -> str:
+    for date in dates:
+        if len(date) >= 4 and date[:4].isdigit():
+            return date[:4]
+    return ""
+
+
+def _guess_oai_entry_type(types: list[str]) -> str:
+    joined = " ".join(types)
+    if "thesis" in joined or "dissertation" in joined:
+        return "phdthesis"
+    if "article" in joined:
+        return "article"
+    if "book" in joined:
+        return "book"
+    return "misc"
+
+
+def _best_identifier_url(identifiers: list[str]) -> str:
+    for identifier in identifiers:
+        if identifier.startswith("http://") or identifier.startswith("https://"):
+            return identifier
+    return ""
+
+
+def _oai_citation_key(creators: list[str], year: str, title: str, ordinal: int) -> str:
+    author = creators[0] if creators else "oai"
+    family = author.split(",")[0] if "," in author else author.split()[-1]
+    family = "".join(ch for ch in family.lower() if ch.isalnum()) or "oai"
+    first_word = "".join(ch for ch in title.split()[0].lower() if ch.isalnum()) if title.split() else "untitled"
+    return f"{family}{year or 'nd'}{first_word}{ordinal}"
+
+
+def _flatten_set_description(node: ET.Element | None) -> str:
+    if node is None:
+        return ""
+    parts = []
+    for child in node.iter():
+        if child.text and child.text.strip():
+            parts.append(" ".join(child.text.split()))
+    return " ".join(parts)
diff --git a/src/citegeist/resolve.py b/src/citegeist/resolve.py
index 4d3ce28..5e5a205 100644
--- a/src/citegeist/resolve.py
+++ b/src/citegeist/resolve.py
@@ -30,6 +30,9 @@ class MetadataResolver:
             resolved = self.resolve_doi(doi)
             if resolved is not None:
                 return resolved
+            resolved = self.resolve_datacite_doi(doi)
+            if resolved is not None:
+                return resolved
 
         if openalex_id := entry.fields.get("openalex"):
             resolved = self.resolve_openalex(openalex_id)
@@ -47,6 +50,20 @@ class MetadataResolver:
                 return resolved
 
         if title := entry.fields.get("title"):
+            resolved = self.search_crossref_best_match(
+                title=title,
+                author_text=entry.fields.get("author", ""),
+                year=entry.fields.get("year", ""),
+            )
+            if resolved is not None:
+                return resolved
+            resolved = self.search_datacite_best_match(
+                title=title,
+                author_text=entry.fields.get("author", ""),
+                year=entry.fields.get("year", ""),
+            )
+            if resolved is not None:
+                return resolved
             resolved = self.search_openalex_best_match(
                 title=title,
                 author_text=entry.fields.get("author", ""),
@@ -75,6 +92,26 @@ class MetadataResolver:
         items = payload.get("message", {}).get("items", [])
         return [_crossref_message_to_entry(item) for item in items]
 
+    def search_crossref_best_match(
+        self,
+        title: str,
+        author_text: str = "",
+        year: str = "",
+    ) -> Resolution | None:
+        candidate = _select_best_title_match(
+            self.search_crossref(title, limit=5),
+            title=title,
+            author_text=author_text,
+            year=year,
+        )
+        if candidate is None:
+            return None
+        return Resolution(
+            entry=candidate,
+            source_type="resolver",
+            source_label=f"crossref:search:{title}",
+        )
+
     def resolve_dblp(self, dblp_key: str) -> Resolution | None:
         encoded_key = urllib.parse.quote(dblp_key, safe="/:")
         text = self.source_client.get_text(f"https://dblp.org/rec/{encoded_key}.bib")
@@ -128,6 +165,43 @@ class MetadataResolver:
             source_label=f"openalex:id:{normalized_id}",
         )
 
+    def resolve_datacite_doi(self, doi: str) -> Resolution | None:
+        encoded = urllib.parse.quote(doi, safe="")
+        payload = self.source_client.get_json(f"https://api.datacite.org/dois/{encoded}")
+        data = payload.get("data", {})
+        if not data:
+            return None
+        return Resolution(
+            entry=_datacite_work_to_entry(data),
+            source_type="resolver",
+            source_label=f"datacite:doi:{doi}",
+        )
+
+    def search_datacite(self, title: str, limit: int = 5) -> list[BibEntry]:
+        query = urllib.parse.urlencode({"query": title, "page[size]": limit})
+        payload = self.source_client.get_json(f"https://api.datacite.org/dois?{query}")
+        return [_datacite_work_to_entry(item) for item in payload.get("data", [])]
+
+    def search_datacite_best_match(
+        self,
+        title: str,
+        author_text: str = "",
+        year: str = "",
+    ) -> Resolution | None:
+        candidate = _select_best_title_match(
+            self.search_datacite(title, limit=5),
+            title=title,
+            author_text=author_text,
+            year=year,
+        )
+        if candidate is None:
+            return None
+        return Resolution(
+            entry=candidate,
+            source_type="resolver",
+            source_label=f"datacite:search:{title}",
+        )
+
     def search_openalex(self, title: str, limit: int = 5) -> list[BibEntry]:
         query = urllib.parse.urlencode({"search": title, "per-page": limit})
         payload = self.source_client.get_json(f"https://api.openalex.org/works?{query}")
@@ -139,42 +213,50 @@ class MetadataResolver:
         author_text: str = "",
         year: str = "",
     ) -> Resolution | None:
-        candidates = self.search_openalex(title, limit=5)
-        if not candidates:
+        candidate = _select_best_title_match(
+            self.search_openalex(title, limit=5),
+            title=title,
+            author_text=author_text,
+            year=year,
+        )
+        if candidate is None:
             return None
-
-        title_norm = _normalize_match_text(title)
-        author_norm = _normalize_match_text(author_text)
-        for candidate in candidates:
-            candidate_title = _normalize_match_text(candidate.fields.get("title", ""))
-            candidate_author = _normalize_match_text(candidate.fields.get("author", ""))
-            candidate_year = candidate.fields.get("year", "")
-            if candidate_title == title_norm:
-                if author_norm and candidate_author and author_norm.split(" and ")[0] not in candidate_author:
-                    continue
-                if year and candidate_year and year != candidate_year:
-                    continue
-                return Resolution(
-                    entry=candidate,
-                    source_type="resolver",
-                    source_label=f"openalex:search:{title}",
-                )
-
         return Resolution(
-            entry=candidates[0],
+            entry=candidate,
             source_type="resolver",
             source_label=f"openalex:search:{title}",
         )
 
 def merge_entries(base: BibEntry, resolved: BibEntry) -> BibEntry:
+    merged, _ = merge_entries_with_conflicts(base, resolved)
+    return merged
+
+
+def merge_entries_with_conflicts(base: BibEntry, resolved: BibEntry) -> tuple[BibEntry, list[dict[str, str]]]:
     merged_fields = dict(base.fields)
+    conflicts: list[dict[str, str]] = []
     for key, value in resolved.fields.items():
-        if value and (key not in merged_fields or not merged_fields[key]):
+        if not value:
+            continue
+        current_value = merged_fields.get(key, "")
+        if current_value and current_value != value:
+            conflicts.append(
+                {
+                    "field_name": key,
+                    "current_value": current_value,
+                    "proposed_value": value,
+                }
+            )
+            continue
+        if key not in merged_fields or not merged_fields[key]:
             merged_fields[key] = value
-    return BibEntry(
-        entry_type=base.entry_type or resolved.entry_type,
-        citation_key=base.citation_key,
-        fields=merged_fields,
+    return (
+        BibEntry(
+            entry_type=base.entry_type or resolved.entry_type,
+            citation_key=base.citation_key,
+            fields=merged_fields,
+        ),
+        conflicts,
     )
 
 
@@ -363,3 +445,123 @@ def _normalize_match_text(value: str) -> str:
     lowered = value.lower()
     lowered = re.sub(r"\W+", " ", lowered)
     return " ".join(lowered.split())
+
+
+def _select_best_title_match(
+    candidates: list[BibEntry],
+    title: str,
+    author_text: str = "",
+    year: str = "",
+) -> BibEntry | None:
+    if not candidates:
+        return None
+
+    title_norm = _normalize_match_text(title)
+    author_tokens = _author_match_tokens(author_text)
+    year_text = str(year or "").strip()
+
+    for candidate in candidates:
+        candidate_title = _normalize_match_text(candidate.fields.get("title", ""))
+        if candidate_title != title_norm:
+            continue
+        candidate_year = str(candidate.fields.get("year", "") or "").strip()
+        if year_text and candidate_year and year_text != candidate_year:
+            continue
+        if author_tokens and not _candidate_matches_author_tokens(candidate, author_tokens):
+            continue
+        return candidate
+    return None
+
+
+def _author_match_tokens(author_text: str) -> set[str]:
+    normalized = _normalize_match_text(author_text)
+    if not normalized:
+        return set()
+    tokens = {
+        token
+        for token in re.findall(r"[a-z0-9]+", normalized)
+        if len(token) >= 2 and token not in {"and", "et", "al"}
+    }
+    return tokens
+
+
+def _candidate_matches_author_tokens(candidate: BibEntry, author_tokens: set[str]) -> bool:
+    candidate_author = _normalize_match_text(candidate.fields.get("author", ""))
+    if not candidate_author:
+        return False
+    candidate_tokens = set(re.findall(r"[a-z0-9]+", candidate_author))
+    return bool(author_tokens & candidate_tokens)
+
+
+def _datacite_work_to_entry(data: dict) -> BibEntry:
+    attributes = data.get("attributes", {})
+    doi = str(attributes.get("doi") or "")
+    titles = attributes.get("titles") or []
+    creators = attributes.get("creators") or []
+    descriptions = attributes.get("descriptions") or []
+    publisher = str(attributes.get("publisher") or "")
+    year = str(attributes.get("publicationYear") or "")
+    url = str(attributes.get("url") or "")
+    types = attributes.get("types") or {}
+
+    title = titles[0].get("title", "") if titles else ""
+    author_names = " and ".join(_datacite_creator_name(creator) for creator in creators if _datacite_creator_name(creator))
+    abstract = _datacite_abstract(descriptions)
+    entry_type = _datacite_type_to_bibtype(str(types.get("resourceTypeGeneral") or ""))
+
+    fields: dict[str, str] = {}
+    if title:
+        fields["title"] = title
+    if author_names:
+        fields["author"] = author_names
+    if year:
+        fields["year"] = year
+    if doi:
+        fields["doi"] = doi
+    if url:
+        fields["url"] = url
+    elif doi:
+        fields["url"] = f"https://doi.org/{doi}"
+    if publisher:
+        fields["publisher"] = publisher
+    if abstract:
+        fields["abstract"] = abstract
+
+    citation_key = _make_resolution_key(author_names or "datacite", year or "n.d.", title or doi or "untitled")
+    return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
+
+
+def _datacite_creator_name(creator: dict) -> str:
+    family = str(creator.get("familyName") or "")
+    given = str(creator.get("givenName") or "")
+    if family and given:
+        return f"{family}, {given}"
+    return str(creator.get("name") or family or given)
+
+
+def _datacite_abstract(descriptions: list[dict]) -> str:
+    for description in descriptions:
+        if str(description.get("descriptionType") or "").lower() == "abstract":
+            return str(description.get("description") or "")
+    return ""
+
+
+def _datacite_type_to_bibtype(resource_type: str) -> str:
+    lowered = resource_type.lower()
+    mapping = {
+        "audiovisual": "misc",
+        "book": "book",
+        "bookchapter": "incollection",
+        "collection": "misc",
+        "computationalnotebook": "misc",
+        "conferencepaper": "inproceedings",
+        "dataset": "misc",
+        "dissertation": "phdthesis",
+        "image": "misc",
+        "journalarticle": "article",
+        "model": "misc",
+        "report": "techreport",
+        "software": "misc",
+        "text": "misc",
+    }
+    return mapping.get(lowered, "misc")
diff --git a/src/citegeist/sources.py b/src/citegeist/sources.py
index 63bd23d..0f453e5 100644
--- a/src/citegeist/sources.py
+++ b/src/citegeist/sources.py
@@ -30,11 +30,11 @@ class SourceClient:
     def get_text(self, url: str) -> str:
         cached = self._read_cached(url, "txt")
         if cached is not None:
-            return cached.decode("utf-8")
+            return self._decode_text(cached)
 
         payload = self._fetch_bytes(url)
         self._write_cache(url, "txt", payload)
-        return payload.decode("utf-8")
+        return self._decode_text(payload)
 
     def get_xml(self, url: str) -> ET.Element:
         cached = self._read_cached(url, "xml")
@@ -76,3 +76,11 @@ class SourceClient:
         self.cache_dir.mkdir(parents=True, exist_ok=True)
         path = self.cache_dir / self._cache_key(url, suffix)
         path.write_bytes(payload)
+
+    def _decode_text(self, payload: bytes) -> str:
+        for encoding in ("utf-8", "utf-8-sig", "iso-8859-1", "latin-1"):
+            try:
+                return payload.decode(encoding)
+            except UnicodeDecodeError:
+                continue
+        return payload.decode("utf-8", errors="replace")
diff --git a/src/citegeist/storage.py b/src/citegeist/storage.py
index 57e75ee..f2578f8 100644
--- a/src/citegeist/storage.py
+++ b/src/citegeist/storage.py
@@ -95,6 +95,29 @@ class BibliographyStore:
                 PRIMARY KEY (source_entry_id, target_citation_key, relation_type)
             );
 
+            CREATE TABLE IF NOT EXISTS topics (
+                id INTEGER PRIMARY KEY,
+                slug TEXT NOT NULL UNIQUE,
+                name TEXT NOT NULL,
+                source_type TEXT NOT NULL,
+                source_url TEXT,
+                expansion_phrase TEXT,
+                suggested_phrase TEXT,
+                phrase_review_status TEXT NOT NULL DEFAULT 'unreviewed',
+                phrase_review_notes TEXT,
+                created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
+                updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
+            );
+
+            CREATE TABLE IF NOT EXISTS entry_topics (
+                entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
+                topic_id INTEGER NOT NULL REFERENCES topics(id) ON DELETE CASCADE,
+                source_label TEXT NOT NULL,
+                confidence REAL,
+                created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
+                PRIMARY KEY (entry_id, topic_id)
+            );
+
             CREATE TABLE IF NOT EXISTS field_provenance (
                 id INTEGER PRIMARY KEY,
                 entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
@@ -117,10 +140,23 @@ class BibliographyStore:
                 confidence REAL,
                 recorded_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
             );
+
+            CREATE TABLE IF NOT EXISTS field_conflicts (
+                id INTEGER PRIMARY KEY,
+                entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
+                field_name TEXT NOT NULL,
+                current_value TEXT,
+                proposed_value TEXT,
+                source_type TEXT NOT NULL,
+                source_label TEXT NOT NULL,
+                status TEXT NOT NULL DEFAULT 'open',
+                recorded_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
+            );
             """
         )
 
         self._ensure_entry_columns()
+        self._ensure_topic_columns()
 
         if self._fts5_enabled:
             self.connection.execute(
@@ -177,6 +213,7 @@ class BibliographyStore:
             ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
             ON CONFLICT(citation_key) DO UPDATE SET
                 entry_type = excluded.entry_type,
+                review_status = excluded.review_status,
                 title = excluded.title,
                 year = excluded.year,
                 journal = excluded.journal,
@@ -280,30 +317,58 @@ class BibliographyStore:
 
         return entry_id
 
-    def search_text(self, query: str, limit: int = 10) -> list[dict[str, object]]:
+    def search_text(self, query: str, limit: int = 10, topic_slug: str | None = None) -> list[dict[str, object]]:
         if self._fts5_enabled:
-            rows = self.connection.execute(
-                """
-                SELECT e.citation_key, e.title, e.year, bm25(entry_text_fts) AS score
-                FROM entry_text_fts
-                JOIN entries e ON e.citation_key = entry_text_fts.citation_key
-                WHERE entry_text_fts MATCH ?
-                ORDER BY score
-                LIMIT ?
-                """,
-                (query, limit),
-            ).fetchall()
+            if topic_slug:
+                rows = self.connection.execute(
+                    """
+                    SELECT DISTINCT e.citation_key, e.title, e.year, bm25(entry_text_fts) AS score
+                    FROM entry_text_fts
+                    JOIN entries e ON e.citation_key = entry_text_fts.citation_key
+                    JOIN entry_topics et ON et.entry_id = e.id
+                    JOIN topics t ON t.id = et.topic_id
+                    WHERE entry_text_fts MATCH ? AND t.slug = ?
+                    ORDER BY score
+                    LIMIT ?
+                    """,
+                    (query, topic_slug, limit),
+                ).fetchall()
+            else:
+                rows = self.connection.execute(
+                    """
+                    SELECT e.citation_key, e.title, e.year, bm25(entry_text_fts) AS score
+                    FROM entry_text_fts
+                    JOIN entries e ON e.citation_key = entry_text_fts.citation_key
+                    WHERE entry_text_fts MATCH ?
+                    ORDER BY score
+                    LIMIT ?
+                    """,
+                    (query, limit),
+                ).fetchall()
         else:
             pattern = f"%{query}%"
-            rows = self.connection.execute(
-                """
-                SELECT citation_key, title, year, 0.0 AS score
-                FROM entries
-                WHERE title LIKE ? OR abstract LIKE ? OR fulltext LIKE ?
-                LIMIT ?
-                """,
-                (pattern, pattern, pattern, limit),
-            ).fetchall()
+            if topic_slug:
+                rows = self.connection.execute(
+                    """
+                    SELECT DISTINCT e.citation_key, e.title, e.year, 0.0 AS score
+                    FROM entries e
+                    JOIN entry_topics et ON et.entry_id = e.id
+                    JOIN topics t ON t.id = et.topic_id
+                    WHERE t.slug = ? AND (e.title LIKE ? OR e.abstract LIKE ? OR e.fulltext LIKE ?)
+                    LIMIT ?
+                    """,
+                    (topic_slug, pattern, pattern, pattern, limit),
+                ).fetchall()
+            else:
+                rows = self.connection.execute(
+                    """
+                    SELECT citation_key, title, year, 0.0 AS score
+                    FROM entries
+                    WHERE title LIKE ? OR abstract LIKE ? OR fulltext LIKE ?
+                    LIMIT ?
+                    """,
+                    (pattern, pattern, pattern, limit),
+                ).fetchall()
 
         return [dict(row) for row in rows]
 
@@ -383,7 +448,11 @@ class BibliographyStore:
             "SELECT * FROM entries WHERE citation_key = ?",
             (citation_key,),
         ).fetchone()
-        return self._row_to_entry_dict(row) if row else None
+        if row is None:
+            return None
+        payload = self._row_to_entry_dict(row)
+        payload["topics"] = self.get_entry_topics(citation_key)
+        return payload
 
     def list_entries(self, limit: int = 50) -> list[dict[str, object]]:
         rows = self.connection.execute(
@@ -397,6 +466,227 @@ class BibliographyStore:
         ).fetchall()
         return [dict(row) for row in rows]
 
+    def ensure_topic(
+        self,
+        slug: str,
+        name: str,
+        source_type: str = "manual",
+        source_url: str | None = None,
+        expansion_phrase: str | None = None,
+        suggested_phrase: str | None = None,
+        phrase_review_status: str | None = None,
+        phrase_review_notes: str | None = None,
+    ) -> int:
+        row = self.connection.execute(
+            """
+            INSERT INTO topics (
+                slug, name, source_type, source_url, expansion_phrase,
+                suggested_phrase, phrase_review_status, phrase_review_notes
+            )
+            VALUES (?, ?, ?, ?, ?, ?, COALESCE(?, 'unreviewed'), ?)
+            ON CONFLICT(slug) DO UPDATE SET
+                name = excluded.name,
+                source_type = excluded.source_type,
+                source_url = COALESCE(excluded.source_url, topics.source_url),
+                expansion_phrase = COALESCE(excluded.expansion_phrase, topics.expansion_phrase),
+                suggested_phrase = COALESCE(excluded.suggested_phrase, topics.suggested_phrase),
+                phrase_review_status = COALESCE(excluded.phrase_review_status, topics.phrase_review_status),
+                phrase_review_notes = COALESCE(excluded.phrase_review_notes, topics.phrase_review_notes),
+                updated_at = CURRENT_TIMESTAMP
+            RETURNING id
+            """,
+            (
+                slug,
+                name,
+                source_type,
+                source_url,
+                expansion_phrase,
+                suggested_phrase,
+                phrase_review_status,
+                phrase_review_notes,
+            ),
+        ).fetchone()
+        return int(row["id"])
+
+    def add_entry_topic(
+        self,
+        citation_key: str,
+        topic_slug: str,
+        topic_name: str,
+        source_type: str = "manual",
+        source_url: str | None = None,
+        source_label: str = "manual",
+        confidence: float = 1.0,
+        expansion_phrase: str | None = None,
+    ) -> bool:
+        entry_row = self.connection.execute(
+            "SELECT id FROM entries WHERE citation_key = ?",
+            (citation_key,),
+        ).fetchone()
+        if entry_row is None:
+            return False
+
+        topic_id = self.ensure_topic(
+            topic_slug,
+            topic_name,
+            source_type=source_type,
+            source_url=source_url,
+            expansion_phrase=expansion_phrase,
+        )
+        self.connection.execute(
+            """
+            INSERT INTO entry_topics (entry_id, topic_id, source_label, confidence)
+            VALUES (?, ?, ?, ?)
+            ON CONFLICT(entry_id, topic_id) DO UPDATE SET
+                source_label = excluded.source_label,
+                confidence = excluded.confidence
+            """,
+            (int(entry_row["id"]), topic_id, source_label, confidence),
+        )
+        return True
+
+    def get_entry_topics(self, citation_key: str) -> list[dict[str, object]]:
+        rows = self.connection.execute(
+            """
+            SELECT t.slug, t.name, t.source_type, t.source_url, et.source_label, et.confidence
+            FROM entry_topics et
+            JOIN entries e ON e.id = et.entry_id
+            JOIN topics t ON t.id = et.topic_id
+            WHERE e.citation_key = ?
+            ORDER BY t.name, t.slug
+            """,
+            (citation_key,),
+        ).fetchall()
+        return [dict(row) for row in rows]
+
+    def list_topics(
+        self,
+        limit: int = 100,
+        phrase_review_status: str | None = None,
+    ) -> list[dict[str, object]]:
+        where = ""
+        params: list[object] = []
+        if phrase_review_status is not None:
+            where = "WHERE t.phrase_review_status = ?"
+            params.append(phrase_review_status)
+        params.append(limit)
+        rows = self.connection.execute(
+            f"""
+            SELECT t.slug, t.name, t.source_type, t.source_url, t.expansion_phrase,
+                   t.suggested_phrase, t.phrase_review_status, t.phrase_review_notes,
+                   COUNT(et.entry_id) AS entry_count
+            FROM topics t
+            LEFT JOIN entry_topics et ON et.topic_id = t.id
+            {where}
+            GROUP BY t.id, t.slug, t.name, t.source_type, t.source_url, t.expansion_phrase,
+                     t.suggested_phrase, t.phrase_review_status, t.phrase_review_notes
+            ORDER BY t.name, t.slug
+            LIMIT ?
+            """,
+            params,
+        ).fetchall()
+        return [dict(row) for row in rows]
+
+    def get_topic(self, slug: str) -> dict[str, object] | None:
+        row = self.connection.execute(
+            """
+            SELECT t.slug, t.name, t.source_type, t.source_url, t.expansion_phrase,
+                   t.suggested_phrase, t.phrase_review_status, t.phrase_review_notes,
+                   COUNT(et.entry_id) AS entry_count
+            FROM topics t
+            LEFT JOIN entry_topics et ON et.topic_id = t.id
+            WHERE t.slug = ?
+            GROUP BY t.id, t.slug, t.name, t.source_type, t.source_url, t.expansion_phrase,
+                     t.suggested_phrase, t.phrase_review_status, t.phrase_review_notes
+            """,
+            (slug,),
+        ).fetchone()
+        return dict(row) if row else None
+
+    def set_topic_expansion_phrase(self, slug: str, expansion_phrase: str | None) -> bool:
+        row = self.connection.execute(
+            """
+            UPDATE topics
+            SET expansion_phrase = ?, updated_at = CURRENT_TIMESTAMP
+            WHERE slug = ?
+            RETURNING id
+            """,
+            (expansion_phrase, slug),
+        ).fetchone()
+        self.connection.commit()
+        return row is not None
+
+    def stage_topic_phrase_suggestion(
+        self,
+        slug: str,
+        suggested_phrase: str | None,
+        review_status: str = "pending",
+        review_notes: str | None = None,
+    ) -> bool:
+        row = self.connection.execute(
+            """
+            UPDATE topics
+            SET suggested_phrase = ?,
+                phrase_review_status = ?,
+                phrase_review_notes = ?,
+                updated_at = CURRENT_TIMESTAMP
+            WHERE slug = ?
+            RETURNING id
+            """,
+            (suggested_phrase, review_status, review_notes, slug),
+        ).fetchone()
+        self.connection.commit()
+        return row is not None
+
+    def review_topic_phrase_suggestion(
+        self,
+        slug: str,
+        review_status: str,
+        review_notes: str | None = None,
+        applied_phrase: str | None = None,
+    ) -> bool:
+        topic = self.get_topic(slug)
+        if topic is None:
+            return False
+
+        suggested_phrase = topic.get("suggested_phrase")
+        expansion_phrase = topic.get("expansion_phrase")
+        if review_status == "accepted":
+            expansion_phrase = applied_phrase if applied_phrase is not None else suggested_phrase
+        elif applied_phrase is not None:
+            expansion_phrase = applied_phrase
+
+        row = self.connection.execute(
+            """
+            UPDATE topics
+            SET expansion_phrase = ?,
+                phrase_review_status = ?,
+                phrase_review_notes = ?,
+                updated_at = CURRENT_TIMESTAMP
+            WHERE slug = ?
+            RETURNING id
+            """,
+            (expansion_phrase, review_status, review_notes, slug),
+        ).fetchone()
+        self.connection.commit()
+        return row is not None
+
+    def list_topic_entries(self, topic_slug: str, limit: int = 100) -> list[dict[str, object]]:
+        rows = self.connection.execute(
+            """
+            SELECT e.citation_key, e.entry_type, e.review_status, e.title, e.year,
+                   t.slug AS topic_slug, t.name AS topic_name, et.source_label, et.confidence
+            FROM entry_topics et
+            JOIN topics t ON t.id = et.topic_id
+            JOIN entries e ON e.id = et.entry_id
+            WHERE t.slug = ?
+            ORDER BY COALESCE(e.year, ''), e.citation_key
+            LIMIT ?
+            """,
+            (topic_slug, limit),
+        ).fetchall()
+        return [dict(row) for row in rows]
+
     def set_entry_status(self, citation_key: str, review_status: str) -> bool:
         row = self.connection.execute(
             """
@@ -437,6 +727,114 @@ class BibliographyStore:
         self.connection.commit()
         return True
 
+    def record_conflicts(
+        self,
+        citation_key: str,
+        conflicts: list[dict[str, str]],
+        source_type: str,
+        source_label: str,
+    ) -> bool:
+        row = self.connection.execute(
+            "SELECT id FROM entries WHERE citation_key = ?",
+            (citation_key,),
+        ).fetchone()
+        if row is None:
+            return False
+
+        entry_id = int(row["id"])
+        for conflict in conflicts:
+            self.connection.execute(
+                """
+                INSERT INTO field_conflicts (
+                    entry_id, field_name, current_value, proposed_value, source_type, source_label, status
+                ) VALUES (?, ?, ?, ?, ?, ?, 'open')
+                """,
+                (
+                    entry_id,
+                    conflict["field_name"],
+                    conflict.get("current_value"),
+                    conflict.get("proposed_value"),
+                    source_type,
+                    source_label,
+                ),
+            )
+        self.connection.commit()
+        return True
+
+    def get_field_conflicts(self, citation_key: str, status: str | None = None) -> list[dict[str, object]]:
+        where = ""
+        params: list[object] = [citation_key]
+        if status is not None:
+            where = " AND fc.status = ?"
+            params.append(status)
+
+        rows = self.connection.execute(
+            f"""
+            SELECT fc.field_name, fc.current_value, fc.proposed_value, fc.source_type,
+                   fc.source_label, fc.status, fc.recorded_at
+            FROM field_conflicts fc
+            JOIN entries e ON e.id = fc.entry_id
+            WHERE e.citation_key = ?{where}
+            ORDER BY fc.recorded_at, fc.id
+            """,
+            params,
+        ).fetchall()
+        return [dict(row) for row in rows]
+
+    def set_conflict_status(self, citation_key: str, field_name: str, status: str) -> int:
+        row = self.connection.execute(
+            "SELECT id FROM entries WHERE citation_key = ?",
+            (citation_key,),
+        ).fetchone()
+        if row is None:
+            return 0
+        entry_id = int(row["id"])
+        result = self.connection.execute(
+            """
+            UPDATE field_conflicts
+            SET status = ?
+            WHERE entry_id = ? AND field_name = ? AND status = 'open'
+            """,
+            (status, entry_id, field_name),
+        )
+        self.connection.commit()
+        return result.rowcount
+
+    def apply_conflict_value(self, citation_key: str, field_name: str) -> bool:
+        row = self.connection.execute(
+            """
+            SELECT fc.id, fc.proposed_value, e.review_status
+            FROM field_conflicts fc
+            JOIN entries e ON e.id = fc.entry_id
+            WHERE e.citation_key = ? AND fc.field_name = ? AND fc.status = 'open'
+            ORDER BY fc.recorded_at DESC, fc.id DESC
+            LIMIT 1
+            """,
+            (citation_key, field_name),
+        ).fetchone()
+        if row is None:
+            return False
+
+        entry = self._load_bib_entry(citation_key)
+        if entry is None:
+            return False
+
+        proposed_value = str(row["proposed_value"] or "")
+        entry.fields[field_name] = proposed_value
+        self.upsert_entry(
+            entry,
+            raw_bibtex=_entry_to_bibtex(entry),
+            source_type="manual_review",
+            source_label=f"conflict_accept:{field_name}",
+            review_status=str(row["review_status"] or "draft"),
+        )
+        self.connection.execute(
+            "UPDATE field_conflicts SET status = 'accepted' WHERE id = ?",
+            (int(row["id"]),),
+        )
+        self.connection.commit()
+        return True
+
     def add_relation(
         self,
         source_citation_key: str,
@@ -651,6 +1049,37 @@ class BibliographyStore:
                 "ALTER TABLE entries ADD COLUMN review_status TEXT NOT NULL DEFAULT 'draft'"
             )
 
+    def _ensure_topic_columns(self) -> None:
+        columns = {
+            row["name"] for row in self.connection.execute("PRAGMA table_info(topics)").fetchall()
+        }
+        if "expansion_phrase" not in columns:
+            try:
+                self.connection.execute("ALTER TABLE topics ADD COLUMN expansion_phrase TEXT")
+            except sqlite3.OperationalError as exc:
+                if "duplicate column name" not in str(exc).lower():
+                    raise
+        if "suggested_phrase" not in columns:
+            try:
+                self.connection.execute("ALTER TABLE topics ADD COLUMN suggested_phrase TEXT")
+            except sqlite3.OperationalError as exc:
+                if "duplicate column name" not in str(exc).lower():
+                    raise
+        if "phrase_review_status" not in columns:
+            try:
+                self.connection.execute(
+                    "ALTER TABLE topics ADD COLUMN phrase_review_status TEXT NOT NULL DEFAULT 'unreviewed'"
+                )
+            except sqlite3.OperationalError as exc:
+                if "duplicate column name" not in str(exc).lower():
+                    raise
+        if "phrase_review_notes" not in columns:
+            try:
+                self.connection.execute("ALTER TABLE topics ADD COLUMN phrase_review_notes TEXT")
+            except sqlite3.OperationalError as exc:
+                if "duplicate column name" not in str(exc).lower():
+                    raise
+
     def _record_field_provenance(
         self,
         entry_id: int,
diff --git a/src/citegeist/talkorigins.py b/src/citegeist/talkorigins.py
new file mode 100644
index 0000000..45ce910
--- /dev/null
+++ b/src/citegeist/talkorigins.py
@@ -0,0 +1,1485 @@
+from __future__ import annotations
+
+from collections import Counter
+from dataclasses import asdict, dataclass
+from html.parser import HTMLParser
+import hashlib
+import json
+import re
+from pathlib import Path
+from urllib.parse import urljoin, urlparse
+
+from .bibtex import BibEntry, render_bibtex
+from .resolve import MetadataResolver, merge_entries, merge_entries_with_conflicts
+from .sources import SourceClient
+from .storage import BibliographyStore
+
+YEAR_PATTERN = re.compile(r"\b(18|19|20)\d{2}\b")
+REPEATED_AUTHOR_PATTERN = re.compile(r"^\s*[-_]{3,}\s*,?\s*")
+WHITESPACE_PATTERN = re.compile(r"\s+")
+TOPIC_PHRASE_STOPWORDS = {
+    "about",
+    "across",
+    "after",
+    "among",
+    "analysis",
+    "book",
+    "books",
+    "conference",
+    "data",
+    "edition",
+    "effects",
+    "example",
+    "first",
+    "from",
+    "human",
+    "humans",
+    "journal",
+    "method",
+    "methods",
+    "paper",
+    "papers",
+    "review",
+    "science",
+    "second",
+    "studies",
+    "study",
+    "system",
+    "their",
+    "theory",
+    "title",
+    "using",
+}
+
+
+@dataclass(slots=True)
+class TalkOriginsTopic:
+    topic: str
+    url: str
+    raw_entries: list[str]
+
+
+@dataclass(slots=True)
+class TalkOriginsSeedSet:
+    topic: str
+    slug: str
+    url: str
+    raw_entry_count: int
+    parsed_entry_count: int
+    seed_bib: str
+    plaintext_path: str = ""
+    page_path: str = ""
+    snapshot_path: str = ""
+
+
+@dataclass(slots=True)
+class TalkOriginsBatchExport:
+    base_url: str
+    output_dir: str
+    topic_count: int
+    entry_count: int
+    jobs_path: str
+    manifest_path: str
+    seed_sets: list[TalkOriginsSeedSet]
+    full_bib_path: str = ""
+    full_plaintext_path: str = ""
+    site_index_path: str = ""
+
+
+@dataclass(slots=True)
+class TalkOriginsValidationReport:
+    manifest_path: str
+    topic_count: int
+    entry_count: int
+    parsed_ratio: float
+    missing_author_count: int
+    missing_title_count: int
+    missing_year_count: int
+    suspicious_entry_type_count: int
+    suspicious_examples: list[dict[str, str]]
+    duplicate_cluster_count: int
+    duplicate_entry_count: int
+    duplicate_examples: list[dict[str, object]]
+
+
+@dataclass(slots=True)
+class TalkOriginsIngestReport:
+    manifest_path: str
+    topic_count: int
+    raw_entry_count: int
+    stored_entry_count: int
+    duplicate_cluster_count: int
+    duplicate_entry_count: int
+    canonicalized_count: int
+
+
+@dataclass(slots=True)
+class TalkOriginsDuplicateCluster:
+    key: str
+    count: int
+    items: list[dict[str, str]]
+    canonical: dict[str, object] | None = None
+
+
+@dataclass(slots=True)
+class TalkOriginsEnrichmentResult:
+    key: str
+    citation_key: str
+    weak_reasons_before: list[str]
+    resolved: bool
+    applied: bool
+    source_label: str = ""
+    weak_reasons_after: list[str] | None = None
+    conflicts: list[dict[str, str]] | None = None
+    error: str = ""
+
+
+@dataclass(slots=True)
+class TalkOriginsReviewExport:
+    manifest_path: str
+    item_count: int
+    items: list[dict[str, object]]
+
+
+@dataclass(slots=True)
+class TalkOriginsCorrectionResult:
+    key: str
+    citation_key: str
+    applied: bool
+    error: str = ""
+
+
+@dataclass(slots=True)
+class TalkOriginsTopicPhraseSuggestion:
+    slug: str
+    topic: str
+    entry_count: int
+    suggested_phrase: str
+    keywords: list[str]
+    review_required: bool = False
+    review_reasons: list[str] | None = None
+
+
+class TalkOriginsScraper:
+    def __init__(
+        self,
+        source_client: SourceClient | None = None,
+        resolver: MetadataResolver | None = None,
+    ) -> None:
+        self.source_client = source_client or SourceClient()
+        self.resolver = resolver or MetadataResolver(source_client=self.source_client)
+
+    def scrape_to_directory(
+        self,
+        base_url: str,
+        output_dir: str | Path,
+        limit_topics: int | None = None,
+        limit_entries_per_topic: int | None = None,
+        resolve_seeds: bool = False,
+        ingest_store: BibliographyStore | None = None,
+        review_status: str = "draft",
+        expand: bool = False,
+        topic_limit: int = 5,
+        topic_commit_limit: int | None = None,
+        resume: bool = True,
+    ) -> TalkOriginsBatchExport:
+        output_root = Path(output_dir)
+        seeds_dir = output_root / "seeds"
+        plaintext_dir = output_root / "plaintext"
+        snapshots_dir = output_root / "snapshots"
+        site_dir = output_root / "site"
+        topics_dir = site_dir / "topics"
+        seeds_dir.mkdir(parents=True, exist_ok=True)
+        plaintext_dir.mkdir(parents=True, exist_ok=True)
+        snapshots_dir.mkdir(parents=True, exist_ok=True)
+        topics_dir.mkdir(parents=True, exist_ok=True)
+
+        seed_sets: list[TalkOriginsSeedSet] = []
+        total_entries = 0
+        jobs: list[dict[str, object]] = []
+        full_entries: list[BibEntry] = []
+        full_plaintext_blocks: list[str] = []
+
+        for topic in self.scrape_topics(
+            base_url,
+            snapshots_dir=snapshots_dir,
+            limit_topics=limit_topics,
+            resume=resume,
+        ):
+            raw_entries = topic.raw_entries[:limit_entries_per_topic] if limit_entries_per_topic else topic.raw_entries
+            entry_pairs = [
+                (raw_entry, self.parse_reference_entry(raw_entry, index + 1))
+                for index, raw_entry in enumerate(raw_entries)
+            ]
+            parsed_entries = [entry for _, entry in entry_pairs if entry is not None]
+            if resolve_seeds:
+                parsed_entries = [self._augment_entry(entry) for entry in parsed_entries]
+            if parsed_entries:
+                augmented_iter = iter(parsed_entries)
+                entry_pairs = [
+                    (raw_entry, next(augmented_iter) if parsed_entry is not None else None)
+                    for raw_entry, parsed_entry in entry_pairs
+                ]
+
+            slug = _slugify(topic.topic)
+            seed_path = (seeds_dir / f"{slug}.bib").resolve()
+            plaintext_path = (plaintext_dir / f"{slug}.txt").resolve()
+            page_path = (topics_dir / f"{slug}.html").resolve()
+            snapshot_path = (snapshots_dir / f"{slug}.json").resolve()
+            rendered = render_bibtex(parsed_entries) if parsed_entries else ""
+            seed_path.write_text(rendered + ("\n" if rendered else ""), encoding="utf-8")
+            plaintext_path.write_text(_render_plaintext_topic(topic.topic, raw_entries), encoding="utf-8")
+            page_path.write_text(
+                _render_topic_page(topic.topic, entry_pairs, seed_path.name),
+                encoding="utf-8",
+            )
+
+            if ingest_store is not None and parsed_entries:
+                ingest_store.ingest_bibtex(
+                    rendered,
+                    source_label=topic.url,
+                    review_status=review_status,
+                )
+                for entry in parsed_entries:
+                    ingest_store.add_entry_topic(
+                        entry.citation_key,
+                        topic_slug=slug,
+                        topic_name=topic.topic,
+                        source_type="talkorigins",
+                        source_url=topic.url,
+                        source_label=topic.url,
+                    )
+                ingest_store.connection.commit()
+
+            seed_set = TalkOriginsSeedSet(
+                topic=topic.topic,
+                slug=slug,
+                url=topic.url,
+                raw_entry_count=len(raw_entries),
+                parsed_entry_count=len(parsed_entries),
+                seed_bib=str(seed_path),
+                plaintext_path=str(plaintext_path),
+                page_path=str(page_path),
+                snapshot_path=str(snapshot_path),
+            )
+            seed_sets.append(seed_set)
+            total_entries += len(parsed_entries)
+            full_entries.extend(parsed_entries)
+            full_plaintext_blocks.append(_render_plaintext_topic(topic.topic, raw_entries).rstrip())
+            jobs.append(
+                {
+                    "name": f"talkorigins:{slug}",
+                    "topic": topic.topic,
+                    "topic_slug": slug,
+                    "topic_name": topic.topic,
+                    "topic_phrase": topic.topic,
+                    "seed_bib": str(seed_path),
+                    "expand": expand,
+                    "status": review_status,
+                    "topic_limit": topic_limit,
+                    "topic_commit_limit": topic_commit_limit,
+                }
+            )
+
+        output_root.mkdir(parents=True, exist_ok=True)
+        manifest_path = (output_root / "talkorigins_manifest.json").resolve()
+        jobs_path = (output_root / "talkorigins_jobs.json").resolve()
+        full_bib_path = (output_root / "talkorigins_full.bib").resolve()
+        full_plaintext_path = (output_root / "talkorigins_full.txt").resolve()
+        site_index_path = (site_dir / "index.html").resolve()
+        full_bib_path.write_text(render_bibtex(full_entries) + ("\n" if full_entries else ""), encoding="utf-8")
+        full_plaintext_path.write_text("\n\n".join(block for block in full_plaintext_blocks if block) + "\n", encoding="utf-8")
+        site_index_path.write_text(
+            _render_site_index(seed_sets, Path(full_bib_path).name, Path(full_plaintext_path).name),
+            encoding="utf-8",
+        )
+        manifest_payload = {
+            "base_url": base_url,
+            "resume": resume,
+            "seed_sets": [asdict(item) for item in seed_sets],
+            "full_bib_path": str(full_bib_path),
+            "full_plaintext_path": str(full_plaintext_path),
+            "site_index_path": str(site_index_path),
+        }
+        manifest_path.write_text(json.dumps(manifest_payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+        jobs_path.write_text(json.dumps({"jobs": jobs}, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+
+        return TalkOriginsBatchExport(
+            base_url=base_url,
+            output_dir=str(output_root.resolve()),
+            topic_count=len(seed_sets),
+            entry_count=total_entries,
+            jobs_path=str(jobs_path),
+            manifest_path=str(manifest_path),
+            seed_sets=seed_sets,
+            full_bib_path=str(full_bib_path),
+            full_plaintext_path=str(full_plaintext_path),
+            site_index_path=str(site_index_path),
+        )
+
+    def validate_export(self, manifest_path: str | Path) -> TalkOriginsValidationReport:
+        manifest = json.loads(Path(manifest_path).read_text(encoding="utf-8"))
+        seed_sets = manifest.get("seed_sets", [])
+
+        topic_count = len(seed_sets)
+        raw_total = sum(int(item.get("raw_entry_count", 0)) for item in seed_sets)
+        parsed_total = sum(int(item.get("parsed_entry_count", 0)) for item in seed_sets)
+        missing_author_count = 0
+        missing_title_count = 0
+        missing_year_count = 0
+        suspicious_entry_type_count = 0
+        suspicious_examples: list[dict[str, str]] = []
+        duplicate_groups: dict[str, list[dict[str, str]]] = {}
+
+        for seed_set in seed_sets:
+            seed_bib = seed_set.get("seed_bib")
+            if not isinstance(seed_bib, str) or not seed_bib:
+                continue
+            path = Path(seed_bib)
+            if not path.exists():
+                continue
+            entries = parse_bib_file(path)
+            for entry in entries:
+                if not entry.fields.get("author"):
+                    missing_author_count += 1
+                if not entry.fields.get("title"):
+                    missing_title_count += 1
+                if not entry.fields.get("year"):
+                    missing_year_count += 1
+                if _is_suspicious_entry_type(entry):
+                    suspicious_entry_type_count += 1
+                    if len(suspicious_examples) < 20:
+                        suspicious_examples.append(
+                            {
+                                "citation_key": entry.citation_key,
+                                "entry_type": entry.entry_type,
+                                "title": entry.fields.get("title", ""),
+                                "journal": entry.fields.get("journal", ""),
+                                "publisher": entry.fields.get("publisher", ""),
+                                "howpublished": entry.fields.get("howpublished", ""),
+                            }
+                        )
+                duplicate_key = _duplicate_key(entry)
+                if duplicate_key:
+                    duplicate_groups.setdefault(duplicate_key, []).append(
+                        {
+                            "citation_key": entry.citation_key,
+                            "title": entry.fields.get("title", ""),
+                            "author": entry.fields.get("author", ""),
+                            "year": entry.fields.get("year", ""),
+                            "seed_bib": str(path),
+                        }
+                    )
+
+        parsed_ratio = (parsed_total / raw_total) if raw_total else 0.0
+        duplicate_examples: list[dict[str, object]] = []
+        duplicate_cluster_count = 0
+        duplicate_entry_count = 0
+        for group_key, items in sorted(duplicate_groups.items()):
+            if len(items) < 2:
+                continue
+            duplicate_cluster_count += 1
+            duplicate_entry_count += len(items)
+            if len(duplicate_examples) < 20:
+                duplicate_examples.append(
+                    {
+                        "key": group_key,
+                        "count": len(items),
+                        "items": items[:5],
+                    }
+                )
+        return TalkOriginsValidationReport(
+            manifest_path=str(Path(manifest_path).resolve()),
+            topic_count=topic_count,
+            entry_count=parsed_total,
+            parsed_ratio=parsed_ratio,
+            missing_author_count=missing_author_count,
+            missing_title_count=missing_title_count,
+            missing_year_count=missing_year_count,
+            suspicious_entry_type_count=suspicious_entry_type_count,
+            suspicious_examples=suspicious_examples,
+            duplicate_cluster_count=duplicate_cluster_count,
+            duplicate_entry_count=duplicate_entry_count,
+            duplicate_examples=duplicate_examples,
+        )
+
+    def suggest_topic_phrases(
+        self,
+        manifest_path: str | Path,
+        limit: int | None = None,
+        topic_slug: str | None = None,
+    ) -> list[TalkOriginsTopicPhraseSuggestion]:
+        manifest = json.loads(Path(manifest_path).read_text(encoding="utf-8"))
+        seed_sets = manifest.get("seed_sets", [])
+        suggestions: list[TalkOriginsTopicPhraseSuggestion] = []
+
+        for seed_set in seed_sets:
+            current_topic_slug = str(seed_set.get("slug") or _slugify(str(seed_set.get("topic") or "")))
+            if topic_slug and current_topic_slug != topic_slug:
+                continue
+            seed_bib = seed_set.get("seed_bib")
+            if not isinstance(seed_bib, str) or not seed_bib:
+                continue
+            path = Path(seed_bib)
+            if not path.exists():
+                continue
+            entries = parse_bib_file(path)
+            topic_name = str(seed_set.get("topic") or current_topic_slug)
+            keywords = _suggest_topic_keywords(entries, topic_name)
+            review_reasons = _topic_phrase_review_reasons(entries, keywords)
+            suggestions.append(
+                TalkOriginsTopicPhraseSuggestion(
+                    slug=current_topic_slug,
+                    topic=topic_name,
+                    entry_count=len(entries),
+                    suggested_phrase=" ".join([topic_name, *keywords]).strip(),
+                    keywords=keywords,
+                    review_required=bool(review_reasons),
+                    review_reasons=review_reasons,
+                )
+            )
+
+        suggestions.sort(key=lambda item: (item.topic.casefold(), item.slug))
+        if limit is not None:
+            suggestions = suggestions[:limit]
+        return suggestions
+
+    def inspect_duplicate_clusters(
+        self,
+        manifest_path: str | Path,
+        limit: int = 20,
+        min_count: int = 2,
+        match: str | None = None,
+        topic_slug: str | None = None,
+        preview_canonical: bool = False,
+        weak_only: bool = False,
+    ) -> list[TalkOriginsDuplicateCluster]:
+        duplicate_groups, grouped_entries = _collect_duplicate_groups(
+            manifest_path,
+            match=match,
+            topic_slug=topic_slug,
+        )
+
+        clusters: list[TalkOriginsDuplicateCluster] = []
+        for group_key, items in sorted(duplicate_groups.items()):
+            if len(items) < min_count:
+                continue
+            canonical_payload = None
+            if preview_canonical:
+                canonical = _build_canonical_preview(grouped_entries[group_key])
+                weak_reasons = _canonical_weaknesses(canonical)
+                if weak_only and not weak_reasons:
+                    continue
+                canonical_payload = {
+                    "citation_key": canonical.citation_key,
+                    "entry_type": canonical.entry_type,
+                    "field_count": len([value for value in canonical.fields.values() if value]),
+                    "fields": dict(sorted(canonical.fields.items())),
+                    "weak_reasons": weak_reasons,
+                }
+            elif weak_only:
+                canonical = _build_canonical_preview(grouped_entries[group_key])
+                if not _canonical_weaknesses(canonical):
+                    continue
+            clusters.append(
+                TalkOriginsDuplicateCluster(
+                    key=group_key,
+                    count=len(items),
+                    items=sorted(
+                        items,
+                        key=lambda item: (
+                            item.get("topic_slug", ""),
+                            item.get("year", ""),
+                            item.get("citation_key", ""),
+                        ),
+                    ),
+                    canonical=canonical_payload,
+                )
+            )
+        return clusters[:limit]
+
+    def enrich_weak_canonicals(
+        self,
+        manifest_path: str | Path,
+        store: BibliographyStore,
+        limit: int = 20,
+        min_count: int = 2,
+        match: str | None = None,
+        topic_slug: str | None = None,
+        apply: bool = False,
+        review_status: str = "enriched",
+        allow_unsafe_matches: bool = False,
+    ) -> list[TalkOriginsEnrichmentResult]:
+        duplicate_groups, grouped_entries = _collect_duplicate_groups(
+            manifest_path,
+            match=match,
+            topic_slug=topic_slug,
+        )
+        results: list[TalkOriginsEnrichmentResult] = []
+
+        for group_key, items in sorted(duplicate_groups.items()):
+            if len(items) < min_count:
+                continue
+            canonical = _build_canonical_preview(grouped_entries[group_key])
+            weak_reasons_before = _canonical_weaknesses(canonical)
+            if not weak_reasons_before:
+                continue
+            resolution = None
+            error = ""
+            try:
+                resolution = self.resolver.resolve_entry(canonical)
+            except Exception as exc:
+                error = str(exc)
+
+            result = TalkOriginsEnrichmentResult(
+                key=group_key,
+                citation_key=canonical.citation_key,
+                weak_reasons_before=weak_reasons_before,
+                resolved=resolution is not None,
+                applied=False,
+                source_label=resolution.source_label if resolution is not None else "",
+                error=error,
+            )
+
+            if resolution is not None:
+                if not allow_unsafe_matches and not _is_safe_enrichment_match(canonical, resolution):
+                    result.resolved = False
+                    result.source_label = resolution.source_label
+                    result.error = "unsafe resolver match"
+                    results.append(result)
+                    if len(results) >= limit:
+                        break
+                    continue
+                merged, conflicts = merge_entries_with_conflicts(canonical, resolution.entry)
+                if canonical.entry_type == "misc" and resolution.entry.entry_type != "misc":
+                    merged = BibEntry(
+                        entry_type=resolution.entry.entry_type,
+                        citation_key=merged.citation_key,
+                        fields=merged.fields,
+                    )
+                result.conflicts = conflicts
+                result.weak_reasons_after = _canonical_weaknesses(merged)
+                if apply:
+                    store_key = _find_store_citation_key(store, canonical)
+                    if store_key:
+                        store.replace_entry(
+                            store_key,
+                            merged,
+                            source_type=resolution.source_type,
+                            source_label=resolution.source_label,
+                            review_status=review_status,
+                        )
+                        if conflicts:
+                            store.record_conflicts(
+                                store_key,
+                                conflicts,
+                                source_type=resolution.source_type,
+                                source_label=resolution.source_label,
+                            )
+                        result.citation_key = store_key
+                        result.applied = True
+            results.append(result)
+            if len(results) >= limit:
+                break
+
+        if apply:
+            store.connection.commit()
+        return results
+
+    def build_review_export(
+        self,
+        manifest_path: str | Path,
+        store: BibliographyStore,
+        limit: int = 20,
+        min_count: int = 2,
+        match: str | None = None,
+        topic_slug: str | None = None,
+    ) -> TalkOriginsReviewExport:
+        clusters = self.inspect_duplicate_clusters(
+            manifest_path,
+            limit=limit,
+            min_count=min_count,
+            match=match,
+            topic_slug=topic_slug,
+            preview_canonical=True,
+            weak_only=True,
+        )
+        enrichment_results = self.enrich_weak_canonicals(
+            manifest_path,
+            store,
+            limit=limit,
+            min_count=min_count,
+            match=match,
+            topic_slug=topic_slug,
+            apply=False,
+        )
+        by_key = {result.key: result for result in enrichment_results}
+        items: list[dict[str, object]] = []
+        for cluster in clusters:
+            result = by_key.get(cluster.key)
+            payload = {
+                "key": cluster.key,
+                "count": cluster.count,
+                "items": cluster.items,
+                "canonical": cluster.canonical,
+                "enrichment": asdict(result) if result is not None else None,
+            }
+            items.append(payload)
+        return TalkOriginsReviewExport(
+            manifest_path=str(Path(manifest_path).resolve()),
+            item_count=len(items),
+            items=items,
+        )
+
+    def apply_review_corrections(
+        self,
+        manifest_path: str | Path,
+        corrections_path: str | Path,
+        store: BibliographyStore,
+        default_review_status: str = "reviewed",
+    ) -> list[TalkOriginsCorrectionResult]:
+        duplicate_groups, grouped_entries = _collect_duplicate_groups(manifest_path)
+        payload = json.loads(Path(corrections_path).read_text(encoding="utf-8"))
+        correction_items = payload.get("corrections", [])
+        results: list[TalkOriginsCorrectionResult] = []
+
+        for item in correction_items:
+            key = str(item.get("key") or "")
+            if not key:
+                results.append(TalkOriginsCorrectionResult(key="", citation_key="", applied=False, error="missing key"))
+                continue
+            entries = grouped_entries.get(key)
+            if not entries:
+                results.append(TalkOriginsCorrectionResult(key=key, citation_key="", applied=False, error="unknown key"))
+                continue
+
+            canonical = _build_canonical_preview(entries)
+            store_key = _find_store_citation_key(store, canonical)
+            if not store_key:
+                results.append(TalkOriginsCorrectionResult(key=key, citation_key=canonical.citation_key, applied=False, error="entry not found in store"))
+                continue
+
+            corrected = BibEntry(
+                entry_type=str(item.get("entry_type") or canonical.entry_type),
+                citation_key=store_key,
+                fields=dict(canonical.fields),
+            )
+            override_fields = item.get("fields", {})
+            if isinstance(override_fields, dict):
+                for field_name, value in override_fields.items():
+                    if value is None:
+                        corrected.fields.pop(str(field_name), None)
+                    else:
+                        corrected.fields[str(field_name)] = str(value)
+
+            review_status = str(item.get("review_status") or default_review_status)
+            store.replace_entry(
+                store_key,
+                corrected,
+                source_type="manual_review",
+                source_label=f"talkorigins_corrections:{Path(corrections_path).resolve()}",
+                review_status=review_status,
+            )
+            results.append(TalkOriginsCorrectionResult(key=key, citation_key=store_key, applied=True))
+
+        store.connection.commit()
+        return results
+
+    def ingest_export(
+        self,
+        manifest_path: str | Path,
+        store: BibliographyStore,
+        review_status: str = "draft",
+        dedupe: bool = True,
+    ) -> TalkOriginsIngestReport:
+        manifest = json.loads(Path(manifest_path).read_text(encoding="utf-8"))
+        seed_sets = manifest.get("seed_sets", [])
+        topic_count = len(seed_sets)
+        raw_entry_count = sum(int(item.get("parsed_entry_count", 0)) for item in seed_sets)
+
+        grouped: dict[str, list[tuple[dict[str, object], BibEntry]]] = {}
+        canonicalized_count = 0
+        duplicate_entry_count = 0
+
+        for seed_set in seed_sets:
+            seed_bib = seed_set.get("seed_bib")
+            if not isinstance(seed_bib, str) or not seed_bib:
+                continue
+            entries = parse_bib_file(seed_bib)
+            for entry in entries:
+                group_key = _duplicate_key(entry) if dedupe else entry.citation_key
+                if not group_key:
+                    group_key = entry.citation_key
+                grouped.setdefault(group_key, []).append((seed_set, entry))
+
+        stored_entry_count = 0
+        duplicate_cluster_count = 0
+        source_label = str(Path(manifest_path).resolve())
+        key_owners: dict[str, str] = {}
+        existing_rows = store.connection.execute("SELECT citation_key FROM entries").fetchall()
+        for row in existing_rows:
+            key_owners[str(row["citation_key"])] = "__existing__"
+
+        for group_key, items in grouped.items():
+            if len(items) > 1:
+                duplicate_cluster_count += 1
+                duplicate_entry_count += len(items)
+
+            canonical = _select_canonical_entry([entry for _, entry in items])
+            for _, duplicate in items:
+                if duplicate.citation_key != canonical.citation_key:
+                    canonical = merge_entries(canonical, duplicate)
+                    canonicalized_count += 1
+            canonical = _assign_canonical_key(canonical, group_key, key_owners)
+
+            store.upsert_entry(
+                canonical,
+                raw_bibtex=render_bibtex([canonical]),
+                source_type="talkorigins",
+                source_label=source_label,
+                review_status=review_status,
+            )
+            stored_entry_count += 1
+
+            seen_topics: set[str] = set()
+            for seed_set, _ in items:
+                topic_slug = str(seed_set.get("slug") or _slugify(str(seed_set.get("topic") or "")))
+                if topic_slug in seen_topics:
+                    continue
+                seen_topics.add(topic_slug)
+                store.add_entry_topic(
+                    canonical.citation_key,
+                    topic_slug=topic_slug,
+                    topic_name=str(seed_set.get("topic") or topic_slug),
+                    source_type="talkorigins",
+                    source_url=str(seed_set.get("url") or ""),
+                    source_label=source_label,
+                )
+
+        store.connection.commit()
+        return TalkOriginsIngestReport(
+            manifest_path=str(Path(manifest_path).resolve()),
+            topic_count=topic_count,
+            raw_entry_count=raw_entry_count,
+            stored_entry_count=stored_entry_count,
+            duplicate_cluster_count=duplicate_cluster_count,
+            duplicate_entry_count=duplicate_entry_count,
+            canonicalized_count=canonicalized_count,
+        )
+
+    def scrape_topics(
+        self,
+        base_url: str,
+        snapshots_dir: Path | None = None,
+        limit_topics: int | None = None,
+        resume: bool = True,
+    ) -> list[TalkOriginsTopic]:
+        index_html = self.source_client.get_text(base_url)
+        parser = _TopicIndexParser(base_url)
+        parser.feed(index_html)
+
+        topics: list[TalkOriginsTopic] = []
+        for link in parser.topic_links[:limit_topics]:
+            slug = _slugify(link["topic"])
+            snapshot_path = snapshots_dir / f"{slug}.json" if snapshots_dir is not None else None
+            snapshot = _load_snapshot(snapshot_path) if resume and snapshot_path is not None else None
+            if snapshot is not None:
+                raw_entries = list(snapshot.get("raw_entries", []))
+            else:
+                page_html = self.source_client.get_text(link["url"])
+                topic_parser = _TopicPageParser()
+                topic_parser.feed(page_html)
+                raw_entries = normalize_topic_entries(topic_parser.preformatted_text())
+                if snapshot_path is not None:
+                    snapshot_payload = {
+                        "topic": link["topic"],
+                        "url": link["url"],
+                        "raw_entries": raw_entries,
+                    }
+                    snapshot_path.write_text(json.dumps(snapshot_payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+            topics.append(TalkOriginsTopic(topic=link["topic"], url=link["url"], raw_entries=raw_entries))
+        return topics
+
+    def parse_reference_entry(self, raw_entry: str, ordinal: int) -> BibEntry | None:
+        year_match = YEAR_PATTERN.search(raw_entry)
+        if year_match is None:
+            return None
+
+        year = year_match.group(0)
+        author_part = raw_entry[: year_match.start()].strip(" ,.;:")
+        remainder = raw_entry[year_match.end() :].strip(" ,.;:")
+        if not author_part or not remainder:
+            return None
+
+        title, venue = _split_title_and_venue(remainder)
+        if not title:
+            return None
+
+        authors = _normalize_gsa_authors(author_part)
+        citation_key = _make_citation_key(authors, year, title, ordinal)
+        entry_type = _guess_entry_type(remainder)
+        fields = {
+            "author": authors,
+            "year": year,
+            "title": title,
+            "note": f"talkorigins_source = {{true}}; raw_reference = {{{raw_entry}}}",
+        }
+        if entry_type == "book":
+            normalized = _normalize_incollection_candidate(title, venue)
+            if normalized is not None:
+                title = normalized["title"]
+                fields["title"] = title
+                entry_type = "incollection"
+                if normalized.get("editor"):
+                    fields["editor"] = normalized["editor"]
+                if normalized.get("booktitle"):
+                    fields["booktitle"] = normalized["booktitle"]
+                if normalized.get("publisher"):
+                    fields["publisher"] = normalized["publisher"]
+                venue = ""
+        if venue:
+            if entry_type == "article":
+                fields["journal"] = venue
+            elif entry_type == "inproceedings":
+                fields["booktitle"] = venue
+            elif entry_type == "incollection":
+                fields["booktitle"] = venue
+            elif entry_type in {"book", "phdthesis", "mastersthesis"}:
+                fields["publisher"] = venue
+            else:
+                fields["howpublished"] = venue
+
+        return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
+
+    def _augment_entry(self, entry: BibEntry) -> BibEntry:
+        try:
+            resolution = self.resolver.resolve_entry(entry)
+        except Exception:
+            return entry
+        if resolution is None:
+            return entry
+        return merge_entries(entry, resolution.entry)
+
+
+def normalize_topic_entries(text: str) -> list[str]:
+    entries: list[str] = []
+    previous_authors = ""
+    current: list[str] = []
+
+    for raw_line in text.splitlines():
+        line = raw_line.strip()
+        if not line:
+            if current:
+                entry_text = " ".join(current)
+                normalized = _normalize_repeated_authors(entry_text, previous_authors)
+                entries.append(normalized)
+                previous_authors = _extract_author_prefix(normalized) or previous_authors
+                current = []
+            continue
+        current.append(WHITESPACE_PATTERN.sub(" ", line))
+
+    if current:
+        entry_text = " ".join(current)
+        normalized = _normalize_repeated_authors(entry_text, previous_authors)
+        entries.append(normalized)
+
+    return entries
+
+
+def _normalize_repeated_authors(entry_text: str, previous_authors: str) -> str:
+    if previous_authors and REPEATED_AUTHOR_PATTERN.match(entry_text):
+        return REPEATED_AUTHOR_PATTERN.sub(f"{previous_authors}, ", entry_text, count=1)
+    return entry_text
+
+
+def _extract_author_prefix(entry_text: str) -> str:
+    year_match = YEAR_PATTERN.search(entry_text)
+    if year_match is None:
+        return ""
+    return entry_text[: year_match.start()].strip(" ,;:")
+
+
+def _split_title_and_venue(remainder: str) -> tuple[str, str]:
+    if ": " in remainder:
+        title, venue = remainder.split(": ", 1)
+        return _clean_fragment(title), _clean_fragment(venue)
+
+    parts = [part.strip() for part in remainder.split(". ") if part.strip()]
+    if not parts:
+        return "", ""
+    title = parts[0]
+    venue = ". ".join(parts[1:]) if len(parts) > 1 else ""
+    return _clean_fragment(title), _clean_fragment(venue)
+
+
+def _normalize_gsa_authors(author_part: str) -> str:
+    cleaned = WHITESPACE_PATTERN.sub(" ", author_part.replace("&", " and ")).strip(" ,;:")
+    if " and " in cleaned and "," not in cleaned:
+        return cleaned
+
+    fragments = [fragment.strip() for fragment in cleaned.split(",") if fragment.strip()]
+    if len(fragments) < 2:
+        return cleaned
+
+    authors: list[str] = []
+    index = 0
+    while index + 1 < len(fragments):
+        family = re.sub(r"^(and)\s+", "", fragments[index], flags=re.IGNORECASE).strip()
+        given = re.sub(r"^(and)\s+", "", fragments[index + 1], flags=re.IGNORECASE).strip()
+        if family and given:
+            authors.append(f"{family}, {given}")
+        index += 2
+
+    if index < len(fragments):
+        trailing = re.sub(r"^(and)\s+", "", fragments[index], flags=re.IGNORECASE).strip()
+        if trailing:
+            authors.append(trailing)
+
+    return " and ".join(authors) if authors else cleaned
+
+
+def _make_citation_key(authors: str, year: str, title: str, ordinal: int) -> str:
+    first_author = authors.split(" and ")[0]
+    family = first_author.split(",", 1)[0] if "," in first_author else first_author.split()[-1]
+    family = re.sub(r"[^A-Za-z0-9]+", "", family).lower() or "ref"
+    first_word = re.sub(r"[^A-Za-z0-9]+", "", title.split()[0]).lower() if title.split() else "untitled"
+    first_word = first_word or "untitled"
+    return f"{family}{year}{first_word}{ordinal}"
+
+
+def _guess_entry_type(text: str) -> str:
+    lowered = text.lower()
+    if "ph.d" in lowered or "dissertation" in lowered or "thesis" in lowered:
+        return "phdthesis"
+    if any(
+        token in lowered
+        for token in (
+            "press",
+            "publisher",
+            "publications",
+            "publication",
+            "elsevier",
+            "springer",
+            "wiley",
+            "university",
+            "books",
+        )
+    ):
+        return "book"
+    if any(token in lowered for token in ("proceedings", "conference", "symposium", "workshop")):
+        return "inproceedings"
+    if any(token in lowered for token in ("journal", "review", "letters", "quarterly", "science", "nature")):
+        return "article"
+    return "misc"
+
+
+def _clean_fragment(value: str) -> str:
+    return WHITESPACE_PATTERN.sub(" ", value.strip(" .;:,\"'"))
+
+
+def _slugify(value: str) -> str:
+    slug = re.sub(r"[^A-Za-z0-9]+", "-", value.lower()).strip("-")
+    return slug or "topic"
+
+
+def _normalize_incollection_candidate(title: str, venue: str) -> dict[str, str] | None:
+    lowered = venue.lower()
+    if ", in " not in lowered:
+        return None
+
+    split_index = lowered.find(", in ")
+    prefix = _clean_fragment(venue[:split_index])
+    container = venue[split_index + len(", in ") :].strip()
+    if not container:
+        return None
+
+    editor_match = re.match(r"^(?P<editors>.+?),\s+eds?\.,\s+(?P<rest>.+)$", container, flags=re.IGNORECASE)
+    if editor_match is None:
+        return None
+
+    editor_text = _normalize_gsa_authors(editor_match.group("editors"))
+    rest = editor_match.group("rest").strip()
+    if ": " in rest:
+        booktitle, publisher = rest.split(": ", 1)
+    else:
+        booktitle, publisher = rest, ""
+
+    normalized_title = title
+    if prefix:
+        normalized_title = _clean_fragment(f"{title}: {prefix}")
+
+    payload = {
+        "title": normalized_title,
+        "editor": editor_text,
+        "booktitle": _clean_fragment(booktitle),
+    }
+    if publisher:
+        payload["publisher"] = _clean_fragment(publisher)
+    return payload
+
+
+def _load_snapshot(path: Path | None) -> dict[str, object] | None:
+    if path is None or not path.exists():
+        return None
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def parse_bib_file(path: str | Path) -> list[BibEntry]:
+    from .bibtex import parse_bibtex
+
+    return parse_bibtex(Path(path).read_text(encoding="utf-8"))
+
+
+def _render_plaintext_topic(topic: str, raw_entries: list[str]) -> str:
+    body = "\n\n".join(raw_entries)
+    return f"{topic}\n\n{body}\n" if body else f"{topic}\n"
+
+
+def _render_topic_page(topic: str, entry_pairs: list[tuple[str, BibEntry | None]], seed_filename: str) -> str:
+    entry_blocks: list[str] = []
+    for index, (raw_entry, parsed_entry) in enumerate(entry_pairs, start=1):
+        bibtex_block = ""
+        if parsed_entry is not None:
+            bibtex_block = render_bibtex([parsed_entry])
+        safe_plain = _html_escape(raw_entry)
+        safe_bibtex = _html_escape(bibtex_block)
+        entry_blocks.append(
+            "\n".join(
+                [
+                    '<article class="entry">',
+                    f'  <div class="gsa-entry">{safe_plain}</div>',
+                    f'  <button type="button" class="toggle" onclick="toggleBibtex(\'bibtex-{index}\')">Show BibTeX</button>',
+                    f'  <div id="bibtex-{index}" class="bibtex hidden"><pre>{safe_bibtex}</pre></div>',
+                    "</article>",
+                ]
+            )
+        )
+
+    return "\n".join(
+        [
+            "<!DOCTYPE html>",
+            '<html lang="en">',
+            "<head>",
+            '  <meta charset="utf-8" />',
+            f"  <title>{_html_escape(topic)} bibliography</title>",
+            "  <style>",
+            "    body { font-family: Georgia, serif; margin: 2rem auto; max-width: 900px; line-height: 1.5; }",
+            "    .entry { margin: 0 0 1.5rem 0; padding-bottom: 1rem; border-bottom: 1px solid #ccc; }",
+            "    .gsa-entry { white-space: pre-wrap; }",
+            "    .bibtex.hidden { display: none; }",
+            "    .toggle { margin-top: 0.5rem; }",
+            "    pre { background: #f6f3eb; padding: 0.75rem; overflow-x: auto; }",
+            "  </style>",
+            "  <script>",
+            "    function toggleBibtex(id) {",
+            "      const element = document.getElementById(id);",
+            "      if (!element) { return; }",
+            "      element.classList.toggle('hidden');",
+            "    }",
+            "  </script>",
+            "</head>",
+            "<body>",
+            f"  <h1>{_html_escape(topic)}</h1>",
+            f'  <p><a href="../index.html">Back to index</a> | <a href="../../seeds/{_html_escape(seed_filename)}">Seed BibTeX</a></p>',
+            *entry_blocks,
+            "</body>",
+            "</html>",
+        ]
+    ) + "\n"
+
+
+def _render_site_index(seed_sets: list[TalkOriginsSeedSet], full_bib_name: str, full_plaintext_name: str) -> str:
+    items = [
+        f'    <li><a href="topics/{_html_escape(item.slug)}.html">{_html_escape(item.topic)}</a> '
+        f'({item.parsed_entry_count} entries)</li>'
+        for item in seed_sets
+    ]
+    return "\n".join(
+        [
+            "<!DOCTYPE html>",
+            '<html lang="en">',
+            "<head>",
+            '  <meta charset="utf-8" />',
+            "  <title>TalkOrigins bibliography reconstruction</title>",
+            "  <style>body { font-family: Georgia, serif; margin: 2rem auto; max-width: 900px; line-height: 1.5; }</style>",
+            "</head>",
+            "<body>",
+            "  <h1>TalkOrigins bibliography reconstruction</h1>",
+            "  <p>Downloads:</p>",
+            "  <ul>",
+            f'    <li><a href="../{_html_escape(full_plaintext_name)}">Full plaintext bibliography</a></li>',
+            f'    <li><a href="../{_html_escape(full_bib_name)}">Full BibTeX bibliography</a></li>',
+            "  </ul>",
+            "  <h2>Topics</h2>",
+            "  <ul>",
+            *items,
+            "  </ul>",
+            "</body>",
+            "</html>",
+        ]
+    ) + "\n"
+
+
+def _html_escape(value: str) -> str:
+    return (
+        value.replace("&", "&amp;")
+        .replace("<", "&lt;")
+        .replace(">", "&gt;")
+        .replace('"', "&quot;")
+    )
+
+
+def _collect_duplicate_groups(
+    manifest_path: str | Path,
+    match: str | None = None,
+    topic_slug: str | None = None,
+) -> tuple[dict[str, list[dict[str, str]]], dict[str, list[BibEntry]]]:
+    manifest = json.loads(Path(manifest_path).read_text(encoding="utf-8"))
+    seed_sets = manifest.get("seed_sets", [])
+    match_text = match.casefold() if match else None
+    duplicate_groups: dict[str, list[dict[str, str]]] = {}
+    grouped_entries: dict[str, list[BibEntry]] = {}
+
+    for seed_set in seed_sets:
+        seed_bib = seed_set.get("seed_bib")
+        if not isinstance(seed_bib, str) or not seed_bib:
+            continue
+        current_topic_slug = str(seed_set.get("slug") or _slugify(str(seed_set.get("topic") or "")))
+        if topic_slug and current_topic_slug != topic_slug:
+            continue
+        path = Path(seed_bib)
+        if not path.exists():
+            continue
+        for entry in parse_bib_file(path):
+            duplicate_key = _duplicate_key(entry)
+            if not duplicate_key:
+                continue
+            item = {
+                "citation_key": entry.citation_key,
+                "title": entry.fields.get("title", ""),
+                "author": entry.fields.get("author", ""),
+                "year": entry.fields.get("year", ""),
+                "seed_bib": str(path),
+                "topic": str(seed_set.get("topic") or ""),
+                "topic_slug": current_topic_slug,
+            }
+            if match_text and not _duplicate_item_matches(item, duplicate_key, match_text):
+                continue
+            duplicate_groups.setdefault(duplicate_key, []).append(item)
+            grouped_entries.setdefault(duplicate_key, []).append(entry)
+
+    return duplicate_groups, grouped_entries
+
+
+def _duplicate_key(entry: BibEntry) -> str:
+    author = _normalize_duplicate_text(entry.fields.get("author", ""))
+    title = _normalize_duplicate_text(entry.fields.get("title", ""))
+    year = entry.fields.get("year", "").strip()
+    if not author or not title or not year:
+        return ""
+    first_author = author.split(" and ")[0]
+    return f"{first_author}|{year}|{title}"
+
+
+def _duplicate_item_matches(item: dict[str, str], duplicate_key: str, match_text: str) -> bool:
+    haystacks = (
+        duplicate_key,
+        item.get("citation_key", ""),
+        item.get("title", ""),
+        item.get("author", ""),
+        item.get("year", ""),
+        item.get("topic", ""),
+        item.get("topic_slug", ""),
+        item.get("seed_bib", ""),
+    )
+    return any(match_text in value.casefold() for value in haystacks if value)
+
+
+def _normalize_duplicate_text(value: str) -> str:
+    normalized = value.lower()
+    normalized = normalized.replace("&", " and ")
+    normalized = re.sub(r"[^a-z0-9\s]+", " ", normalized)
+    normalized = re.sub(r"\s+", " ", normalized).strip()
+    return normalized
+
+
+def _topic_phrase_tokens(value: str) -> list[str]:
+    return [
+        token
+        for token in _normalize_duplicate_text(value).split()
+        if len(token) >= 4 and token not in TOPIC_PHRASE_STOPWORDS
+    ]
+
+
+def _suggest_topic_keywords(entries: list[BibEntry], topic_name: str, max_keywords: int = 4) -> list[str]:
+    topic_terms = set(_topic_phrase_tokens(topic_name))
+    counts: Counter[str] = Counter()
+    for entry in entries:
+        for term in set(_topic_phrase_tokens(entry.fields.get("title", ""))):
+            if term in topic_terms:
+                continue
+            counts[term] += 1
+    ranked = sorted(counts.items(), key=lambda item: (-item[1], item[0]))
+    if len(entries) <= 1:
+        max_keywords = min(max_keywords, 1)
+    elif len(entries) <= 3:
+        max_keywords = min(max_keywords, 2)
+    filtered = [(term, count) for term, count in ranked if count >= 2]
+    selected = filtered if filtered else ranked[:max_keywords]
+    return [term for term, _ in selected[:max_keywords]]
+
+
+def _topic_phrase_review_reasons(entries: list[BibEntry], keywords: list[str]) -> list[str]:
+    reasons: list[str] = []
+    if len(entries) <= 1:
+        reasons.append("single_entry_topic")
+    elif len(entries) <= 3:
+        reasons.append("small_topic")
+    if not keywords:
+        reasons.append("no_keyword_signal")
+    elif len(keywords) == 1:
+        reasons.append("thin_keyword_signal")
+    if any(_looks_noisy_keyword(keyword) for keyword in keywords):
+        reasons.append("noisy_keywords")
+    return reasons
+
+
+def _looks_noisy_keyword(keyword: str) -> bool:
+    if len(keyword) <= 3:
+        return True
+    if any(char.isdigit() for char in keyword):
+        return True
+    noisy_tokens = {"boundry", "colloquium", "edition", "history", "idea", "central", "bearing", "time"}
+    return keyword in noisy_tokens
+
+
+def _select_canonical_entry(entries: list[BibEntry]) -> BibEntry:
+    return max(
+        entries,
+        key=lambda entry: (
+            _entry_richness(entry),
+            -len(entry.citation_key),
+            entry.citation_key,
+        ),
+    )
+
+
+def _build_canonical_preview(entries: list[BibEntry]) -> BibEntry:
+    canonical = _select_canonical_entry(entries)
+    for duplicate in entries:
+        if duplicate.citation_key != canonical.citation_key:
+            canonical = merge_entries(canonical, duplicate)
+    return canonical
+
+
+def _canonical_weaknesses(entry: BibEntry) -> list[str]:
+    reasons: list[str] = []
+    if entry.entry_type == "misc":
+        reasons.append("entry_type:misc")
+    if not entry.fields.get("doi"):
+        reasons.append("missing:doi")
+    if _entry_richness(entry) < 6:
+        reasons.append("low_field_richness")
+    if entry.entry_type in {"article", "inproceedings", "incollection"} and not (
+        entry.fields.get("journal") or entry.fields.get("booktitle")
+    ):
+        reasons.append("missing:venue")
+    return reasons
+
+
+def _find_store_citation_key(store: BibliographyStore, entry: BibEntry) -> str | None:
+    if store.get_entry(entry.citation_key) is not None:
+        return entry.citation_key
+
+    first_author = entry.fields.get("author", "").split(" and ")[0].strip()
+    row = store.connection.execute(
+        """
+        SELECT e.citation_key
+        FROM entries e
+        LEFT JOIN entry_creators ec
+          ON ec.entry_id = e.id AND ec.role = 'author' AND ec.ordinal = 1
+        LEFT JOIN creators c
+          ON c.id = ec.creator_id
+        WHERE COALESCE(e.title, '') = ?
+          AND COALESCE(e.year, '') = ?
+          AND COALESCE(c.full_name, '') = ?
+        ORDER BY e.citation_key
+        LIMIT 1
+        """,
+        (
+            entry.fields.get("title", ""),
+            entry.fields.get("year", ""),
+            first_author,
+        ),
+    ).fetchone()
+    if row is None:
+        return None
+    return str(row["citation_key"])
+
+
+def _is_safe_enrichment_match(base: BibEntry, resolution: object) -> bool:
+    source_label = getattr(resolution, "source_label", "")
+    resolved_entry = getattr(resolution, "entry", None)
+    if not isinstance(source_label, str) or resolved_entry is None:
+        return False
+    if ":search:" not in source_label:
+        return True
+
+    base_title = _normalize_duplicate_text(base.fields.get("title", ""))
+    resolved_title = _normalize_duplicate_text(resolved_entry.fields.get("title", ""))
+    if not base_title or base_title != resolved_title:
+        return False
+
+    base_year = (base.fields.get("year") or "").strip()
+    resolved_year = (resolved_entry.fields.get("year") or "").strip()
+    if base_year and resolved_year and base_year == resolved_year:
+        return True
+
+    base_author = _normalize_duplicate_text(base.fields.get("author", ""))
+    resolved_author = _normalize_duplicate_text(resolved_entry.fields.get("author", ""))
+    if not base_author or not resolved_author:
+        return False
+    base_first = base_author.split(" and ")[0].split()[0]
+    resolved_first = resolved_author.split(" and ")[0].split()[0]
+    return bool(base_first and resolved_first and base_first == resolved_first)
+
+
+def _entry_richness(entry: BibEntry) -> int:
+    score = 0
+    for field_name, value in entry.fields.items():
+        if value:
+            score += 3 if field_name in {"doi", "url", "abstract", "publisher", "journal", "booktitle", "editor"} else 1
+    return score
+
+
+def _assign_canonical_key(entry: BibEntry, group_key: str, key_owners: dict[str, str]) -> BibEntry:
+    base_key = entry.citation_key
+    owner = key_owners.get(base_key)
+    if owner is None or owner == group_key:
+        key_owners[base_key] = group_key
+        return entry
+
+    suffix = hashlib.sha1(group_key.encode("utf-8")).hexdigest()[:8]
+    candidate = f"{base_key}_{suffix}"
+    counter = 2
+    while candidate in key_owners and key_owners[candidate] != group_key:
+        candidate = f"{base_key}_{suffix}_{counter}"
+        counter += 1
+    key_owners[candidate] = group_key
+    return BibEntry(entry_type=entry.entry_type, citation_key=candidate, fields=dict(entry.fields))
+
+
+def _is_suspicious_entry_type(entry: BibEntry) -> bool:
+    journal = entry.fields.get("journal", "").lower()
+    publisher = entry.fields.get("publisher", "").lower()
+    howpublished = entry.fields.get("howpublished", "").lower()
+    if entry.entry_type == "article" and any(
+        token in journal
+        for token in ("elsevier", "springer", "press", "publications", "publisher", "university")
+    ):
+        return True
+    if entry.entry_type == "misc" and any(
+        token in howpublished
+        for token in ("journal", "review", "letters", "proceedings", "conference", "symposium")
+    ):
+        return True
+    if entry.entry_type == "book" and any(
+        token in publisher for token in ("journal", "review", "letters", "proceedings", "conference")
+    ) and not any(
+        token in publisher for token in ("press", "academic", "elsevier", "springer", "wiley", "university")
+    ):
+        return True
+    if entry.entry_type == "incollection" and not entry.fields.get("booktitle"):
+        return True
+    return False
+
+
+class _TopicIndexParser(HTMLParser):
+    def __init__(self, base_url: str) -> None:
+        super().__init__()
+        self.base_url = base_url
+        self.base_prefix = base_url if base_url.endswith("/") else base_url + "/"
+        self.topic_links: list[dict[str, str]] = []
+        self._current_href: str | None = None
+        self._current_text: list[str] = []
+        self._seen_urls: set[str] = set()
+
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        if tag != "a":
+            return
+        href = dict(attrs).get("href")
+        if not href or href.startswith("#"):
+            return
+        self._current_href = urljoin(self.base_url, href)
+        self._current_text = []
+
+    def handle_data(self, data: str) -> None:
+        if self._current_href is not None:
+            self._current_text.append(data)
+
+    def handle_endtag(self, tag: str) -> None:
+        if tag != "a" or self._current_href is None:
+            return
+        topic = WHITESPACE_PATTERN.sub(" ", "".join(self._current_text)).strip()
+        href = self._current_href
+        self._current_href = None
+        self._current_text = []
+        if not topic or href in self._seen_urls:
+            return
+        parsed = urlparse(href)
+        base_parsed = urlparse(self.base_prefix)
+        if parsed.netloc and base_parsed.netloc and parsed.netloc != base_parsed.netloc:
+            return
+        if not href.startswith(self.base_prefix):
+            return
+        if href.rstrip("/").endswith("biblio") or href.endswith("origins.html"):
+            return
+        self._seen_urls.add(href)
+        self.topic_links.append({"topic": topic, "url": href})
+
+
+class _TopicPageParser(HTMLParser):
+    def __init__(self) -> None:
+        super().__init__()
+        self._bibliography_depth = 0
+        self._in_pre = False
+        self._in_paragraph = False
+        self._current_paragraph: list[str] = []
+        self._parts: list[str] = []
+
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        attributes = dict(attrs)
+        if tag == "div" and "bibliography" in (attributes.get("class") or "").split():
+            self._bibliography_depth += 1
+            return
+        if tag == "pre":
+            self._in_pre = True
+            return
+        if self._bibliography_depth and tag == "p":
+            self._in_paragraph = True
+            self._current_paragraph = []
+
+    def handle_endtag(self, tag: str) -> None:
+        if tag == "div" and self._bibliography_depth:
+            self._bibliography_depth -= 1
+            return
+        if tag == "p" and self._in_paragraph:
+            text = "".join(self._current_paragraph).strip()
+            if text:
+                self._parts.append(text)
+                self._parts.append("\n\n")
+            self._current_paragraph = []
+            self._in_paragraph = False
+            return
+        if tag == "pre":
+            self._in_pre = False
+            self._parts.append("\n")
+
+    def handle_data(self, data: str) -> None:
+        if self._bibliography_depth and self._in_paragraph:
+            self._current_paragraph.append(data)
+        elif self._in_pre:
+            self._parts.append(data)
+
+    def preformatted_text(self) -> str:
+        return "".join(self._parts)
diff --git a/tests/test_batch.py b/tests/test_batch.py
new file mode 100644
index 0000000..9fe71b6
--- /dev/null
+++ b/tests/test_batch.py
@@ -0,0 +1,129 @@
+from pathlib import Path
+
+from citegeist.batch import BatchBootstrapRunner, load_batch_jobs
+from citegeist.cli import main
+from citegeist.storage import BibliographyStore
+
+
+def test_load_batch_jobs_accepts_object_with_jobs(tmp_path: Path):
+    path = tmp_path / "jobs.json"
+    path.write_text(
+        """
+{
+  "jobs": [
+    {"name": "topic-only", "topic": "graph topic"},
+    {"name": "seed-only", "seed_bib": "seed.bib"}
+  ]
+}
+""",
+        encoding="utf-8",
+    )
+
+    jobs = load_batch_jobs(path)
+
+    assert jobs[0]["name"] == "topic-only"
+    assert jobs[1]["seed_bib"] == str((tmp_path / "seed.bib").resolve())
+
+
+def test_batch_runner_executes_multiple_jobs(tmp_path: Path):
+    seed_bib = tmp_path / "seed.bib"
+    seed_bib.write_text(
+        """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Seed Paper},
+  year = {2024}
+}
+""",
+        encoding="utf-8",
+    )
+    jobs = [
+        {"name": "seed-job", "seed_bib": str(seed_bib), "expand": False},
+        {"name": "topic-job", "topic": "graph topic", "expand": False, "preview": True},
+    ]
+
+    runner = BatchBootstrapRunner()
+    from citegeist import BibEntry
+
+    runner.bootstrapper.resolver.search_openalex = lambda topic, limit=5: [  # type: ignore[method-assign]
+        BibEntry(entry_type="article", citation_key="topic2024graph", fields={"title": "Graph Topic Result"})
+    ]
+    runner.bootstrapper.resolver.search_crossref = lambda topic, limit=5: []  # type: ignore[method-assign]
+    runner.bootstrapper.resolver.search_datacite = lambda topic, limit=5: []  # type: ignore[method-assign]
+
+    store = BibliographyStore()
+    try:
+        results = runner.run(store, jobs)
+        assert [job.job_name for job in results] == ["seed-job", "topic-job"]
+        assert results[0].result_count == 1
+        assert results[1].results[0].citation_key == "topic2024graph"
+        assert store.get_entry("seed2024") is not None
+        assert store.get_entry("topic2024graph") is None
+    finally:
+        store.close()
+
+
+def test_batch_runner_can_store_topic_phrase_metadata():
+    jobs = [
+        {
+            "name": "topic-job",
+            "topic": "graph topic",
+            "topic_slug": "graph-methods",
+            "topic_name": "Graph Methods",
+            "topic_phrase": "graph networks biology",
+            "expand": False,
+            "preview": False,
+        }
+    ]
+
+    runner = BatchBootstrapRunner()
+    from citegeist import BibEntry
+
+    runner.bootstrapper.resolver.search_openalex = lambda topic, limit=5: [  # type: ignore[method-assign]
+        BibEntry(entry_type="article", citation_key="topic2024graph", fields={"title": "Graph Topic Result"})
+    ]
+    runner.bootstrapper.resolver.search_crossref = lambda topic, limit=5: []  # type: ignore[method-assign]
+    runner.bootstrapper.resolver.search_datacite = lambda topic, limit=5: []  # type: ignore[method-assign]
+
+    store = BibliographyStore()
+    try:
+        runner.run(store, jobs)
+        topic = store.get_topic("graph-methods")
+        assert topic is not None
+        assert topic["name"] == "Graph Methods"
+        assert topic["expansion_phrase"] == "graph networks biology"
+    finally:
+        store.close()
+
+
+def test_bootstrap_batch_cli_runs_json_jobs(tmp_path: Path):
+    seed_bib = tmp_path / "seed.bib"
+    seed_bib.write_text(
+        """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Seed Paper},
+  year = {2024}
+}
+""",
+        encoding="utf-8",
+    )
+    batch_json = tmp_path / "jobs.json"
+    batch_json.write_text(
+        f"""
+[
+  {{"name": "seed-job", "seed_bib": "{seed_bib}", "expand": false}},
+  {{"name": "topic-job", "topic": "graph topic", "expand": false, "preview": true}}
+]
+""",
+        encoding="utf-8",
+    )
+
+    from unittest.mock import patch
+
+    database = tmp_path / "library.sqlite3"
+    with patch("citegeist.cli.BatchBootstrapRunner.run") as mocked_run:
+        mocked_run.return_value = []
+        exit_code = main(["--db", str(database), "bootstrap-batch", str(batch_json)])
+
+    assert exit_code == 0
diff --git a/tests/test_bootstrap.py b/tests/test_bootstrap.py
new file mode 100644
index 0000000..728d8ac
--- /dev/null
+++ b/tests/test_bootstrap.py
@@ -0,0 +1,175 @@
+from citegeist import BibliographyStore
+from citegeist.bootstrap import Bootstrapper
+from citegeist.cli import main
+
+
+def test_bootstrap_from_seed_bib_only():
+    store = BibliographyStore()
+    try:
+        bootstrapper = Bootstrapper()
+        bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: []  # type: ignore[method-assign]
+        bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: []  # type: ignore[method-assign]
+
+        results = bootstrapper.bootstrap(
+            store,
+            seed_bibtex="""
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Seed Paper},
+  year = {2024}
+}
+""",
+            expand=False,
+        )
+
+        assert [item.citation_key for item in results] == ["seed2024"]
+        assert store.get_entry("seed2024") is not None
+    finally:
+        store.close()
+
+
+def test_bootstrap_from_topic_only():
+    store = BibliographyStore()
+    try:
+        bootstrapper = Bootstrapper()
+        bootstrapper.resolver.search_openalex = lambda topic, limit=5: []  # type: ignore[method-assign]
+        bootstrapper.resolver.search_crossref = lambda topic, limit=5: []  # type: ignore[method-assign]
+        bootstrapper.resolver.search_datacite = lambda topic, limit=5: [  # type: ignore[method-assign]
+            __import__("citegeist").BibEntry(
+                entry_type="article",
+                citation_key="topic2024graph",
+                fields={"title": "Graph Topic Result", "year": "2024"},
+            )
+        ]
+        bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: []  # type: ignore[method-assign]
+        bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: []  # type: ignore[method-assign]
+
+        results = bootstrapper.bootstrap(store, topic="graph topic", expand=False)
+
+        assert [item.citation_key for item in results] == ["topic2024graph"]
+        assert store.get_entry("topic2024graph") is not None
+        assert results[0].score > 0
+    finally:
+        store.close()
+
+
+def test_bootstrap_cli_accepts_seed_and_topic(tmp_path):
+    seed_bib = tmp_path / "seed.bib"
+    seed_bib.write_text(
+        """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Seed Paper},
+  year = {2024}
+}
+""",
+        encoding="utf-8",
+    )
+
+    from unittest.mock import patch
+
+    database = tmp_path / "library.sqlite3"
+    with patch("citegeist.cli.Bootstrapper.bootstrap") as mocked_bootstrap:
+        mocked_bootstrap.return_value = []
+        exit_code = main(
+            [
+                "--db",
+                str(database),
+                "bootstrap",
+                "--seed-bib",
+                str(seed_bib),
+                "--topic",
+                "graph topic",
+                "--no-expand",
+            ]
+        )
+
+    assert exit_code == 0
+
+
+def test_bootstrap_ranks_and_deduplicates_topic_candidates():
+    store = BibliographyStore()
+    try:
+        bootstrapper = Bootstrapper()
+        from citegeist import BibEntry
+
+        bootstrapper.resolver.search_openalex = lambda topic, limit=5: [  # type: ignore[method-assign]
+            BibEntry(
+                entry_type="article",
+                citation_key="shared2024graph",
+                fields={"title": "Graph Topic Ranking", "abstract": "graph topic graph"},
+            )
+        ]
+        bootstrapper.resolver.search_crossref = lambda topic, limit=5: [  # type: ignore[method-assign]
+            BibEntry(
+                entry_type="article",
+                citation_key="shared2024graph",
+                fields={"title": "Graph Topic Ranking", "abstract": "graph"},
+            ),
+            BibEntry(
+                entry_type="article",
+                citation_key="crossref2024other",
+                fields={"title": "Less relevant paper"},
+            ),
+        ]
+        bootstrapper.resolver.search_datacite = lambda topic, limit=5: []  # type: ignore[method-assign]
+        bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: []  # type: ignore[method-assign]
+        bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: []  # type: ignore[method-assign]
+
+        results = bootstrapper.bootstrap(store, topic="graph topic", expand=False, topic_limit=5)
+
+        topic_results = [item for item in results if item.origin == "topic"]
+        assert [item.citation_key for item in topic_results] == ["shared2024graph", "crossref2024other"]
+        assert topic_results[0].score > topic_results[1].score
+    finally:
+        store.close()
+
+
+def test_bootstrap_preview_does_not_write_to_database():
+    store = BibliographyStore()
+    try:
+        bootstrapper = Bootstrapper()
+        from citegeist import BibEntry
+
+        bootstrapper.resolver.search_openalex = lambda topic, limit=5: [  # type: ignore[method-assign]
+            BibEntry(entry_type="article", citation_key="preview2024graph", fields={"title": "Preview Graph Topic"})
+        ]
+        bootstrapper.resolver.search_crossref = lambda topic, limit=5: []  # type: ignore[method-assign]
+        bootstrapper.resolver.search_datacite = lambda topic, limit=5: []  # type: ignore[method-assign]
+
+        results = bootstrapper.bootstrap(store, topic="graph topic", expand=False, preview_only=True)
+
+        assert [item.citation_key for item in results] == ["preview2024graph"]
+        assert store.get_entry("preview2024graph") is None
+    finally:
+        store.close()
+
+
+def test_bootstrap_topic_commit_limit_restricts_persisted_candidates():
+    store = BibliographyStore()
+    try:
+        bootstrapper = Bootstrapper()
+        from citegeist import BibEntry
+
+        bootstrapper.resolver.search_openalex = lambda topic, limit=5: [  # type: ignore[method-assign]
+            BibEntry(entry_type="article", citation_key="rank1", fields={"title": "Graph Topic One"}),
+            BibEntry(entry_type="article", citation_key="rank2", fields={"title": "Graph Topic Two"}),
+        ]
+        bootstrapper.resolver.search_crossref = lambda topic, limit=5: []  # type: ignore[method-assign]
+        bootstrapper.resolver.search_datacite = lambda topic, limit=5: []  # type: ignore[method-assign]
+        bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: []  # type: ignore[method-assign]
+        bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: []  # type: ignore[method-assign]
+
+        results = bootstrapper.bootstrap(
+            store,
+            topic="graph topic",
+            expand=False,
+            topic_limit=5,
+            topic_commit_limit=1,
+        )
+
+        assert [item.citation_key for item in results if item.origin == "topic"] == ["rank1"]
+        assert store.get_entry("rank1") is not None
+        assert store.get_entry("rank2") is None
+    finally:
+        store.close()
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 4fed32c..7ab29c9 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -119,7 +119,7 @@ def test_cli_resolve_updates_entry(tmp_path: Path):
                 citation_key="resolvedkey",
                 fields={
                     "author": "Smith, Jane",
-                    "title": "Graph-first bibliography augmentation",
+                    "title": "Resolved Graph-first bibliography augmentation",
                     "year": "2024",
                     "doi": "10.1000/example-doi",
                     "journal": "Journal of Graph Studies",
@@ -138,6 +138,803 @@ def test_cli_resolve_updates_entry(tmp_path: Path):
         )
 
     assert exit_code == 0
+    show = run_cli(tmp_path, "show", "--conflicts", "smith2024graphs")
+    assert show.returncode == 0
+    payload = json.loads(show.stdout)
+    assert payload["field_conflicts"][0]["field_name"] == "title"
+
+
+def test_cli_resolve_conflicts_updates_status(tmp_path: Path):
+    bib_path = tmp_path / "input.bib"
+    bib_path.write_text(
+        """
+@article{smith2024graphs,
+  author = {Smith, Jane},
+  title = {Graph-first bibliography augmentation},
+  year = {2024}
+}
+""",
+        encoding="utf-8",
+    )
+    ingest = run_cli(tmp_path, "ingest", str(bib_path))
+    assert ingest.returncode == 0
+
+    from citegeist.storage import BibliographyStore
+
+    database = tmp_path / "library.sqlite3"
+    store = BibliographyStore(database)
+    try:
+        store.record_conflicts(
+            "smith2024graphs",
+            [
+                {
+                    "field_name": "title",
+                    "current_value": "Graph-first bibliography augmentation",
+                    "proposed_value": "Resolved title",
+                }
+            ],
+            source_type="resolver",
+            source_label="openalex:search:Graph-first bibliography augmentation",
+        )
+    finally:
+        store.close()
+
+    result = run_cli(tmp_path, "resolve-conflicts", "smith2024graphs", "title", "accepted")
+    assert result.returncode == 0
+    assert "accepted" in result.stdout
+
+
+def test_cli_apply_conflict_updates_entry_value(tmp_path: Path):
+    bib_path = tmp_path / "input.bib"
+    bib_path.write_text(
+        """
+@article{smith2024graphs,
+  author = {Smith, Jane},
+  title = {Graph-first bibliography augmentation},
+  year = {2024}
+}
+""",
+        encoding="utf-8",
+    )
+    ingest = run_cli(tmp_path, "ingest", str(bib_path))
+    assert ingest.returncode == 0
+
+    from citegeist.storage import BibliographyStore
+
+    database = tmp_path / "library.sqlite3"
+    store = BibliographyStore(database)
+    try:
+        store.record_conflicts(
+            "smith2024graphs",
+            [
+                {
+                    "field_name": "title",
+                    "current_value": "Graph-first bibliography augmentation",
+                    "proposed_value": "Resolved Graph-first bibliography augmentation",
+                }
+            ],
+            source_type="resolver",
+            source_label="openalex:search:Graph-first bibliography augmentation",
+        )
+    finally:
+        store.close()
+
+    result = run_cli(tmp_path, "apply-conflict", "smith2024graphs", "title")
+    assert result.returncode == 0
+    assert "applied" in result.stdout
+
+    show = run_cli(tmp_path, "show", "smith2024graphs")
+    payload = json.loads(show.stdout)
+    assert payload["title"] == "Resolved Graph-first bibliography augmentation"
+
+
+def test_cli_discover_oai_outputs_identity_and_sets():
+    from unittest.mock import patch
+    from citegeist.harvest import OaiMetadataFormat, OaiSet
+
+    with patch("citegeist.cli.OaiPmhHarvester.identify") as mocked_identify, patch(
+        "citegeist.cli.OaiPmhHarvester.list_sets"
+    ) as mocked_sets, patch("citegeist.cli.OaiPmhHarvester.list_metadata_formats") as mocked_formats:
+        mocked_identify.return_value = {
+            "repositoryName": "Example Repository",
+            "granularity": "YYYY-MM-DD",
+        }
+        mocked_formats.return_value = [
+            OaiMetadataFormat(
+                metadata_prefix="oai_dc",
+                schema="http://www.openarchives.org/OAI/2.0/oai_dc.xsd",
+                metadata_namespace="http://www.openarchives.org/OAI/2.0/oai_dc/",
+            )
+        ]
+        mocked_sets.return_value = [
+            OaiSet(set_spec="theses", set_name="Theses", set_description="Graduate theses")
+        ]
+        exit_code = main(["discover-oai", "https://example.edu/oai"])
+
+    assert exit_code == 0
+
+
+def test_cli_bootstrap_preview_mode(tmp_path):
+    from unittest.mock import patch
+
+    database = tmp_path / "library.sqlite3"
+    with patch("citegeist.cli.Bootstrapper.bootstrap") as mocked_bootstrap:
+        mocked_bootstrap.return_value = []
+        exit_code = main(
+            [
+                "--db",
+                str(database),
+                "bootstrap",
+                "--topic",
+                "graph topic",
+                "--preview",
+                "--topic-commit-limit",
+                "2",
+            ]
+        )
+
+    assert exit_code == 0
+    _, kwargs = mocked_bootstrap.call_args
+    assert kwargs["preview_only"] is True
+    assert kwargs["topic_commit_limit"] == 2
+
+
+def test_cli_bootstrap_accepts_stored_topic_metadata(tmp_path):
+    from unittest.mock import patch
+
+    database = tmp_path / "library.sqlite3"
+    with patch("citegeist.cli.Bootstrapper.bootstrap") as mocked_bootstrap:
+        mocked_bootstrap.return_value = []
+        exit_code = main(
+            [
+                "--db",
+                str(database),
+                "bootstrap",
+                "--topic",
+                "graph topic",
+                "--topic-slug",
+                "graph-methods",
+                "--topic-name",
+                "Graph Methods",
+                "--store-topic-phrase",
+                "graph networks biology",
+            ]
+        )
+
+    assert exit_code == 0
+    _, kwargs = mocked_bootstrap.call_args
+    assert kwargs["topic_slug"] == "graph-methods"
+    assert kwargs["topic_name"] == "Graph Methods"
+    assert kwargs["topic_phrase"] == "graph networks biology"
+
+
+def test_cli_scrape_talkorigins_accepts_output_dir(tmp_path):
+    from unittest.mock import patch
+
+    database = tmp_path / "library.sqlite3"
+    with patch("citegeist.cli.TalkOriginsScraper.scrape_to_directory") as mocked_scrape:
+        mocked_scrape.return_value = __import__("citegeist").TalkOriginsBatchExport(
+            base_url="https://www.talkorigins.org/origins/biblio/",
+            output_dir=str(tmp_path),
+            topic_count=1,
+            entry_count=2,
+            jobs_path=str(tmp_path / "jobs.json"),
+            manifest_path=str(tmp_path / "manifest.json"),
+            seed_sets=[],
+        )
+        exit_code = main(
+            [
+                "--db",
+                str(database),
+                "scrape-talkorigins",
+                str(tmp_path / "talkorigins-out"),
+                "--limit-topics",
+                "3",
+                "--limit-entries-per-topic",
+                "10",
+                "--no-resume",
+                "--no-expand",
+            ]
+        )
+
+    assert exit_code == 0
+
+
+def test_cli_validate_talkorigins_accepts_manifest(tmp_path):
+    from unittest.mock import patch
+
+    manifest = tmp_path / "talkorigins_manifest.json"
+    manifest.write_text("{}", encoding="utf-8")
+    with patch("citegeist.cli.TalkOriginsScraper.validate_export") as mocked_validate:
+        mocked_validate.return_value = __import__("citegeist").TalkOriginsValidationReport(
+            manifest_path=str(manifest),
+            topic_count=1,
+            entry_count=2,
+            parsed_ratio=1.0,
+            missing_author_count=0,
+            missing_title_count=0,
+            missing_year_count=0,
+            suspicious_entry_type_count=0,
+            suspicious_examples=[],
+            duplicate_cluster_count=0,
+            duplicate_entry_count=0,
+            duplicate_examples=[],
+        )
+        exit_code = main(["validate-talkorigins", str(manifest)])
+
+    assert exit_code == 0
+
+
+def test_cli_suggest_talkorigins_phrases_writes_output(tmp_path):
+    from unittest.mock import patch
+
+    manifest = tmp_path / "talkorigins_manifest.json"
+    manifest.write_text("{}", encoding="utf-8")
+    output = tmp_path / "phrases.json"
+    with patch("citegeist.cli.TalkOriginsScraper.suggest_topic_phrases") as mocked_suggest:
+        mocked_suggest.return_value = [
+            __import__("citegeist", fromlist=["TalkOriginsTopicPhraseSuggestion"]).TalkOriginsTopicPhraseSuggestion(
+                slug="abiogenesis",
+                topic="Abiogenesis",
+                entry_count=2,
+                suggested_phrase="Abiogenesis prebiotic chemistry ribozyme",
+                keywords=["prebiotic", "chemistry", "ribozyme"],
+                review_required=True,
+                review_reasons=["small_topic"],
+            )
+        ]
+        exit_code = main(
+            [
+                "suggest-talkorigins-phrases",
+                str(manifest),
+                "--topic",
+                "abiogenesis",
+                "--output",
+                str(output),
+            ]
+        )
+
+    assert exit_code == 0
+    payload = json.loads(output.read_text(encoding="utf-8"))
+    assert payload[0]["slug"] == "abiogenesis"
+
+
+def test_cli_duplicates_talkorigins_accepts_manifest(tmp_path):
+    from unittest.mock import patch
+
+    manifest = tmp_path / "talkorigins_manifest.json"
+    manifest.write_text("{}", encoding="utf-8")
+    with patch("citegeist.cli.TalkOriginsScraper.inspect_duplicate_clusters") as mocked_duplicates:
+        mocked_duplicates.return_value = [
+            __import__("citegeist.talkorigins", fromlist=["TalkOriginsDuplicateCluster"]).TalkOriginsDuplicateCluster(
+                key="smith|1999|duplicate paper",
+                count=2,
+                items=[
+                    {
+                        "citation_key": "dup1",
+                        "title": "Duplicate Paper",
+                        "author": "Smith, Jane",
+                        "year": "1999",
+                        "seed_bib": "a.bib",
+                        "topic": "Abiogenesis",
+                        "topic_slug": "abiogenesis",
+                    }
+                ],
+                canonical={
+                    "citation_key": "dup1",
+                    "entry_type": "article",
+                    "field_count": 3,
+                    "fields": {"title": "Duplicate Paper"},
+                    "weak_reasons": [],
+                },
+            )
+        ]
+        exit_code = main(
+            [
+                "duplicates-talkorigins",
+                str(manifest),
+                "--topic",
+                "abiogenesis",
+                "--match",
+                "duplicate",
+                "--preview",
+                "--weak-only",
+            ]
+        )
+
+    assert exit_code == 0
+
+
+def test_cli_ingest_talkorigins_accepts_manifest(tmp_path):
+    from unittest.mock import patch
+
+    database = tmp_path / "library.sqlite3"
+    manifest = tmp_path / "talkorigins_manifest.json"
+    manifest.write_text("{}", encoding="utf-8")
+    with patch("citegeist.cli.TalkOriginsScraper.ingest_export") as mocked_ingest:
+        mocked_ingest.return_value = __import__("citegeist").TalkOriginsIngestReport(
+            manifest_path=str(manifest),
+            topic_count=1,
+            raw_entry_count=2,
+            stored_entry_count=1,
+            duplicate_cluster_count=1,
+            duplicate_entry_count=2,
+            canonicalized_count=1,
+        )
+        exit_code = main(["--db", str(database), "ingest-talkorigins", str(manifest)])
+
+    assert exit_code == 0
+
+
+def test_cli_enrich_talkorigins_accepts_manifest(tmp_path):
+    from unittest.mock import patch
+
+    database = tmp_path / "library.sqlite3"
+    manifest = tmp_path / "talkorigins_manifest.json"
+    manifest.write_text("{}", encoding="utf-8")
+    with patch("citegeist.cli.TalkOriginsScraper.enrich_weak_canonicals") as mocked_enrich:
+        mocked_enrich.return_value = [
+            __import__("citegeist.talkorigins", fromlist=["TalkOriginsEnrichmentResult"]).TalkOriginsEnrichmentResult(
+                key="smith|1999|duplicate paper",
+                citation_key="dup1",
+                weak_reasons_before=["missing:doi"],
+                resolved=True,
+                applied=False,
+                source_label="crossref:search:Duplicate Paper",
+                weak_reasons_after=[],
+                conflicts=[],
+                error="",
+            )
+        ]
+        exit_code = main(
+            [
+                "--db",
+                str(database),
+                "enrich-talkorigins",
+                str(manifest),
+                "--limit",
+                "5",
+                "--apply",
+                "--allow-unsafe-search-matches",
+            ]
+        )
+
+    assert exit_code == 0
+
+
+def test_cli_review_talkorigins_writes_output(tmp_path):
+    from unittest.mock import patch
+
+    database = tmp_path / "library.sqlite3"
+    manifest = tmp_path / "talkorigins_manifest.json"
+    manifest.write_text("{}", encoding="utf-8")
+    output = tmp_path / "review.json"
+    with patch("citegeist.cli.TalkOriginsScraper.build_review_export") as mocked_review:
+        mocked_review.return_value = __import__("citegeist.talkorigins", fromlist=["TalkOriginsReviewExport"]).TalkOriginsReviewExport(
+            manifest_path=str(manifest),
+            item_count=1,
+            items=[{"key": "smith|1999|duplicate paper", "canonical": {}, "enrichment": {}}],
+        )
+        exit_code = main(
+            [
+                "--db",
+                str(database),
+                "review-talkorigins",
+                str(manifest),
+                "--output",
+                str(output),
+            ]
+        )
+
+    assert exit_code == 0
+    assert output.exists()
+
+
+def test_cli_apply_talkorigins_corrections_accepts_files(tmp_path):
+    from unittest.mock import patch
+
+    database = tmp_path / "library.sqlite3"
+    manifest = tmp_path / "talkorigins_manifest.json"
+    corrections = tmp_path / "corrections.json"
+    manifest.write_text("{}", encoding="utf-8")
+    corrections.write_text('{"corrections": []}', encoding="utf-8")
+    with patch("citegeist.cli.TalkOriginsScraper.apply_review_corrections") as mocked_apply:
+        mocked_apply.return_value = [
+            __import__("citegeist.talkorigins", fromlist=["TalkOriginsCorrectionResult"]).TalkOriginsCorrectionResult(
+                key="smith|1999|duplicate paper",
+                citation_key="dup1",
+                applied=True,
+                error="",
+            )
+        ]
+        exit_code = main(
+            [
+                "--db",
+                str(database),
+                "apply-talkorigins-corrections",
+                str(manifest),
+                str(corrections),
+            ]
+        )
+
+    assert exit_code == 0
+
+
+def test_cli_topics_and_topic_entries(tmp_path: Path):
+    bib_path = tmp_path / "input.bib"
+    bib_path.write_text(
+        """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Seed Paper},
+  year = {2024}
+}
+""",
+        encoding="utf-8",
+    )
+    ingest = run_cli(tmp_path, "ingest", str(bib_path))
+    assert ingest.returncode == 0
+
+    from citegeist.storage import BibliographyStore
+
+    database = tmp_path / "library.sqlite3"
+    store = BibliographyStore(database)
+    try:
+        store.add_entry_topic(
+            "seed2024",
+            topic_slug="graph-methods",
+            topic_name="Graph Methods",
+            source_type="talkorigins",
+            source_url="https://example.org/topics/graph-methods",
+            source_label="topic-seed",
+        )
+        store.connection.commit()
+    finally:
+        store.close()
+
+    topics = run_cli(tmp_path, "topics")
+    assert topics.returncode == 0
+    topics_payload = json.loads(topics.stdout)
+    assert topics_payload[0]["slug"] == "graph-methods"
+
+    topic_entries = run_cli(tmp_path, "topic-entries", "graph-methods")
+    assert topic_entries.returncode == 0
+    topic_payload = json.loads(topic_entries.stdout)
+    assert topic_payload["topic"]["slug"] == "graph-methods"
+    assert topic_payload["entries"][0]["citation_key"] == "seed2024"
+
+
+def test_cli_can_set_topic_phrase(tmp_path: Path):
+    bib_path = tmp_path / "input.bib"
+    bib_path.write_text(
+        """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Seed Paper},
+  year = {2024}
+}
+""",
+        encoding="utf-8",
+    )
+    ingest = run_cli(tmp_path, "ingest", str(bib_path))
+    assert ingest.returncode == 0
+
+    from citegeist.storage import BibliographyStore
+
+    database = tmp_path / "library.sqlite3"
+    store = BibliographyStore(database)
+    try:
+        store.add_entry_topic(
+            "seed2024",
+            topic_slug="graph-methods",
+            topic_name="Graph Methods",
+            source_type="talkorigins",
+            source_url="https://example.org/topics/graph-methods",
+            source_label="topic-seed",
+        )
+        store.connection.commit()
+    finally:
+        store.close()
+
+    result = run_cli(tmp_path, "set-topic-phrase", "graph-methods", "graph networks biology")
+    assert result.returncode == 0
+    payload = json.loads(result.stdout)
+    assert payload["expansion_phrase"] == "graph networks biology"
+
+
+def test_cli_can_apply_topic_phrases_from_json(tmp_path: Path):
+    bib_path = tmp_path / "input.bib"
+    bib_path.write_text(
+        """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Seed Paper},
+  year = {2024}
+}
+""",
+        encoding="utf-8",
+    )
+    ingest = run_cli(tmp_path, "ingest", str(bib_path))
+    assert ingest.returncode == 0
+
+    from citegeist.storage import BibliographyStore
+
+    database = tmp_path / "library.sqlite3"
+    store = BibliographyStore(database)
+    try:
+        store.add_entry_topic(
+            "seed2024",
+            topic_slug="graph-methods",
+            topic_name="Graph Methods",
+            source_type="talkorigins",
+            source_url="https://example.org/topics/graph-methods",
+            source_label="topic-seed",
+        )
+        store.connection.commit()
+    finally:
+        store.close()
+
+    phrases_path = tmp_path / "phrases.json"
+    phrases_path.write_text(
+        json.dumps(
+            [
+                {
+                    "slug": "graph-methods",
+                    "suggested_phrase": "graph networks biology",
+                }
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    result = run_cli(tmp_path, "apply-topic-phrases", str(phrases_path))
+    assert result.returncode == 0
+    payload = json.loads(result.stdout)
+    assert payload[0]["applied"] is True
+
+    check = run_cli(tmp_path, "topics")
+    topics_payload = json.loads(check.stdout)
+    assert topics_payload[0]["expansion_phrase"] == "graph networks biology"
+
+
+def test_cli_can_stage_topic_phrases_from_json(tmp_path: Path):
+    bib_path = tmp_path / "input.bib"
+    bib_path.write_text(
+        """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Seed Paper},
+  year = {2024}
+}
+""",
+        encoding="utf-8",
+    )
+    ingest = run_cli(tmp_path, "ingest", str(bib_path))
+    assert ingest.returncode == 0
+
+    from citegeist.storage import BibliographyStore
+
+    database = tmp_path / "library.sqlite3"
+    store = BibliographyStore(database)
+    try:
+        store.add_entry_topic(
+            "seed2024",
+            topic_slug="graph-methods",
+            topic_name="Graph Methods",
+            source_type="talkorigins",
+            source_url="https://example.org/topics/graph-methods",
+            source_label="topic-seed",
+        )
+        store.connection.commit()
+    finally:
+        store.close()
+
+    phrases_path = tmp_path / "phrases.json"
+    phrases_path.write_text(
+        json.dumps(
+            [
+                {
+                    "slug": "graph-methods",
+                    "suggested_phrase": "graph networks biology",
+                }
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    result = run_cli(tmp_path, "stage-topic-phrases", str(phrases_path))
+    assert result.returncode == 0
+    payload = json.loads(result.stdout)
+    assert payload[0]["staged"] is True
+    assert payload[0]["phrase_review_status"] == "pending"
+
+    check = run_cli(tmp_path, "topics")
+    topics_payload = json.loads(check.stdout)
+    assert topics_payload[0]["suggested_phrase"] == "graph networks biology"
+    assert topics_payload[0]["expansion_phrase"] is None
+    assert topics_payload[0]["phrase_review_status"] == "pending"
+
+
+def test_cli_can_review_topic_phrase(tmp_path: Path):
+    bib_path = tmp_path / "input.bib"
+    bib_path.write_text(
+        """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Seed Paper},
+  year = {2024}
+}
+""",
+        encoding="utf-8",
+    )
+    ingest = run_cli(tmp_path, "ingest", str(bib_path))
+    assert ingest.returncode == 0
+
+    from citegeist.storage import BibliographyStore
+
+    database = tmp_path / "library.sqlite3"
+    store = BibliographyStore(database)
+    try:
+        store.add_entry_topic(
+            "seed2024",
+            topic_slug="graph-methods",
+            topic_name="Graph Methods",
+            source_type="talkorigins",
+            source_url="https://example.org/topics/graph-methods",
+            source_label="topic-seed",
+        )
+        store.stage_topic_phrase_suggestion("graph-methods", "graph networks biology")
+    finally:
+        store.close()
+
+    result = run_cli(
+        tmp_path,
+        "review-topic-phrase",
+        "graph-methods",
+        "accepted",
+        "--notes",
+        "curated and approved",
+    )
+    assert result.returncode == 0
+    payload = json.loads(result.stdout)
+    assert payload["suggested_phrase"] == "graph networks biology"
+    assert payload["expansion_phrase"] == "graph networks biology"
+    assert payload["phrase_review_status"] == "accepted"
+    assert payload["phrase_review_notes"] == "curated and approved"
+
+
+def test_cli_topics_can_filter_by_phrase_review_status(tmp_path: Path):
+    bib_path = tmp_path / "input.bib"
+    bib_path.write_text(
+        """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Seed Paper},
+  year = {2024}
+}
+""",
+        encoding="utf-8",
+    )
+    ingest = run_cli(tmp_path, "ingest", str(bib_path))
+    assert ingest.returncode == 0
+
+    from citegeist.storage import BibliographyStore
+
+    database = tmp_path / "library.sqlite3"
+    store = BibliographyStore(database)
+    try:
+        store.add_entry_topic(
+            "seed2024",
+            topic_slug="graph-methods",
+            topic_name="Graph Methods",
+            source_type="talkorigins",
+            source_url="https://example.org/topics/graph-methods",
+            source_label="topic-seed",
+        )
+        store.ensure_topic("abiogenesis", "Abiogenesis")
+        store.stage_topic_phrase_suggestion("graph-methods", "graph networks biology")
+        store.stage_topic_phrase_suggestion("abiogenesis", "abiogenesis life origin")
+        store.review_topic_phrase_suggestion("abiogenesis", "accepted")
+    finally:
+        store.close()
+
+    result = run_cli(tmp_path, "topics", "--phrase-review-status", "pending")
+    assert result.returncode == 0
+    payload = json.loads(result.stdout)
+    assert [topic["slug"] for topic in payload] == ["graph-methods"]
+
+
+def test_cli_export_topic(tmp_path: Path):
+    bib_path = tmp_path / "input.bib"
+    bib_path.write_text(
+        """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Seed Paper},
+  year = {2024}
+}
+""",
+        encoding="utf-8",
+    )
+    ingest = run_cli(tmp_path, "ingest", str(bib_path))
+    assert ingest.returncode == 0
+
+    from citegeist.storage import BibliographyStore
+
+    database = tmp_path / "library.sqlite3"
+    store = BibliographyStore(database)
+    try:
+        store.add_entry_topic(
+            "seed2024",
+            topic_slug="graph-methods",
+            topic_name="Graph Methods",
+            source_type="talkorigins",
+            source_url="https://example.org/topics/graph-methods",
+            source_label="topic-seed",
+        )
+        store.connection.commit()
+    finally:
+        store.close()
+
+    export_path = tmp_path / "graph-methods.bib"
+    result = run_cli(tmp_path, "export-topic", "graph-methods", "--output", str(export_path))
+    assert result.returncode == 0
+    exported = export_path.read_text(encoding="utf-8")
+    assert "@article{seed2024," in exported
+
+
+def test_cli_search_can_filter_by_topic(tmp_path: Path):
+    bib_path = tmp_path / "input.bib"
+    bib_path.write_text(
+        """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Graph Methods for Biology},
+  year = {2024},
+  abstract = {A graph methods paper.}
+}
+
+@article{other2023,
+  author = {Other, Bob},
+  title = {Graph Methods for Chemistry},
+  year = {2023},
+  abstract = {Another graph methods paper.}
+}
+""",
+        encoding="utf-8",
+    )
+    ingest = run_cli(tmp_path, "ingest", str(bib_path))
+    assert ingest.returncode == 0
+
+    from citegeist.storage import BibliographyStore
+
+    database = tmp_path / "library.sqlite3"
+    store = BibliographyStore(database)
+    try:
+        store.add_entry_topic(
+            "seed2024",
+            topic_slug="biology",
+            topic_name="Biology",
+            source_type="talkorigins",
+            source_url="https://example.org/topics/biology",
+            source_label="topic-seed",
+        )
+        store.add_entry_topic(
+            "other2023",
+            topic_slug="chemistry",
+            topic_name="Chemistry",
+            source_type="talkorigins",
+            source_url="https://example.org/topics/chemistry",
+            source_label="topic-seed",
+        )
+        store.connection.commit()
+    finally:
+        store.close()
+
+    search = run_cli(tmp_path, "search", "graph", "--topic", "biology")
+    assert search.returncode == 0
+    assert "seed2024" in search.stdout
+    assert "other2023" not in search.stdout
 
 
 def test_cli_graph_outputs_missing_targets(tmp_path: Path):
@@ -239,3 +1036,43 @@ def test_cli_expand_with_mocked_openalex(tmp_path: Path):
         )
 
     assert exit_code == 0
+
+
+def test_cli_expand_topic_with_mocked_expander(tmp_path: Path):
+    from citegeist.expand import TopicExpansionResult
+
+    with patch("citegeist.cli.TopicExpander.expand_topic") as mocked_expand:
+        mocked_expand.return_value = [
+            TopicExpansionResult(
+                topic_slug="abiogenesis",
+                source_citation_key="seed2024",
+                discovered_citation_key="discovered1",
+                discovered_title="Abiogenesis origin chemistry",
+                created_entry=True,
+                relation_type="cites",
+                source_label="openalex:cites:seed2024",
+                relevance_score=0.67,
+                meets_relevance_threshold=True,
+                assigned_to_topic=True,
+            )
+        ]
+        database = tmp_path / "library.sqlite3"
+        exit_code = main(
+            [
+                "--db",
+                str(database),
+                "expand-topic",
+                "abiogenesis",
+                "--topic-phrase",
+                "abiogenesis origin chemistry",
+                "--seed-key",
+                "seed2024",
+                "--min-relevance",
+                "0.3",
+                "--preview",
+            ]
+        )
+
+    assert exit_code == 0
+    _, kwargs = mocked_expand.call_args
+    assert kwargs["preview_only"] is True
diff --git a/tests/test_harvest.py b/tests/test_harvest.py
new file mode 100644
index 0000000..49da298
--- /dev/null
+++ b/tests/test_harvest.py
@@ -0,0 +1,293 @@
+from citegeist import OaiPmhHarvester, parse_bibtex
+from citegeist.cli import main
+
+
+OAI_XML = """<?xml version="1.0" encoding="UTF-8"?>
+<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
+         xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
+         xmlns:dc="http://purl.org/dc/elements/1.1/">
+  <ListRecords>
+    <record>
+      <header>
+        <identifier>oai:example.edu:123</identifier>
+      </header>
+      <metadata>
+        <oai_dc:dc>
+          <dc:title>Thesis Metadata Harvesting</dc:title>
+          <dc:creator>Doe, Jane</dc:creator>
+          <dc:date>2023-05-01</dc:date>
+          <dc:description>A dissertation about repository harvesting.</dc:description>
+          <dc:identifier>https://example.edu/items/123</dc:identifier>
+          <dc:publisher>Example University</dc:publisher>
+          <dc:type>Text</dc:type>
+          <dc:type>Dissertation</dc:type>
+        </oai_dc:dc>
+      </metadata>
+    </record>
+  </ListRecords>
+</OAI-PMH>
+"""
+
+OAI_XML_PAGE_1 = """<?xml version="1.0" encoding="UTF-8"?>
+<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
+         xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
+         xmlns:dc="http://purl.org/dc/elements/1.1/">
+  <ListRecords>
+    <record>
+      <header>
+        <identifier>oai:example.edu:123</identifier>
+      </header>
+      <metadata>
+        <oai_dc:dc>
+          <dc:title>First Harvested Thesis</dc:title>
+          <dc:creator>Doe, Jane</dc:creator>
+          <dc:date>2023-05-01</dc:date>
+          <dc:type>Dissertation</dc:type>
+        </oai_dc:dc>
+      </metadata>
+    </record>
+    <resumptionToken>TOKEN123</resumptionToken>
+  </ListRecords>
+</OAI-PMH>
+"""
+
+OAI_XML_PAGE_2 = """<?xml version="1.0" encoding="UTF-8"?>
+<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
+         xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
+         xmlns:dc="http://purl.org/dc/elements/1.1/">
+  <ListRecords>
+    <record>
+      <header>
+        <identifier>oai:example.edu:456</identifier>
+      </header>
+      <metadata>
+        <oai_dc:dc>
+          <dc:title>Second Harvested Thesis</dc:title>
+          <dc:creator>Smith, John</dc:creator>
+          <dc:date>2022-05-01</dc:date>
+          <dc:type>Dissertation</dc:type>
+        </oai_dc:dc>
+      </metadata>
+    </record>
+  </ListRecords>
+</OAI-PMH>
+"""
+
+OAI_IDENTIFY_XML = """<?xml version="1.0" encoding="UTF-8"?>
+<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/">
+  <Identify>
+    <repositoryName>Example Repository</repositoryName>
+    <baseURL>https://example.edu/oai</baseURL>
+    <protocolVersion>2.0</protocolVersion>
+    <adminEmail>repo@example.edu</adminEmail>
+    <earliestDatestamp>2001-01-01</earliestDatestamp>
+    <deletedRecord>persistent</deletedRecord>
+    <granularity>YYYY-MM-DD</granularity>
+  </Identify>
+</OAI-PMH>
+"""
+
+OAI_LISTSETS_XML = """<?xml version="1.0" encoding="UTF-8"?>
+<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/">
+  <ListSets>
+    <set>
+      <setSpec>theses</setSpec>
+      <setName>Theses and Dissertations</setName>
+      <setDescription>
+        <description>This set contains graduate theses.</description>
+      </setDescription>
+    </set>
+  </ListSets>
+</OAI-PMH>
+"""
+
+OAI_METADATA_FORMATS_XML = """<?xml version="1.0" encoding="UTF-8"?>
+<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/">
+  <ListMetadataFormats>
+    <metadataFormat>
+      <metadataPrefix>oai_dc</metadataPrefix>
+      <schema>http://www.openarchives.org/OAI/2.0/oai_dc.xsd</schema>
+      <metadataNamespace>http://www.openarchives.org/OAI/2.0/oai_dc/</metadataNamespace>
+    </metadataFormat>
+    <metadataFormat>
+      <metadataPrefix>mods</metadataPrefix>
+      <schema>http://www.loc.gov/standards/mods/v3/mods-3-7.xsd</schema>
+      <metadataNamespace>http://www.loc.gov/mods/v3</metadataNamespace>
+    </metadataFormat>
+  </ListMetadataFormats>
+</OAI-PMH>
+"""
+
+OAI_MODS_XML = """<?xml version="1.0" encoding="UTF-8"?>
+<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
+         xmlns:mods="http://www.loc.gov/mods/v3">
+  <ListRecords>
+    <record>
+      <header>
+        <identifier>oai:example.edu:mods123</identifier>
+      </header>
+      <metadata>
+        <mods:mods>
+          <mods:titleInfo>
+            <mods:title>MODS Thesis Title</mods:title>
+          </mods:titleInfo>
+          <mods:name>
+            <mods:namePart>Doe</mods:namePart>
+            <mods:namePart>Jane</mods:namePart>
+            <mods:role>
+              <mods:roleTerm>author</mods:roleTerm>
+            </mods:role>
+          </mods:name>
+          <mods:originInfo>
+            <mods:publisher>Example University</mods:publisher>
+            <mods:dateIssued>2022</mods:dateIssued>
+          </mods:originInfo>
+          <mods:genre>dissertation</mods:genre>
+          <mods:abstract>MODS abstract text.</mods:abstract>
+          <mods:location>
+            <mods:url>https://example.edu/mods123</mods:url>
+          </mods:location>
+        </mods:mods>
+      </metadata>
+    </record>
+  </ListRecords>
+</OAI-PMH>
+"""
+
+
+def test_oai_harvester_maps_dublin_core_to_bibentry():
+    harvester = OaiPmhHarvester()
+    harvester.source_client.get_xml = lambda _url: __import__("xml.etree.ElementTree").etree.ElementTree.fromstring(OAI_XML)  # type: ignore[method-assign]
+
+    results = harvester.list_records("https://example.edu/oai")
+
+    assert len(results) == 1
+    entry = results[0].entry
+    assert entry.entry_type == "phdthesis"
+    assert entry.fields["title"] == "Thesis Metadata Harvesting"
+    assert entry.fields["author"] == "Doe, Jane"
+    assert entry.fields["oai"] == "oai:example.edu:123"
+
+
+def test_oai_harvester_follows_resumption_tokens():
+    harvester = OaiPmhHarvester()
+    from xml.etree import ElementTree as ET
+
+    payloads = iter([ET.fromstring(OAI_XML_PAGE_1), ET.fromstring(OAI_XML_PAGE_2)])
+    harvester.source_client.get_xml = lambda _url: next(payloads)  # type: ignore[method-assign]
+
+    results = harvester.list_records("https://example.edu/oai")
+
+    assert [result.identifier for result in results] == [
+        "oai:example.edu:123",
+        "oai:example.edu:456",
+    ]
+    assert [result.entry.citation_key for result in results] == [
+        "doe2023first1",
+        "smith2022second2",
+    ]
+
+
+def test_oai_harvester_passes_date_filters():
+    harvester = OaiPmhHarvester()
+    seen_urls: list[str] = []
+    from xml.etree import ElementTree as ET
+
+    def fake_get_xml(url: str):
+        seen_urls.append(url)
+        return ET.fromstring(OAI_XML)
+
+    harvester.source_client.get_xml = fake_get_xml  # type: ignore[method-assign]
+
+    harvester.list_records(
+        "https://example.edu/oai",
+        date_from="2023-01-01",
+        date_until="2023-12-31",
+        limit=1,
+    )
+
+    assert "from=2023-01-01" in seen_urls[0]
+    assert "until=2023-12-31" in seen_urls[0]
+
+
+def test_oai_harvester_maps_mods_records():
+    harvester = OaiPmhHarvester()
+    from xml.etree import ElementTree as ET
+
+    harvester.source_client.get_xml = lambda _url: ET.fromstring(OAI_MODS_XML)  # type: ignore[method-assign]
+
+    results = harvester.list_records("https://example.edu/oai", metadata_prefix="mods")
+
+    assert len(results) == 1
+    entry = results[0].entry
+    assert entry.entry_type == "phdthesis"
+    assert entry.fields["title"] == "MODS Thesis Title"
+    assert entry.fields["author"] == "Doe, Jane"
+    assert entry.fields["publisher"] == "Example University"
+    assert entry.fields["abstract"] == "MODS abstract text."
+
+
+def test_oai_harvester_can_identify_repository_and_list_sets():
+    harvester = OaiPmhHarvester()
+    from xml.etree import ElementTree as ET
+
+    payloads = iter(
+        [ET.fromstring(OAI_IDENTIFY_XML), ET.fromstring(OAI_LISTSETS_XML), ET.fromstring(OAI_METADATA_FORMATS_XML)]
+    )
+    harvester.source_client.get_xml = lambda _url: next(payloads)  # type: ignore[method-assign]
+
+    identify = harvester.identify("https://example.edu/oai")
+    sets = harvester.list_sets("https://example.edu/oai")
+    formats = harvester.list_metadata_formats("https://example.edu/oai")
+
+    assert identify["repositoryName"] == "Example Repository"
+    assert identify["granularity"] == "YYYY-MM-DD"
+    assert sets[0].set_spec == "theses"
+    assert sets[0].set_name == "Theses and Dissertations"
+    assert "graduate theses" in sets[0].set_description
+    assert [item.metadata_prefix for item in formats] == ["oai_dc", "mods"]
+
+
+def test_harvest_oai_cli_ingests_records(tmp_path):
+    from unittest.mock import patch
+
+    database = tmp_path / "library.sqlite3"
+    harvester = OaiPmhHarvester()
+    from xml.etree import ElementTree as ET
+
+    harvester.source_client.get_xml = lambda _url: ET.fromstring(OAI_XML)  # type: ignore[method-assign]
+    harvested = harvester.list_records("https://example.edu/oai")
+
+    with patch("citegeist.cli.OaiPmhHarvester.list_records") as mocked_list:
+        mocked_list.return_value = harvested
+
+        exit_code = main(
+            [
+                "--db",
+                str(database),
+                "harvest-oai",
+                "https://example.edu/oai",
+                "--metadata-prefix",
+                "oai_dc",
+                "--from",
+                "2023-01-01",
+                "--until",
+                "2023-12-31",
+                "--limit",
+                "5",
+            ]
+        )
+
+    assert exit_code == 0
+
+    from citegeist.storage import BibliographyStore
+
+    store = BibliographyStore(database)
+    try:
+        entry = store.list_entries(limit=10)[0]
+        assert entry["citation_key"] == "doe2023thesis1"
+        bibtex = store.get_entry_bibtex("doe2023thesis1")
+        parsed = parse_bibtex(bibtex or "")
+        assert parsed[0].fields["oai"] == "oai:example.edu:123"
+    finally:
+        store.close()
diff --git a/tests/test_resolve.py b/tests/test_resolve.py
index f4c22b0..226ee10 100644
--- a/tests/test_resolve.py
+++ b/tests/test_resolve.py
@@ -1,11 +1,13 @@
 from xml.etree import ElementTree as ET
 
-from citegeist.bibtex import BibEntry
+from citegeist.bibtex import BibEntry, render_bibtex
 from citegeist.resolve import (
     MetadataResolver,
     _arxiv_atom_entry_to_bib,
     _crossref_message_to_entry,
+    _datacite_work_to_entry,
     _openalex_work_to_entry,
+    merge_entries_with_conflicts,
     merge_entries,
 )
 
@@ -65,6 +67,31 @@ def test_merge_entries_prefers_existing_values_and_adds_missing_fields():
     assert merged.fields["journal"] == "Journal of Graph Studies"
 
 
+def test_merge_entries_with_conflicts_records_disagreements():
+    base = BibEntry(
+        entry_type="article",
+        citation_key="smith2024graphs",
+        fields={"title": "Existing Title", "journal": "Current Journal"},
+    )
+    resolved = BibEntry(
+        entry_type="article",
+        citation_key="resolved",
+        fields={"title": "Resolved Title", "journal": "Current Journal", "year": "2024"},
+    )
+
+    merged, conflicts = merge_entries_with_conflicts(base, resolved)
+
+    assert merged.fields["title"] == "Existing Title"
+    assert merged.fields["year"] == "2024"
+    assert conflicts == [
+        {
+            "field_name": "title",
+            "current_value": "Existing Title",
+            "proposed_value": "Resolved Title",
+        }
+    ]
+
+
 def test_resolver_tries_doi_before_dblp():
     resolver = MetadataResolver()
     calls: list[tuple[str, str]] = []
@@ -77,7 +104,12 @@ def test_resolver_tries_doi_before_dblp():
         calls.append(("dblp", value))
         return None
 
+    def fake_datacite(value: str):
+        calls.append(("datacite", value))
+        return None
+
     resolver.resolve_doi = fake_doi  # type: ignore[method-assign]
+    resolver.resolve_datacite_doi = fake_datacite  # type: ignore[method-assign]
     resolver.resolve_dblp = fake_dblp  # type: ignore[method-assign]
 
     resolver.resolve_entry(
@@ -88,7 +120,11 @@ def test_resolver_tries_doi_before_dblp():
         )
     )
 
-    assert calls == [("doi", "10.1000/example-doi"), ("dblp", "conf/test/Smith24")]
+    assert calls == [
+        ("doi", "10.1000/example-doi"),
+        ("datacite", "10.1000/example-doi"),
+        ("dblp", "conf/test/Smith24"),
+    ]
 
 
 def test_openalex_work_to_entry_maps_basic_fields():
@@ -131,6 +167,8 @@ def test_resolver_can_resolve_openalex_id():
 
 def test_resolver_falls_back_to_openalex_title_search():
     resolver = MetadataResolver()
+    resolver.search_datacite = lambda title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_crossref = lambda title, limit=5: []  # type: ignore[method-assign]
     resolver.search_openalex = lambda title, limit=5: [  # type: ignore[method-assign]
         _openalex_work_to_entry(
             {
@@ -154,3 +192,212 @@ def test_resolver_falls_back_to_openalex_title_search():
     assert resolution is not None
     assert resolution.source_label == "openalex:search:OpenAlex Resolved Work"
     assert resolution.entry.fields["openalex"] == "W12345"
+
+
+def test_resolver_prefers_exact_crossref_title_match_before_datacite():
+    resolver = MetadataResolver()
+    resolver.search_crossref = lambda title, limit=5: [  # type: ignore[method-assign]
+        _crossref_message_to_entry(
+            {
+                "type": "journal-article",
+                "title": [title],
+                "DOI": "10.1126/science.1090005",
+                "container-title": ["Science"],
+                "author": [
+                    {"family": "King", "given": "Mary-Claire"},
+                    {"family": "Wilson", "given": "A. C."},
+                ],
+                "issued": {"date-parts": [[1975, 4, 11]]},
+            }
+        )
+    ]
+    resolver.search_datacite = lambda title, limit=5: [  # type: ignore[method-assign]
+        _datacite_work_to_entry(
+            {
+                "attributes": {
+                    "doi": "10.5061/dryad.v6wwpzh17",
+                    "titles": [
+                        {
+                            "title": "Conserved patterns and locomotor-related evolutionary constraints in the hominoid vertebral column"
+                        }
+                    ],
+                    "creators": [
+                        {"familyName": "Villamil", "givenName": "Catalina I."},
+                        {"familyName": "Middleton", "givenName": "Emily R."},
+                    ],
+                    "publicationYear": 2024,
+                    "types": {"resourceTypeGeneral": "Dataset"},
+                }
+            }
+        )
+    ]
+
+    resolution = resolver.resolve_entry(
+        BibEntry(
+            entry_type="article",
+            citation_key="king1975evolution2",
+            fields={
+                "title": "Evolution at two levels in humans and chimpanzees",
+                "author": "King, M. C. and Wilson, A. C.",
+                "year": "1975",
+            },
+        )
+    )
+
+    assert resolution is not None
+    assert resolution.source_label == "crossref:search:Evolution at two levels in humans and chimpanzees"
+    assert resolution.entry.fields["doi"] == "10.1126/science.1090005"
+
+
+def test_resolver_rejects_mismatched_title_search_candidates():
+    resolver = MetadataResolver()
+    resolver.search_crossref = lambda title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_datacite = lambda title, limit=5: [  # type: ignore[method-assign]
+        _datacite_work_to_entry(
+            {
+                "attributes": {
+                    "doi": "10.5061/dryad.v6wwpzh17",
+                    "titles": [
+                        {
+                            "title": "Conserved patterns and locomotor-related evolutionary constraints in the hominoid vertebral column"
+                        }
+                    ],
+                    "creators": [
+                        {"familyName": "Villamil", "givenName": "Catalina I."},
+                    ],
+                    "publicationYear": 2024,
+                    "types": {"resourceTypeGeneral": "Dataset"},
+                }
+            }
+        )
+    ]
+    resolver.search_openalex = lambda title, limit=5: [  # type: ignore[method-assign]
+        _openalex_work_to_entry(
+            {
+                "id": "https://openalex.org/W2033360601",
+                "display_name": "Immunological relatedness of glucose 6-phosphate dehydrogenases from vertebrate and invertebrate species.",
+                "publication_year": 1978,
+                "type": "article",
+                "authorships": [
+                    {"author": {"display_name": "Yoshikazu Sado"}},
+                    {"author": {"display_name": "Samuel H. Hori"}},
+                ],
+                "doi": "https://doi.org/10.1266/jjg.53.91",
+            }
+        )
+    ]
+
+    resolution = resolver.resolve_entry(
+        BibEntry(
+            entry_type="article",
+            citation_key="sarich1967immunological1",
+            fields={
+                "title": "Immunological Time Scale for Homonid Evolution",
+                "author": "Sarich, V. and Wilson, A.",
+                "year": "1967",
+            },
+        )
+    )
+
+    assert resolution is None
+
+
+def test_datacite_work_to_entry_maps_basic_fields():
+    entry = _datacite_work_to_entry(
+        {
+            "attributes": {
+                "doi": "10.1000/datacite-example",
+                "titles": [{"title": "Repository Dissertation Record"}],
+                "creators": [{"familyName": "Doe", "givenName": "Jane"}],
+                "publicationYear": 2021,
+                "publisher": "Example University",
+                "url": "https://example.edu/record/123",
+                "types": {"resourceTypeGeneral": "Dissertation"},
+                "descriptions": [
+                    {
+                        "descriptionType": "Abstract",
+                        "description": "An abstract from DataCite.",
+                    }
+                ],
+            }
+        }
+    )
+
+    assert entry.entry_type == "phdthesis"
+    assert entry.fields["doi"] == "10.1000/datacite-example"
+    assert entry.fields["author"] == "Doe, Jane"
+    assert entry.fields["publisher"] == "Example University"
+    assert entry.fields["abstract"] == "An abstract from DataCite."
+
+
+def test_resolver_can_resolve_datacite_doi():
+    resolver = MetadataResolver()
+    resolver.source_client.get_json = lambda _url: {  # type: ignore[method-assign]
+        "data": {
+            "attributes": {
+                "doi": "10.1000/datacite-example",
+                "titles": [{"title": "Repository Dissertation Record"}],
+                "creators": [{"familyName": "Doe", "givenName": "Jane"}],
+                "publicationYear": 2021,
+                "types": {"resourceTypeGeneral": "Dissertation"},
+            }
+        }
+    }
+
+    resolution = resolver.resolve_datacite_doi("10.1000/datacite-example")
+
+    assert resolution is not None
+    assert resolution.source_label == "datacite:doi:10.1000/datacite-example"
+    assert resolution.entry.entry_type == "phdthesis"
+
+
+def test_resolver_can_fall_back_to_datacite_title_search():
+    resolver = MetadataResolver()
+    resolver.search_crossref = lambda title, limit=5: []  # type: ignore[method-assign]
+    resolver.search_datacite = lambda title, limit=5: [  # type: ignore[method-assign]
+        _datacite_work_to_entry(
+            {
+                "attributes": {
+                    "doi": "10.1000/datacite-example",
+                    "titles": [{"title": title}],
+                    "creators": [{"familyName": "Doe", "givenName": "Jane"}],
+                    "publicationYear": 2021,
+                    "types": {"resourceTypeGeneral": "Dissertation"},
+                }
+            }
+        )
+    ]
+    resolver.search_openalex = lambda title, limit=5: []  # type: ignore[method-assign]
+
+    resolution = resolver.resolve_entry(
+        BibEntry(
+            entry_type="misc",
+            citation_key="draft1",
+            fields={"title": "Repository Dissertation Record", "author": "Doe, Jane", "year": "2021"},
+        )
+    )
+
+    assert resolution is not None
+    assert resolution.source_label == "datacite:search:Repository Dissertation Record"
+    assert resolution.entry.fields["doi"] == "10.1000/datacite-example"
+
+
+def test_render_bibtex_tolerates_unmatched_braces_in_field_values():
+    rendered = render_bibtex(
+        [
+            BibEntry(
+                entry_type="misc",
+                citation_key="broken2026",
+                fields={
+                    "author": "Broken, Example",
+                    "title": "Unmatched { braces } example } tail",
+                    "year": "2026",
+                    "note": "Open { brace only",
+                },
+            )
+        ]
+    )
+
+    assert "@misc{broken2026," in rendered
+    assert "Unmatched { braces } example ) tail" in rendered
+    assert "Open ( brace only" in rendered
diff --git a/tests/test_sources.py b/tests/test_sources.py
index fea995a..f9f0bf2 100644
--- a/tests/test_sources.py
+++ b/tests/test_sources.py
@@ -28,3 +28,14 @@ def test_source_client_writes_cache_after_fetch(tmp_path: Path):
 
     assert payload["ok"] is True
     assert any(cache_dir.iterdir())
+
+
+def test_source_client_falls_back_to_latin1_for_text(tmp_path: Path):
+    client = SourceClient(cache_dir=tmp_path / "cache")
+    url = "https://example.org/latin1"
+
+    client._fetch_bytes = lambda _url: "café".encode("iso-8859-1")  # type: ignore[method-assign]
+
+    payload = client.get_text(url)
+
+    assert payload == "café"
diff --git a/tests/test_storage.py b/tests/test_storage.py
index 3458f52..f432bfd 100644
--- a/tests/test_storage.py
+++ b/tests/test_storage.py
@@ -130,3 +130,250 @@ def test_store_traverses_graph_and_surfaces_missing_targets():
         assert rows[2]["depth"] == 2
     finally:
         store.close()
+
+
+def test_store_records_and_updates_field_conflicts():
+    store = BibliographyStore()
+    try:
+        store.ingest_bibtex(
+            """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Seed Paper},
+  year = {2024}
+}
+"""
+        )
+        ok = store.record_conflicts(
+            "seed2024",
+            [
+                {
+                    "field_name": "title",
+                    "current_value": "Seed Paper",
+                    "proposed_value": "Resolved Seed Paper",
+                }
+            ],
+            source_type="resolver",
+            source_label="crossref:doi:10.1000/seed",
+        )
+        assert ok is True
+        conflicts = store.get_field_conflicts("seed2024")
+        assert conflicts[0]["field_name"] == "title"
+        assert conflicts[0]["status"] == "open"
+        assert store.set_conflict_status("seed2024", "title", "accepted") == 1
+        updated = store.get_field_conflicts("seed2024", status="accepted")
+        assert len(updated) == 1
+    finally:
+        store.close()
+
+
+def test_store_can_apply_latest_conflict_value():
+    store = BibliographyStore()
+    try:
+        store.ingest_bibtex(
+            """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Seed Paper},
+  year = {2024}
+}
+"""
+        )
+        store.record_conflicts(
+            "seed2024",
+            [
+                {
+                    "field_name": "title",
+                    "current_value": "Seed Paper",
+                    "proposed_value": "Resolved Seed Paper",
+                }
+            ],
+            source_type="resolver",
+            source_label="crossref:doi:10.1000/seed",
+        )
+
+        assert store.apply_conflict_value("seed2024", "title") is True
+        entry = store.get_entry("seed2024")
+        assert entry is not None
+        assert entry["title"] == "Resolved Seed Paper"
+        accepted = store.get_field_conflicts("seed2024", status="accepted")
+        assert len(accepted) == 1
+    finally:
+        store.close()
+
+
+def test_store_supports_entry_topic_membership():
+    store = BibliographyStore()
+    try:
+        store.ingest_bibtex(
+            """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Seed Paper},
+  year = {2024}
+}
+"""
+        )
+
+        assert store.add_entry_topic(
+            "seed2024",
+            topic_slug="graph-methods",
+            topic_name="Graph Methods",
+            source_type="talkorigins",
+            source_url="https://example.org/topics/graph-methods",
+            source_label="topic-seed",
+        ) is True
+        assert store.add_entry_topic(
+            "seed2024",
+            topic_slug="semantic-search",
+            topic_name="Semantic Search",
+            source_type="talkorigins",
+            source_url="https://example.org/topics/semantic-search",
+            source_label="topic-seed",
+        ) is True
+
+        entry = store.get_entry("seed2024")
+        assert entry is not None
+        assert [topic["slug"] for topic in entry["topics"]] == ["graph-methods", "semantic-search"]
+
+        topics = store.list_topics()
+        assert [topic["slug"] for topic in topics] == ["graph-methods", "semantic-search"]
+        assert topics[0]["entry_count"] == 1
+        topic = store.get_topic("graph-methods")
+        assert topic is not None
+        assert topic["name"] == "Graph Methods"
+        assert topic["expansion_phrase"] is None
+        topic_entries = store.list_topic_entries("graph-methods")
+        assert topic_entries[0]["citation_key"] == "seed2024"
+    finally:
+        store.close()
+
+
+def test_store_can_set_topic_expansion_phrase():
+    store = BibliographyStore()
+    try:
+        store.ingest_bibtex(
+            """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Seed Paper},
+  year = {2024}
+}
+"""
+        )
+        store.add_entry_topic(
+            "seed2024",
+            topic_slug="graph-methods",
+            topic_name="Graph Methods",
+            source_type="talkorigins",
+            source_url="https://example.org/topics/graph-methods",
+            source_label="topic-seed",
+        )
+        assert store.set_topic_expansion_phrase("graph-methods", "graph networks biology") is True
+
+        topic = store.get_topic("graph-methods")
+        assert topic is not None
+        assert topic["expansion_phrase"] == "graph networks biology"
+        assert topic["phrase_review_status"] == "unreviewed"
+        topics = store.list_topics()
+        assert topics[0]["expansion_phrase"] == "graph networks biology"
+    finally:
+        store.close()
+
+
+def test_store_can_stage_and_review_topic_phrase_suggestion():
+    store = BibliographyStore()
+    try:
+        store.ensure_topic("graph-methods", "Graph Methods")
+
+        assert store.stage_topic_phrase_suggestion(
+            "graph-methods",
+            "graph networks biology",
+            review_notes="generated from local titles",
+        ) is True
+
+        staged = store.get_topic("graph-methods")
+        assert staged is not None
+        assert staged["suggested_phrase"] == "graph networks biology"
+        assert staged["expansion_phrase"] is None
+        assert staged["phrase_review_status"] == "pending"
+        assert staged["phrase_review_notes"] == "generated from local titles"
+
+        assert store.review_topic_phrase_suggestion(
+            "graph-methods",
+            "accepted",
+            review_notes="looks good",
+        ) is True
+
+        reviewed = store.get_topic("graph-methods")
+        assert reviewed is not None
+        assert reviewed["suggested_phrase"] == "graph networks biology"
+        assert reviewed["expansion_phrase"] == "graph networks biology"
+        assert reviewed["phrase_review_status"] == "accepted"
+        assert reviewed["phrase_review_notes"] == "looks good"
+    finally:
+        store.close()
+
+
+def test_store_can_filter_topics_by_phrase_review_status():
+    store = BibliographyStore()
+    try:
+        store.ensure_topic("graph-methods", "Graph Methods")
+        store.ensure_topic("abiogenesis", "Abiogenesis")
+        store.stage_topic_phrase_suggestion("graph-methods", "graph networks biology")
+        store.stage_topic_phrase_suggestion("abiogenesis", "abiogenesis life origin")
+        store.review_topic_phrase_suggestion("abiogenesis", "accepted")
+
+        pending_topics = store.list_topics(phrase_review_status="pending")
+        accepted_topics = store.list_topics(phrase_review_status="accepted")
+
+        assert [topic["slug"] for topic in pending_topics] == ["graph-methods"]
+        assert [topic["slug"] for topic in accepted_topics] == ["abiogenesis"]
+    finally:
+        store.close()
+
+
+def test_store_search_text_can_filter_by_topic():
+    store = BibliographyStore()
+    try:
+        store.ingest_bibtex(
+            """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Graph Methods for Biology},
+  year = {2024},
+  abstract = {A graph methods paper.}
+}
+
+@article{other2023,
+  author = {Other, Bob},
+  title = {Graph Methods for Chemistry},
+  year = {2023},
+  abstract = {Another graph methods paper.}
+}
+"""
+        )
+
+        store.add_entry_topic(
+            "seed2024",
+            topic_slug="biology",
+            topic_name="Biology",
+            source_type="talkorigins",
+            source_url="https://example.org/topics/biology",
+            source_label="topic-seed",
+        )
+        store.add_entry_topic(
+            "other2023",
+            topic_slug="chemistry",
+            topic_name="Chemistry",
+            source_type="talkorigins",
+            source_url="https://example.org/topics/chemistry",
+            source_label="topic-seed",
+        )
+        store.connection.commit()
+
+        results = store.search_text("graph", topic_slug="biology")
+
+        assert [row["citation_key"] for row in results] == ["seed2024"]
+    finally:
+        store.close()
diff --git a/tests/test_talkorigins.py b/tests/test_talkorigins.py
new file mode 100644
index 0000000..9ca9943
--- /dev/null
+++ b/tests/test_talkorigins.py
@@ -0,0 +1,1024 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from citegeist.batch import load_batch_jobs
+from citegeist.bibtex import BibEntry
+from citegeist.storage import BibliographyStore
+from citegeist.talkorigins import TalkOriginsScraper, normalize_topic_entries
+
+
+INDEX_HTML = """
+<html><body>
+<a href="abiogenesis.html">Abiogenesis</a>
+<a href="evolution.html">Evolution</a>
+<a href="/origins/faqs.html">Browse</a>
+</body></html>
+"""
+
+ABIOGENESIS_HTML = """
+<html><body><pre>
+Smith, J., 1998, First paper title: Journal of Origins, v. 10, p. 1-10.
+
+---, 2001, Second paper title: Journal of Origins, v. 12, p. 20-30.
+</pre></body></html>
+"""
+
+EVOLUTION_HTML = """
+<html><body><pre>
+Jones, A., and Roe, B.,
+2003, Wrapped title across lines:
+Proceedings of the Example Conference, p. 40-55.
+</pre></body></html>
+"""
+
+
+class FakeSourceClient:
+    def __init__(self, payloads: dict[str, str]) -> None:
+        self.payloads = payloads
+
+    def get_text(self, url: str) -> str:
+        return self.payloads[url]
+
+
+def test_normalize_topic_entries_carries_forward_repeated_authors():
+    text = """
+Smith, J., 1998, First paper title: Journal of Origins.
+
+---, 2001, Second paper title: Journal of Origins.
+"""
+
+    entries = normalize_topic_entries(text)
+
+    assert entries[1].startswith("Smith, J., 2001")
+
+
+def test_talkorigins_scraper_writes_seed_bibs_and_jobs(tmp_path: Path):
+    base_url = "https://www.talkorigins.org/origins/biblio/"
+    scraper = TalkOriginsScraper(
+        source_client=FakeSourceClient(
+            {
+                base_url: INDEX_HTML,
+                f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
+                f"{base_url}evolution.html": EVOLUTION_HTML,
+            }
+        )
+    )
+
+    export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
+
+    assert export.topic_count == 2
+    assert export.entry_count == 3
+
+    jobs = json.loads(Path(export.jobs_path).read_text(encoding="utf-8"))
+    assert jobs["jobs"][0]["name"] == "talkorigins:abiogenesis"
+    assert Path(jobs["jobs"][0]["seed_bib"]).exists()
+
+    manifest = json.loads(Path(export.manifest_path).read_text(encoding="utf-8"))
+    assert manifest["seed_sets"][0]["parsed_entry_count"] == 2
+
+    abiogenesis_bib = Path(export.seed_sets[0].seed_bib).read_text(encoding="utf-8")
+    abiogenesis_plain = Path(export.seed_sets[0].plaintext_path).read_text(encoding="utf-8")
+    abiogenesis_page = Path(export.seed_sets[0].page_path).read_text(encoding="utf-8")
+    full_bib = Path(export.full_bib_path).read_text(encoding="utf-8")
+    full_plain = Path(export.full_plaintext_path).read_text(encoding="utf-8")
+    site_index = Path(export.site_index_path).read_text(encoding="utf-8")
+    assert "@article{smith1998first1," in abiogenesis_bib
+    assert 'author = "Smith, J"' in abiogenesis_bib
+    assert "@article{smith2001second2," in abiogenesis_bib
+    assert "Abiogenesis" in abiogenesis_plain
+    assert "Show BibTeX" in abiogenesis_page
+    assert "toggleBibtex" in abiogenesis_page
+    assert "@article{smith1998first1," in full_bib
+    assert "Evolution" in full_plain
+    assert "Full BibTeX bibliography" in site_index
+
+
+def test_talkorigins_parser_prefers_book_for_publisher_like_venues():
+    scraper = TalkOriginsScraper(source_client=FakeSourceClient({}))
+
+    entry = scraper.parse_reference_entry(
+        "Rutten, M. G., 1971, The Origin of Life by Natural Causes: Amsterdam, London, New York, Elsevier.",
+        1,
+    )
+
+    assert entry is not None
+    assert entry.entry_type == "book"
+    assert entry.fields["publisher"] == "Amsterdam, London, New York, Elsevier"
+
+
+def test_talkorigins_parser_promotes_edited_volume_chapter_to_incollection():
+    scraper = TalkOriginsScraper(source_client=FakeSourceClient({}))
+
+    entry = scraper.parse_reference_entry(
+        "Carpenter, C. R., 1958, Territoriality: A Review of Concepts and Problems, in Roe, A., and Simpson, G. G., eds., Behavior and Evolution: New Haven, Yale University Press, p. 224-250.",
+        1,
+    )
+
+    assert entry is not None
+    assert entry.entry_type == "incollection"
+    assert entry.fields["title"] == "Territoriality: A Review of Concepts and Problems"
+    assert entry.fields["editor"] == "Roe, A. and Simpson, G. G."
+    assert entry.fields["booktitle"] == "Behavior and Evolution"
+    assert "Yale University Press" in entry.fields["publisher"]
+
+
+def test_talkorigins_scraper_resume_uses_saved_snapshot(tmp_path: Path):
+    base_url = "https://www.talkorigins.org/origins/biblio/"
+    scraper = TalkOriginsScraper(
+        source_client=FakeSourceClient(
+            {
+                base_url: INDEX_HTML,
+                f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
+                f"{base_url}evolution.html": EVOLUTION_HTML,
+            }
+        )
+    )
+
+    first_export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path, limit_topics=1)
+    snapshot_path = Path(first_export.seed_sets[0].snapshot_path)
+    snapshot = json.loads(snapshot_path.read_text(encoding="utf-8"))
+    assert snapshot["raw_entries"][0].startswith("Smith, J.")
+
+    scraper_with_broken_page = TalkOriginsScraper(
+        source_client=FakeSourceClient(
+            {
+                base_url: INDEX_HTML,
+                f"{base_url}abiogenesis.html": "<html><body>broken</body></html>",
+            }
+        )
+    )
+    resumed_export = scraper_with_broken_page.scrape_to_directory(base_url=base_url, output_dir=tmp_path, limit_topics=1)
+
+    assert resumed_export.entry_count == 2
+
+
+def test_talkorigins_validation_reports_suspicious_entries(tmp_path: Path):
+    base_url = "https://www.talkorigins.org/origins/biblio/"
+    scraper = TalkOriginsScraper(
+        source_client=FakeSourceClient(
+            {
+                base_url: INDEX_HTML,
+                f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
+            }
+        )
+    )
+
+    export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path, limit_topics=1)
+    seed_bib_path = Path(export.seed_sets[0].seed_bib)
+    seed_bib_path.write_text(
+        """
+@article{bad1,
+    author = "Example, A",
+    year = "1999",
+    title = "Bad Venue Classification",
+    journal = "Elsevier"
+}
+""",
+        encoding="utf-8",
+    )
+
+    report = scraper.validate_export(export.manifest_path)
+
+    assert report.topic_count == 1
+    assert report.entry_count == 2
+    assert report.suspicious_entry_type_count >= 1
+    assert report.suspicious_examples[0]["citation_key"] == "bad1"
+
+
+def test_talkorigins_validation_does_not_flag_legitimate_incollection(tmp_path: Path):
+    base_url = "https://www.talkorigins.org/origins/biblio/"
+    scraper = TalkOriginsScraper(
+        source_client=FakeSourceClient(
+            {
+                base_url: INDEX_HTML,
+                f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
+            }
+        )
+    )
+
+    export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path, limit_topics=1)
+    seed_bib_path = Path(export.seed_sets[0].seed_bib)
+    seed_bib_path.write_text(
+        """
+@incollection{good1,
+    author = "Example, A",
+    editor = "Editor, E",
+    year = "1999",
+    title = "Good Chapter",
+    booktitle = "Collected Essays",
+    publisher = "New Haven, Yale University Press"
+}
+""",
+        encoding="utf-8",
+    )
+
+    report = scraper.validate_export(export.manifest_path)
+
+    assert all(item["citation_key"] != "good1" for item in report.suspicious_examples)
+
+
+def test_talkorigins_validation_reports_duplicate_clusters(tmp_path: Path):
+    base_url = "https://www.talkorigins.org/origins/biblio/"
+    scraper = TalkOriginsScraper(
+        source_client=FakeSourceClient(
+            {
+                base_url: INDEX_HTML,
+                f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
+                f"{base_url}evolution.html": EVOLUTION_HTML,
+            }
+        )
+    )
+
+    export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
+    Path(export.seed_sets[0].seed_bib).write_text(
+        """
+@article{dup1,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Duplicate Paper",
+    journal = "Journal A"
+}
+""",
+        encoding="utf-8",
+    )
+    Path(export.seed_sets[1].seed_bib).write_text(
+        """
+@article{dup2,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Duplicate Paper",
+    journal = "Journal B"
+}
+""",
+        encoding="utf-8",
+    )
+
+    report = scraper.validate_export(export.manifest_path)
+
+    assert report.duplicate_cluster_count >= 1
+    assert report.duplicate_entry_count >= 2
+    assert report.duplicate_examples[0]["items"][0]["citation_key"] in {"dup1", "dup2"}
+
+
+def test_talkorigins_can_suggest_topic_phrases(tmp_path: Path):
+    base_url = "https://www.talkorigins.org/origins/biblio/"
+    scraper = TalkOriginsScraper(
+        source_client=FakeSourceClient(
+            {
+                base_url: INDEX_HTML,
+                f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
+            }
+        )
+    )
+
+    export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path, limit_topics=1)
+    Path(export.seed_sets[0].seed_bib).write_text(
+        """
+@article{bio1,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Prebiotic chemistry and ribozyme catalysis",
+    journal = "Origins"
+}
+
+@article{bio2,
+    author = "Smith, Jane",
+    year = "2001",
+    title = "Ribozyme networks in prebiotic chemistry",
+    journal = "Origins"
+}
+""",
+        encoding="utf-8",
+    )
+
+    suggestions = scraper.suggest_topic_phrases(export.manifest_path)
+
+    assert len(suggestions) == 1
+    assert suggestions[0].slug == "abiogenesis"
+    assert suggestions[0].suggested_phrase.startswith("Abiogenesis ")
+    assert "chemistry" in suggestions[0].keywords
+    assert "prebiotic" in suggestions[0].keywords
+    assert suggestions[0].review_required is True
+    assert "small_topic" in (suggestions[0].review_reasons or [])
+    assert "noisy_keywords" not in (suggestions[0].review_reasons or [])
+
+
+def test_talkorigins_duplicate_inspection_filters_by_topic_and_match(tmp_path: Path):
+    base_url = "https://www.talkorigins.org/origins/biblio/"
+    scraper = TalkOriginsScraper(
+        source_client=FakeSourceClient(
+            {
+                base_url: INDEX_HTML,
+                f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
+                f"{base_url}evolution.html": EVOLUTION_HTML,
+            }
+        )
+    )
+
+    export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
+    Path(export.seed_sets[0].seed_bib).write_text(
+        """
+@article{dup1,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Duplicate Paper",
+    journal = "Journal A"
+}
+
+@article{dup2,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Duplicate Paper",
+    journal = "Journal B"
+}
+""",
+        encoding="utf-8",
+    )
+    Path(export.seed_sets[1].seed_bib).write_text(
+        """
+@article{other1,
+    author = "Jones, Alex",
+    year = "2001",
+    title = "Other Topic Paper",
+    journal = "Journal C"
+}
+""",
+        encoding="utf-8",
+    )
+
+    clusters = scraper.inspect_duplicate_clusters(
+        export.manifest_path,
+        topic_slug="abiogenesis",
+        match="duplicate",
+    )
+
+    assert len(clusters) == 1
+    assert clusters[0].count == 2
+    assert all(item["topic_slug"] == "abiogenesis" for item in clusters[0].items)
+
+
+def test_talkorigins_duplicate_inspection_can_preview_canonical_choice(tmp_path: Path):
+    base_url = "https://www.talkorigins.org/origins/biblio/"
+    scraper = TalkOriginsScraper(
+        source_client=FakeSourceClient(
+            {
+                base_url: INDEX_HTML,
+                f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
+                f"{base_url}evolution.html": EVOLUTION_HTML,
+            }
+        )
+    )
+
+    export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
+    Path(export.seed_sets[0].seed_bib).write_text(
+        """
+@article{dup1,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Duplicate Paper",
+    journal = "Journal A"
+}
+""",
+        encoding="utf-8",
+    )
+    Path(export.seed_sets[1].seed_bib).write_text(
+        """
+@article{dup2,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Duplicate Paper",
+    journal = "Journal B",
+    doi = "10.1000/dup"
+}
+""",
+        encoding="utf-8",
+    )
+
+    clusters = scraper.inspect_duplicate_clusters(export.manifest_path, preview_canonical=True)
+
+    assert len(clusters) == 1
+    assert clusters[0].canonical is not None
+    assert clusters[0].canonical["citation_key"] == "dup2"
+    assert clusters[0].canonical["fields"]["doi"] == "10.1000/dup"
+    assert clusters[0].canonical["weak_reasons"] == []
+
+
+def test_talkorigins_duplicate_inspection_can_filter_to_weak_canonicals(tmp_path: Path):
+    base_url = "https://www.talkorigins.org/origins/biblio/"
+    scraper = TalkOriginsScraper(
+        source_client=FakeSourceClient(
+            {
+                base_url: INDEX_HTML,
+                f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
+                f"{base_url}evolution.html": EVOLUTION_HTML,
+            }
+        )
+    )
+
+    export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
+    Path(export.seed_sets[0].seed_bib).write_text(
+        """
+@misc{weak1,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Weak Duplicate"
+}
+
+@misc{weak2,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Weak Duplicate",
+    note = "Copied from legacy source"
+}
+""",
+        encoding="utf-8",
+    )
+    Path(export.seed_sets[1].seed_bib).write_text(
+        """
+@article{strong1,
+    author = "Jones, Alex",
+    year = "2001",
+    title = "Strong Duplicate",
+    journal = "Journal B",
+    doi = "10.1000/strong"
+}
+
+@article{strong2,
+    author = "Jones, Alex",
+    year = "2001",
+    title = "Strong Duplicate",
+    journal = "Journal B"
+}
+""",
+        encoding="utf-8",
+    )
+
+    clusters = scraper.inspect_duplicate_clusters(
+        export.manifest_path,
+        preview_canonical=True,
+        weak_only=True,
+    )
+
+    assert len(clusters) == 1
+    assert clusters[0].canonical is not None
+    assert clusters[0].canonical["citation_key"] == "weak2"
+    assert "entry_type:misc" in clusters[0].canonical["weak_reasons"]
+    assert "missing:doi" in clusters[0].canonical["weak_reasons"]
+
+
+def test_talkorigins_enrich_weak_canonicals_can_preview_resolution(tmp_path: Path):
+    base_url = "https://www.talkorigins.org/origins/biblio/"
+    scraper = TalkOriginsScraper(
+        source_client=FakeSourceClient(
+            {
+                base_url: INDEX_HTML,
+                f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
+                f"{base_url}evolution.html": EVOLUTION_HTML,
+            }
+        )
+    )
+
+    export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
+    Path(export.seed_sets[0].seed_bib).write_text(
+        """
+@misc{weak1,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Weak Duplicate"
+}
+
+@misc{weak2,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Weak Duplicate",
+    note = "Copied from legacy source"
+}
+""",
+        encoding="utf-8",
+    )
+    Path(export.seed_sets[1].seed_bib).write_text("", encoding="utf-8")
+
+    from citegeist.resolve import Resolution
+
+    scraper.resolver.resolve_entry = lambda entry: Resolution(  # type: ignore[method-assign]
+        entry=BibEntry(
+            entry_type="article",
+            citation_key="resolved",
+            fields={
+                "author": entry.fields["author"],
+                "title": entry.fields["title"],
+                "year": entry.fields["year"],
+                "doi": "10.1000/weak",
+                "journal": "Journal of Better Metadata",
+            },
+        ),
+        source_type="resolver",
+        source_label="crossref:search:Weak Duplicate",
+    )
+
+    store = BibliographyStore()
+    try:
+        results = scraper.enrich_weak_canonicals(export.manifest_path, store, apply=False)
+    finally:
+        store.close()
+
+    assert len(results) == 1
+    assert results[0].resolved is True
+    assert results[0].applied is False
+    assert results[0].weak_reasons_after == []
+
+
+def test_talkorigins_enrich_weak_canonicals_can_apply_to_store(tmp_path: Path):
+    base_url = "https://www.talkorigins.org/origins/biblio/"
+    scraper = TalkOriginsScraper(
+        source_client=FakeSourceClient(
+            {
+                base_url: INDEX_HTML,
+                f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
+                f"{base_url}evolution.html": EVOLUTION_HTML,
+            }
+        )
+    )
+
+    export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
+    Path(export.seed_sets[0].seed_bib).write_text(
+        """
+@misc{weak2,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Weak Duplicate"
+}
+""",
+        encoding="utf-8",
+    )
+    Path(export.seed_sets[1].seed_bib).write_text(
+        """
+@misc{weak1,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Weak Duplicate",
+    note = "Copied from legacy source"
+}
+""",
+        encoding="utf-8",
+    )
+
+    from citegeist.resolve import Resolution
+
+    scraper.resolver.resolve_entry = lambda entry: Resolution(  # type: ignore[method-assign]
+        entry=BibEntry(
+            entry_type="article",
+            citation_key="resolved",
+            fields={
+                "author": entry.fields["author"],
+                "title": entry.fields["title"],
+                "year": entry.fields["year"],
+                "doi": "10.1000/weak",
+                "journal": "Journal of Better Metadata",
+            },
+        ),
+        source_type="resolver",
+        source_label="crossref:search:Weak Duplicate",
+    )
+
+    store = BibliographyStore()
+    try:
+        scraper.ingest_export(export.manifest_path, store, dedupe=False)
+        results = scraper.enrich_weak_canonicals(export.manifest_path, store, apply=True)
+
+        assert len(results) == 1
+        assert results[0].applied is True
+        entry = store.get_entry(results[0].citation_key)
+        assert entry is not None
+        assert entry["doi"] == "10.1000/weak"
+        assert entry["journal"] == "Journal of Better Metadata"
+        assert entry["review_status"] == "enriched"
+    finally:
+        store.close()
+
+
+def test_talkorigins_enrich_weak_canonicals_rejects_unsafe_search_match(tmp_path: Path):
+    base_url = "https://www.talkorigins.org/origins/biblio/"
+    scraper = TalkOriginsScraper(
+        source_client=FakeSourceClient(
+            {
+                base_url: INDEX_HTML,
+                f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
+                f"{base_url}evolution.html": EVOLUTION_HTML,
+            }
+        )
+    )
+
+    export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
+    Path(export.seed_sets[0].seed_bib).write_text(
+        """
+@misc{weak2,
+    author = "Adams, D",
+    year = "1987",
+    title = "The bigger they are, the harder they fall"
+}
+""",
+        encoding="utf-8",
+    )
+    Path(export.seed_sets[1].seed_bib).write_text(
+        """
+@misc{weak1,
+    author = "Adams, D",
+    year = "1987",
+    title = "The bigger they are, the harder they fall",
+    note = "Copied from legacy source"
+}
+""",
+        encoding="utf-8",
+    )
+
+    from citegeist.resolve import Resolution
+
+    scraper.resolver.resolve_entry = lambda entry: Resolution(  # type: ignore[method-assign]
+        entry=BibEntry(
+            entry_type="misc",
+            citation_key="resolved",
+            fields={
+                "author": "Kulik, Dean",
+                "title": "The Nexus Recursive Harmonic Framework: Reality as Unbounded, Observerless Computation (SILR / RHA / CST) Ver 2",
+                "year": "2026",
+                "doi": "10.9999/not-a-match",
+            },
+        ),
+        source_type="resolver",
+        source_label="datacite:search:The bigger they are, the harder they fall",
+    )
+
+    store = BibliographyStore()
+    try:
+        scraper.ingest_export(export.manifest_path, store, dedupe=False)
+        results = scraper.enrich_weak_canonicals(export.manifest_path, store, apply=True)
+
+        assert len(results) == 1
+        assert results[0].resolved is False
+        assert results[0].applied is False
+        assert results[0].error == "unsafe resolver match"
+        entry = store.get_entry("weak2") or store.get_entry("weak1")
+        assert entry is not None
+        assert entry["doi"] is None
+    finally:
+        store.close()
+
+
+def test_talkorigins_enrich_weak_canonicals_can_allow_unsafe_search_match(tmp_path: Path):
+    base_url = "https://www.talkorigins.org/origins/biblio/"
+    scraper = TalkOriginsScraper(
+        source_client=FakeSourceClient(
+            {
+                base_url: INDEX_HTML,
+                f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
+                f"{base_url}evolution.html": EVOLUTION_HTML,
+            }
+        )
+    )
+
+    export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
+    Path(export.seed_sets[0].seed_bib).write_text(
+        """
+@misc{weak2,
+    author = "Adams, D",
+    year = "1987",
+    title = "The bigger they are, the harder they fall"
+}
+""",
+        encoding="utf-8",
+    )
+    Path(export.seed_sets[1].seed_bib).write_text(
+        """
+@misc{weak1,
+    author = "Adams, D",
+    year = "1987",
+    title = "The bigger they are, the harder they fall",
+    note = "Copied from legacy source"
+}
+""",
+        encoding="utf-8",
+    )
+
+    from citegeist.resolve import Resolution
+
+    scraper.resolver.resolve_entry = lambda entry: Resolution(  # type: ignore[method-assign]
+        entry=BibEntry(
+            entry_type="misc",
+            citation_key="resolved",
+            fields={
+                "author": "Kulik, Dean",
+                "title": "The Nexus Recursive Harmonic Framework: Reality as Unbounded, Observerless Computation (SILR / RHA / CST) Ver 2",
+                "year": "2026",
+                "doi": "10.9999/not-a-match",
+            },
+        ),
+        source_type="resolver",
+        source_label="datacite:search:The bigger they are, the harder they fall",
+    )
+
+    store = BibliographyStore()
+    try:
+        scraper.ingest_export(export.manifest_path, store, dedupe=False)
+        results = scraper.enrich_weak_canonicals(
+            export.manifest_path,
+            store,
+            apply=True,
+            allow_unsafe_matches=True,
+        )
+
+        assert len(results) == 1
+        assert results[0].resolved is True
+        assert results[0].applied is True
+        entry = store.get_entry(results[0].citation_key)
+        assert entry is not None
+        assert entry["doi"] == "10.9999/not-a-match"
+    finally:
+        store.close()
+
+
+def test_talkorigins_build_review_export_combines_cluster_and_enrichment(tmp_path: Path):
+    base_url = "https://www.talkorigins.org/origins/biblio/"
+    scraper = TalkOriginsScraper(
+        source_client=FakeSourceClient(
+            {
+                base_url: INDEX_HTML,
+                f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
+                f"{base_url}evolution.html": EVOLUTION_HTML,
+            }
+        )
+    )
+
+    export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
+    Path(export.seed_sets[0].seed_bib).write_text(
+        """
+@misc{weak1,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Weak Duplicate"
+}
+
+@misc{weak2,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Weak Duplicate",
+    note = "Copied from legacy source"
+}
+""",
+        encoding="utf-8",
+    )
+    Path(export.seed_sets[1].seed_bib).write_text("", encoding="utf-8")
+
+    from citegeist.resolve import Resolution
+
+    scraper.resolver.resolve_entry = lambda entry: Resolution(  # type: ignore[method-assign]
+        entry=BibEntry(
+            entry_type="article",
+            citation_key="resolved",
+            fields={
+                "author": entry.fields["author"],
+                "title": entry.fields["title"],
+                "year": entry.fields["year"],
+                "doi": "10.1000/weak",
+                "journal": "Journal of Better Metadata",
+            },
+        ),
+        source_type="resolver",
+        source_label="crossref:search:Weak Duplicate",
+    )
+
+    store = BibliographyStore()
+    try:
+        review = scraper.build_review_export(export.manifest_path, store)
+    finally:
+        store.close()
+
+    assert review.item_count == 1
+    assert review.items[0]["canonical"]["citation_key"] == "weak2"
+    assert review.items[0]["enrichment"]["resolved"] is True
+    assert review.items[0]["enrichment"]["applied"] is False
+
+
+def test_talkorigins_apply_review_corrections_updates_store_entry(tmp_path: Path):
+    base_url = "https://www.talkorigins.org/origins/biblio/"
+    scraper = TalkOriginsScraper(
+        source_client=FakeSourceClient(
+            {
+                base_url: INDEX_HTML,
+                f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
+                f"{base_url}evolution.html": EVOLUTION_HTML,
+            }
+        )
+    )
+
+    export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
+    Path(export.seed_sets[0].seed_bib).write_text(
+        """
+@misc{weak1,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Weak Duplicate"
+}
+
+@misc{weak2,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Weak Duplicate",
+    note = "Copied from legacy source"
+}
+""",
+        encoding="utf-8",
+    )
+    Path(export.seed_sets[1].seed_bib).write_text("", encoding="utf-8")
+
+    corrections_path = tmp_path / "corrections.json"
+    corrections_path.write_text(
+        json.dumps(
+            {
+                "corrections": [
+                    {
+                        "key": "smith jane|1999|weak duplicate",
+                        "entry_type": "article",
+                        "review_status": "reviewed",
+                        "fields": {
+                            "journal": "Journal of Better Metadata",
+                            "doi": "10.1000/weak",
+                            "note": None,
+                        },
+                    }
+                ]
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    store = BibliographyStore()
+    try:
+        scraper.ingest_export(export.manifest_path, store, dedupe=True)
+        results = scraper.apply_review_corrections(export.manifest_path, corrections_path, store)
+
+        assert len(results) == 1
+        assert results[0].applied is True
+        entry = store.get_entry(results[0].citation_key)
+        assert entry is not None
+        assert entry["entry_type"] == "article"
+        assert entry["journal"] == "Journal of Better Metadata"
+        assert entry["doi"] == "10.1000/weak"
+        assert entry["review_status"] == "reviewed"
+        assert entry.get("note") is None
+    finally:
+        store.close()
+
+
+def test_talkorigins_scraper_assigns_topics_when_ingesting(tmp_path: Path):
+    base_url = "https://www.talkorigins.org/origins/biblio/"
+    scraper = TalkOriginsScraper(
+        source_client=FakeSourceClient(
+            {
+                base_url: INDEX_HTML,
+                f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
+            }
+        )
+    )
+
+    store = BibliographyStore()
+    try:
+        export = scraper.scrape_to_directory(
+            base_url=base_url,
+            output_dir=tmp_path,
+            limit_topics=1,
+            ingest_store=store,
+        )
+
+        assert export.entry_count == 2
+        entry = store.get_entry("smith1998first1")
+        assert entry is not None
+        assert entry["topics"][0]["slug"] == "abiogenesis"
+        assert entry["topics"][0]["name"] == "Abiogenesis"
+        assert store.list_topics()[0]["slug"] == "abiogenesis"
+    finally:
+        store.close()
+
+
+def test_talkorigins_ingest_export_consolidates_duplicates_into_one_entry(tmp_path: Path):
+    base_url = "https://www.talkorigins.org/origins/biblio/"
+    scraper = TalkOriginsScraper(
+        source_client=FakeSourceClient(
+            {
+                base_url: INDEX_HTML,
+                f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
+                f"{base_url}evolution.html": EVOLUTION_HTML,
+            }
+        )
+    )
+
+    export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
+    Path(export.seed_sets[0].seed_bib).write_text(
+        """
+@article{dup1,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Duplicate Paper",
+    journal = "Journal A"
+}
+""",
+        encoding="utf-8",
+    )
+    Path(export.seed_sets[1].seed_bib).write_text(
+        """
+@article{dup2,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "Duplicate Paper",
+    journal = "Journal B",
+    doi = "10.1000/dup"
+}
+""",
+        encoding="utf-8",
+    )
+
+    store = BibliographyStore()
+    try:
+        report = scraper.ingest_export(export.manifest_path, store)
+
+        assert report.duplicate_cluster_count >= 1
+        assert report.stored_entry_count == 1
+        assert report.canonicalized_count >= 1
+        entry = store.get_entry("dup2")
+        assert entry is not None
+        assert entry["doi"] == "10.1000/dup"
+        assert [topic["slug"] for topic in entry["topics"]] == ["abiogenesis", "evolution"]
+    finally:
+        store.close()
+
+
+def test_talkorigins_ingest_export_avoids_canonical_key_collisions(tmp_path: Path):
+    base_url = "https://www.talkorigins.org/origins/biblio/"
+    scraper = TalkOriginsScraper(
+        source_client=FakeSourceClient(
+            {
+                base_url: INDEX_HTML,
+                f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
+                f"{base_url}evolution.html": EVOLUTION_HTML,
+            }
+        )
+    )
+
+    export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
+    Path(export.seed_sets[0].seed_bib).write_text(
+        """
+@article{sharedkey,
+    author = "Smith, Jane",
+    year = "1999",
+    title = "First Paper",
+    journal = "Journal A"
+}
+""",
+        encoding="utf-8",
+    )
+    Path(export.seed_sets[1].seed_bib).write_text(
+        """
+@article{sharedkey,
+    author = "Jones, Alex",
+    year = "2001",
+    title = "Second Paper",
+    journal = "Journal B"
+}
+""",
+        encoding="utf-8",
+    )
+
+    store = BibliographyStore()
+    try:
+        report = scraper.ingest_export(export.manifest_path, store)
+
+        assert report.stored_entry_count == 2
+        entries = store.list_entries(limit=10)
+        assert len(entries) == 2
+        assert len({entry["citation_key"] for entry in entries}) == 2
+    finally:
+        store.close()
+
+
+def test_load_batch_jobs_resolves_relative_seed_paths(tmp_path: Path):
+    seed_bib = tmp_path / "seeds" / "topic.bib"
+    seed_bib.parent.mkdir(parents=True)
+    seed_bib.write_text("", encoding="utf-8")
+
+    jobs_json = tmp_path / "jobs.json"
+    jobs_json.write_text(
+        """
+{
+  "jobs": [
+    {"name": "relative-job", "seed_bib": "seeds/topic.bib", "topic": "Abiogenesis"}
+  ]
+}
+""",
+        encoding="utf-8",
+    )
+
+    jobs = load_batch_jobs(jobs_json)
+
+    assert jobs[0]["seed_bib"] == str(seed_bib.resolve())
diff --git a/tests/test_topic_expand.py b/tests/test_topic_expand.py
new file mode 100644
index 0000000..f7cd4a7
--- /dev/null
+++ b/tests/test_topic_expand.py
@@ -0,0 +1,242 @@
+from citegeist.bibtex import BibEntry
+from citegeist.expand import (
+    ExpansionResult,
+    TopicExpander,
+    _meets_topic_assignment_threshold,
+    _topic_relevance_score,
+)
+from citegeist.storage import BibliographyStore
+
+
+class FakeOpenAlexExpander:
+    def __init__(self, results: list[ExpansionResult] | dict[str, list[ExpansionResult]]) -> None:
+        self.results = results
+
+    def expand_entry(self, store, citation_key, relation_type="cites", limit=25):
+        if isinstance(self.results, dict):
+            return list(self.results.get(citation_key, []))
+        return list(self.results)
+
+
+def test_topic_expander_assigns_relevant_discoveries_back_to_topic():
+    store = BibliographyStore()
+    try:
+        store.ingest_bibtex(
+            """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Abiogenesis Seed Paper},
+  year = {2024}
+}
+"""
+        )
+        store.add_entry_topic(
+            "seed2024",
+            topic_slug="abiogenesis",
+            topic_name="Abiogenesis",
+            source_type="talkorigins",
+            source_url="https://example.org/topics/abiogenesis",
+            source_label="seed",
+        )
+        store.upsert_entry(
+            BibEntry(
+                entry_type="article",
+                citation_key="discovered1",
+                fields={
+                    "title": "Abiogenesis and origin chemistry",
+                    "abstract": "A study of abiogenesis pathways.",
+                    "year": "2025",
+                },
+            ),
+            source_type="graph_expand",
+            source_label="test",
+            review_status="draft",
+        )
+        store.upsert_entry(
+            BibEntry(
+                entry_type="article",
+                citation_key="discovered2",
+                fields={
+                    "title": "Galaxy formation dynamics",
+                    "abstract": "Nothing about the topic.",
+                    "year": "2025",
+                },
+            ),
+            source_type="graph_expand",
+            source_label="test",
+            review_status="draft",
+        )
+        store.connection.commit()
+
+        expander = TopicExpander(
+            openalex_expander=FakeOpenAlexExpander(
+                [
+                    ExpansionResult("seed2024", "discovered1", False, "cites", "openalex:cites:seed2024"),
+                    ExpansionResult("seed2024", "discovered2", False, "cites", "openalex:cites:seed2024"),
+                ]
+            )
+        )
+
+        results = expander.expand_topic(
+            store,
+            "abiogenesis",
+            topic_phrase="abiogenesis origin chemistry",
+            min_relevance=0.34,
+        )
+
+        assert len(results) == 2
+        assigned = {item.discovered_citation_key: item.assigned_to_topic for item in results}
+        assert assigned["discovered1"] is True
+        assert assigned["discovered2"] is False
+        topics = store.get_entry_topics("discovered1")
+        assert topics[0]["slug"] == "abiogenesis"
+        assert store.get_entry_topics("discovered2") == []
+    finally:
+        store.close()
+
+
+def test_topic_expander_can_restrict_to_allowed_seed_keys():
+    store = BibliographyStore()
+    try:
+        store.ingest_bibtex(
+            """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Abiogenesis Seed Paper},
+  year = {2024}
+}
+
+@article{seed2023,
+  author = {Seed, Bob},
+  title = {Abiogenesis Historical Seed},
+  year = {2023}
+}
+"""
+        )
+        for citation_key in ("seed2024", "seed2023"):
+            store.add_entry_topic(
+                citation_key,
+                topic_slug="abiogenesis",
+                topic_name="Abiogenesis",
+                source_type="talkorigins",
+                source_url="https://example.org/topics/abiogenesis",
+                source_label="seed",
+            )
+        store.upsert_entry(
+            BibEntry(
+                entry_type="article",
+                citation_key="discovered1",
+                fields={
+                    "title": "Abiogenesis origin chemistry",
+                    "abstract": "A study of abiogenesis chemistry.",
+                    "year": "2025",
+                },
+            ),
+            source_type="graph_expand",
+            source_label="test",
+            review_status="draft",
+        )
+        store.connection.commit()
+
+        expander = TopicExpander(
+            openalex_expander=FakeOpenAlexExpander(
+                {"seed2023": [ExpansionResult("seed2023", "discovered1", False, "cites", "openalex:cites:seed2023")]}
+            )
+        )
+
+        results = expander.expand_topic(
+            store,
+            "abiogenesis",
+            topic_phrase="abiogenesis origin chemistry",
+            seed_keys=["seed2024"],
+        )
+
+        assert results == []
+        assert store.get_entry_topics("discovered1") == []
+    finally:
+        store.close()
+
+
+def test_topic_expander_preview_discovers_without_writing():
+    store = BibliographyStore()
+    try:
+        store.ingest_bibtex(
+            """
+@article{seed2024,
+  author = {Seed, Alice},
+  title = {Abiogenesis Seed Paper},
+  year = {2024}
+}
+"""
+        )
+        store.add_entry_topic(
+            "seed2024",
+            topic_slug="abiogenesis",
+            topic_name="Abiogenesis",
+            source_type="talkorigins",
+            source_url="https://example.org/topics/abiogenesis",
+            source_label="seed",
+        )
+        store.connection.commit()
+
+        expander = TopicExpander()
+        expander._preview_discoveries = lambda *_args, **_kwargs: [  # type: ignore[method-assign]
+            (
+                ExpansionResult(
+                    "seed2024",
+                    "preview1",
+                    True,
+                    "cites",
+                    "openalex:cites:seed2024",
+                ),
+                {
+                    "title": "Abiogenesis origin chemistry",
+                    "abstract": "A study of abiogenesis chemistry.",
+                    "year": "2025",
+                },
+            )
+        ]
+
+        results = expander.expand_topic(
+            store,
+            "abiogenesis",
+            topic_phrase="abiogenesis origin chemistry",
+            min_relevance=0.3,
+            preview_only=True,
+        )
+
+        assert len(results) == 1
+        assert results[0].discovered_citation_key == "preview1"
+        assert results[0].meets_relevance_threshold is True
+        assert results[0].assigned_to_topic is False
+        assert results[0].created_entry is True
+        assert store.get_entry("preview1") is None
+        assert store.get_entry_topics("preview1") == []
+    finally:
+        store.close()
+
+
+def test_topic_relevance_score_expands_human_evolution_terms():
+    score = _topic_relevance_score(
+        "human evolution",
+        {
+            "title": "Body size and proportions in early hominids",
+            "abstract": "A fossil and paleolithic perspective on primate ancestry.",
+            "journal": "Science",
+        },
+    )
+
+    assert score >= 0.15
+
+
+def test_topic_assignment_requires_title_anchor():
+    entry = {
+        "title": "Phylogenies and the Comparative Method",
+        "abstract": "A comparative framework for primate and hominid evolution.",
+        "journal": "Systematic Zoology",
+    }
+
+    score = _topic_relevance_score("human evolution", entry)
+
+    assert score >= 0.15
+    assert _meets_topic_assignment_threshold("human evolution", entry, min_relevance=0.15, relevance_score=score) is False