34 changed files with 10571 additions and 238 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,229 +1,6 @@
 # ---> Python
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
-cover/
+.venv/
-
+.cache/
-# Translations
+*.pyc
-*.mo
+library.sqlite3
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 # ---> Emacs
 # -*- mode: gitignore; -*-
 *~
 \#*\#
 /.emacs.desktop
 /.emacs.desktop.lock
 *.elc
 auto-save-list
 tramp
 .\#*
 # Org-mode
 .org-id-locations
 *_archive
 # flymake-mode
 *_flymake.*
 # eshell files
 /eshell/history
 /eshell/lastdir
 # elpa packages
 /elpa/
 # reftex files
 *.rel
 # AUCTeX auto folder
 /auto/
 # cask packages
 .cask/
 dist/
 # Flycheck
 flycheck_*.el
 # server auth directory
 /server/
 # projectiles files
 .projectile
 # directory configuration
 .dir-locals.el
 # network security
 /network-security.data
 # ---> Rust
 # Generated by Cargo
 # will have compiled files and executables
 debug/
 target/
 # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 Cargo.lock
 # These are backup files generated by rustfmt
 **/*.rs.bk
 # MSVC Windows builds of rustc generate these, which store debugging information
 *.pdb
--- a/9
+++ b/9
@ -1,9 +0,0 @@
 MIT License
 Copyright (c) 2026 welsberr
 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/16
+++ b/16
@ -0,0 +1,16 @@
 PYTHONPATH_SRC=PYTHONPATH=src
 VENV_PYTHON=.venv/bin/python
 .PHONY: test test-live live-smoke validate-talkorigins
 test:
 	$(PYTHONPATH_SRC) $(VENV_PYTHON) -m pytest -q
 test-live:
 	CITEGEIST_LIVE_TESTS=1 CITEGEIST_SOURCE_CACHE=.cache/citegeist $(PYTHONPATH_SRC) $(VENV_PYTHON) -m pytest -m live -q
 live-smoke:
 	CITEGEIST_SOURCE_CACHE=.cache/citegeist $(PYTHONPATH_SRC) $(VENV_PYTHON) scripts/live_smoke.py
 validate-talkorigins:
 	$(PYTHONPATH_SRC) $(VENV_PYTHON) -m citegeist validate-talkorigins talkorigins-out/talkorigins_manifest.json
--- a/README.md
+++ b/README.md
@ -1,3 +1,253 @@
-# CiteGeist
+# citegeist
-A bibliography workbench based on Bibtex and local SQLite databases, aimed at several common bibliography tasks: ingestion of plain-text references, augmentation of Bibtex entries with metadata, graph representations of citations, graph expansion from a citation set, and more.
+`citegeist` is a research-oriented bibliography workbench for building, expanding, and auditing BibTeX libraries.
 The aim is not just to store citations. The aim is to help with the harder problem: finding, improving, connecting, and checking the literature around a topic while keeping BibTeX as a first-class output format.
 ## Repo Description
 `citegeist` is a BibTeX-native research tool for citation extraction, metadata enrichment, citation-graph expansion, and semantic search over scholarly sources.
 ## Scope
 The project is intended to support a workflow like this:
 1. Start from rough references extracted from papers, notes, syllabi, or dissertations.
 2. Convert them into draft BibTeX entries.
 3. Enrich and correct those entries using external scholarly metadata sources.
 4. Persist entries, identifiers, abstracts, and citation edges in a local database.
 5. Traverse the citation graph outward to discover additional relevant works.
 6. Search the local corpus semantically using abstracts and extracted full text.
 7. Export verified results back into BibTeX for LaTeX use.
 ## Why A New Codebase
 This repository starts cleanly rather than extending the older `bib/` toolkit directly.
 The older toolkit is useful as prior art:
 - it demonstrates identifier-driven metadata augmentation;
 - it caches PDFs and extracted plaintext;
 - it shows one workable model for bibliography growth.
 But it is not the right long-term base:
 - it is Python 2-era code;
 - it is shell-script centric;
 - it does not provide a normalized database for graph workflows;
 - it is not structured as a reusable Python 3 library.
 `citegeist` keeps the useful ideas and rebuilds the foundation around a cleaner Python 3 package boundary.
 ## Current Status
 The initial repo includes:
 - `pybtex`-backed BibTeX parsing and export in a repo-local virtual environment;
 - a SQLite-backed bibliography store;
 - a small CLI for ingest, search, inspection, and export;
 - review-state tracking on entries, per-field ingest provenance, and field-level conflict review;
 - plaintext reference extraction into draft BibTeX for numbered, APA-like, wrapped-line, and simple book-style references;
 - identifier-first metadata resolution for DOI, OpenAlex, DBLP, arXiv, and DataCite-backed entries, with OpenAlex/DataCite title-search fallback;
 - local citation-graph traversal over stored `cites`, `cited_by`, and `crossref` edges;
 - Crossref- and OpenAlex-backed graph expansion that materializes draft related works and edge provenance;
 - a dedicated source-client layer with fixture/cache support for live-source development;
 - OAI-PMH Dublin Core harvesting for institutional repositories and thesis/dissertation sources;
 - OAI-PMH repository discovery via `Identify`, `ListSets`, and `ListMetadataFormats` to target harvests more precisely;
 - bibliography bootstrap workflows that can start from a seed `.bib`, a topic phrase, or both;
 - batch bootstrap orchestration from JSON job files containing seed BibTeX paths, topic phrases, or both;
 - a TalkOrigins scraper that fixes repeated-author plaintext references, emits per-topic seed BibTeX files, and writes a batch JSON specification;
 - normalized tables for entries, creators, identifiers, and citation relations;
 - full-text-search-ready indexing over title, abstract, and fulltext when SQLite FTS5 is available;
 - tests covering parsing, ingestion, relation storage, and search.
 The prioritized execution plan lives in [ROADMAP.md](./ROADMAP.md).
 ## Layout
 ```text
 citegeist/
  src/citegeist/
    bibtex.py
    storage.py
  tests/
    test_storage.py
  pyproject.toml
 ```
 ## Quick Start
 ```bash
 cd citegeist
 python3 -m virtualenv --always-copy .venv
 .venv/bin/pip install -e .
 .venv/bin/pip install pytest
 mkdir -p .cache/citegeist
 PYTHONPATH=src .venv/bin/python - <<'PY'
 from citegeist import BibliographyStore
 bib = """
@article{smith2024graphs,
  author = {Smith, Jane and Doe, Alex},
  title = {Graph-first bibliography augmentation},
  year = {2024},
  abstract = {We study citation graphs for literature discovery.},
  references = {miller2023search}
 }
@inproceedings{miller2023search,
  author = {Miller, Sam},
  title = {Semantic search for research corpora},
  year = {2023},
  abstract = {Dense retrieval improves recall for academic search.}
 }
 """
 store = BibliographyStore("library.sqlite3")
 store.ingest_bibtex(bib)
 print(store.get_relations("smith2024graphs"))
 print(store.search_text("semantic"))
 store.close()
 PY
 .venv/bin/python -m pytest -q
 ```
 Or use the CLI directly:
 ```bash
 cd citegeist
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 ingest references.bib
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 search "semantic search"
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 search "origin" --topic abiogenesis
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 show --provenance --conflicts smith2024graphs
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 set-status smith2024graphs reviewed
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve-conflicts smith2024graphs title accepted
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 apply-conflict smith2024graphs title
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 bootstrap --seed-bib seed.bib --topic "bayesian nonparametrics"
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 bootstrap --topic "bayesian nonparametrics" --preview --topic-commit-limit 5
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 scrape-talkorigins talkorigins-out --limit-topics 5 --limit-entries-per-topic 20
 PYTHONPATH=src .venv/bin/python -m citegeist extract references.txt --output draft.bib
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve smith2024graphs
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topics
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topic-entries abiogenesis
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export-topic abiogenesis --output abiogenesis.bib
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 graph smith2024graphs --relation cites --depth 2 --missing-only
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source crossref
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source openalex --relation cited_by --limit 10
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand-topic abiogenesis --topic-phrase "abiogenesis origin chemistry" --source openalex --relation cites --seed-key seed2024 --min-relevance 0.3 --preview
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 set-topic-phrase abiogenesis "abiogenesis origin chemistry prebiotic"
 PYTHONPATH=src .venv/bin/python -m citegeist discover-oai https://example.edu/oai
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 harvest-oai https://example.edu/oai --metadata-prefix mods --from 2024-01-01 --until 2024-12-31 --limit 10
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export --output reviewed.bib
 ```
 For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run.
 For large legacy plaintext corpora such as the TalkOrigins bibliography, prefer a two-step workflow:
 1. `scrape-talkorigins` to generate cleaned per-topic `seed_bib` files plus a `talkorigins_jobs.json` batch spec.
 2. `bootstrap-batch` on that JSON file when you want to ingest, resolve, and expand from the generated seeds.
 The TalkOrigins scrape output now includes:
 - `seeds/*.bib` per-topic seed BibTeX files for `bootstrap-batch`
 - `plaintext/*.txt` per-topic cleaned GSA-style plaintext with repeated authors expanded
 - `site/topics/*.html` reconstructed topic pages with hide/show BibTeX blocks
 - `talkorigins_full.txt` and `talkorigins_full.bib` aggregate downloads
 - `snapshots/*.json` cached topic payloads so reruns can resume without re-fetching already scraped topics
 After a full scrape, run:
 ```bash
 PYTHONPATH=src .venv/bin/python -m citegeist validate-talkorigins talkorigins-out/talkorigins_manifest.json
 PYTHONPATH=src .venv/bin/python -m citegeist duplicates-talkorigins talkorigins-out/talkorigins_manifest.json --limit 20
 PYTHONPATH=src .venv/bin/python -m citegeist duplicates-talkorigins talkorigins-out/talkorigins_manifest.json --limit 20 --preview --weak-only
 PYTHONPATH=src .venv/bin/python -m citegeist suggest-talkorigins-phrases talkorigins-out/talkorigins_manifest.json --output topic-phrases.json
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 stage-topic-phrases topic-phrases.json
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 review-topic-phrase abiogenesis accepted --notes "curated from local corpus"
 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 apply-topic-phrases topic-phrases.json
 PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 enrich-talkorigins talkorigins-out/talkorigins_manifest.json --limit 20
 PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins-copy.sqlite3 enrich-talkorigins talkorigins-out/talkorigins_manifest.json --limit 5 --apply --allow-unsafe-search-matches
 PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 review-talkorigins talkorigins-out/talkorigins_manifest.json --output talkorigins-review.json
 PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 apply-talkorigins-corrections talkorigins-out/talkorigins_manifest.json talkorigins-corrections.json
 ```
 That report summarizes parse coverage and flags suspicious entry-type / venue combinations for manual cleanup.
 It also reports duplicate clusters across topic seed files so you can gauge how much deduplication pressure to expect before ingestion.
 Use `duplicates-talkorigins` when you want to inspect specific clusters, filter by text, restrict the audit to one topic slug, or preview only weak canonicalization outcomes before importing.
 Use `suggest-talkorigins-phrases` to derive candidate stored expansion phrases from the existing TalkOrigins topic corpus itself. The output is deterministic JSON keyed by topic slug, with a suggested phrase plus the extracted keywords that drove it. This is a useful first pass before setting topic phrases in the database or editing generated batch jobs.
 Use `stage-topic-phrases` to load those suggestions into the database as review items. Staging stores the candidate in `suggested_phrase` and marks the topic `pending` without changing the active `expansion_phrase`.
 Use `review-topic-phrase` to accept or reject one staged suggestion in place. Accepting a suggestion copies it into `expansion_phrase`; rejecting it preserves the review state without changing the live phrase.
 Use `apply-topic-phrases` when you want a direct patch path instead of the staged review flow. It accepts either the raw suggestion list or an object with a `topics` list, and will apply `suggested_phrase` or `phrase` to matching topic slugs immediately.
 Use `enrich-talkorigins` when you want to target those weak canonical entries for resolver-based metadata upgrades before retrying graph expansion on imported topic slices.
 Use `review-talkorigins` when you want one JSON review artifact that combines weak canonical clusters with dry-run enrichment outcomes for manual cleanup.
 Use `expand-topic` when you already have both a topic phrase and a curated topic seed set in the database: it expands outward from the topic’s existing entries, then only assigns discovered works back to that topic if they clear a topic-relevance threshold. Write-enabled assignment is stricter than preview ranking: a candidate must clear the score threshold and show a non-generic title anchor to the topic phrase, so broad methods papers do not get attached just because their abstracts or related terms overlap. On large noisy topics, prefer `--seed-key` to restrict the run to just the trusted seed entries you want to expand from, and use `--preview` first to inspect discovered candidates and relevance scores before writing anything.
 Use `set-topic-phrase` to store a curated expansion phrase on the topic itself. When a stored phrase exists, `expand-topic` will use it automatically if you do not pass `--topic-phrase`. Batch bootstrap jobs can also set `topic_slug`, `topic_name`, and `topic_phrase` so curated topic metadata is created as part of the run.
 Use `topics --phrase-review-status pending` when you want to audit only topics whose staged phrase suggestions still need review.
 `--allow-unsafe-search-matches` exists only for bounded experiments on copied databases when you explicitly want to relax trust to exercise downstream expansion behavior.
 Correction files are simple JSON:
 ```json
 {
  "corrections": [
    {
      "key": "smith jane|1999|weak duplicate",
      "entry_type": "article",
      "review_status": "reviewed",
      "fields": {
        "journal": "Journal of Better Metadata",
        "doi": "10.1000/weak",
        "note": null
      }
    }
  ]
 }
 ```
 `fields` values overwrite the canonical entry for that duplicate-cluster key. Set a field to `null` to remove it.
 To import the reconstructed corpus into SQLite while collapsing duplicate works across topics into canonical entries:
 ```bash
 PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 ingest-talkorigins talkorigins-out/talkorigins_manifest.json
 ```
 That import preserves many-to-many topic membership through the `topics` and `entry_topics` tables.
 After import, use `topics`, `topic-entries`, `search --topic`, and `export-topic` to inspect or export topic slices from the consolidated database.
 Live-source workflow:
 ```bash
 cd citegeist
 export CITEGEIST_SOURCE_CACHE=.cache/citegeist
 export CITEGEIST_LIVE_TESTS=1
 PYTHONPATH=src .venv/bin/python -m pytest -m live -q
 PYTHONPATH=src .venv/bin/python scripts/live_smoke.py
 ```
 By default, live tests are skipped. They only run when `CITEGEIST_LIVE_TESTS=1` is set.
 Convenience targets:
 ```bash
 make test
 make test-live
 make live-smoke
 ```
 ## Near-Term Priorities
 - source adapters beyond OAI-PMH for additional non-DOI scholarly ecosystems.
 See [ROADMAP.md](./ROADMAP.md) for the prioritized phase plan and rationale.
 ## Naming
 The name is intended to be short, distinct, and memorable:
 - `cite` for citation work;
 - `geist` for the organizing intelligence around the literature.
--- a/ROADMAP.md
+++ b/ROADMAP.md
@ -0,0 +1,187 @@
 # Roadmap
 This roadmap prioritizes a usable local research workflow over breadth of integrations.
 The first objective is not to support every metadata source. The first objective is to make one end-to-end path work reliably:
 1. ingest draft references,
 2. normalize and store them,
 3. enrich them,
 4. traverse citation links,
 5. export reviewed BibTeX.
 ## Prioritization Principles
 - prioritize steps that make the system usable by a single researcher on a local machine;
 - prioritize deterministic infrastructure before network integrations;
 - keep every stage inspectable and auditable;
 - treat verification and provenance as core features, not cleanup work;
 - defer heavy semantic infrastructure until the local corpus model is stable.
 ## Current Baseline
 Completed:
 - lightweight BibTeX parsing;
 - SQLite storage for entries, creators, identifiers, and relations;
 - local text search using SQLite FTS5 when available;
 - tests for ingest, relation storage, and search.
 ## Phase 1: Core Ingestion And Export
 Priority: P0
 Goal:
 Make `citegeist` useful as a local BibTeX workbench even before online enrichment is added.
 Tasks:
 - add BibTeX export from the normalized database back into stable, readable BibTeX;
 - add a small CLI for `ingest`, `show`, `search`, and `export`;
 - store field provenance metadata alongside imported and edited fields;
 - add schema support for entry status such as `draft`, `enriched`, `reviewed`, and `exported`;
 - add fixture-driven tests for round-tripping BibTeX through ingest and export.
 Why this comes first:
 - without export, the project is not yet useful in a LaTeX workflow;
 - without a CLI, the package is a library demo rather than a tool;
 - without provenance and state, later enrichment work becomes hard to audit.
 Exit criteria:
 - a user can ingest a `.bib` file, inspect entries, search locally, and export a reviewed `.bib`;
 - round-trip tests show no unexpected field loss for supported entry types.
 ## Phase 2: Reference Extraction
 Priority: P0
 Goal:
 Turn raw reference text into draft entries that can enter the main pipeline.
 Tasks:
 - add parsers for bibliography-section lines and plain-text reference lists;
 - define a draft-entry schema for incomplete references with confidence markers;
 - support ingestion of OCR- or PDF-derived plaintext bibliography sections;
 - add normalization for author names, years, title casing, and page ranges;
 - build gold-test fixtures from real, messy reference examples.
 Why this is next:
 - this addresses the project’s first unique bottleneck: getting rough references into structured form;
 - enrichment is much more effective once draft references are normalized.
 Exit criteria:
 - a user can pass a plaintext bibliography section and receive draft BibTeX entries with unresolved fields clearly marked;
 - tests cover common article, book, chapter, and proceedings references.
 ## Phase 3: Metadata Enrichment
 Priority: P1
 Goal:
 Resolve draft or partial entries against external scholarly sources and merge improved metadata safely.
 Tasks:
 - define a resolver interface with deterministic merge rules;
 - implement first-party resolvers for DOI/Crossref, DBLP, and arXiv;
 - add identifier-first resolution, then title/author/year fallback search;
 - store merge provenance per field and resolution attempt logs;
 - flag conflicts rather than silently overwriting disputed values.
 Why this is P1 rather than the first phase:
 - enrichment quality depends on the ingestion and provenance model being correct first;
 - it is easier to test deterministic merge behavior once local workflows already exist.
 Exit criteria:
 - an incomplete entry can be enriched from at least one authoritative source;
 - conflicting fields remain visible for review instead of being lost.
 ## Phase 4: Citation Graph Expansion
 Priority: P1
 Goal:
 Use citation edges as a discovery engine rather than just metadata storage.
 Tasks:
 - support explicit `cites` and `cited_by` edge ingestion with source provenance;
 - add graph expansion commands starting from one or more seed entries;
 - track edge discovery source, timestamp, and confidence;
 - add filters for depth, source type, year range, and reviewed status;
 - expose unresolved nodes so the user can decide what to enrich next.
 Why this matters:
 - this is central to literature discovery rather than mere bibliography cleanup;
 - it turns the database into a research navigation tool.
 Exit criteria:
 - starting from one or more seed entries, a user can expand outward through citation edges and persist newly discovered nodes;
 - graph traversal results can be exported as BibTeX candidates for review.
 ## Phase 5: Search And Ranking
 Priority: P2
 Goal:
 Improve discovery quality inside the local corpus.
 Tasks:
 - refine FTS ranking across title, abstract, keywords, and fulltext;
 - add saved search queries and result filters;
 - add optional embedding-backed semantic search behind a pluggable interface;
 - support hybrid ranking that combines lexical matching, identifiers, and citation proximity;
 - add benchmarking fixtures for retrieval quality on a few research topics.
 Why this is later:
 - FTS is already enough to support early workflows;
 - embedding infrastructure is expensive and should wait until the corpus schema stabilizes.
 Exit criteria:
 - local search is useful on realistic corpora without requiring external services;
 - semantic indexing is optional and does not displace the simpler local search path.
 ## Phase 6: Corpus Acquisition Pipelines
 Priority: P2
 Goal:
 Broaden source acquisition without mixing that complexity into the core model.
 Tasks:
 - add source adapters for open-access theses and dissertation repositories;
 - add support for harvesting publisher citation pages and preprint metadata pages;
 - define per-source import provenance and rate-limit behavior;
 - separate source-specific scraping logic from normalized entry storage;
 - add regression fixtures for representative public sources.
 Why this is later:
 - acquisition breadth is useful, but only after the core ingest/enrich/review loop is solid;
 - source adapters are brittle and should sit on top of a stable model.
 Exit criteria:
 - new public corpora can be imported through adapters without changing the storage core;
 - imported entries retain their source provenance and can be reviewed like any other entry.
 ## Suggested Next Three Tasks
 1. Add a CLI module with `ingest`, `search`, `show`, and `export`.
 2. Implement BibTeX export from the normalized store.
 3. Add provenance tables and entry review status fields.
 These three tasks complete the first usable local workflow and should be treated as the immediate sprint.
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,20 @@
 [build-system]
 requires = ["setuptools>=68"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "citegeist"
 version = "0.1.0"
 description = "BibTeX-native tooling for bibliography augmentation, citation graphs, and search"
 requires-python = ">=3.10"
 dependencies = ["pybtex==0.25.1"]
 [project.scripts]
 citegeist = "citegeist.cli:main"
 [tool.pytest.ini_options]
 pythonpath = ["src"]
 testpaths = ["tests"]
 markers = [
  "live: tests that call live external scholarly APIs and are skipped unless explicitly enabled",
 ]
--- a/scripts/live_smoke.py
+++ b/scripts/live_smoke.py
@ -0,0 +1,58 @@
 from __future__ import annotations
 import argparse
 import json
 import os
 from citegeist import MetadataResolver, SourceClient
 def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description="Run live smoke checks against scholarly metadata sources")
    parser.add_argument(
        "--cache-dir",
        default=os.environ.get("CITEGEIST_SOURCE_CACHE", ".cache/citegeist"),
        help="Directory for cached live-source responses",
    )
    parser.add_argument(
        "--fixtures-dir",
        default=os.environ.get("CITEGEIST_SOURCE_FIXTURES"),
        help="Optional fixture directory to read before live network calls",
    )
    return parser
 def main() -> int:
    args = build_parser().parse_args()
    client = SourceClient(cache_dir=args.cache_dir, fixtures_dir=args.fixtures_dir)
    resolver = MetadataResolver(source_client=client)
    checks = {
        "crossref_doi": resolver.resolve_doi("10.1038/nphys1170"),
        "arxiv_id": resolver.resolve_arxiv("1706.03762"),
        "openalex_search": resolver.search_openalex_best_match(
            title="Attention Is All You Need",
            author_text="Ashish Vaswani",
            year="2017",
        ),
    }
    payload = {}
    for name, resolution in checks.items():
        payload[name] = None
        if resolution is not None:
            payload[name] = {
                "source_label": resolution.source_label,
                "title": resolution.entry.fields.get("title"),
                "year": resolution.entry.fields.get("year"),
                "doi": resolution.entry.fields.get("doi"),
                "openalex": resolution.entry.fields.get("openalex"),
                "arxiv": resolution.entry.fields.get("arxiv"),
            }
    print(json.dumps(payload, indent=2, sort_keys=True))
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/src/citegeist/init.py
+++ b/src/citegeist/init.py
@ -0,0 +1,52 @@
 from .batch import BatchBootstrapRunner, BatchJobResult, load_batch_jobs
 from .bibtex import BibEntry, parse_bibtex
 from .bootstrap import BootstrapResult, Bootstrapper
 from .expand import CrossrefExpander, OpenAlexExpander
 from .extract import extract_references
 from .harvest import OaiMetadataFormat, OaiPmhHarvester, OaiSet
 from .resolve import MetadataResolver, merge_entries, merge_entries_with_conflicts
 from .sources import SourceClient
 from .storage import BibliographyStore
 from .talkorigins import (
    TalkOriginsBatchExport,
    TalkOriginsDuplicateCluster,
    TalkOriginsEnrichmentResult,
    TalkOriginsIngestReport,
    TalkOriginsReviewExport,
    TalkOriginsScraper,
    TalkOriginsSeedSet,
    TalkOriginsTopicPhraseSuggestion,
    TalkOriginsTopic,
    TalkOriginsValidationReport,
 )
 __all__ = [
    "BibEntry",
    "BatchBootstrapRunner",
    "BatchJobResult",
    "BibliographyStore",
    "BootstrapResult",
    "Bootstrapper",
    "CrossrefExpander",
    "MetadataResolver",
    "OpenAlexExpander",
    "OaiPmhHarvester",
    "OaiMetadataFormat",
    "OaiSet",
    "SourceClient",
    "TalkOriginsBatchExport",
    "TalkOriginsDuplicateCluster",
    "TalkOriginsEnrichmentResult",
    "TalkOriginsIngestReport",
    "TalkOriginsReviewExport",
    "TalkOriginsScraper",
    "TalkOriginsSeedSet",
    "TalkOriginsTopicPhraseSuggestion",
    "TalkOriginsTopic",
    "TalkOriginsValidationReport",
    "extract_references",
    "load_batch_jobs",
    "merge_entries",
    "merge_entries_with_conflicts",
    "parse_bibtex",
 ]
--- a/src/citegeist/main.py
+++ b/src/citegeist/main.py
@ -0,0 +1,4 @@
 from .cli import main
 raise SystemExit(main())
--- a/src/citegeist/batch.py
+++ b/src/citegeist/batch.py
@ -0,0 +1,78 @@
 from __future__ import annotations
 import json
 from dataclasses import dataclass
 from pathlib import Path
 from .bootstrap import BootstrapResult, Bootstrapper
 from .storage import BibliographyStore
@dataclass(slots=True)
 class BatchJobResult:
    job_name: str
    result_count: int
    results: list[BootstrapResult]
 def load_batch_jobs(path: str | Path) -> list[dict]:
    path = Path(path)
    payload = json.loads(path.read_text(encoding="utf-8"))
    if isinstance(payload, dict):
        jobs = payload.get("jobs", [])
    else:
        jobs = payload
    if not isinstance(jobs, list):
        raise ValueError("Batch JSON must be a list of jobs or an object with a 'jobs' list")
    normalized_jobs: list[dict] = []
    for job in jobs:
        if not isinstance(job, dict):
            raise ValueError("Each batch job must be an object")
        normalized = dict(job)
        seed_bib = normalized.get("seed_bib")
        if isinstance(seed_bib, str) and seed_bib:
            seed_path = Path(seed_bib)
            if not seed_path.is_absolute():
                normalized["seed_bib"] = str((path.parent / seed_path).resolve())
        normalized_jobs.append(normalized)
    return normalized_jobs
 class BatchBootstrapRunner:
    def __init__(self, bootstrapper: Bootstrapper | None = None) -> None:
        self.bootstrapper = bootstrapper or Bootstrapper()
    def run(self, store: BibliographyStore, jobs: list[dict]) -> list[BatchJobResult]:
        results: list[BatchJobResult] = []
        for index, job in enumerate(jobs, start=1):
            seed_bib = job.get("seed_bib")
            topic = job.get("topic")
            topic_limit = int(job.get("topic_limit", 5))
            topic_commit_limit = job.get("topic_commit_limit")
            expand = bool(job.get("expand", True))
            review_status = str(job.get("status", "draft"))
            preview = bool(job.get("preview", False))
            name = str(job.get("name") or f"job_{index}")
            topic_slug = job.get("topic_slug")
            topic_name = job.get("topic_name")
            topic_phrase = job.get("topic_phrase")
            seed_bibtex = None
            if seed_bib:
                seed_bibtex = Path(seed_bib).read_text(encoding="utf-8")
            job_results = self.bootstrapper.bootstrap(
                store,
                seed_bibtex=seed_bibtex,
                topic=topic,
                topic_limit=topic_limit,
                topic_commit_limit=int(topic_commit_limit) if topic_commit_limit is not None else None,
                expand=expand,
                review_status=review_status,
                preview_only=preview,
                topic_slug=str(topic_slug) if topic_slug else None,
                topic_name=str(topic_name) if topic_name else None,
                topic_phrase=str(topic_phrase) if topic_phrase else None,
            )
            results.append(BatchJobResult(name, len(job_results), job_results))
        return results
--- a/src/citegeist/bibtex.py
+++ b/src/citegeist/bibtex.py
@ -0,0 +1,116 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from io import StringIO
 try:
    from pybtex.database import BibliographyData, Entry, Person, parse_string
    from pybtex.bibtex.exceptions import BibTeXError
    from pybtex.database.output.bibtex import Writer
 except ImportError:  # pragma: no cover - exercised only outside the configured venv
    BibTeXError = None
    BibliographyData = Entry = Person = Writer = None
    parse_string = None
@dataclass(slots=True)
 class BibEntry:
    entry_type: str
    citation_key: str
    fields: dict[str, str]
 def parse_bibtex(text: str) -> list[BibEntry]:
    _require_pybtex()
    bibliography = parse_string(text, bib_format="bibtex")
    entries: list[BibEntry] = []
    for citation_key, entry in bibliography.entries.items():
        fields = dict(entry.fields.items())
        for role, persons in entry.persons.items():
            fields[role] = " and ".join(str(person) for person in persons)
        entries.append(
            BibEntry(
                entry_type=entry.type,
                citation_key=citation_key,
                fields=fields,
            )
        )
    return entries
 def render_bibtex(entries: list[BibEntry]) -> str:
    _require_pybtex()
    bibliography_entries = {}
    for entry in entries:
        fields = {
            key: _sanitize_bibtex_value(value)
            for key, value in entry.fields.items()
            if key not in {"author", "editor"}
        }
        persons = {}
        for role in ("author", "editor"):
            raw_names = entry.fields.get(role)
            if raw_names:
                persons[role] = [Person(name.strip()) for name in raw_names.split(" and ") if name.strip()]
        bibliography_entries[entry.citation_key] = Entry(entry.entry_type, fields=fields, persons=persons)
    buffer = StringIO()
    try:
        Writer().write_stream(BibliographyData(entries=bibliography_entries), buffer)
    except BibTeXError:
        conservative_entries = {}
        for entry in entries:
            fields = {
                key: _flatten_bibtex_braces(value)
                for key, value in entry.fields.items()
                if key not in {"author", "editor"}
            }
            persons = {}
            for role in ("author", "editor"):
                raw_names = entry.fields.get(role)
                if raw_names:
                    persons[role] = [Person(name.strip()) for name in raw_names.split(" and ") if name.strip()]
            conservative_entries[entry.citation_key] = Entry(entry.entry_type, fields=fields, persons=persons)
        buffer = StringIO()
        Writer().write_stream(BibliographyData(entries=conservative_entries), buffer)
    return buffer.getvalue().strip()
 def _require_pybtex() -> None:
    if parse_string is None or Writer is None:
        raise RuntimeError(
            "pybtex is required. Use the repo-local virtual environment under .venv/ for citegeist commands."
        )
 def _sanitize_bibtex_value(value: str) -> str:
    depth = 0
    parts: list[str] = []
    for char in value:
        if char == "{":
            depth += 1
            parts.append(char)
            continue
        if char == "}":
            if depth == 0:
                parts.append(")")
            else:
                depth -= 1
                parts.append(char)
            continue
        parts.append(char)
    if depth > 0:
        open_count = depth
        normalized = []
        for char in parts:
            if char == "{" and open_count > 0:
                normalized.append("(")
                open_count -= 1
            else:
                normalized.append(char)
        return "".join(normalized)
    return "".join(parts)
 def _flatten_bibtex_braces(value: str) -> str:
    return value.replace("{", "(").replace("}", ")")
--- a/src/citegeist/bootstrap.py
+++ b/src/citegeist/bootstrap.py
@ -0,0 +1,145 @@
 from __future__ import annotations
 from dataclasses import dataclass
 import re
 from .bibtex import BibEntry, parse_bibtex
 from .expand import CrossrefExpander, OpenAlexExpander
 from .resolve import MetadataResolver
 from .storage import BibliographyStore
@dataclass(slots=True)
 class BootstrapResult:
    citation_key: str
    origin: str
    created: bool
    score: float = 0.0
 class Bootstrapper:
    def __init__(
        self,
        resolver: MetadataResolver | None = None,
        crossref_expander: CrossrefExpander | None = None,
        openalex_expander: OpenAlexExpander | None = None,
    ) -> None:
        self.resolver = resolver or MetadataResolver()
        self.crossref_expander = crossref_expander or CrossrefExpander(self.resolver)
        self.openalex_expander = openalex_expander or OpenAlexExpander(self.resolver)
    def bootstrap(
        self,
        store: BibliographyStore,
        seed_bibtex: str | None = None,
        topic: str | None = None,
        topic_limit: int = 5,
        topic_commit_limit: int | None = None,
        expand: bool = True,
        review_status: str = "draft",
        preview_only: bool = False,
        topic_slug: str | None = None,
        topic_name: str | None = None,
        topic_phrase: str | None = None,
    ) -> list[BootstrapResult]:
        results: list[BootstrapResult] = []
        seed_keys: list[str] = []
        if seed_bibtex:
            for entry in parse_bibtex(seed_bibtex):
                created = store.get_entry(entry.citation_key) is None
                if not preview_only:
                    store.upsert_entry(
                        entry,
                        raw_bibtex=None,
                        source_type="bootstrap",
                        source_label="seed_bibtex",
                        review_status=review_status,
                    )
                    seed_keys.append(entry.citation_key)
                results.append(BootstrapResult(entry.citation_key, "seed_bibtex", created))
        if topic:
            if not preview_only and (topic_slug or topic_name or topic_phrase):
                store.ensure_topic(
                    slug=topic_slug or _slugify(topic),
                    name=topic_name or topic,
                    source_type="bootstrap",
                    expansion_phrase=topic_phrase or topic,
                )
            ranked_candidates = self._topic_candidates(topic, seed_keys, topic_limit)
            if topic_commit_limit is not None:
                ranked_candidates = ranked_candidates[:topic_commit_limit]
            for entry, score in ranked_candidates:
                created = store.get_entry(entry.citation_key) is None
                if not preview_only:
                    store.upsert_entry(
                        entry,
                        raw_bibtex=None,
                        source_type="bootstrap",
                        source_label=f"topic:{topic}",
                        review_status=review_status,
                    )
                    seed_keys.append(entry.citation_key)
                results.append(BootstrapResult(entry.citation_key, "topic", created, score=score))
        if expand and not preview_only:
            expanded_keys = list(dict.fromkeys(seed_keys))
            for citation_key in expanded_keys:
                for item in self.crossref_expander.expand_entry_references(store, citation_key):
                    results.append(BootstrapResult(item.discovered_citation_key, "crossref_expand", item.created_entry))
                for item in self.openalex_expander.expand_entry(store, citation_key, relation_type="cites", limit=topic_limit):
                    results.append(BootstrapResult(item.discovered_citation_key, "openalex_expand", item.created_entry))
        store.connection.commit()
        return results
    def _topic_candidates(self, topic: str, seed_keys: list[str], limit: int) -> list[tuple[BibEntry, float]]:
        scored: dict[str, tuple[BibEntry, float]] = {}
        for source_name, base_score, entries in (
            ("openalex", 3.0, self.resolver.search_openalex(topic, limit=limit)),
            ("crossref", 2.0, self.resolver.search_crossref(topic, limit=limit)),
            ("datacite", 1.5, self.resolver.search_datacite(topic, limit=limit)),
        ):
            for entry in entries:
                score = base_score + _topic_relevance_score(entry, topic) + _seed_overlap_score(entry, seed_keys)
                existing = scored.get(entry.citation_key)
                if existing is None or score > existing[1]:
                    scored[entry.citation_key] = (entry, score)
        ranked = sorted(
            scored.values(),
            key=lambda item: (-item[1], item[0].citation_key),
        )
        return ranked[:limit]
 def _topic_relevance_score(entry: BibEntry, topic: str) -> float:
    topic_terms = _tokenize(topic)
    title_terms = _tokenize(entry.fields.get("title", ""))
    abstract_terms = _tokenize(entry.fields.get("abstract", ""))
    overlap = len(topic_terms & (title_terms | abstract_terms))
    return float(overlap)
 def _seed_overlap_score(entry: BibEntry, seed_keys: list[str]) -> float:
    if not seed_keys:
        return 0.0
    title_terms = _tokenize(entry.fields.get("title", ""))
    score = 0.0
    for seed_key in seed_keys:
        seed_terms = _tokenize(seed_key)
        if seed_terms & title_terms:
            score += 0.25
    return score
 def _tokenize(value: str) -> set[str]:
    return {token for token in re.split(r"\W+", value.lower()) if token}
 def _slugify(value: str) -> str:
    slug = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-")
    return slug or "topic"
--- a/src/citegeist/cli.py
+++ b/src/citegeist/cli.py
--- a/src/citegeist/expand.py
+++ b/src/citegeist/expand.py
@ -0,0 +1,600 @@
 from __future__ import annotations
 import re
 from dataclasses import dataclass
 from urllib.parse import quote, urlencode
 from .bibtex import BibEntry, parse_bibtex
 from .resolve import MetadataResolver
 from .storage import BibliographyStore
@dataclass(slots=True)
 class ExpansionResult:
    source_citation_key: str
    discovered_citation_key: str
    created_entry: bool
    relation_type: str
    source_label: str
@dataclass(slots=True)
 class TopicExpansionResult:
    topic_slug: str
    source_citation_key: str
    discovered_citation_key: str
    discovered_title: str
    created_entry: bool
    relation_type: str
    source_label: str
    relevance_score: float
    meets_relevance_threshold: bool
    assigned_to_topic: bool
 class CrossrefExpander:
    def __init__(self, resolver: MetadataResolver | None = None) -> None:
        self.resolver = resolver or MetadataResolver()
    def expand_entry_references(
        self,
        store: BibliographyStore,
        citation_key: str,
    ) -> list[ExpansionResult]:
        entry = store.get_entry(citation_key)
        if entry is None:
            return []
        doi = entry.get("doi")
        if not doi:
            return []
        payload = self.resolver.source_client.get_json(
            f"https://api.crossref.org/works/{doi}?mailto=welsberr@gmail.com"
        )
        references = payload.get("message", {}).get("reference", [])
        results: list[ExpansionResult] = []
        for index, reference in enumerate(references, start=1):
            discovered = _crossref_reference_to_entry(reference, citation_key, index)
            created = False
            if store.get_entry(discovered.citation_key) is None:
                store.upsert_entry(
                    discovered,
                    raw_bibtex=None,
                    source_type="graph_expand",
                    source_label=f"crossref:references:{doi}",
                    review_status="draft",
                )
                store.connection.commit()
                created = True
            store.add_relation(
                citation_key,
                discovered.citation_key,
                "cites",
                source_type="graph_expand",
                source_label=f"crossref:references:{doi}",
                confidence=1.0 if reference.get("DOI") else 0.6,
            )
            results.append(
                ExpansionResult(
                    source_citation_key=citation_key,
                    discovered_citation_key=discovered.citation_key,
                    created_entry=created,
                    relation_type="cites",
                    source_label=f"crossref:references:{doi}",
                )
            )
        return results
 class OpenAlexExpander:
    def __init__(self, resolver: MetadataResolver | None = None) -> None:
        self.resolver = resolver or MetadataResolver()
    def expand_entry(
        self,
        store: BibliographyStore,
        citation_key: str,
        relation_type: str = "cites",
        limit: int = 25,
    ) -> list[ExpansionResult]:
        entry = store.get_entry(citation_key)
        if entry is None:
            return []
        openalex_id = entry.get("openalex") or self._lookup_openalex_id(entry)
        if not openalex_id:
            return []
        if not entry.get("openalex"):
            bibtex = store.get_entry_bibtex(citation_key)
            if bibtex:
                seed_entry = parse_bibtex(bibtex)[0]
                seed_entry.fields["openalex"] = openalex_id
                store.replace_entry(
                    citation_key,
                    seed_entry,
                    source_type="resolver",
                    source_label=f"openalex:id:{openalex_id}",
                    review_status=str(entry.get("review_status") or "draft"),
                )
        filter_name = "cited_by" if relation_type == "cites" else "cites"
        query = urlencode({"filter": f"{filter_name}:{openalex_id}", "per-page": limit})
        payload = self.resolver.source_client.get_json(f"https://api.openalex.org/works?{query}")
        works = payload.get("results", [])
        results: list[ExpansionResult] = []
        for work in works:
            discovered = _openalex_work_to_entry(work)
            created = False
            if store.get_entry(discovered.citation_key) is None:
                store.upsert_entry(
                    discovered,
                    raw_bibtex=None,
                    source_type="graph_expand",
                    source_label=f"openalex:{relation_type}:{openalex_id}",
                    review_status="draft",
                )
                store.connection.commit()
                created = True
            if relation_type == "cites":
                source_key = citation_key
                target_key = discovered.citation_key
            else:
                source_key = discovered.citation_key
                target_key = citation_key
            store.add_relation(
                source_key,
                target_key,
                "cites",
                source_type="graph_expand",
                source_label=f"openalex:{relation_type}:{openalex_id}",
                confidence=0.9,
            )
            results.append(
                ExpansionResult(
                    source_citation_key=source_key,
                    discovered_citation_key=discovered.citation_key,
                    created_entry=created,
                    relation_type=relation_type,
                    source_label=f"openalex:{relation_type}:{openalex_id}",
                )
            )
        return results
    def _lookup_openalex_id(self, entry: dict[str, object]) -> str | None:
        doi = entry.get("doi")
        if not doi:
            return None
        query = urlencode({"filter": f"doi:https://doi.org/{doi}"})
        payload = self.resolver.source_client.get_json(f"https://api.openalex.org/works?{query}")
        results = payload.get("results", [])
        if not results:
            return None
        return _normalize_openalex_id(results[0].get("id", ""))
 class TopicExpander:
    def __init__(
        self,
        crossref_expander: CrossrefExpander | None = None,
        openalex_expander: OpenAlexExpander | None = None,
    ) -> None:
        self.crossref_expander = crossref_expander or CrossrefExpander()
        self.openalex_expander = openalex_expander or OpenAlexExpander()
    def expand_topic(
        self,
        store: BibliographyStore,
        topic_slug: str,
        topic_phrase: str | None = None,
        source: str = "openalex",
        relation_type: str = "cites",
        seed_limit: int = 25,
        per_seed_limit: int = 25,
        min_relevance: float = 0.2,
        seed_keys: list[str] | None = None,
        preview_only: bool = False,
    ) -> list[TopicExpansionResult]:
        topic = store.get_topic(topic_slug)
        if topic is None:
            return []
        phrase = (topic_phrase or str(topic.get("name") or topic_slug)).strip()
        seeds = store.list_topic_entries(topic_slug, limit=seed_limit)
        if seed_keys:
            allowed = set(seed_keys)
            seeds = [seed for seed in seeds if str(seed["citation_key"]) in allowed]
        results: list[TopicExpansionResult] = []
        for seed in seeds:
            seed_key = str(seed["citation_key"])
            if preview_only:
                discovered_rows = self._preview_discoveries(
                    store,
                    seed_key,
                    source=source,
                    relation_type=relation_type,
                    limit=per_seed_limit,
                )
            else:
                discovered_rows = self._materialized_discoveries(
                    store,
                    seed_key,
                    source=source,
                    relation_type=relation_type,
                    limit=per_seed_limit,
                )
            for row, target_entry in discovered_rows:
                score = _topic_relevance_score(phrase, target_entry)
                meets_threshold = _meets_topic_assignment_threshold(
                    phrase,
                    target_entry,
                    min_relevance=min_relevance,
                    relevance_score=score,
                )
                assigned = False
                if not preview_only and meets_threshold and target_entry is not None:
                    assigned = store.add_entry_topic(
                        row.discovered_citation_key,
                        topic_slug=topic_slug,
                        topic_name=str(topic.get("name") or topic_slug),
                        source_type="topic_expand",
                        source_url=str(topic.get("source_url") or ""),
                        source_label=f"{source}:{relation_type}:{seed_key}",
                        confidence=score,
                    )
                results.append(
                    TopicExpansionResult(
                        topic_slug=topic_slug,
                        source_citation_key=row.source_citation_key,
                        discovered_citation_key=row.discovered_citation_key,
                        discovered_title=str(target_entry.get("title") or ""),
                        created_entry=row.created_entry,
                        relation_type=row.relation_type,
                        source_label=row.source_label,
                        relevance_score=score,
                        meets_relevance_threshold=meets_threshold,
                        assigned_to_topic=assigned,
                    )
                )
        store.connection.commit()
        return results
    def _materialized_discoveries(
        self,
        store: BibliographyStore,
        citation_key: str,
        source: str,
        relation_type: str,
        limit: int,
    ) -> list[tuple[ExpansionResult, dict[str, object] | None]]:
        if source == "crossref":
            expansion_rows = self.crossref_expander.expand_entry_references(store, citation_key)
        else:
            expansion_rows = self.openalex_expander.expand_entry(
                store,
                citation_key,
                relation_type=relation_type,
                limit=limit,
            )
        return [(row, store.get_entry(row.discovered_citation_key)) for row in expansion_rows]
    def _preview_discoveries(
        self,
        store: BibliographyStore,
        citation_key: str,
        source: str,
        relation_type: str,
        limit: int,
    ) -> list[tuple[ExpansionResult, dict[str, object]]]:
        if source == "crossref":
            return self._preview_crossref_discoveries(store, citation_key, limit)
        return self._preview_openalex_discoveries(store, citation_key, relation_type, limit)
    def _preview_crossref_discoveries(
        self,
        store: BibliographyStore,
        citation_key: str,
        limit: int,
    ) -> list[tuple[ExpansionResult, dict[str, object]]]:
        entry = store.get_entry(citation_key)
        if entry is None or not entry.get("doi"):
            return []
        doi = str(entry["doi"])
        payload = self.crossref_expander.resolver.source_client.get_json(
            f"https://api.crossref.org/works/{doi}?mailto=welsberr@gmail.com"
        )
        references = payload.get("message", {}).get("reference", [])[:limit]
        rows: list[tuple[ExpansionResult, dict[str, object]]] = []
        for index, reference in enumerate(references, start=1):
            discovered = _crossref_reference_to_entry(reference, citation_key, index)
            rows.append(
                (
                    ExpansionResult(
                        source_citation_key=citation_key,
                        discovered_citation_key=discovered.citation_key,
                        created_entry=store.get_entry(discovered.citation_key) is None,
                        relation_type="cites",
                        source_label=f"crossref:references:{doi}",
                    ),
                    dict(discovered.fields),
                )
            )
        return rows
    def _preview_openalex_discoveries(
        self,
        store: BibliographyStore,
        citation_key: str,
        relation_type: str,
        limit: int,
    ) -> list[tuple[ExpansionResult, dict[str, object]]]:
        entry = store.get_entry(citation_key)
        if entry is None:
            return []
        openalex_id = entry.get("openalex") or self.openalex_expander._lookup_openalex_id(entry)
        if not openalex_id:
            return []
        filter_name = "cited_by" if relation_type == "cites" else "cites"
        query = urlencode({"filter": f"{filter_name}:{openalex_id}", "per-page": limit})
        payload = self.openalex_expander.resolver.source_client.get_json(f"https://api.openalex.org/works?{query}")
        works = payload.get("results", [])
        rows: list[tuple[ExpansionResult, dict[str, object]]] = []
        for work in works:
            discovered = _openalex_work_to_entry(work)
            source_key = citation_key if relation_type == "cites" else discovered.citation_key
            rows.append(
                (
                    ExpansionResult(
                        source_citation_key=source_key,
                        discovered_citation_key=discovered.citation_key,
                        created_entry=store.get_entry(discovered.citation_key) is None,
                        relation_type=relation_type,
                        source_label=f"openalex:{relation_type}:{openalex_id}",
                    ),
                    dict(discovered.fields),
                )
            )
        return rows
 def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry:
    title = (
        reference.get("article-title")
        or reference.get("volume-title")
        or reference.get("journal-title")
        or reference.get("unstructured")
        or f"Referenced work {ordinal}"
    )
    year = str(reference.get("year") or "")
    author = reference.get("author") or ""
    doi = reference.get("DOI") or ""
    journal_title = reference.get("journal-title") or ""
    fields: dict[str, str] = {
        "title": _normalize_text(title),
        "note": f"discovered_from = {{{source_citation_key}}}",
    }
    if year:
        fields["year"] = year
    if author:
        fields["author"] = _normalize_text(author)
    if doi:
        fields["doi"] = doi
        fields["url"] = f"https://doi.org/{doi}"
    if journal_title:
        fields["journal"] = _normalize_text(journal_title)
    citation_key = _reference_citation_key(reference, title, year, ordinal)
    entry_type = "article" if journal_title else "misc"
    return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
 def _reference_citation_key(reference: dict, title: str, year: str, ordinal: int) -> str:
    if doi := reference.get("DOI"):
        suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower()
        return f"doi{suffix}"
    author = reference.get("author") or "ref"
    family = author.split(",")[0].split()[-1]
    family = re.sub(r"[^A-Za-z0-9]+", "", family).lower() or "ref"
    first_word = re.sub(r"[^A-Za-z0-9]+", "", title.split()[0]).lower() if title.split() else "untitled"
    return f"{family}{year or 'nd'}{first_word}{ordinal}"
 def _normalize_text(value: str) -> str:
    return " ".join(value.split())
 def _topic_relevance_score(topic_phrase: str, entry: dict[str, object] | None) -> float:
    if entry is None:
        return 0.0
    topic_terms = _expanded_keyword_terms(topic_phrase)
    if not topic_terms:
        return 0.0
    title_terms = _expanded_keyword_terms(str(entry.get("title") or ""))
    abstract_terms = _expanded_keyword_terms(str(entry.get("abstract") or ""))
    keyword_terms = _expanded_keyword_terms(str(entry.get("keywords") or ""))
    venue_terms = _expanded_keyword_terms(" ".join(str(entry.get(field) or "") for field in ("journal", "booktitle")))
    score = 0.0
    score += 0.6 * _term_overlap_ratio(topic_terms, title_terms)
    score += 0.25 * _term_overlap_ratio(topic_terms, abstract_terms)
    score += 0.1 * _term_overlap_ratio(topic_terms, keyword_terms)
    score += 0.05 * _term_overlap_ratio(topic_terms, venue_terms)
    phrase = _normalize_text(topic_phrase.casefold())
    title = _normalize_text(str(entry.get("title") or "").casefold())
    if phrase and title and phrase in title:
        score = max(score, 0.75)
    return min(score, 1.0)
 def _meets_topic_assignment_threshold(
    topic_phrase: str,
    entry: dict[str, object] | None,
    min_relevance: float,
    relevance_score: float | None = None,
 ) -> bool:
    if entry is None:
        return False
    score = relevance_score if relevance_score is not None else _topic_relevance_score(topic_phrase, entry)
    if score < min_relevance:
        return False
    title_anchor = _title_topic_anchor_ratio(topic_phrase, str(entry.get("title") or ""))
    return title_anchor >= 0.2
 def _keyword_terms(text: str) -> set[str]:
    return {
        _normalize_keyword(term)
        for term in re.findall(r"[A-Za-z0-9]+", text.casefold())
        if len(term) >= 4
    }
 def _expanded_keyword_terms(text: str) -> set[str]:
    terms = _keyword_terms(text)
    expanded = set(terms)
    for term in terms:
        expanded.update(_related_topic_terms(term))
    return expanded
 def _title_topic_anchor_ratio(topic_phrase: str, title: str) -> float:
    normalized_phrase = _normalize_text(topic_phrase.casefold())
    normalized_title = _normalize_text(title.casefold())
    if normalized_phrase and normalized_title and normalized_phrase in normalized_title:
        return 1.0
    topic_terms = _core_topic_terms(topic_phrase)
    title_terms = _keyword_terms(title)
    if not topic_terms or not title_terms:
        return 0.0
    overlap = topic_terms & title_terms
    if overlap:
        return max(0.25, len(overlap) / len(topic_terms))
    return 0.0
 def _core_topic_terms(topic_phrase: str) -> set[str]:
    generic_terms = {"evolution", "origin", "origins", "science", "study", "studies"}
    return {term for term in _keyword_terms(topic_phrase) if term not in generic_terms}
 def _term_overlap_ratio(topic_terms: set[str], candidate_terms: set[str]) -> float:
    if not topic_terms or not candidate_terms:
        return 0.0
    return len(topic_terms & candidate_terms) / len(topic_terms)
 def _normalize_keyword(term: str) -> str:
    normalized = term.casefold()
    for suffix in ("isms", "ists", "ation", "ment", "ings", "ness", "isms", "ism", "ist", "ing", "ers", "er", "ies", "ied", "ed", "es", "s"):
        if len(normalized) > len(suffix) + 3 and normalized.endswith(suffix):
            if suffix in {"ies", "ied"}:
                return normalized[: -len(suffix)] + "y"
            return normalized[: -len(suffix)]
    return normalized
 def _related_topic_terms(term: str) -> set[str]:
    related_groups = (
        {"human", "hominid", "hominin", "homo"},
        {"chimpanzee", "chimp", "pan", "ape", "apes", "primate"},
        {"primate", "primate", "ape", "apes", "hominid", "hominin"},
        {"evolution", "evolutionary", "phylogeny", "phylogen", "ancestor", "ancestral"},
        {"origin", "origins", "abiogenesis", "prebiotic"},
        {"morphometry", "morphology", "cranial", "dental", "skeletal", "body"},
        {"paleolithic", "paleoanthropology", "paleoanthropological", "pleistocene", "fossil"},
    )
    for group in related_groups:
        if term in group:
            return group - {term}
    return set()
 def _openalex_work_to_entry(work: dict) -> BibEntry:
    title = _normalize_text(work.get("display_name", "") or "Untitled work")
    year = str(work.get("publication_year") or "")
    doi = _normalize_openalex_doi(work.get("doi"))
    openalex_id = _normalize_openalex_id(work.get("id", ""))
    authors = " and ".join(_openalex_author_name(item) for item in work.get("authorships", []))
    source = ((work.get("primary_location") or {}).get("source") or {}).get("display_name", "")
    work_type = work.get("type", "")
    fields: dict[str, str] = {"title": title}
    if year:
        fields["year"] = year
    if authors:
        fields["author"] = authors
    if doi:
        fields["doi"] = doi
        fields["url"] = f"https://doi.org/{doi}"
    if openalex_id:
        fields["openalex"] = openalex_id
    if abstract := work.get("abstract_inverted_index"):
        fields["abstract"] = _openalex_abstract_text(abstract)
    if source:
        if work_type == "article":
            fields["journal"] = source
        else:
            fields["booktitle"] = source
    citation_key = _openalex_citation_key(openalex_id, authors, year, title)
    entry_type = _openalex_type_to_bibtype(work_type)
    return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
 def _openalex_author_name(authorship: dict) -> str:
    author = authorship.get("author") or {}
    name = author.get("display_name", "")
    return _normalize_text(name)
 def _openalex_abstract_text(inverted_index: dict) -> str:
    positions: dict[int, str] = {}
    for word, indexes in inverted_index.items():
        for index in indexes:
            positions[int(index)] = word
    return " ".join(word for _, word in sorted(positions.items()))
 def _openalex_type_to_bibtype(work_type: str) -> str:
    mapping = {
        "article": "article",
        "book": "book",
        "book-chapter": "incollection",
        "dissertation": "phdthesis",
        "proceedings-article": "inproceedings",
    }
    return mapping.get(work_type, "misc")
 def _openalex_citation_key(openalex_id: str, authors: str, year: str, title: str) -> str:
    if openalex_id:
        return f"openalex{re.sub(r'[^A-Za-z0-9]+', '', openalex_id).lower()}"
    author = authors.split(" and ")[0] if authors else "ref"
    family = re.sub(r"[^A-Za-z0-9]+", "", author.split()[-1]).lower() or "ref"
    first_word = re.sub(r"[^A-Za-z0-9]+", "", title.split()[0]).lower() if title.split() else "untitled"
    return f"{family}{year or 'nd'}{first_word}"
 def _normalize_openalex_id(value: str) -> str:
    if not value:
        return ""
    return value.rsplit("/", 1)[-1]
 def _normalize_openalex_doi(value: str | None) -> str:
    if not value:
        return ""
    if value.startswith("https://doi.org/"):
        return value[len("https://doi.org/") :]
    return value
--- a/src/citegeist/extract.py
+++ b/src/citegeist/extract.py
@ -0,0 +1,201 @@
 from __future__ import annotations
 import re
 from .bibtex import BibEntry
 YEAR_PATTERN = re.compile(r"\b(19|20)\d{2}\b")
 YEAR_PAREN_PATTERN = re.compile(r"\((19|20)\d{2}\)")
 REF_START_PATTERN = re.compile(r"^(?:\[\d+\]|\d+\.|\(\d+\))\s*")
 def extract_references(text: str) -> list[BibEntry]:
    entries: list[BibEntry] = []
    for index, line in enumerate(_iter_reference_blocks(text), start=1):
        parsed = _parse_reference_line(line, index)
        if parsed is not None:
            entries.append(parsed)
    return entries
 def render_extracted_bibtex(text: str) -> str:
    from .bibtex import render_bibtex
    return render_bibtex(extract_references(text))
 def _iter_reference_blocks(text: str) -> list[str]:
    lines: list[str] = []
    current: list[str] = []
    for raw_line in text.splitlines():
        line = raw_line.strip()
        if not line:
            if current:
                lines.append(" ".join(current))
                current = []
            continue
        starts_new = bool(REF_START_PATTERN.match(line))
        line = REF_START_PATTERN.sub("", line)
        normalized = " ".join(line.split())
        if len(normalized) < 20:
            continue
        if starts_new and current:
            lines.append(" ".join(current))
            current = [normalized]
        else:
            current.append(normalized)
    if current:
        lines.append(" ".join(current))
    return lines
 def _parse_reference_line(line: str, ordinal: int) -> BibEntry | None:
    for parser in (_parse_apa_style_reference, _parse_publisher_style_reference, _parse_plain_year_reference):
        parsed = parser(line, ordinal)
        if parsed is not None:
            return parsed
    return None
 def _parse_apa_style_reference(line: str, ordinal: int) -> BibEntry | None:
    year_match = YEAR_PAREN_PATTERN.search(line)
    if year_match is None:
        return None
    year = year_match.group(0).strip("()")
    author_part = line[: year_match.start()].strip(" .")
    remainder = line[year_match.end() :].strip(" .")
    if not author_part or not remainder:
        return None
    segments = _segments_after_year(remainder)
    if not segments:
        return None
    title = _clean_title(segments[0])
    venue = segments[1] if len(segments) > 1 else ""
    authors = _normalize_authors(author_part)
    return _build_entry(line, ordinal, authors, year, title, venue)
 def _parse_publisher_style_reference(line: str, ordinal: int) -> BibEntry | None:
    year_match = YEAR_PATTERN.search(line)
    if year_match is None:
        return None
    prefix = line[: year_match.start()].strip(" .,;")
    if "." not in prefix:
        return None
    head, publisher = prefix.rsplit(".", 1)
    if "." not in head:
        return None
    author_part, title = head.split(".", 1)
    authors = _normalize_authors(author_part)
    title = _clean_title(title)
    publisher = publisher.strip(" .,;")
    if not authors or not title or not publisher:
        return None
    citation_key = _make_citation_key(authors, year_match.group(0), title, ordinal)
    return BibEntry(
        entry_type="book",
        citation_key=citation_key,
        fields={
            "author": authors,
            "year": year_match.group(0),
            "title": title,
            "publisher": publisher,
            "note": f"extracted_reference = {{true}}; raw_reference = {{{line}}}",
        },
    )
 def _parse_plain_year_reference(line: str, ordinal: int) -> BibEntry | None:
    year_match = YEAR_PATTERN.search(line)
    if year_match is None:
        return None
    year = year_match.group(0)
    author_part = line[: year_match.start()].strip(" .")
    remainder = line[year_match.end() :].strip(" .")
    if not author_part or not remainder:
        return None
    segments = _segments_after_year(remainder)
    if not segments:
        return None
    title = _clean_title(segments[0])
    venue = segments[1] if len(segments) > 1 else ""
    authors = _normalize_authors(author_part)
    return _build_entry(line, ordinal, authors, year, title, venue)
 def _normalize_authors(author_part: str) -> str:
    normalized = author_part.replace(" & ", " and ")
    normalized = re.sub(r"\bet al\.?$", "and others", normalized)
    normalized = re.sub(r"\s+and\s+", " and ", normalized)
    normalized = re.sub(r"\s*,\s*", ", ", normalized)
    return normalized.strip(" .")
 def _segments_after_year(remainder: str) -> list[str]:
    return [segment.strip(" .") for segment in remainder.split(". ") if segment.strip(" .")]
 def _clean_title(title: str) -> str:
    cleaned = title.strip(" .\"'")
    cleaned = re.sub(r"\s+", " ", cleaned)
    return cleaned
 def _build_entry(
    raw_line: str,
    ordinal: int,
    authors: str,
    year: str,
    title: str,
    venue: str,
 ) -> BibEntry:
    citation_key = _make_citation_key(authors, year, title, ordinal)
    entry_type = _guess_entry_type(venue)
    fields: dict[str, str] = {
        "author": authors,
        "year": year,
        "title": title,
        "note": f"extracted_reference = {{true}}; raw_reference = {{{raw_line}}}",
    }
    if venue:
        if entry_type == "article":
            fields["journal"] = venue
        elif entry_type == "inproceedings":
            fields["booktitle"] = venue
        else:
            fields["howpublished"] = venue
    return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
 def _make_citation_key(authors: str, year: str, title: str, ordinal: int) -> str:
    first_author = authors.split(" and ")[0]
    family_name = first_author.split(",")[0] if "," in first_author else first_author.split()[-1]
    family_name = re.sub(r"[^A-Za-z0-9]+", "", family_name).lower() or "ref"
    first_word = re.sub(r"[^A-Za-z0-9]+", "", title.split()[0]).lower() if title.split() else "untitled"
    if not first_word:
        first_word = "untitled"
    return f"{family_name}{year}{first_word}{ordinal}"
 def _guess_entry_type(venue: str) -> str:
    lowered = venue.lower()
    if any(token in lowered for token in ("journal", "transactions", "review", "letters")):
        return "article"
    if any(token in lowered for token in ("proceedings", "conference", "workshop", "symposium")):
        return "inproceedings"
    if any(token in lowered for token in ("press", "publisher", "university")):
        return "book"
    return "misc"
--- a/src/citegeist/harvest.py
+++ b/src/citegeist/harvest.py
@ -0,0 +1,317 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from urllib.parse import urlencode
 import xml.etree.ElementTree as ET
 from .bibtex import BibEntry
 from .sources import SourceClient
 NS = {
    "oai": "http://www.openarchives.org/OAI/2.0/",
    "oai_dc": "http://www.openarchives.org/OAI/2.0/oai_dc/",
    "dc": "http://purl.org/dc/elements/1.1/",
    "mods": "http://www.loc.gov/mods/v3",
 }
@dataclass(slots=True)
 class HarvestResult:
    base_url: str
    identifier: str
    entry: BibEntry
@dataclass(slots=True)
 class OaiSet:
    set_spec: str
    set_name: str
    set_description: str = ""
@dataclass(slots=True)
 class OaiMetadataFormat:
    metadata_prefix: str
    schema: str
    metadata_namespace: str
 class OaiPmhHarvester:
    def __init__(self, source_client: SourceClient | None = None) -> None:
        self.source_client = source_client or SourceClient()
    def identify(self, base_url: str) -> dict[str, str]:
        root = self.source_client.get_xml(f"{base_url}?{urlencode({'verb': 'Identify'})}")
        identify = root.find(".//oai:Identify", NS)
        if identify is None:
            return {}
        payload: dict[str, str] = {}
        for field_name in (
            "repositoryName",
            "baseURL",
            "protocolVersion",
            "adminEmail",
            "earliestDatestamp",
            "deletedRecord",
            "granularity",
        ):
            payload[field_name] = _node_text(identify.find(f"oai:{field_name}", NS))
        return payload
    def list_sets(self, base_url: str) -> list[OaiSet]:
        root = self.source_client.get_xml(f"{base_url}?{urlencode({'verb': 'ListSets'})}")
        sets = root.findall(".//oai:set", NS)
        results: list[OaiSet] = []
        for node in sets:
            results.append(
                OaiSet(
                    set_spec=_node_text(node.find("oai:setSpec", NS)),
                    set_name=_node_text(node.find("oai:setName", NS)),
                    set_description=_flatten_set_description(node.find("oai:setDescription", NS)),
                )
            )
        return results
    def list_metadata_formats(self, base_url: str, identifier: str | None = None) -> list[OaiMetadataFormat]:
        params = {"verb": "ListMetadataFormats"}
        if identifier:
            params["identifier"] = identifier
        root = self.source_client.get_xml(f"{base_url}?{urlencode(params)}")
        formats = root.findall(".//oai:metadataFormat", NS)
        results: list[OaiMetadataFormat] = []
        for node in formats:
            results.append(
                OaiMetadataFormat(
                    metadata_prefix=_node_text(node.find("oai:metadataPrefix", NS)),
                    schema=_node_text(node.find("oai:schema", NS)),
                    metadata_namespace=_node_text(node.find("oai:metadataNamespace", NS)),
                )
            )
        return results
    def list_records(
        self,
        base_url: str,
        metadata_prefix: str = "oai_dc",
        set_spec: str | None = None,
        date_from: str | None = None,
        date_until: str | None = None,
        limit: int | None = None,
    ) -> list[HarvestResult]:
        results: list[HarvestResult] = []
        params = {"verb": "ListRecords", "metadataPrefix": metadata_prefix}
        if set_spec:
            params["set"] = set_spec
        if date_from:
            params["from"] = date_from
        if date_until:
            params["until"] = date_until
        ordinal = 1
        next_url = f"{base_url}?{urlencode(params)}"
        while next_url:
            root = self.source_client.get_xml(next_url)
            records = root.findall(".//oai:record", NS)
            for record in records:
                parsed = self._record_to_result(base_url, record, ordinal)
                ordinal += 1
                if parsed is not None:
                    results.append(parsed)
                if limit is not None and len(results) >= limit:
                    return results
            next_url = self._resumption_url(base_url, root)
        return results
    def get_record(
        self,
        base_url: str,
        identifier: str,
        metadata_prefix: str = "oai_dc",
    ) -> HarvestResult | None:
        params = {
            "verb": "GetRecord",
            "metadataPrefix": metadata_prefix,
            "identifier": identifier,
        }
        root = self.source_client.get_xml(f"{base_url}?{urlencode(params)}")
        record = root.find(".//oai:record", NS)
        if record is None:
            return None
        return self._record_to_result(base_url, record, 1)
    def _record_to_result(self, base_url: str, record: ET.Element, ordinal: int) -> HarvestResult | None:
        identifier = _node_text(record.find("./oai:header/oai:identifier", NS))
        metadata_node = record.find("./oai:metadata/*", NS)
        if metadata_node is None or not identifier:
            return None
        entry = _metadata_node_to_entry(base_url, identifier, metadata_node, ordinal)
        return HarvestResult(base_url=base_url, identifier=identifier, entry=entry)
    def _resumption_url(self, base_url: str, root: ET.Element) -> str | None:
        token = _node_text(root.find(".//oai:resumptionToken", NS))
        if not token:
            return None
        return f"{base_url}?{urlencode({'verb': 'ListRecords', 'resumptionToken': token})}"
 def _oai_dc_to_entry(base_url: str, identifier: str, metadata: ET.Element, ordinal: int) -> BibEntry:
    titles = _all_text(metadata.findall("dc:title", NS))
    creators = _all_text(metadata.findall("dc:creator", NS))
    dates = _all_text(metadata.findall("dc:date", NS))
    descriptions = _all_text(metadata.findall("dc:description", NS))
    identifiers = _all_text(metadata.findall("dc:identifier", NS))
    publishers = _all_text(metadata.findall("dc:publisher", NS))
    types = [value.lower() for value in _all_text(metadata.findall("dc:type", NS))]
    title = titles[0] if titles else "Untitled record"
    year = _first_year(dates)
    entry_type = _guess_oai_entry_type(types)
    fields: dict[str, str] = {
        "title": title,
        "oai": identifier,
        "url": _best_identifier_url(identifiers) or f"{base_url}?verb=GetRecord&identifier={identifier}&metadataPrefix=oai_dc",
        "note": "harvested_from = {oai_pmh}",
    }
    if creators:
        fields["author"] = " and ".join(creators)
    if year:
        fields["year"] = year
    if descriptions:
        fields["abstract"] = descriptions[0]
    if publishers:
        fields["publisher"] = publishers[0]
    citation_key = _oai_citation_key(creators, year, title, ordinal)
    return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
 def _mods_to_entry(base_url: str, identifier: str, metadata: ET.Element, ordinal: int) -> BibEntry:
    title = _node_text(metadata.find(".//mods:titleInfo/mods:title", NS)) or "Untitled record"
    sub_title = _node_text(metadata.find(".//mods:titleInfo/mods:subTitle", NS))
    if sub_title:
        title = f"{title}: {sub_title}"
    creators: list[str] = []
    for name in metadata.findall(".//mods:name", NS):
        role_terms = [term.text or "" for term in name.findall(".//mods:roleTerm", NS)]
        if role_terms and not any(term.lower() == "author" for term in role_terms):
            continue
        parts = [_node_text(part) for part in name.findall("./mods:namePart", NS)]
        parts = [part for part in parts if part]
        if parts:
            creators.append(", ".join(parts) if len(parts) == 2 else " ".join(parts))
    year = ""
    for date_node in metadata.findall(".//mods:originInfo/mods:dateIssued", NS):
        text = _node_text(date_node)
        if len(text) >= 4 and text[:4].isdigit():
            year = text[:4]
            break
    publisher = _node_text(metadata.find(".//mods:originInfo/mods:publisher", NS))
    abstract = _node_text(metadata.find(".//mods:abstract", NS))
    genre = _node_text(metadata.find(".//mods:genre", NS)).lower()
    related_title = _node_text(metadata.find(".//mods:relatedItem/mods:titleInfo/mods:title", NS))
    url = _node_text(metadata.find(".//mods:location/mods:url", NS))
    entry_type = "phdthesis" if "thesis" in genre or "dissertation" in genre else "misc"
    if not entry_type == "phdthesis":
        if related_title:
            entry_type = "article"
    fields: dict[str, str] = {
        "title": title,
        "oai": identifier,
        "url": url or f"{base_url}?verb=GetRecord&identifier={identifier}&metadataPrefix=mods",
        "note": "harvested_from = {oai_pmh_mods}",
    }
    if creators:
        fields["author"] = " and ".join(creators)
    if year:
        fields["year"] = year
    if publisher:
        fields["publisher"] = publisher
    if abstract:
        fields["abstract"] = abstract
    if related_title:
        fields["journal"] = related_title
    citation_key = _oai_citation_key(creators, year, title, ordinal)
    return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
 def _metadata_node_to_entry(base_url: str, identifier: str, metadata: ET.Element, ordinal: int) -> BibEntry:
    if metadata.tag.endswith("dc"):
        return _oai_dc_to_entry(base_url, identifier, metadata, ordinal)
    if metadata.tag.endswith("mods"):
        return _mods_to_entry(base_url, identifier, metadata, ordinal)
    return BibEntry(
        entry_type="misc",
        citation_key=_oai_citation_key([], "", identifier, ordinal),
        fields={
            "title": identifier,
            "oai": identifier,
            "url": f"{base_url}?verb=GetRecord&identifier={identifier}",
            "note": f"unsupported_oai_metadata = {{{metadata.tag}}}",
        },
    )
 def _node_text(node: ET.Element | None) -> str:
    if node is None or node.text is None:
        return ""
    return " ".join(node.text.split())
 def _all_text(nodes: list[ET.Element]) -> list[str]:
    values = []
    for node in nodes:
        value = _node_text(node)
        if value:
            values.append(value)
    return values
 def _first_year(dates: list[str]) -> str:
    for date in dates:
        if len(date) >= 4 and date[:4].isdigit():
            return date[:4]
    return ""
 def _guess_oai_entry_type(types: list[str]) -> str:
    joined = " ".join(types)
    if "thesis" in joined or "dissertation" in joined:
        return "phdthesis"
    if "article" in joined:
        return "article"
    if "book" in joined:
        return "book"
    return "misc"
 def _best_identifier_url(identifiers: list[str]) -> str:
    for identifier in identifiers:
        if identifier.startswith("http://") or identifier.startswith("https://"):
            return identifier
    return ""
 def _oai_citation_key(creators: list[str], year: str, title: str, ordinal: int) -> str:
    author = creators[0] if creators else "oai"
    family = author.split(",")[0] if "," in author else author.split()[-1]
    family = "".join(ch for ch in family.lower() if ch.isalnum()) or "oai"
    first_word = "".join(ch for ch in title.split()[0].lower() if ch.isalnum()) if title.split() else "untitled"
    return f"{family}{year or 'nd'}{first_word}{ordinal}"
 def _flatten_set_description(node: ET.Element | None) -> str:
    if node is None:
        return ""
    parts = []
    for child in node.iter():
        if child.text and child.text.strip():
            parts.append(" ".join(child.text.split()))
    return " ".join(parts)
--- a/src/citegeist/resolve.py
+++ b/src/citegeist/resolve.py
@ -0,0 +1,567 @@
 from __future__ import annotations
 import re
 import urllib.parse
 import xml.etree.ElementTree as ET
 from dataclasses import dataclass
 from .bibtex import BibEntry, parse_bibtex
 from .sources import SourceClient
@dataclass(slots=True)
 class Resolution:
    entry: BibEntry
    source_type: str
    source_label: str
 class MetadataResolver:
    def __init__(
        self,
        user_agent: str = "citegeist/0.1 (local research tool)",
        source_client: SourceClient | None = None,
    ) -> None:
        self.user_agent = user_agent
        self.source_client = source_client or SourceClient(user_agent=user_agent)
    def resolve_entry(self, entry: BibEntry) -> Resolution | None:
        if doi := entry.fields.get("doi"):
            resolved = self.resolve_doi(doi)
            if resolved is not None:
                return resolved
            resolved = self.resolve_datacite_doi(doi)
            if resolved is not None:
                return resolved
        if openalex_id := entry.fields.get("openalex"):
            resolved = self.resolve_openalex(openalex_id)
            if resolved is not None:
                return resolved
        if dblp_key := entry.fields.get("dblp"):
            resolved = self.resolve_dblp(dblp_key)
            if resolved is not None:
                return resolved
        if arxiv_id := entry.fields.get("arxiv"):
            resolved = self.resolve_arxiv(arxiv_id)
            if resolved is not None:
                return resolved
        if title := entry.fields.get("title"):
            resolved = self.search_crossref_best_match(
                title=title,
                author_text=entry.fields.get("author", ""),
                year=entry.fields.get("year", ""),
            )
            if resolved is not None:
                return resolved
            resolved = self.search_datacite_best_match(
                title=title,
                author_text=entry.fields.get("author", ""),
                year=entry.fields.get("year", ""),
            )
            if resolved is not None:
                return resolved
            resolved = self.search_openalex_best_match(
                title=title,
                author_text=entry.fields.get("author", ""),
                year=entry.fields.get("year", ""),
            )
            if resolved is not None:
                return resolved
        return None
    def resolve_doi(self, doi: str) -> Resolution | None:
        encoded = urllib.parse.quote(doi, safe="")
        payload = self.source_client.get_json(f"https://api.crossref.org/works/{encoded}")
        message = payload.get("message", {})
        if not message:
            return None
        return Resolution(
            entry=_crossref_message_to_entry(message),
            source_type="resolver",
            source_label=f"crossref:doi:{doi}",
        )
    def search_crossref(self, title: str, limit: int = 5) -> list[BibEntry]:
        query = urllib.parse.urlencode({"query.title": title, "rows": limit})
        payload = self.source_client.get_json(f"https://api.crossref.org/works?{query}")
        items = payload.get("message", {}).get("items", [])
        return [_crossref_message_to_entry(item) for item in items]
    def search_crossref_best_match(
        self,
        title: str,
        author_text: str = "",
        year: str = "",
    ) -> Resolution | None:
        candidate = _select_best_title_match(
            self.search_crossref(title, limit=5),
            title=title,
            author_text=author_text,
            year=year,
        )
        if candidate is None:
            return None
        return Resolution(
            entry=candidate,
            source_type="resolver",
            source_label=f"crossref:search:{title}",
        )
    def resolve_dblp(self, dblp_key: str) -> Resolution | None:
        encoded_key = urllib.parse.quote(dblp_key, safe="/:")
        text = self.source_client.get_text(f"https://dblp.org/rec/{encoded_key}.bib")
        entries = parse_bibtex(text)
        if not entries:
            return None
        return Resolution(
            entry=entries[0],
            source_type="resolver",
            source_label=f"dblp:key:{dblp_key}",
        )
    def search_dblp(self, query_text: str, limit: int = 5) -> list[BibEntry]:
        query = urllib.parse.urlencode({"q": query_text, "format": "json", "h": limit})
        payload = self.source_client.get_json(f"https://dblp.org/search/publ/api?{query}")
        hits = payload.get("result", {}).get("hits", {}).get("hit", [])
        if isinstance(hits, dict):
            hits = [hits]
        results: list[BibEntry] = []
        for hit in hits:
            info = hit.get("info", {})
            dblp_key = info.get("key")
            if dblp_key:
                resolved = self.resolve_dblp(dblp_key)
                if resolved is not None:
                    results.append(resolved.entry)
        return results
    def resolve_arxiv(self, arxiv_id: str) -> Resolution | None:
        query = urllib.parse.urlencode({"id_list": arxiv_id})
        root = self.source_client.get_xml(f"https://export.arxiv.org/api/query?{query}")
        namespace = {"atom": "http://www.w3.org/2005/Atom"}
        entry = root.find("atom:entry", namespace)
        if entry is None:
            return None
        return Resolution(
            entry=_arxiv_atom_entry_to_bib(entry, arxiv_id),
            source_type="resolver",
            source_label=f"arxiv:id:{arxiv_id}",
        )
    def resolve_openalex(self, openalex_id: str) -> Resolution | None:
        normalized_id = _normalize_openalex_id(openalex_id)
        payload = self.source_client.get_json(f"https://api.openalex.org/works/{normalized_id}")
        if not payload:
            return None
        return Resolution(
            entry=_openalex_work_to_entry(payload),
            source_type="resolver",
            source_label=f"openalex:id:{normalized_id}",
        )
    def resolve_datacite_doi(self, doi: str) -> Resolution | None:
        encoded = urllib.parse.quote(doi, safe="")
        payload = self.source_client.get_json(f"https://api.datacite.org/dois/{encoded}")
        data = payload.get("data", {})
        if not data:
            return None
        return Resolution(
            entry=_datacite_work_to_entry(data),
            source_type="resolver",
            source_label=f"datacite:doi:{doi}",
        )
    def search_datacite(self, title: str, limit: int = 5) -> list[BibEntry]:
        query = urllib.parse.urlencode({"query": title, "page[size]": limit})
        payload = self.source_client.get_json(f"https://api.datacite.org/dois?{query}")
        return [_datacite_work_to_entry(item) for item in payload.get("data", [])]
    def search_datacite_best_match(
        self,
        title: str,
        author_text: str = "",
        year: str = "",
    ) -> Resolution | None:
        candidate = _select_best_title_match(
            self.search_datacite(title, limit=5),
            title=title,
            author_text=author_text,
            year=year,
        )
        if candidate is None:
            return None
        return Resolution(
            entry=candidate,
            source_type="resolver",
            source_label=f"datacite:search:{title}",
        )
    def search_openalex(self, title: str, limit: int = 5) -> list[BibEntry]:
        query = urllib.parse.urlencode({"search": title, "per-page": limit})
        payload = self.source_client.get_json(f"https://api.openalex.org/works?{query}")
        return [_openalex_work_to_entry(item) for item in payload.get("results", [])]
    def search_openalex_best_match(
        self,
        title: str,
        author_text: str = "",
        year: str = "",
    ) -> Resolution | None:
        candidate = _select_best_title_match(
            self.search_openalex(title, limit=5),
            title=title,
            author_text=author_text,
            year=year,
        )
        if candidate is None:
            return None
        return Resolution(
            entry=candidate,
            source_type="resolver",
            source_label=f"openalex:search:{title}",
        )
 def merge_entries(base: BibEntry, resolved: BibEntry) -> BibEntry:
    merged, _ = merge_entries_with_conflicts(base, resolved)
    return merged
 def merge_entries_with_conflicts(base: BibEntry, resolved: BibEntry) -> tuple[BibEntry, list[dict[str, str]]]:
    merged_fields = dict(base.fields)
    conflicts: list[dict[str, str]] = []
    for key, value in resolved.fields.items():
        if not value:
            continue
        current_value = merged_fields.get(key, "")
        if current_value and current_value != value:
            conflicts.append(
                {
                    "field_name": key,
                    "current_value": current_value,
                    "proposed_value": value,
                }
            )
            continue
        if key not in merged_fields or not merged_fields[key]:
            merged_fields[key] = value
    return (
        BibEntry(
            entry_type=base.entry_type or resolved.entry_type,
            citation_key=base.citation_key,
            fields=merged_fields,
        ),
        conflicts,
    )
 def _crossref_message_to_entry(message: dict) -> BibEntry:
    entry_type = _crossref_type_to_bibtype(message.get("type", "article"))
    title_values = message.get("title", [])
    title = title_values[0] if title_values else ""
    year = _extract_crossref_year(message)
    authors = " and ".join(_crossref_person_to_name(person) for person in message.get("author", []))
    venue = ""
    if container_title := message.get("container-title", []):
        venue = container_title[0]
    fields: dict[str, str] = {}
    if authors:
        fields["author"] = authors
    if title:
        fields["title"] = title
    if year:
        fields["year"] = year
    if doi := message.get("DOI"):
        fields["doi"] = doi
    if url := message.get("URL"):
        fields["url"] = url
    if abstract := message.get("abstract"):
        fields["abstract"] = abstract
    if venue:
        if entry_type == "article":
            fields["journal"] = venue
        else:
            fields["booktitle"] = venue
    if volume := message.get("volume"):
        fields["volume"] = str(volume)
    if issue := message.get("issue"):
        fields["number"] = str(issue)
    if pages := message.get("page"):
        fields["pages"] = str(pages)
    citation_key = _make_resolution_key(fields.get("author", "crossref"), year or "n.d.", title or "untitled")
    return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
 def _arxiv_atom_entry_to_bib(node: ET.Element, arxiv_id: str) -> BibEntry:
    ns = {
        "atom": "http://www.w3.org/2005/Atom",
        "arxiv": "http://arxiv.org/schemas/atom",
    }
    title = _node_text(node.find("atom:title", ns))
    summary = _node_text(node.find("atom:summary", ns))
    published = _node_text(node.find("atom:published", ns))
    year = published[:4] if published else ""
    authors = " and ".join(
        _node_text(author.find("atom:name", ns)) for author in node.findall("atom:author", ns)
    )
    doi = _node_text(node.find("arxiv:doi", ns))
    fields: dict[str, str] = {
        "title": title,
        "author": authors,
        "year": year,
        "arxiv": arxiv_id,
        "url": f"https://arxiv.org/abs/{arxiv_id}",
        "pdf": f"https://arxiv.org/pdf/{arxiv_id}.pdf",
    }
    if summary:
        fields["abstract"] = summary
    if doi:
        fields["doi"] = doi
    return BibEntry(entry_type="article", citation_key=f"arxiv{arxiv_id.replace('.', '').replace('/', '')}", fields=fields)
 def _crossref_type_to_bibtype(crossref_type: str) -> str:
    mapping = {
        "journal-article": "article",
        "proceedings-article": "inproceedings",
        "book-chapter": "incollection",
        "book": "book",
        "proceedings": "proceedings",
    }
    return mapping.get(crossref_type, "misc")
 def _extract_crossref_year(message: dict) -> str:
    for field_name in ("published-print", "published-online", "issued", "created"):
        date_parts = message.get(field_name, {}).get("date-parts", [])
        if date_parts and date_parts[0]:
            return str(date_parts[0][0])
    return ""
 def _crossref_person_to_name(person: dict) -> str:
    family = person.get("family", "")
    given = person.get("given", "")
    if family and given:
        return f"{family}, {given}"
    return family or given
 def _node_text(node: ET.Element | None) -> str:
    if node is None or node.text is None:
        return ""
    return " ".join(node.text.split())
 def _make_resolution_key(author_text: str, year: str, title: str) -> str:
    first_author = author_text.split(" and ")[0]
    family_name = first_author.split(",")[0] if "," in first_author else first_author.split()[-1]
    family_name = "".join(ch for ch in family_name.lower() if ch.isalnum()) or "ref"
    first_word = "".join(ch for ch in title.split()[0].lower() if ch.isalnum()) if title.split() else "untitled"
    return f"{family_name}{year}{first_word}"
 def _openalex_work_to_entry(work: dict) -> BibEntry:
    title = work.get("display_name", "") or "Untitled work"
    year = str(work.get("publication_year") or "")
    doi = _normalize_openalex_doi(work.get("doi"))
    openalex_id = _normalize_openalex_id(work.get("id", ""))
    authors = " and ".join(_openalex_author_name(item) for item in work.get("authorships", []))
    source = ((work.get("primary_location") or {}).get("source") or {}).get("display_name", "")
    work_type = work.get("type", "")
    fields: dict[str, str] = {}
    if authors:
        fields["author"] = authors
    if title:
        fields["title"] = title
    if year:
        fields["year"] = year
    if doi:
        fields["doi"] = doi
        fields["url"] = f"https://doi.org/{doi}"
    if openalex_id:
        fields["openalex"] = openalex_id
        fields.setdefault("url", f"https://openalex.org/{openalex_id}")
    if abstract := work.get("abstract_inverted_index"):
        fields["abstract"] = _openalex_abstract_text(abstract)
    if source:
        if work_type == "article":
            fields["journal"] = source
        else:
            fields["booktitle"] = source
    citation_key = f"openalex{re.sub(r'[^A-Za-z0-9]+', '', openalex_id).lower()}" if openalex_id else _make_resolution_key(authors or "openalex", year or "n.d.", title or "untitled")
    return BibEntry(entry_type=_openalex_type_to_bibtype(work_type), citation_key=citation_key, fields=fields)
 def _openalex_author_name(authorship: dict) -> str:
    author = authorship.get("author") or {}
    return " ".join(str(author.get("display_name", "")).split())
 def _openalex_abstract_text(inverted_index: dict) -> str:
    positions: dict[int, str] = {}
    for word, indexes in inverted_index.items():
        for index in indexes:
            positions[int(index)] = word
    return " ".join(word for _, word in sorted(positions.items()))
 def _openalex_type_to_bibtype(work_type: str) -> str:
    mapping = {
        "article": "article",
        "book": "book",
        "book-chapter": "incollection",
        "dissertation": "phdthesis",
        "proceedings-article": "inproceedings",
    }
    return mapping.get(work_type, "misc")
 def _normalize_openalex_id(value: str) -> str:
    if not value:
        return ""
    return value.rsplit("/", 1)[-1]
 def _normalize_openalex_doi(value: str | None) -> str:
    if not value:
        return ""
    if value.startswith("https://doi.org/"):
        return value[len("https://doi.org/") :]
    return value
 def _normalize_match_text(value: str) -> str:
    lowered = value.lower()
    lowered = re.sub(r"\W+", " ", lowered)
    return " ".join(lowered.split())
 def _select_best_title_match(
    candidates: list[BibEntry],
    title: str,
    author_text: str = "",
    year: str = "",
 ) -> BibEntry | None:
    if not candidates:
        return None
    title_norm = _normalize_match_text(title)
    author_tokens = _author_match_tokens(author_text)
    year_text = str(year or "").strip()
    for candidate in candidates:
        candidate_title = _normalize_match_text(candidate.fields.get("title", ""))
        if candidate_title != title_norm:
            continue
        candidate_year = str(candidate.fields.get("year", "") or "").strip()
        if year_text and candidate_year and year_text != candidate_year:
            continue
        if author_tokens and not _candidate_matches_author_tokens(candidate, author_tokens):
            continue
        return candidate
    return None
 def _author_match_tokens(author_text: str) -> set[str]:
    normalized = _normalize_match_text(author_text)
    if not normalized:
        return set()
    tokens = {
        token
        for token in re.findall(r"[a-z0-9]+", normalized)
        if len(token) >= 2 and token not in {"and", "et", "al"}
    }
    return tokens
 def _candidate_matches_author_tokens(candidate: BibEntry, author_tokens: set[str]) -> bool:
    candidate_author = _normalize_match_text(candidate.fields.get("author", ""))
    if not candidate_author:
        return False
    candidate_tokens = set(re.findall(r"[a-z0-9]+", candidate_author))
    return bool(author_tokens & candidate_tokens)
 def _datacite_work_to_entry(data: dict) -> BibEntry:
    attributes = data.get("attributes", {})
    doi = str(attributes.get("doi") or "")
    titles = attributes.get("titles") or []
    creators = attributes.get("creators") or []
    descriptions = attributes.get("descriptions") or []
    publisher = str(attributes.get("publisher") or "")
    year = str(attributes.get("publicationYear") or "")
    url = str(attributes.get("url") or "")
    types = attributes.get("types") or {}
    title = titles[0].get("title", "") if titles else ""
    author_names = " and ".join(_datacite_creator_name(creator) for creator in creators if _datacite_creator_name(creator))
    abstract = _datacite_abstract(descriptions)
    entry_type = _datacite_type_to_bibtype(str(types.get("resourceTypeGeneral") or ""))
    fields: dict[str, str] = {}
    if title:
        fields["title"] = title
    if author_names:
        fields["author"] = author_names
    if year:
        fields["year"] = year
    if doi:
        fields["doi"] = doi
    if url:
        fields["url"] = url
    elif doi:
        fields["url"] = f"https://doi.org/{doi}"
    if publisher:
        fields["publisher"] = publisher
    if abstract:
        fields["abstract"] = abstract
    citation_key = _make_resolution_key(author_names or "datacite", year or "n.d.", title or doi or "untitled")
    return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
 def _datacite_creator_name(creator: dict) -> str:
    family = str(creator.get("familyName") or "")
    given = str(creator.get("givenName") or "")
    if family and given:
        return f"{family}, {given}"
    return str(creator.get("name") or family or given)
 def _datacite_abstract(descriptions: list[dict]) -> str:
    for description in descriptions:
        if str(description.get("descriptionType") or "").lower() == "abstract":
            return str(description.get("description") or "")
    return ""
 def _datacite_type_to_bibtype(resource_type: str) -> str:
    lowered = resource_type.lower()
    mapping = {
        "audiovisual": "misc",
        "book": "book",
        "bookchapter": "incollection",
        "collection": "misc",
        "computationalnotebook": "misc",
        "conferencepaper": "inproceedings",
        "dataset": "misc",
        "dissertation": "phdthesis",
        "image": "misc",
        "journalarticle": "article",
        "model": "misc",
        "report": "techreport",
        "software": "misc",
        "text": "misc",
    }
    return mapping.get(lowered, "misc")
--- a/src/citegeist/sources.py
+++ b/src/citegeist/sources.py
@ -0,0 +1,86 @@
 from __future__ import annotations
 import hashlib
 import json
 import urllib.request
 import xml.etree.ElementTree as ET
 from pathlib import Path
 class SourceClient:
    def __init__(
        self,
        user_agent: str = "citegeist/0.1 (local research tool)",
        cache_dir: str | Path | None = None,
        fixtures_dir: str | Path | None = None,
    ) -> None:
        self.user_agent = user_agent
        self.cache_dir = Path(cache_dir) if cache_dir else None
        self.fixtures_dir = Path(fixtures_dir) if fixtures_dir else None
    def get_json(self, url: str) -> dict:
        cached = self._read_cached(url, "json")
        if cached is not None:
            return json.loads(cached)
        payload = self._fetch_bytes(url)
        self._write_cache(url, "json", payload)
        return json.loads(payload.decode("utf-8"))
    def get_text(self, url: str) -> str:
        cached = self._read_cached(url, "txt")
        if cached is not None:
            return self._decode_text(cached)
        payload = self._fetch_bytes(url)
        self._write_cache(url, "txt", payload)
        return self._decode_text(payload)
    def get_xml(self, url: str) -> ET.Element:
        cached = self._read_cached(url, "xml")
        if cached is not None:
            return ET.fromstring(cached)
        payload = self._fetch_bytes(url)
        self._write_cache(url, "xml", payload)
        return ET.fromstring(payload)
    def _fetch_bytes(self, url: str) -> bytes:
        with urllib.request.urlopen(self._request(url)) as response:
            return response.read()
    def _request(self, url: str) -> urllib.request.Request:
        return urllib.request.Request(
            url,
            headers={
                "User-Agent": self.user_agent,
            },
        )
    def _cache_key(self, url: str, suffix: str) -> str:
        digest = hashlib.sha1(url.encode("utf-8")).hexdigest()
        return f"{digest}.{suffix}"
    def _read_cached(self, url: str, suffix: str) -> bytes | None:
        for root in (self.fixtures_dir, self.cache_dir):
            if root is None:
                continue
            path = root / self._cache_key(url, suffix)
            if path.exists():
                return path.read_bytes()
        return None
    def _write_cache(self, url: str, suffix: str, payload: bytes) -> None:
        if self.cache_dir is None:
            return
        self.cache_dir.mkdir(parents=True, exist_ok=True)
        path = self.cache_dir / self._cache_key(url, suffix)
        path.write_bytes(payload)
    def _decode_text(self, payload: bytes) -> str:
        for encoding in ("utf-8", "utf-8-sig", "iso-8859-1", "latin-1"):
            try:
                return payload.decode(encoding)
            except UnicodeDecodeError:
                continue
        return payload.decode("utf-8", errors="replace")
--- a/src/citegeist/storage.py
+++ b/src/citegeist/storage.py
--- a/src/citegeist/talkorigins.py
+++ b/src/citegeist/talkorigins.py
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -0,0 +1,15 @@
 from __future__ import annotations
 import os
 import pytest
 def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None:
    if os.environ.get("CITEGEIST_LIVE_TESTS") == "1":
        return
    skip_live = pytest.mark.skip(reason="set CITEGEIST_LIVE_TESTS=1 to run live-source tests")
    for item in items:
        if "live" in item.keywords:
            item.add_marker(skip_live)
--- a/tests/test_batch.py
+++ b/tests/test_batch.py
@ -0,0 +1,129 @@
 from pathlib import Path
 from citegeist.batch import BatchBootstrapRunner, load_batch_jobs
 from citegeist.cli import main
 from citegeist.storage import BibliographyStore
 def test_load_batch_jobs_accepts_object_with_jobs(tmp_path: Path):
    path = tmp_path / "jobs.json"
    path.write_text(
        """
 {
  "jobs": [
    {"name": "topic-only", "topic": "graph topic"},
    {"name": "seed-only", "seed_bib": "seed.bib"}
  ]
 }
 """,
        encoding="utf-8",
    )
    jobs = load_batch_jobs(path)
    assert jobs[0]["name"] == "topic-only"
    assert jobs[1]["seed_bib"] == str((tmp_path / "seed.bib").resolve())
 def test_batch_runner_executes_multiple_jobs(tmp_path: Path):
    seed_bib = tmp_path / "seed.bib"
    seed_bib.write_text(
        """
@article{seed2024,
  author = {Seed, Alice},
  title = {Seed Paper},
  year = {2024}
 }
 """,
        encoding="utf-8",
    )
    jobs = [
        {"name": "seed-job", "seed_bib": str(seed_bib), "expand": False},
        {"name": "topic-job", "topic": "graph topic", "expand": False, "preview": True},
    ]
    runner = BatchBootstrapRunner()
    from citegeist import BibEntry
    runner.bootstrapper.resolver.search_openalex = lambda topic, limit=5: [  # type: ignore[method-assign]
        BibEntry(entry_type="article", citation_key="topic2024graph", fields={"title": "Graph Topic Result"})
    ]
    runner.bootstrapper.resolver.search_crossref = lambda topic, limit=5: []  # type: ignore[method-assign]
    runner.bootstrapper.resolver.search_datacite = lambda topic, limit=5: []  # type: ignore[method-assign]
    store = BibliographyStore()
    try:
        results = runner.run(store, jobs)
        assert [job.job_name for job in results] == ["seed-job", "topic-job"]
        assert results[0].result_count == 1
        assert results[1].results[0].citation_key == "topic2024graph"
        assert store.get_entry("seed2024") is not None
        assert store.get_entry("topic2024graph") is None
    finally:
        store.close()
 def test_batch_runner_can_store_topic_phrase_metadata():
    jobs = [
        {
            "name": "topic-job",
            "topic": "graph topic",
            "topic_slug": "graph-methods",
            "topic_name": "Graph Methods",
            "topic_phrase": "graph networks biology",
            "expand": False,
            "preview": False,
        }
    ]
    runner = BatchBootstrapRunner()
    from citegeist import BibEntry
    runner.bootstrapper.resolver.search_openalex = lambda topic, limit=5: [  # type: ignore[method-assign]
        BibEntry(entry_type="article", citation_key="topic2024graph", fields={"title": "Graph Topic Result"})
    ]
    runner.bootstrapper.resolver.search_crossref = lambda topic, limit=5: []  # type: ignore[method-assign]
    runner.bootstrapper.resolver.search_datacite = lambda topic, limit=5: []  # type: ignore[method-assign]
    store = BibliographyStore()
    try:
        runner.run(store, jobs)
        topic = store.get_topic("graph-methods")
        assert topic is not None
        assert topic["name"] == "Graph Methods"
        assert topic["expansion_phrase"] == "graph networks biology"
    finally:
        store.close()
 def test_bootstrap_batch_cli_runs_json_jobs(tmp_path: Path):
    seed_bib = tmp_path / "seed.bib"
    seed_bib.write_text(
        """
@article{seed2024,
  author = {Seed, Alice},
  title = {Seed Paper},
  year = {2024}
 }
 """,
        encoding="utf-8",
    )
    batch_json = tmp_path / "jobs.json"
    batch_json.write_text(
        f"""
 [
  {{"name": "seed-job", "seed_bib": "{seed_bib}", "expand": false}},
  {{"name": "topic-job", "topic": "graph topic", "expand": false, "preview": true}}
 ]
 """,
        encoding="utf-8",
    )
    from unittest.mock import patch
    database = tmp_path / "library.sqlite3"
    with patch("citegeist.cli.BatchBootstrapRunner.run") as mocked_run:
        mocked_run.return_value = []
        exit_code = main(["--db", str(database), "bootstrap-batch", str(batch_json)])
    assert exit_code == 0
--- a/tests/test_bootstrap.py
+++ b/tests/test_bootstrap.py
@ -0,0 +1,175 @@
 from citegeist import BibliographyStore
 from citegeist.bootstrap import Bootstrapper
 from citegeist.cli import main
 def test_bootstrap_from_seed_bib_only():
    store = BibliographyStore()
    try:
        bootstrapper = Bootstrapper()
        bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: []  # type: ignore[method-assign]
        bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: []  # type: ignore[method-assign]
        results = bootstrapper.bootstrap(
            store,
            seed_bibtex="""
@article{seed2024,
  author = {Seed, Alice},
  title = {Seed Paper},
  year = {2024}
 }
 """,
            expand=False,
        )
        assert [item.citation_key for item in results] == ["seed2024"]
        assert store.get_entry("seed2024") is not None
    finally:
        store.close()
 def test_bootstrap_from_topic_only():
    store = BibliographyStore()
    try:
        bootstrapper = Bootstrapper()
        bootstrapper.resolver.search_openalex = lambda topic, limit=5: []  # type: ignore[method-assign]
        bootstrapper.resolver.search_crossref = lambda topic, limit=5: []  # type: ignore[method-assign]
        bootstrapper.resolver.search_datacite = lambda topic, limit=5: [  # type: ignore[method-assign]
            __import__("citegeist").BibEntry(
                entry_type="article",
                citation_key="topic2024graph",
                fields={"title": "Graph Topic Result", "year": "2024"},
            )
        ]
        bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: []  # type: ignore[method-assign]
        bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: []  # type: ignore[method-assign]
        results = bootstrapper.bootstrap(store, topic="graph topic", expand=False)
        assert [item.citation_key for item in results] == ["topic2024graph"]
        assert store.get_entry("topic2024graph") is not None
        assert results[0].score > 0
    finally:
        store.close()
 def test_bootstrap_cli_accepts_seed_and_topic(tmp_path):
    seed_bib = tmp_path / "seed.bib"
    seed_bib.write_text(
        """
@article{seed2024,
  author = {Seed, Alice},
  title = {Seed Paper},
  year = {2024}
 }
 """,
        encoding="utf-8",
    )
    from unittest.mock import patch
    database = tmp_path / "library.sqlite3"
    with patch("citegeist.cli.Bootstrapper.bootstrap") as mocked_bootstrap:
        mocked_bootstrap.return_value = []
        exit_code = main(
            [
                "--db",
                str(database),
                "bootstrap",
                "--seed-bib",
                str(seed_bib),
                "--topic",
                "graph topic",
                "--no-expand",
            ]
        )
    assert exit_code == 0
 def test_bootstrap_ranks_and_deduplicates_topic_candidates():
    store = BibliographyStore()
    try:
        bootstrapper = Bootstrapper()
        from citegeist import BibEntry
        bootstrapper.resolver.search_openalex = lambda topic, limit=5: [  # type: ignore[method-assign]
            BibEntry(
                entry_type="article",
                citation_key="shared2024graph",
                fields={"title": "Graph Topic Ranking", "abstract": "graph topic graph"},
            )
        ]
        bootstrapper.resolver.search_crossref = lambda topic, limit=5: [  # type: ignore[method-assign]
            BibEntry(
                entry_type="article",
                citation_key="shared2024graph",
                fields={"title": "Graph Topic Ranking", "abstract": "graph"},
            ),
            BibEntry(
                entry_type="article",
                citation_key="crossref2024other",
                fields={"title": "Less relevant paper"},
            ),
        ]
        bootstrapper.resolver.search_datacite = lambda topic, limit=5: []  # type: ignore[method-assign]
        bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: []  # type: ignore[method-assign]
        bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: []  # type: ignore[method-assign]
        results = bootstrapper.bootstrap(store, topic="graph topic", expand=False, topic_limit=5)
        topic_results = [item for item in results if item.origin == "topic"]
        assert [item.citation_key for item in topic_results] == ["shared2024graph", "crossref2024other"]
        assert topic_results[0].score > topic_results[1].score
    finally:
        store.close()
 def test_bootstrap_preview_does_not_write_to_database():
    store = BibliographyStore()
    try:
        bootstrapper = Bootstrapper()
        from citegeist import BibEntry
        bootstrapper.resolver.search_openalex = lambda topic, limit=5: [  # type: ignore[method-assign]
            BibEntry(entry_type="article", citation_key="preview2024graph", fields={"title": "Preview Graph Topic"})
        ]
        bootstrapper.resolver.search_crossref = lambda topic, limit=5: []  # type: ignore[method-assign]
        bootstrapper.resolver.search_datacite = lambda topic, limit=5: []  # type: ignore[method-assign]
        results = bootstrapper.bootstrap(store, topic="graph topic", expand=False, preview_only=True)
        assert [item.citation_key for item in results] == ["preview2024graph"]
        assert store.get_entry("preview2024graph") is None
    finally:
        store.close()
 def test_bootstrap_topic_commit_limit_restricts_persisted_candidates():
    store = BibliographyStore()
    try:
        bootstrapper = Bootstrapper()
        from citegeist import BibEntry
        bootstrapper.resolver.search_openalex = lambda topic, limit=5: [  # type: ignore[method-assign]
            BibEntry(entry_type="article", citation_key="rank1", fields={"title": "Graph Topic One"}),
            BibEntry(entry_type="article", citation_key="rank2", fields={"title": "Graph Topic Two"}),
        ]
        bootstrapper.resolver.search_crossref = lambda topic, limit=5: []  # type: ignore[method-assign]
        bootstrapper.resolver.search_datacite = lambda topic, limit=5: []  # type: ignore[method-assign]
        bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: []  # type: ignore[method-assign]
        bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: []  # type: ignore[method-assign]
        results = bootstrapper.bootstrap(
            store,
            topic="graph topic",
            expand=False,
            topic_limit=5,
            topic_commit_limit=1,
        )
        assert [item.citation_key for item in results if item.origin == "topic"] == ["rank1"]
        assert store.get_entry("rank1") is not None
        assert store.get_entry("rank2") is None
    finally:
        store.close()
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
--- a/tests/test_expand.py
+++ b/tests/test_expand.py
@ -0,0 +1,69 @@
 from citegeist.expand import CrossrefExpander, _crossref_reference_to_entry
 from citegeist.storage import BibliographyStore
 def test_crossref_reference_to_entry_prefers_doi_key():
    entry = _crossref_reference_to_entry(
        {
            "DOI": "10.1000/example-ref",
            "article-title": "Discovered Reference",
            "author": "Doe, Alex",
            "year": "2022",
            "journal-title": "Journal of Discovery",
        },
        "seed2024",
        1,
    )
    assert entry.citation_key == "doi101000exampleref"
    assert entry.fields["doi"] == "10.1000/example-ref"
    assert entry.fields["journal"] == "Journal of Discovery"
 def test_crossref_expander_creates_draft_nodes_and_relations():
    store = BibliographyStore()
    try:
        store.ingest_bibtex(
            """
@article{seed2024,
  author = {Seed, Alice},
  title = {Seed Paper},
  year = {2024},
  doi = {10.1000/seed-doi}
 }
 """
        )
        expander = CrossrefExpander()
        expander.resolver.source_client.get_json = lambda _url: {  # type: ignore[method-assign]
            "message": {
                "reference": [
                    {
                        "DOI": "10.1000/example-ref",
                        "article-title": "Discovered Reference",
                        "author": "Doe, Alex",
                        "year": "2022",
                        "journal-title": "Journal of Discovery",
                    },
                    {
                        "unstructured": "Unstructured reference string",
                        "year": "2021",
                    },
                ]
            }
        }
        results = expander.expand_entry_references(store, "seed2024")
        assert [result.discovered_citation_key for result in results] == [
            "doi101000exampleref",
            "ref2021unstructured2",
        ]
        discovered = store.get_entry("doi101000exampleref")
        assert discovered is not None
        assert discovered["review_status"] == "draft"
        assert store.get_relations("seed2024") == ["doi101000exampleref", "ref2021unstructured2"]
        relation_provenance = store.get_relation_provenance("seed2024")
        assert relation_provenance[0]["source_type"] == "graph_expand"
    finally:
        store.close()
--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@ -0,0 +1,65 @@
 from citegeist import extract_references, parse_bibtex
 from citegeist.cli import main
 SAMPLE_REFERENCES = """
 [1] Smith, Jane and Doe, Alex. 2024. Graph-first bibliography augmentation. Journal of Research Systems.
 [2] Miller, Sam. 2023. Semantic search for research corpora. Proceedings of the Retrieval Workshop.
 """
 APA_AND_BOOK_REFERENCES = """
 Brown, T., & Green, P. (2021). Retrieval methods for scholarly corpora. Journal of Information Retrieval.
 Nguyen, An. Research Design for Literature Mapping. Example University Press, 2020.
 """
 WRAPPED_REFERENCES = """
 [1] Taylor, Ann. 2022. Multi-line reference extraction
 for bibliography pipelines. Journal of Parsing Systems.
 [2] Chen, Bo. 2021. Another entry. Proceedings of the Mining Workshop.
 """
 def test_extract_references_builds_draft_entries():
    entries = extract_references(SAMPLE_REFERENCES)
    assert [entry.citation_key for entry in entries] == [
        "smith2024graphfirst1",
        "miller2023semantic2",
    ]
    assert entries[0].entry_type == "article"
    assert entries[0].fields["journal"] == "Journal of Research Systems"
    assert entries[1].entry_type == "inproceedings"
    assert entries[1].fields["booktitle"] == "Proceedings of the Retrieval Workshop"
 def test_extract_cli_writes_bibtex(tmp_path):
    input_path = tmp_path / "references.txt"
    output_path = tmp_path / "draft.bib"
    input_path.write_text(SAMPLE_REFERENCES, encoding="utf-8")
    exit_code = main(["extract", str(input_path), "--output", str(output_path)])
    assert exit_code == 0
    exported = output_path.read_text(encoding="utf-8")
    parsed = {entry.citation_key: entry for entry in parse_bibtex(exported)}
    assert parsed["smith2024graphfirst1"].fields["journal"] == "Journal of Research Systems"
    assert parsed["miller2023semantic2"].fields["booktitle"] == "Proceedings of the Retrieval Workshop"
 def test_extract_references_supports_apa_and_book_styles():
    entries = extract_references(APA_AND_BOOK_REFERENCES)
    assert [entry.entry_type for entry in entries] == ["article", "book"]
    assert entries[0].fields["journal"] == "Journal of Information Retrieval"
    assert entries[0].fields["author"] == "Brown, T., and Green, P"
    assert entries[1].fields["publisher"] == "Example University Press"
    assert entries[1].fields["title"] == "Research Design for Literature Mapping"
 def test_extract_references_joins_wrapped_reference_lines():
    entries = extract_references(WRAPPED_REFERENCES)
    assert len(entries) == 2
    assert entries[0].fields["title"] == "Multi-line reference extraction for bibliography pipelines"
    assert entries[0].fields["journal"] == "Journal of Parsing Systems"
--- a/tests/test_harvest.py
+++ b/tests/test_harvest.py
@ -0,0 +1,293 @@
 from citegeist import OaiPmhHarvester, parse_bibtex
 from citegeist.cli import main
 OAI_XML = """<?xml version="1.0" encoding="UTF-8"?>
 <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
         xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
         xmlns:dc="http://purl.org/dc/elements/1.1/">
  <ListRecords>
    <record>
      <header>
        <identifier>oai:example.edu:123</identifier>
      </header>
      <metadata>
        <oai_dc:dc>
          <dc:title>Thesis Metadata Harvesting</dc:title>
          <dc:creator>Doe, Jane</dc:creator>
          <dc:date>2023-05-01</dc:date>
          <dc:description>A dissertation about repository harvesting.</dc:description>
          <dc:identifier>https://example.edu/items/123</dc:identifier>
          <dc:publisher>Example University</dc:publisher>
          <dc:type>Text</dc:type>
          <dc:type>Dissertation</dc:type>
        </oai_dc:dc>
      </metadata>
    </record>
  </ListRecords>
 </OAI-PMH>
 """
 OAI_XML_PAGE_1 = """<?xml version="1.0" encoding="UTF-8"?>
 <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
         xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
         xmlns:dc="http://purl.org/dc/elements/1.1/">
  <ListRecords>
    <record>
      <header>
        <identifier>oai:example.edu:123</identifier>
      </header>
      <metadata>
        <oai_dc:dc>
          <dc:title>First Harvested Thesis</dc:title>
          <dc:creator>Doe, Jane</dc:creator>
          <dc:date>2023-05-01</dc:date>
          <dc:type>Dissertation</dc:type>
        </oai_dc:dc>
      </metadata>
    </record>
    <resumptionToken>TOKEN123</resumptionToken>
  </ListRecords>
 </OAI-PMH>
 """
 OAI_XML_PAGE_2 = """<?xml version="1.0" encoding="UTF-8"?>
 <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
         xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
         xmlns:dc="http://purl.org/dc/elements/1.1/">
  <ListRecords>
    <record>
      <header>
        <identifier>oai:example.edu:456</identifier>
      </header>
      <metadata>
        <oai_dc:dc>
          <dc:title>Second Harvested Thesis</dc:title>
          <dc:creator>Smith, John</dc:creator>
          <dc:date>2022-05-01</dc:date>
          <dc:type>Dissertation</dc:type>
        </oai_dc:dc>
      </metadata>
    </record>
  </ListRecords>
 </OAI-PMH>
 """
 OAI_IDENTIFY_XML = """<?xml version="1.0" encoding="UTF-8"?>
 <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/">
  <Identify>
    <repositoryName>Example Repository</repositoryName>
    <baseURL>https://example.edu/oai</baseURL>
    <protocolVersion>2.0</protocolVersion>
    <adminEmail>repo@example.edu</adminEmail>
    <earliestDatestamp>2001-01-01</earliestDatestamp>
    <deletedRecord>persistent</deletedRecord>
    <granularity>YYYY-MM-DD</granularity>
  </Identify>
 </OAI-PMH>
 """
 OAI_LISTSETS_XML = """<?xml version="1.0" encoding="UTF-8"?>
 <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/">
  <ListSets>
    <set>
      <setSpec>theses</setSpec>
      <setName>Theses and Dissertations</setName>
      <setDescription>
        <description>This set contains graduate theses.</description>
      </setDescription>
    </set>
  </ListSets>
 </OAI-PMH>
 """
 OAI_METADATA_FORMATS_XML = """<?xml version="1.0" encoding="UTF-8"?>
 <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/">
  <ListMetadataFormats>
    <metadataFormat>
      <metadataPrefix>oai_dc</metadataPrefix>
      <schema>http://www.openarchives.org/OAI/2.0/oai_dc.xsd</schema>
      <metadataNamespace>http://www.openarchives.org/OAI/2.0/oai_dc/</metadataNamespace>
    </metadataFormat>
    <metadataFormat>
      <metadataPrefix>mods</metadataPrefix>
      <schema>http://www.loc.gov/standards/mods/v3/mods-3-7.xsd</schema>
      <metadataNamespace>http://www.loc.gov/mods/v3</metadataNamespace>
    </metadataFormat>
  </ListMetadataFormats>
 </OAI-PMH>
 """
 OAI_MODS_XML = """<?xml version="1.0" encoding="UTF-8"?>
 <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
         xmlns:mods="http://www.loc.gov/mods/v3">
  <ListRecords>
    <record>
      <header>
        <identifier>oai:example.edu:mods123</identifier>
      </header>
      <metadata>
        <mods:mods>
          <mods:titleInfo>
            <mods:title>MODS Thesis Title</mods:title>
          </mods:titleInfo>
          <mods:name>
            <mods:namePart>Doe</mods:namePart>
            <mods:namePart>Jane</mods:namePart>
            <mods:role>
              <mods:roleTerm>author</mods:roleTerm>
            </mods:role>
          </mods:name>
          <mods:originInfo>
            <mods:publisher>Example University</mods:publisher>
            <mods:dateIssued>2022</mods:dateIssued>
          </mods:originInfo>
          <mods:genre>dissertation</mods:genre>
          <mods:abstract>MODS abstract text.</mods:abstract>
          <mods:location>
            <mods:url>https://example.edu/mods123</mods:url>
          </mods:location>
        </mods:mods>
      </metadata>
    </record>
  </ListRecords>
 </OAI-PMH>
 """
 def test_oai_harvester_maps_dublin_core_to_bibentry():
    harvester = OaiPmhHarvester()
    harvester.source_client.get_xml = lambda _url: __import__("xml.etree.ElementTree").etree.ElementTree.fromstring(OAI_XML)  # type: ignore[method-assign]
    results = harvester.list_records("https://example.edu/oai")
    assert len(results) == 1
    entry = results[0].entry
    assert entry.entry_type == "phdthesis"
    assert entry.fields["title"] == "Thesis Metadata Harvesting"
    assert entry.fields["author"] == "Doe, Jane"
    assert entry.fields["oai"] == "oai:example.edu:123"
 def test_oai_harvester_follows_resumption_tokens():
    harvester = OaiPmhHarvester()
    from xml.etree import ElementTree as ET
    payloads = iter([ET.fromstring(OAI_XML_PAGE_1), ET.fromstring(OAI_XML_PAGE_2)])
    harvester.source_client.get_xml = lambda _url: next(payloads)  # type: ignore[method-assign]
    results = harvester.list_records("https://example.edu/oai")
    assert [result.identifier for result in results] == [
        "oai:example.edu:123",
        "oai:example.edu:456",
    ]
    assert [result.entry.citation_key for result in results] == [
        "doe2023first1",
        "smith2022second2",
    ]
 def test_oai_harvester_passes_date_filters():
    harvester = OaiPmhHarvester()
    seen_urls: list[str] = []
    from xml.etree import ElementTree as ET
    def fake_get_xml(url: str):
        seen_urls.append(url)
        return ET.fromstring(OAI_XML)
    harvester.source_client.get_xml = fake_get_xml  # type: ignore[method-assign]
    harvester.list_records(
        "https://example.edu/oai",
        date_from="2023-01-01",
        date_until="2023-12-31",
        limit=1,
    )
    assert "from=2023-01-01" in seen_urls[0]
    assert "until=2023-12-31" in seen_urls[0]
 def test_oai_harvester_maps_mods_records():
    harvester = OaiPmhHarvester()
    from xml.etree import ElementTree as ET
    harvester.source_client.get_xml = lambda _url: ET.fromstring(OAI_MODS_XML)  # type: ignore[method-assign]
    results = harvester.list_records("https://example.edu/oai", metadata_prefix="mods")
    assert len(results) == 1
    entry = results[0].entry
    assert entry.entry_type == "phdthesis"
    assert entry.fields["title"] == "MODS Thesis Title"
    assert entry.fields["author"] == "Doe, Jane"
    assert entry.fields["publisher"] == "Example University"
    assert entry.fields["abstract"] == "MODS abstract text."
 def test_oai_harvester_can_identify_repository_and_list_sets():
    harvester = OaiPmhHarvester()
    from xml.etree import ElementTree as ET
    payloads = iter(
        [ET.fromstring(OAI_IDENTIFY_XML), ET.fromstring(OAI_LISTSETS_XML), ET.fromstring(OAI_METADATA_FORMATS_XML)]
    )
    harvester.source_client.get_xml = lambda _url: next(payloads)  # type: ignore[method-assign]
    identify = harvester.identify("https://example.edu/oai")
    sets = harvester.list_sets("https://example.edu/oai")
    formats = harvester.list_metadata_formats("https://example.edu/oai")
    assert identify["repositoryName"] == "Example Repository"
    assert identify["granularity"] == "YYYY-MM-DD"
    assert sets[0].set_spec == "theses"
    assert sets[0].set_name == "Theses and Dissertations"
    assert "graduate theses" in sets[0].set_description
    assert [item.metadata_prefix for item in formats] == ["oai_dc", "mods"]
 def test_harvest_oai_cli_ingests_records(tmp_path):
    from unittest.mock import patch
    database = tmp_path / "library.sqlite3"
    harvester = OaiPmhHarvester()
    from xml.etree import ElementTree as ET
    harvester.source_client.get_xml = lambda _url: ET.fromstring(OAI_XML)  # type: ignore[method-assign]
    harvested = harvester.list_records("https://example.edu/oai")
    with patch("citegeist.cli.OaiPmhHarvester.list_records") as mocked_list:
        mocked_list.return_value = harvested
        exit_code = main(
            [
                "--db",
                str(database),
                "harvest-oai",
                "https://example.edu/oai",
                "--metadata-prefix",
                "oai_dc",
                "--from",
                "2023-01-01",
                "--until",
                "2023-12-31",
                "--limit",
                "5",
            ]
        )
    assert exit_code == 0
    from citegeist.storage import BibliographyStore
    store = BibliographyStore(database)
    try:
        entry = store.list_entries(limit=10)[0]
        assert entry["citation_key"] == "doe2023thesis1"
        bibtex = store.get_entry_bibtex("doe2023thesis1")
        parsed = parse_bibtex(bibtex or "")
        assert parsed[0].fields["oai"] == "oai:example.edu:123"
    finally:
        store.close()
--- a/tests/test_live_sources.py
+++ b/tests/test_live_sources.py
@ -0,0 +1,52 @@
 from __future__ import annotations
 import os
 import pytest
 from citegeist import MetadataResolver, SourceClient
 pytestmark = pytest.mark.live
 def _live_client() -> SourceClient:
    cache_dir = os.environ.get("CITEGEIST_SOURCE_CACHE", ".cache/citegeist")
    return SourceClient(
        cache_dir=cache_dir,
        fixtures_dir=os.environ.get("CITEGEIST_SOURCE_FIXTURES"),
    )
 def test_live_crossref_doi_resolution():
    resolver = MetadataResolver(source_client=_live_client())
    resolution = resolver.resolve_doi("10.1038/nphys1170")
    assert resolution is not None
    assert resolution.entry.fields.get("doi") == "10.1038/nphys1170"
    assert resolution.entry.fields.get("title")
 def test_live_arxiv_resolution():
    resolver = MetadataResolver(source_client=_live_client())
    resolution = resolver.resolve_arxiv("1706.03762")
    assert resolution is not None
    assert resolution.entry.fields.get("arxiv") == "1706.03762"
    assert resolution.entry.fields.get("title")
 def test_live_openalex_title_search():
    resolver = MetadataResolver(source_client=_live_client())
    resolution = resolver.search_openalex_best_match(
        title="Attention Is All You Need",
        author_text="Ashish Vaswani",
        year="2017",
    )
    assert resolution is not None
    assert resolution.entry.fields.get("title")
    assert resolution.entry.fields.get("openalex")
--- a/tests/test_openalex_expand.py
+++ b/tests/test_openalex_expand.py
@ -0,0 +1,84 @@
 from citegeist.expand import OpenAlexExpander, _openalex_work_to_entry
 from citegeist.storage import BibliographyStore
 def test_openalex_work_to_entry_maps_basic_fields():
    entry = _openalex_work_to_entry(
        {
            "id": "https://openalex.org/W12345",
            "doi": "https://doi.org/10.1000/example-openalex",
            "display_name": "OpenAlex Discovered Work",
            "publication_year": 2022,
            "type": "article",
            "authorships": [{"author": {"display_name": "Jane Smith"}}],
            "primary_location": {"source": {"display_name": "Journal of Graph Discovery"}},
            "abstract_inverted_index": {"Graph": [0], "discovery": [1]},
        }
    )
    assert entry.citation_key == "openalexw12345"
    assert entry.fields["openalex"] == "W12345"
    assert entry.fields["doi"] == "10.1000/example-openalex"
    assert entry.fields["journal"] == "Journal of Graph Discovery"
    assert entry.fields["abstract"] == "Graph discovery"
 def test_openalex_expander_adds_outgoing_and_incoming_edges():
    store = BibliographyStore()
    try:
        store.ingest_bibtex(
            """
@article{seed2024,
  author = {Seed, Alice},
  title = {Seed Paper},
  year = {2024},
  doi = {10.1000/seed-doi}
 }
 """
        )
        expander = OpenAlexExpander()
        payloads = iter(
            [
                {
                    "results": [
                        {
                            "id": "https://openalex.org/WSEED",
                        }
                    ]
                },
                {
                    "results": [
                        {
                            "id": "https://openalex.org/WDISCOVERED",
                            "display_name": "Referenced OpenAlex Work",
                            "publication_year": 2021,
                            "type": "article",
                            "authorships": [{"author": {"display_name": "Bob Known"}}],
                            "primary_location": {"source": {"display_name": "OpenAlex Journal"}},
                        }
                    ]
                },
                {
                    "results": [
                        {
                            "id": "https://openalex.org/WCITING",
                            "display_name": "Citing OpenAlex Work",
                            "publication_year": 2025,
                            "type": "article",
                            "authorships": [{"author": {"display_name": "Carol Citing"}}],
                        }
                    ]
                },
            ]
        )
        expander.resolver.source_client.get_json = lambda _url: next(payloads)  # type: ignore[method-assign]
        outgoing = expander.expand_entry(store, "seed2024", relation_type="cites", limit=5)
        incoming = expander.expand_entry(store, "seed2024", relation_type="cited_by", limit=5)
        assert outgoing[0].discovered_citation_key == "openalexwdiscovered"
        assert incoming[0].source_citation_key == "openalexwciting"
        assert "openalexwdiscovered" in store.get_relations("seed2024", "cites")
        assert "seed2024" in store.get_relations("openalexwciting", "cites")
    finally:
        store.close()
--- a/tests/test_resolve.py
+++ b/tests/test_resolve.py
@ -0,0 +1,403 @@
 from xml.etree import ElementTree as ET
 from citegeist.bibtex import BibEntry, render_bibtex
 from citegeist.resolve import (
    MetadataResolver,
    _arxiv_atom_entry_to_bib,
    _crossref_message_to_entry,
    _datacite_work_to_entry,
    _openalex_work_to_entry,
    merge_entries_with_conflicts,
    merge_entries,
 )
 def test_crossref_message_to_entry_maps_basic_fields():
    entry = _crossref_message_to_entry(
        {
            "type": "journal-article",
            "title": ["Graph-first bibliography augmentation"],
            "DOI": "10.1000/example-doi",
            "URL": "https://doi.org/10.1000/example-doi",
            "container-title": ["Journal of Graph Studies"],
            "author": [{"family": "Smith", "given": "Jane"}],
            "issued": {"date-parts": [[2024, 5, 1]]},
        }
    )
    assert entry.entry_type == "article"
    assert entry.fields["author"] == "Smith, Jane"
    assert entry.fields["journal"] == "Journal of Graph Studies"
    assert entry.fields["year"] == "2024"
 def test_arxiv_atom_entry_to_bib_maps_basic_fields():
    xml = ET.fromstring(
        """
 <entry xmlns="http://www.w3.org/2005/Atom" xmlns:arxiv="http://arxiv.org/schemas/atom">
  <title>Semantic search for research corpora</title>
  <summary>Dense retrieval improves recall.</summary>
  <published>2023-01-15T00:00:00Z</published>
  <author><name>Miller, Sam</name></author>
  <arxiv:doi>10.1000/arxiv-example</arxiv:doi>
 </entry>
 """
    )
    entry = _arxiv_atom_entry_to_bib(xml, "2301.12345")
    assert entry.fields["author"] == "Miller, Sam"
    assert entry.fields["arxiv"] == "2301.12345"
    assert entry.fields["doi"] == "10.1000/arxiv-example"
 def test_merge_entries_prefers_existing_values_and_adds_missing_fields():
    base = BibEntry(
        entry_type="article",
        citation_key="smith2024graphs",
        fields={"title": "Graph-first bibliography augmentation", "doi": "10.1000/example-doi"},
    )
    resolved = BibEntry(
        entry_type="article",
        citation_key="otherkey",
        fields={"title": "Different title", "journal": "Journal of Graph Studies"},
    )
    merged = merge_entries(base, resolved)
    assert merged.fields["title"] == "Graph-first bibliography augmentation"
    assert merged.fields["journal"] == "Journal of Graph Studies"
 def test_merge_entries_with_conflicts_records_disagreements():
    base = BibEntry(
        entry_type="article",
        citation_key="smith2024graphs",
        fields={"title": "Existing Title", "journal": "Current Journal"},
    )
    resolved = BibEntry(
        entry_type="article",
        citation_key="resolved",
        fields={"title": "Resolved Title", "journal": "Current Journal", "year": "2024"},
    )
    merged, conflicts = merge_entries_with_conflicts(base, resolved)
    assert merged.fields["title"] == "Existing Title"
    assert merged.fields["year"] == "2024"
    assert conflicts == [
        {
            "field_name": "title",
            "current_value": "Existing Title",
            "proposed_value": "Resolved Title",
        }
    ]
 def test_resolver_tries_doi_before_dblp():
    resolver = MetadataResolver()
    calls: list[tuple[str, str]] = []
    def fake_doi(value: str):
        calls.append(("doi", value))
        return None
    def fake_dblp(value: str):
        calls.append(("dblp", value))
        return None
    def fake_datacite(value: str):
        calls.append(("datacite", value))
        return None
    resolver.resolve_doi = fake_doi  # type: ignore[method-assign]
    resolver.resolve_datacite_doi = fake_datacite  # type: ignore[method-assign]
    resolver.resolve_dblp = fake_dblp  # type: ignore[method-assign]
    resolver.resolve_entry(
        BibEntry(
            entry_type="article",
            citation_key="smith2024graphs",
            fields={"doi": "10.1000/example-doi", "dblp": "conf/test/Smith24"},
        )
    )
    assert calls == [
        ("doi", "10.1000/example-doi"),
        ("datacite", "10.1000/example-doi"),
        ("dblp", "conf/test/Smith24"),
    ]
 def test_openalex_work_to_entry_maps_basic_fields():
    entry = _openalex_work_to_entry(
        {
            "id": "https://openalex.org/W12345",
            "doi": "https://doi.org/10.1000/example-openalex",
            "display_name": "OpenAlex Resolved Work",
            "publication_year": 2022,
            "type": "article",
            "authorships": [{"author": {"display_name": "Jane Smith"}}],
            "primary_location": {"source": {"display_name": "Journal of Open Graphs"}},
            "abstract_inverted_index": {"OpenAlex": [0], "resolved": [1]},
        }
    )
    assert entry.citation_key == "openalexw12345"
    assert entry.fields["openalex"] == "W12345"
    assert entry.fields["doi"] == "10.1000/example-openalex"
    assert entry.fields["journal"] == "Journal of Open Graphs"
    assert entry.fields["abstract"] == "OpenAlex resolved"
 def test_resolver_can_resolve_openalex_id():
    resolver = MetadataResolver()
    resolver.source_client.get_json = lambda _url: {  # type: ignore[method-assign]
        "id": "https://openalex.org/W12345",
        "display_name": "OpenAlex Resolved Work",
        "publication_year": 2022,
        "type": "article",
        "authorships": [{"author": {"display_name": "Jane Smith"}}],
    }
    resolution = resolver.resolve_openalex("W12345")
    assert resolution is not None
    assert resolution.source_label == "openalex:id:W12345"
    assert resolution.entry.fields["openalex"] == "W12345"
 def test_resolver_falls_back_to_openalex_title_search():
    resolver = MetadataResolver()
    resolver.search_datacite = lambda title, limit=5: []  # type: ignore[method-assign]
    resolver.search_crossref = lambda title, limit=5: []  # type: ignore[method-assign]
    resolver.search_openalex = lambda title, limit=5: [  # type: ignore[method-assign]
        _openalex_work_to_entry(
            {
                "id": "https://openalex.org/W12345",
                "display_name": title,
                "publication_year": 2022,
                "type": "article",
                "authorships": [{"author": {"display_name": "Jane Smith"}}],
            }
        )
    ]
    resolution = resolver.resolve_entry(
        BibEntry(
            entry_type="article",
            citation_key="smith2022openalex",
            fields={"title": "OpenAlex Resolved Work", "author": "Jane Smith", "year": "2022"},
        )
    )
    assert resolution is not None
    assert resolution.source_label == "openalex:search:OpenAlex Resolved Work"
    assert resolution.entry.fields["openalex"] == "W12345"
 def test_resolver_prefers_exact_crossref_title_match_before_datacite():
    resolver = MetadataResolver()
    resolver.search_crossref = lambda title, limit=5: [  # type: ignore[method-assign]
        _crossref_message_to_entry(
            {
                "type": "journal-article",
                "title": [title],
                "DOI": "10.1126/science.1090005",
                "container-title": ["Science"],
                "author": [
                    {"family": "King", "given": "Mary-Claire"},
                    {"family": "Wilson", "given": "A. C."},
                ],
                "issued": {"date-parts": [[1975, 4, 11]]},
            }
        )
    ]
    resolver.search_datacite = lambda title, limit=5: [  # type: ignore[method-assign]
        _datacite_work_to_entry(
            {
                "attributes": {
                    "doi": "10.5061/dryad.v6wwpzh17",
                    "titles": [
                        {
                            "title": "Conserved patterns and locomotor-related evolutionary constraints in the hominoid vertebral column"
                        }
                    ],
                    "creators": [
                        {"familyName": "Villamil", "givenName": "Catalina I."},
                        {"familyName": "Middleton", "givenName": "Emily R."},
                    ],
                    "publicationYear": 2024,
                    "types": {"resourceTypeGeneral": "Dataset"},
                }
            }
        )
    ]
    resolution = resolver.resolve_entry(
        BibEntry(
            entry_type="article",
            citation_key="king1975evolution2",
            fields={
                "title": "Evolution at two levels in humans and chimpanzees",
                "author": "King, M. C. and Wilson, A. C.",
                "year": "1975",
            },
        )
    )
    assert resolution is not None
    assert resolution.source_label == "crossref:search:Evolution at two levels in humans and chimpanzees"
    assert resolution.entry.fields["doi"] == "10.1126/science.1090005"
 def test_resolver_rejects_mismatched_title_search_candidates():
    resolver = MetadataResolver()
    resolver.search_crossref = lambda title, limit=5: []  # type: ignore[method-assign]
    resolver.search_datacite = lambda title, limit=5: [  # type: ignore[method-assign]
        _datacite_work_to_entry(
            {
                "attributes": {
                    "doi": "10.5061/dryad.v6wwpzh17",
                    "titles": [
                        {
                            "title": "Conserved patterns and locomotor-related evolutionary constraints in the hominoid vertebral column"
                        }
                    ],
                    "creators": [
                        {"familyName": "Villamil", "givenName": "Catalina I."},
                    ],
                    "publicationYear": 2024,
                    "types": {"resourceTypeGeneral": "Dataset"},
                }
            }
        )
    ]
    resolver.search_openalex = lambda title, limit=5: [  # type: ignore[method-assign]
        _openalex_work_to_entry(
            {
                "id": "https://openalex.org/W2033360601",
                "display_name": "Immunological relatedness of glucose 6-phosphate dehydrogenases from vertebrate and invertebrate species.",
                "publication_year": 1978,
                "type": "article",
                "authorships": [
                    {"author": {"display_name": "Yoshikazu Sado"}},
                    {"author": {"display_name": "Samuel H. Hori"}},
                ],
                "doi": "https://doi.org/10.1266/jjg.53.91",
            }
        )
    ]
    resolution = resolver.resolve_entry(
        BibEntry(
            entry_type="article",
            citation_key="sarich1967immunological1",
            fields={
                "title": "Immunological Time Scale for Homonid Evolution",
                "author": "Sarich, V. and Wilson, A.",
                "year": "1967",
            },
        )
    )
    assert resolution is None
 def test_datacite_work_to_entry_maps_basic_fields():
    entry = _datacite_work_to_entry(
        {
            "attributes": {
                "doi": "10.1000/datacite-example",
                "titles": [{"title": "Repository Dissertation Record"}],
                "creators": [{"familyName": "Doe", "givenName": "Jane"}],
                "publicationYear": 2021,
                "publisher": "Example University",
                "url": "https://example.edu/record/123",
                "types": {"resourceTypeGeneral": "Dissertation"},
                "descriptions": [
                    {
                        "descriptionType": "Abstract",
                        "description": "An abstract from DataCite.",
                    }
                ],
            }
        }
    )
    assert entry.entry_type == "phdthesis"
    assert entry.fields["doi"] == "10.1000/datacite-example"
    assert entry.fields["author"] == "Doe, Jane"
    assert entry.fields["publisher"] == "Example University"
    assert entry.fields["abstract"] == "An abstract from DataCite."
 def test_resolver_can_resolve_datacite_doi():
    resolver = MetadataResolver()
    resolver.source_client.get_json = lambda _url: {  # type: ignore[method-assign]
        "data": {
            "attributes": {
                "doi": "10.1000/datacite-example",
                "titles": [{"title": "Repository Dissertation Record"}],
                "creators": [{"familyName": "Doe", "givenName": "Jane"}],
                "publicationYear": 2021,
                "types": {"resourceTypeGeneral": "Dissertation"},
            }
        }
    }
    resolution = resolver.resolve_datacite_doi("10.1000/datacite-example")
    assert resolution is not None
    assert resolution.source_label == "datacite:doi:10.1000/datacite-example"
    assert resolution.entry.entry_type == "phdthesis"
 def test_resolver_can_fall_back_to_datacite_title_search():
    resolver = MetadataResolver()
    resolver.search_crossref = lambda title, limit=5: []  # type: ignore[method-assign]
    resolver.search_datacite = lambda title, limit=5: [  # type: ignore[method-assign]
        _datacite_work_to_entry(
            {
                "attributes": {
                    "doi": "10.1000/datacite-example",
                    "titles": [{"title": title}],
                    "creators": [{"familyName": "Doe", "givenName": "Jane"}],
                    "publicationYear": 2021,
                    "types": {"resourceTypeGeneral": "Dissertation"},
                }
            }
        )
    ]
    resolver.search_openalex = lambda title, limit=5: []  # type: ignore[method-assign]
    resolution = resolver.resolve_entry(
        BibEntry(
            entry_type="misc",
            citation_key="draft1",
            fields={"title": "Repository Dissertation Record", "author": "Doe, Jane", "year": "2021"},
        )
    )
    assert resolution is not None
    assert resolution.source_label == "datacite:search:Repository Dissertation Record"
    assert resolution.entry.fields["doi"] == "10.1000/datacite-example"
 def test_render_bibtex_tolerates_unmatched_braces_in_field_values():
    rendered = render_bibtex(
        [
            BibEntry(
                entry_type="misc",
                citation_key="broken2026",
                fields={
                    "author": "Broken, Example",
                    "title": "Unmatched { braces } example } tail",
                    "year": "2026",
                    "note": "Open { brace only",
                },
            )
        ]
    )
    assert "@misc{broken2026," in rendered
    assert "Unmatched { braces } example ) tail" in rendered
    assert "Open ( brace only" in rendered
--- a/tests/test_sources.py
+++ b/tests/test_sources.py
@ -0,0 +1,41 @@
 from pathlib import Path
 from citegeist.sources import SourceClient
 def test_source_client_reads_fixture_before_network(tmp_path: Path):
    fixtures_dir = tmp_path / "fixtures"
    fixtures_dir.mkdir()
    client = SourceClient(cache_dir=tmp_path / "cache", fixtures_dir=fixtures_dir)
    url = "https://api.crossref.org/works/10.1000/example"
    fixture_path = fixtures_dir / client._cache_key(url, "json")  # noqa: SLF001
    fixture_path.write_text('{"message": {"DOI": "10.1000/example"}}', encoding="utf-8")
    payload = client.get_json(url)
    assert payload["message"]["DOI"] == "10.1000/example"
 def test_source_client_writes_cache_after_fetch(tmp_path: Path):
    cache_dir = tmp_path / "cache"
    client = SourceClient(cache_dir=cache_dir)
    url = "https://example.org/test"
    client._fetch_bytes = lambda _url: b'{"ok": true}'  # type: ignore[method-assign]
    payload = client.get_json(url)
    assert payload["ok"] is True
    assert any(cache_dir.iterdir())
 def test_source_client_falls_back_to_latin1_for_text(tmp_path: Path):
    client = SourceClient(cache_dir=tmp_path / "cache")
    url = "https://example.org/latin1"
    client._fetch_bytes = lambda _url: "café".encode("iso-8859-1")  # type: ignore[method-assign]
    payload = client.get_text(url)
    assert payload == "café"
--- a/tests/test_storage.py
+++ b/tests/test_storage.py
@ -0,0 +1,379 @@
 from citegeist import BibliographyStore, parse_bibtex
 SAMPLE_BIB = """
@article{smith2024graphs,
  author = {Smith, Jane and Doe, Alex},
  title = {Graph-first bibliography augmentation},
  year = {2024},
  doi = {10.1000/graph.2024.1},
  abstract = {We study citation graphs for literature discovery.},
  references = {miller2023search}
 }
@inproceedings{miller2023search,
  author = {Miller, Sam},
  title = {Semantic search for research corpora},
  year = {2023},
  abstract = {Dense retrieval improves recall for academic search.}
 }
 """
 def test_parse_bibtex_extracts_entries_and_fields():
    entries = parse_bibtex(SAMPLE_BIB)
    assert [entry.citation_key for entry in entries] == ["smith2024graphs", "miller2023search"]
    assert entries[0].fields["title"] == "Graph-first bibliography augmentation"
    assert entries[0].fields["references"] == "miller2023search"
 def test_store_ingests_entries_relations_and_search_text():
    store = BibliographyStore()
    try:
        store.ingest_bibtex(
            SAMPLE_BIB,
            fulltext_by_key={
                "smith2024graphs": "This paper links citation graphs with semantic search over abstracts."
            },
        )
        entry = store.get_entry("smith2024graphs")
        assert entry is not None
        assert entry["doi"] == "10.1000/graph.2024.1"
        assert store.get_relations("smith2024graphs") == ["miller2023search"]
        results = store.search_text("semantic")
        assert [row["citation_key"] for row in results][:2] == [
            "miller2023search",
            "smith2024graphs",
        ]
    finally:
        store.close()
 def test_store_exports_bibtex_from_normalized_rows():
    store = BibliographyStore()
    try:
        store.ingest_bibtex(SAMPLE_BIB)
        exported = store.export_bibtex()
        parsed = {entry.citation_key: entry for entry in parse_bibtex(exported)}
        assert "@article{smith2024graphs," in exported
        assert "@inproceedings{miller2023search," in exported
        assert parsed["smith2024graphs"].fields["author"] == "Smith, Jane and Doe, Alex"
        assert parsed["smith2024graphs"].fields["references"] == "miller2023search"
    finally:
        store.close()
 def test_store_records_provenance_and_review_status():
    store = BibliographyStore()
    try:
        store.ingest_bibtex(SAMPLE_BIB, source_label="fixtures/sample.bib", review_status="draft")
        entry = store.get_entry("smith2024graphs")
        assert entry is not None
        assert entry["review_status"] == "draft"
        provenance = store.get_field_provenance("smith2024graphs")
        assert provenance
        assert provenance[0]["source_type"] == "bibtex"
        assert provenance[0]["source_label"] == "fixtures/sample.bib"
        assert store.set_entry_status("smith2024graphs", "reviewed") is True
        updated = store.get_entry("smith2024graphs")
        assert updated is not None
        assert updated["review_status"] == "reviewed"
    finally:
        store.close()
 def test_store_traverses_graph_and_surfaces_missing_targets():
    store = BibliographyStore()
    try:
        store.ingest_bibtex(
            """
@article{seed2024,
  author = {Seed, Alice},
  title = {Seed Paper},
  year = {2024},
  references = {known2023, missing2022}
 }
@article{known2023,
  author = {Known, Bob},
  title = {Known Paper},
  year = {2023},
  references = {leaf2021}
 }
@article{leaf2021,
  author = {Leaf, Carol},
  title = {Leaf Paper},
  year = {2021}
 }
 """,
            review_status="reviewed",
        )
        rows = store.traverse_graph(["seed2024"], relation_types=["cites"], max_depth=2)
        assert [row["target_citation_key"] for row in rows] == [
            "known2023",
            "missing2022",
            "leaf2021",
        ]
        assert rows[1]["target_exists"] is False
        assert rows[2]["depth"] == 2
    finally:
        store.close()
 def test_store_records_and_updates_field_conflicts():
    store = BibliographyStore()
    try:
        store.ingest_bibtex(
            """
@article{seed2024,
  author = {Seed, Alice},
  title = {Seed Paper},
  year = {2024}
 }
 """
        )
        ok = store.record_conflicts(
            "seed2024",
            [
                {
                    "field_name": "title",
                    "current_value": "Seed Paper",
                    "proposed_value": "Resolved Seed Paper",
                }
            ],
            source_type="resolver",
            source_label="crossref:doi:10.1000/seed",
        )
        assert ok is True
        conflicts = store.get_field_conflicts("seed2024")
        assert conflicts[0]["field_name"] == "title"
        assert conflicts[0]["status"] == "open"
        assert store.set_conflict_status("seed2024", "title", "accepted") == 1
        updated = store.get_field_conflicts("seed2024", status="accepted")
        assert len(updated) == 1
    finally:
        store.close()
 def test_store_can_apply_latest_conflict_value():
    store = BibliographyStore()
    try:
        store.ingest_bibtex(
            """
@article{seed2024,
  author = {Seed, Alice},
  title = {Seed Paper},
  year = {2024}
 }
 """
        )
        store.record_conflicts(
            "seed2024",
            [
                {
                    "field_name": "title",
                    "current_value": "Seed Paper",
                    "proposed_value": "Resolved Seed Paper",
                }
            ],
            source_type="resolver",
            source_label="crossref:doi:10.1000/seed",
        )
        assert store.apply_conflict_value("seed2024", "title") is True
        entry = store.get_entry("seed2024")
        assert entry is not None
        assert entry["title"] == "Resolved Seed Paper"
        accepted = store.get_field_conflicts("seed2024", status="accepted")
        assert len(accepted) == 1
    finally:
        store.close()
 def test_store_supports_entry_topic_membership():
    store = BibliographyStore()
    try:
        store.ingest_bibtex(
            """
@article{seed2024,
  author = {Seed, Alice},
  title = {Seed Paper},
  year = {2024}
 }
 """
        )
        assert store.add_entry_topic(
            "seed2024",
            topic_slug="graph-methods",
            topic_name="Graph Methods",
            source_type="talkorigins",
            source_url="https://example.org/topics/graph-methods",
            source_label="topic-seed",
        ) is True
        assert store.add_entry_topic(
            "seed2024",
            topic_slug="semantic-search",
            topic_name="Semantic Search",
            source_type="talkorigins",
            source_url="https://example.org/topics/semantic-search",
            source_label="topic-seed",
        ) is True
        entry = store.get_entry("seed2024")
        assert entry is not None
        assert [topic["slug"] for topic in entry["topics"]] == ["graph-methods", "semantic-search"]
        topics = store.list_topics()
        assert [topic["slug"] for topic in topics] == ["graph-methods", "semantic-search"]
        assert topics[0]["entry_count"] == 1
        topic = store.get_topic("graph-methods")
        assert topic is not None
        assert topic["name"] == "Graph Methods"
        assert topic["expansion_phrase"] is None
        topic_entries = store.list_topic_entries("graph-methods")
        assert topic_entries[0]["citation_key"] == "seed2024"
    finally:
        store.close()
 def test_store_can_set_topic_expansion_phrase():
    store = BibliographyStore()
    try:
        store.ingest_bibtex(
            """
@article{seed2024,
  author = {Seed, Alice},
  title = {Seed Paper},
  year = {2024}
 }
 """
        )
        store.add_entry_topic(
            "seed2024",
            topic_slug="graph-methods",
            topic_name="Graph Methods",
            source_type="talkorigins",
            source_url="https://example.org/topics/graph-methods",
            source_label="topic-seed",
        )
        assert store.set_topic_expansion_phrase("graph-methods", "graph networks biology") is True
        topic = store.get_topic("graph-methods")
        assert topic is not None
        assert topic["expansion_phrase"] == "graph networks biology"
        assert topic["phrase_review_status"] == "unreviewed"
        topics = store.list_topics()
        assert topics[0]["expansion_phrase"] == "graph networks biology"
    finally:
        store.close()
 def test_store_can_stage_and_review_topic_phrase_suggestion():
    store = BibliographyStore()
    try:
        store.ensure_topic("graph-methods", "Graph Methods")
        assert store.stage_topic_phrase_suggestion(
            "graph-methods",
            "graph networks biology",
            review_notes="generated from local titles",
        ) is True
        staged = store.get_topic("graph-methods")
        assert staged is not None
        assert staged["suggested_phrase"] == "graph networks biology"
        assert staged["expansion_phrase"] is None
        assert staged["phrase_review_status"] == "pending"
        assert staged["phrase_review_notes"] == "generated from local titles"
        assert store.review_topic_phrase_suggestion(
            "graph-methods",
            "accepted",
            review_notes="looks good",
        ) is True
        reviewed = store.get_topic("graph-methods")
        assert reviewed is not None
        assert reviewed["suggested_phrase"] == "graph networks biology"
        assert reviewed["expansion_phrase"] == "graph networks biology"
        assert reviewed["phrase_review_status"] == "accepted"
        assert reviewed["phrase_review_notes"] == "looks good"
    finally:
        store.close()
 def test_store_can_filter_topics_by_phrase_review_status():
    store = BibliographyStore()
    try:
        store.ensure_topic("graph-methods", "Graph Methods")
        store.ensure_topic("abiogenesis", "Abiogenesis")
        store.stage_topic_phrase_suggestion("graph-methods", "graph networks biology")
        store.stage_topic_phrase_suggestion("abiogenesis", "abiogenesis life origin")
        store.review_topic_phrase_suggestion("abiogenesis", "accepted")
        pending_topics = store.list_topics(phrase_review_status="pending")
        accepted_topics = store.list_topics(phrase_review_status="accepted")
        assert [topic["slug"] for topic in pending_topics] == ["graph-methods"]
        assert [topic["slug"] for topic in accepted_topics] == ["abiogenesis"]
    finally:
        store.close()
 def test_store_search_text_can_filter_by_topic():
    store = BibliographyStore()
    try:
        store.ingest_bibtex(
            """
@article{seed2024,
  author = {Seed, Alice},
  title = {Graph Methods for Biology},
  year = {2024},
  abstract = {A graph methods paper.}
 }
@article{other2023,
  author = {Other, Bob},
  title = {Graph Methods for Chemistry},
  year = {2023},
  abstract = {Another graph methods paper.}
 }
 """
        )
        store.add_entry_topic(
            "seed2024",
            topic_slug="biology",
            topic_name="Biology",
            source_type="talkorigins",
            source_url="https://example.org/topics/biology",
            source_label="topic-seed",
        )
        store.add_entry_topic(
            "other2023",
            topic_slug="chemistry",
            topic_name="Chemistry",
            source_type="talkorigins",
            source_url="https://example.org/topics/chemistry",
            source_label="topic-seed",
        )
        store.connection.commit()
        results = store.search_text("graph", topic_slug="biology")
        assert [row["citation_key"] for row in results] == ["seed2024"]
    finally:
        store.close()
--- a/tests/test_talkorigins.py
+++ b/tests/test_talkorigins.py
--- a/tests/test_topic_expand.py
+++ b/tests/test_topic_expand.py
@ -0,0 +1,242 @@
 from citegeist.bibtex import BibEntry
 from citegeist.expand import (
    ExpansionResult,
    TopicExpander,
    _meets_topic_assignment_threshold,
    _topic_relevance_score,
 )
 from citegeist.storage import BibliographyStore
 class FakeOpenAlexExpander:
    def __init__(self, results: list[ExpansionResult] | dict[str, list[ExpansionResult]]) -> None:
        self.results = results
    def expand_entry(self, store, citation_key, relation_type="cites", limit=25):
        if isinstance(self.results, dict):
            return list(self.results.get(citation_key, []))
        return list(self.results)
 def test_topic_expander_assigns_relevant_discoveries_back_to_topic():
    store = BibliographyStore()
    try:
        store.ingest_bibtex(
            """
@article{seed2024,
  author = {Seed, Alice},
  title = {Abiogenesis Seed Paper},
  year = {2024}
 }
 """
        )
        store.add_entry_topic(
            "seed2024",
            topic_slug="abiogenesis",
            topic_name="Abiogenesis",
            source_type="talkorigins",
            source_url="https://example.org/topics/abiogenesis",
            source_label="seed",
        )
        store.upsert_entry(
            BibEntry(
                entry_type="article",
                citation_key="discovered1",
                fields={
                    "title": "Abiogenesis and origin chemistry",
                    "abstract": "A study of abiogenesis pathways.",
                    "year": "2025",
                },
            ),
            source_type="graph_expand",
            source_label="test",
            review_status="draft",
        )
        store.upsert_entry(
            BibEntry(
                entry_type="article",
                citation_key="discovered2",
                fields={
                    "title": "Galaxy formation dynamics",
                    "abstract": "Nothing about the topic.",
                    "year": "2025",
                },
            ),
            source_type="graph_expand",
            source_label="test",
            review_status="draft",
        )
        store.connection.commit()
        expander = TopicExpander(
            openalex_expander=FakeOpenAlexExpander(
                [
                    ExpansionResult("seed2024", "discovered1", False, "cites", "openalex:cites:seed2024"),
                    ExpansionResult("seed2024", "discovered2", False, "cites", "openalex:cites:seed2024"),
                ]
            )
        )
        results = expander.expand_topic(
            store,
            "abiogenesis",
            topic_phrase="abiogenesis origin chemistry",
            min_relevance=0.34,
        )
        assert len(results) == 2
        assigned = {item.discovered_citation_key: item.assigned_to_topic for item in results}
        assert assigned["discovered1"] is True
        assert assigned["discovered2"] is False
        topics = store.get_entry_topics("discovered1")
        assert topics[0]["slug"] == "abiogenesis"
        assert store.get_entry_topics("discovered2") == []
    finally:
        store.close()
 def test_topic_expander_can_restrict_to_allowed_seed_keys():
    store = BibliographyStore()
    try:
        store.ingest_bibtex(
            """
@article{seed2024,
  author = {Seed, Alice},
  title = {Abiogenesis Seed Paper},
  year = {2024}
 }
@article{seed2023,
  author = {Seed, Bob},
  title = {Abiogenesis Historical Seed},
  year = {2023}
 }
 """
        )
        for citation_key in ("seed2024", "seed2023"):
            store.add_entry_topic(
                citation_key,
                topic_slug="abiogenesis",
                topic_name="Abiogenesis",
                source_type="talkorigins",
                source_url="https://example.org/topics/abiogenesis",
                source_label="seed",
            )
        store.upsert_entry(
            BibEntry(
                entry_type="article",
                citation_key="discovered1",
                fields={
                    "title": "Abiogenesis origin chemistry",
                    "abstract": "A study of abiogenesis chemistry.",
                    "year": "2025",
                },
            ),
            source_type="graph_expand",
            source_label="test",
            review_status="draft",
        )
        store.connection.commit()
        expander = TopicExpander(
            openalex_expander=FakeOpenAlexExpander(
                {"seed2023": [ExpansionResult("seed2023", "discovered1", False, "cites", "openalex:cites:seed2023")]}
            )
        )
        results = expander.expand_topic(
            store,
            "abiogenesis",
            topic_phrase="abiogenesis origin chemistry",
            seed_keys=["seed2024"],
        )
        assert results == []
        assert store.get_entry_topics("discovered1") == []
    finally:
        store.close()
 def test_topic_expander_preview_discovers_without_writing():
    store = BibliographyStore()
    try:
        store.ingest_bibtex(
            """
@article{seed2024,
  author = {Seed, Alice},
  title = {Abiogenesis Seed Paper},
  year = {2024}
 }
 """
        )
        store.add_entry_topic(
            "seed2024",
            topic_slug="abiogenesis",
            topic_name="Abiogenesis",
            source_type="talkorigins",
            source_url="https://example.org/topics/abiogenesis",
            source_label="seed",
        )
        store.connection.commit()
        expander = TopicExpander()
        expander._preview_discoveries = lambda *_args, **_kwargs: [  # type: ignore[method-assign]
            (
                ExpansionResult(
                    "seed2024",
                    "preview1",
                    True,
                    "cites",
                    "openalex:cites:seed2024",
                ),
                {
                    "title": "Abiogenesis origin chemistry",
                    "abstract": "A study of abiogenesis chemistry.",
                    "year": "2025",
                },
            )
        ]
        results = expander.expand_topic(
            store,
            "abiogenesis",
            topic_phrase="abiogenesis origin chemistry",
            min_relevance=0.3,
            preview_only=True,
        )
        assert len(results) == 1
        assert results[0].discovered_citation_key == "preview1"
        assert results[0].meets_relevance_threshold is True
        assert results[0].assigned_to_topic is False
        assert results[0].created_entry is True
        assert store.get_entry("preview1") is None
        assert store.get_entry_topics("preview1") == []
    finally:
        store.close()
 def test_topic_relevance_score_expands_human_evolution_terms():
    score = _topic_relevance_score(
        "human evolution",
        {
            "title": "Body size and proportions in early hominids",
            "abstract": "A fossil and paleolithic perspective on primate ancestry.",
            "journal": "Science",
        },
    )
    assert score >= 0.15
 def test_topic_assignment_requires_title_anchor():
    entry = {
        "title": "Phylogenies and the Comparative Method",
        "abstract": "A comparative framework for primate and hominid evolution.",
        "journal": "Systematic Zoology",
    }
    score = _topic_relevance_score("human evolution", entry)
    assert score >= 0.15
    assert _meets_topic_assignment_threshold("human evolution", entry, min_relevance=0.15, relevance_score=score) is False
		`@ -0,0 +1,4 @@`
							`from .cli import main`


							`raise SystemExit(main())`