Compare commits

..

No commits in common. "0c6380562b38316170a77821b9d472fba02b194d" and "b74582b72f09f36b63e459c26e3cc7ea3d0696c2" have entirely different histories.

34 changed files with 10571 additions and 238 deletions

231
.gitignore vendored
View File

@ -1,229 +1,6 @@
# ---> Python
# Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/ .pytest_cache/
cover/ .venv/
.cache/
# Translations *.pyc
*.mo library.sqlite3
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# ---> Emacs
# -*- mode: gitignore; -*-
*~
\#*\#
/.emacs.desktop
/.emacs.desktop.lock
*.elc
auto-save-list
tramp
.\#*
# Org-mode
.org-id-locations
*_archive
# flymake-mode
*_flymake.*
# eshell files
/eshell/history
/eshell/lastdir
# elpa packages
/elpa/
# reftex files
*.rel
# AUCTeX auto folder
/auto/
# cask packages
.cask/
dist/
# Flycheck
flycheck_*.el
# server auth directory
/server/
# projectiles files
.projectile
# directory configuration
.dir-locals.el
# network security
/network-security.data
# ---> Rust
# Generated by Cargo
# will have compiled files and executables
debug/
target/
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
Cargo.lock
# These are backup files generated by rustfmt
**/*.rs.bk
# MSVC Windows builds of rustc generate these, which store debugging information
*.pdb

View File

@ -1,9 +0,0 @@
MIT License
Copyright (c) 2026 welsberr
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

16
Makefile Normal file
View File

@ -0,0 +1,16 @@
PYTHONPATH_SRC=PYTHONPATH=src
VENV_PYTHON=.venv/bin/python
.PHONY: test test-live live-smoke validate-talkorigins
test:
$(PYTHONPATH_SRC) $(VENV_PYTHON) -m pytest -q
test-live:
CITEGEIST_LIVE_TESTS=1 CITEGEIST_SOURCE_CACHE=.cache/citegeist $(PYTHONPATH_SRC) $(VENV_PYTHON) -m pytest -m live -q
live-smoke:
CITEGEIST_SOURCE_CACHE=.cache/citegeist $(PYTHONPATH_SRC) $(VENV_PYTHON) scripts/live_smoke.py
validate-talkorigins:
$(PYTHONPATH_SRC) $(VENV_PYTHON) -m citegeist validate-talkorigins talkorigins-out/talkorigins_manifest.json

254
README.md
View File

@ -1,3 +1,253 @@
# CiteGeist # citegeist
A bibliography workbench based on Bibtex and local SQLite databases, aimed at several common bibliography tasks: ingestion of plain-text references, augmentation of Bibtex entries with metadata, graph representations of citations, graph expansion from a citation set, and more. `citegeist` is a research-oriented bibliography workbench for building, expanding, and auditing BibTeX libraries.
The aim is not just to store citations. The aim is to help with the harder problem: finding, improving, connecting, and checking the literature around a topic while keeping BibTeX as a first-class output format.
## Repo Description
`citegeist` is a BibTeX-native research tool for citation extraction, metadata enrichment, citation-graph expansion, and semantic search over scholarly sources.
## Scope
The project is intended to support a workflow like this:
1. Start from rough references extracted from papers, notes, syllabi, or dissertations.
2. Convert them into draft BibTeX entries.
3. Enrich and correct those entries using external scholarly metadata sources.
4. Persist entries, identifiers, abstracts, and citation edges in a local database.
5. Traverse the citation graph outward to discover additional relevant works.
6. Search the local corpus semantically using abstracts and extracted full text.
7. Export verified results back into BibTeX for LaTeX use.
## Why A New Codebase
This repository starts cleanly rather than extending the older `bib/` toolkit directly.
The older toolkit is useful as prior art:
- it demonstrates identifier-driven metadata augmentation;
- it caches PDFs and extracted plaintext;
- it shows one workable model for bibliography growth.
But it is not the right long-term base:
- it is Python 2-era code;
- it is shell-script centric;
- it does not provide a normalized database for graph workflows;
- it is not structured as a reusable Python 3 library.
`citegeist` keeps the useful ideas and rebuilds the foundation around a cleaner Python 3 package boundary.
## Current Status
The initial repo includes:
- `pybtex`-backed BibTeX parsing and export in a repo-local virtual environment;
- a SQLite-backed bibliography store;
- a small CLI for ingest, search, inspection, and export;
- review-state tracking on entries, per-field ingest provenance, and field-level conflict review;
- plaintext reference extraction into draft BibTeX for numbered, APA-like, wrapped-line, and simple book-style references;
- identifier-first metadata resolution for DOI, OpenAlex, DBLP, arXiv, and DataCite-backed entries, with OpenAlex/DataCite title-search fallback;
- local citation-graph traversal over stored `cites`, `cited_by`, and `crossref` edges;
- Crossref- and OpenAlex-backed graph expansion that materializes draft related works and edge provenance;
- a dedicated source-client layer with fixture/cache support for live-source development;
- OAI-PMH Dublin Core harvesting for institutional repositories and thesis/dissertation sources;
- OAI-PMH repository discovery via `Identify`, `ListSets`, and `ListMetadataFormats` to target harvests more precisely;
- bibliography bootstrap workflows that can start from a seed `.bib`, a topic phrase, or both;
- batch bootstrap orchestration from JSON job files containing seed BibTeX paths, topic phrases, or both;
- a TalkOrigins scraper that fixes repeated-author plaintext references, emits per-topic seed BibTeX files, and writes a batch JSON specification;
- normalized tables for entries, creators, identifiers, and citation relations;
- full-text-search-ready indexing over title, abstract, and fulltext when SQLite FTS5 is available;
- tests covering parsing, ingestion, relation storage, and search.
The prioritized execution plan lives in [ROADMAP.md](./ROADMAP.md).
## Layout
```text
citegeist/
src/citegeist/
bibtex.py
storage.py
tests/
test_storage.py
pyproject.toml
```
## Quick Start
```bash
cd citegeist
python3 -m virtualenv --always-copy .venv
.venv/bin/pip install -e .
.venv/bin/pip install pytest
mkdir -p .cache/citegeist
PYTHONPATH=src .venv/bin/python - <<'PY'
from citegeist import BibliographyStore
bib = """
@article{smith2024graphs,
author = {Smith, Jane and Doe, Alex},
title = {Graph-first bibliography augmentation},
year = {2024},
abstract = {We study citation graphs for literature discovery.},
references = {miller2023search}
}
@inproceedings{miller2023search,
author = {Miller, Sam},
title = {Semantic search for research corpora},
year = {2023},
abstract = {Dense retrieval improves recall for academic search.}
}
"""
store = BibliographyStore("library.sqlite3")
store.ingest_bibtex(bib)
print(store.get_relations("smith2024graphs"))
print(store.search_text("semantic"))
store.close()
PY
.venv/bin/python -m pytest -q
```
Or use the CLI directly:
```bash
cd citegeist
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 ingest references.bib
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 search "semantic search"
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 search "origin" --topic abiogenesis
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 show --provenance --conflicts smith2024graphs
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 set-status smith2024graphs reviewed
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve-conflicts smith2024graphs title accepted
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 apply-conflict smith2024graphs title
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 bootstrap --seed-bib seed.bib --topic "bayesian nonparametrics"
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 bootstrap --topic "bayesian nonparametrics" --preview --topic-commit-limit 5
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 scrape-talkorigins talkorigins-out --limit-topics 5 --limit-entries-per-topic 20
PYTHONPATH=src .venv/bin/python -m citegeist extract references.txt --output draft.bib
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve smith2024graphs
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topics
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topic-entries abiogenesis
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export-topic abiogenesis --output abiogenesis.bib
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 graph smith2024graphs --relation cites --depth 2 --missing-only
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source crossref
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source openalex --relation cited_by --limit 10
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand-topic abiogenesis --topic-phrase "abiogenesis origin chemistry" --source openalex --relation cites --seed-key seed2024 --min-relevance 0.3 --preview
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 set-topic-phrase abiogenesis "abiogenesis origin chemistry prebiotic"
PYTHONPATH=src .venv/bin/python -m citegeist discover-oai https://example.edu/oai
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 harvest-oai https://example.edu/oai --metadata-prefix mods --from 2024-01-01 --until 2024-12-31 --limit 10
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export --output reviewed.bib
```
For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run.
For large legacy plaintext corpora such as the TalkOrigins bibliography, prefer a two-step workflow:
1. `scrape-talkorigins` to generate cleaned per-topic `seed_bib` files plus a `talkorigins_jobs.json` batch spec.
2. `bootstrap-batch` on that JSON file when you want to ingest, resolve, and expand from the generated seeds.
The TalkOrigins scrape output now includes:
- `seeds/*.bib` per-topic seed BibTeX files for `bootstrap-batch`
- `plaintext/*.txt` per-topic cleaned GSA-style plaintext with repeated authors expanded
- `site/topics/*.html` reconstructed topic pages with hide/show BibTeX blocks
- `talkorigins_full.txt` and `talkorigins_full.bib` aggregate downloads
- `snapshots/*.json` cached topic payloads so reruns can resume without re-fetching already scraped topics
After a full scrape, run:
```bash
PYTHONPATH=src .venv/bin/python -m citegeist validate-talkorigins talkorigins-out/talkorigins_manifest.json
PYTHONPATH=src .venv/bin/python -m citegeist duplicates-talkorigins talkorigins-out/talkorigins_manifest.json --limit 20
PYTHONPATH=src .venv/bin/python -m citegeist duplicates-talkorigins talkorigins-out/talkorigins_manifest.json --limit 20 --preview --weak-only
PYTHONPATH=src .venv/bin/python -m citegeist suggest-talkorigins-phrases talkorigins-out/talkorigins_manifest.json --output topic-phrases.json
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 stage-topic-phrases topic-phrases.json
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 review-topic-phrase abiogenesis accepted --notes "curated from local corpus"
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 apply-topic-phrases topic-phrases.json
PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 enrich-talkorigins talkorigins-out/talkorigins_manifest.json --limit 20
PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins-copy.sqlite3 enrich-talkorigins talkorigins-out/talkorigins_manifest.json --limit 5 --apply --allow-unsafe-search-matches
PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 review-talkorigins talkorigins-out/talkorigins_manifest.json --output talkorigins-review.json
PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 apply-talkorigins-corrections talkorigins-out/talkorigins_manifest.json talkorigins-corrections.json
```
That report summarizes parse coverage and flags suspicious entry-type / venue combinations for manual cleanup.
It also reports duplicate clusters across topic seed files so you can gauge how much deduplication pressure to expect before ingestion.
Use `duplicates-talkorigins` when you want to inspect specific clusters, filter by text, restrict the audit to one topic slug, or preview only weak canonicalization outcomes before importing.
Use `suggest-talkorigins-phrases` to derive candidate stored expansion phrases from the existing TalkOrigins topic corpus itself. The output is deterministic JSON keyed by topic slug, with a suggested phrase plus the extracted keywords that drove it. This is a useful first pass before setting topic phrases in the database or editing generated batch jobs.
Use `stage-topic-phrases` to load those suggestions into the database as review items. Staging stores the candidate in `suggested_phrase` and marks the topic `pending` without changing the active `expansion_phrase`.
Use `review-topic-phrase` to accept or reject one staged suggestion in place. Accepting a suggestion copies it into `expansion_phrase`; rejecting it preserves the review state without changing the live phrase.
Use `apply-topic-phrases` when you want a direct patch path instead of the staged review flow. It accepts either the raw suggestion list or an object with a `topics` list, and will apply `suggested_phrase` or `phrase` to matching topic slugs immediately.
Use `enrich-talkorigins` when you want to target those weak canonical entries for resolver-based metadata upgrades before retrying graph expansion on imported topic slices.
Use `review-talkorigins` when you want one JSON review artifact that combines weak canonical clusters with dry-run enrichment outcomes for manual cleanup.
Use `expand-topic` when you already have both a topic phrase and a curated topic seed set in the database: it expands outward from the topics existing entries, then only assigns discovered works back to that topic if they clear a topic-relevance threshold. Write-enabled assignment is stricter than preview ranking: a candidate must clear the score threshold and show a non-generic title anchor to the topic phrase, so broad methods papers do not get attached just because their abstracts or related terms overlap. On large noisy topics, prefer `--seed-key` to restrict the run to just the trusted seed entries you want to expand from, and use `--preview` first to inspect discovered candidates and relevance scores before writing anything.
Use `set-topic-phrase` to store a curated expansion phrase on the topic itself. When a stored phrase exists, `expand-topic` will use it automatically if you do not pass `--topic-phrase`. Batch bootstrap jobs can also set `topic_slug`, `topic_name`, and `topic_phrase` so curated topic metadata is created as part of the run.
Use `topics --phrase-review-status pending` when you want to audit only topics whose staged phrase suggestions still need review.
`--allow-unsafe-search-matches` exists only for bounded experiments on copied databases when you explicitly want to relax trust to exercise downstream expansion behavior.
Correction files are simple JSON:
```json
{
"corrections": [
{
"key": "smith jane|1999|weak duplicate",
"entry_type": "article",
"review_status": "reviewed",
"fields": {
"journal": "Journal of Better Metadata",
"doi": "10.1000/weak",
"note": null
}
}
]
}
```
`fields` values overwrite the canonical entry for that duplicate-cluster key. Set a field to `null` to remove it.
To import the reconstructed corpus into SQLite while collapsing duplicate works across topics into canonical entries:
```bash
PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 ingest-talkorigins talkorigins-out/talkorigins_manifest.json
```
That import preserves many-to-many topic membership through the `topics` and `entry_topics` tables.
After import, use `topics`, `topic-entries`, `search --topic`, and `export-topic` to inspect or export topic slices from the consolidated database.
Live-source workflow:
```bash
cd citegeist
export CITEGEIST_SOURCE_CACHE=.cache/citegeist
export CITEGEIST_LIVE_TESTS=1
PYTHONPATH=src .venv/bin/python -m pytest -m live -q
PYTHONPATH=src .venv/bin/python scripts/live_smoke.py
```
By default, live tests are skipped. They only run when `CITEGEIST_LIVE_TESTS=1` is set.
Convenience targets:
```bash
make test
make test-live
make live-smoke
```
## Near-Term Priorities
- source adapters beyond OAI-PMH for additional non-DOI scholarly ecosystems.
See [ROADMAP.md](./ROADMAP.md) for the prioritized phase plan and rationale.
## Naming
The name is intended to be short, distinct, and memorable:
- `cite` for citation work;
- `geist` for the organizing intelligence around the literature.

187
ROADMAP.md Normal file
View File

@ -0,0 +1,187 @@
# Roadmap
This roadmap prioritizes a usable local research workflow over breadth of integrations.
The first objective is not to support every metadata source. The first objective is to make one end-to-end path work reliably:
1. ingest draft references,
2. normalize and store them,
3. enrich them,
4. traverse citation links,
5. export reviewed BibTeX.
## Prioritization Principles
- prioritize steps that make the system usable by a single researcher on a local machine;
- prioritize deterministic infrastructure before network integrations;
- keep every stage inspectable and auditable;
- treat verification and provenance as core features, not cleanup work;
- defer heavy semantic infrastructure until the local corpus model is stable.
## Current Baseline
Completed:
- lightweight BibTeX parsing;
- SQLite storage for entries, creators, identifiers, and relations;
- local text search using SQLite FTS5 when available;
- tests for ingest, relation storage, and search.
## Phase 1: Core Ingestion And Export
Priority: P0
Goal:
Make `citegeist` useful as a local BibTeX workbench even before online enrichment is added.
Tasks:
- add BibTeX export from the normalized database back into stable, readable BibTeX;
- add a small CLI for `ingest`, `show`, `search`, and `export`;
- store field provenance metadata alongside imported and edited fields;
- add schema support for entry status such as `draft`, `enriched`, `reviewed`, and `exported`;
- add fixture-driven tests for round-tripping BibTeX through ingest and export.
Why this comes first:
- without export, the project is not yet useful in a LaTeX workflow;
- without a CLI, the package is a library demo rather than a tool;
- without provenance and state, later enrichment work becomes hard to audit.
Exit criteria:
- a user can ingest a `.bib` file, inspect entries, search locally, and export a reviewed `.bib`;
- round-trip tests show no unexpected field loss for supported entry types.
## Phase 2: Reference Extraction
Priority: P0
Goal:
Turn raw reference text into draft entries that can enter the main pipeline.
Tasks:
- add parsers for bibliography-section lines and plain-text reference lists;
- define a draft-entry schema for incomplete references with confidence markers;
- support ingestion of OCR- or PDF-derived plaintext bibliography sections;
- add normalization for author names, years, title casing, and page ranges;
- build gold-test fixtures from real, messy reference examples.
Why this is next:
- this addresses the projects first unique bottleneck: getting rough references into structured form;
- enrichment is much more effective once draft references are normalized.
Exit criteria:
- a user can pass a plaintext bibliography section and receive draft BibTeX entries with unresolved fields clearly marked;
- tests cover common article, book, chapter, and proceedings references.
## Phase 3: Metadata Enrichment
Priority: P1
Goal:
Resolve draft or partial entries against external scholarly sources and merge improved metadata safely.
Tasks:
- define a resolver interface with deterministic merge rules;
- implement first-party resolvers for DOI/Crossref, DBLP, and arXiv;
- add identifier-first resolution, then title/author/year fallback search;
- store merge provenance per field and resolution attempt logs;
- flag conflicts rather than silently overwriting disputed values.
Why this is P1 rather than the first phase:
- enrichment quality depends on the ingestion and provenance model being correct first;
- it is easier to test deterministic merge behavior once local workflows already exist.
Exit criteria:
- an incomplete entry can be enriched from at least one authoritative source;
- conflicting fields remain visible for review instead of being lost.
## Phase 4: Citation Graph Expansion
Priority: P1
Goal:
Use citation edges as a discovery engine rather than just metadata storage.
Tasks:
- support explicit `cites` and `cited_by` edge ingestion with source provenance;
- add graph expansion commands starting from one or more seed entries;
- track edge discovery source, timestamp, and confidence;
- add filters for depth, source type, year range, and reviewed status;
- expose unresolved nodes so the user can decide what to enrich next.
Why this matters:
- this is central to literature discovery rather than mere bibliography cleanup;
- it turns the database into a research navigation tool.
Exit criteria:
- starting from one or more seed entries, a user can expand outward through citation edges and persist newly discovered nodes;
- graph traversal results can be exported as BibTeX candidates for review.
## Phase 5: Search And Ranking
Priority: P2
Goal:
Improve discovery quality inside the local corpus.
Tasks:
- refine FTS ranking across title, abstract, keywords, and fulltext;
- add saved search queries and result filters;
- add optional embedding-backed semantic search behind a pluggable interface;
- support hybrid ranking that combines lexical matching, identifiers, and citation proximity;
- add benchmarking fixtures for retrieval quality on a few research topics.
Why this is later:
- FTS is already enough to support early workflows;
- embedding infrastructure is expensive and should wait until the corpus schema stabilizes.
Exit criteria:
- local search is useful on realistic corpora without requiring external services;
- semantic indexing is optional and does not displace the simpler local search path.
## Phase 6: Corpus Acquisition Pipelines
Priority: P2
Goal:
Broaden source acquisition without mixing that complexity into the core model.
Tasks:
- add source adapters for open-access theses and dissertation repositories;
- add support for harvesting publisher citation pages and preprint metadata pages;
- define per-source import provenance and rate-limit behavior;
- separate source-specific scraping logic from normalized entry storage;
- add regression fixtures for representative public sources.
Why this is later:
- acquisition breadth is useful, but only after the core ingest/enrich/review loop is solid;
- source adapters are brittle and should sit on top of a stable model.
Exit criteria:
- new public corpora can be imported through adapters without changing the storage core;
- imported entries retain their source provenance and can be reviewed like any other entry.
## Suggested Next Three Tasks
1. Add a CLI module with `ingest`, `search`, `show`, and `export`.
2. Implement BibTeX export from the normalized store.
3. Add provenance tables and entry review status fields.
These three tasks complete the first usable local workflow and should be treated as the immediate sprint.

20
pyproject.toml Normal file
View File

@ -0,0 +1,20 @@
[build-system]
requires = ["setuptools>=68"]
build-backend = "setuptools.build_meta"
[project]
name = "citegeist"
version = "0.1.0"
description = "BibTeX-native tooling for bibliography augmentation, citation graphs, and search"
requires-python = ">=3.10"
dependencies = ["pybtex==0.25.1"]
[project.scripts]
citegeist = "citegeist.cli:main"
[tool.pytest.ini_options]
pythonpath = ["src"]
testpaths = ["tests"]
markers = [
"live: tests that call live external scholarly APIs and are skipped unless explicitly enabled",
]

58
scripts/live_smoke.py Normal file
View File

@ -0,0 +1,58 @@
from __future__ import annotations
import argparse
import json
import os
from citegeist import MetadataResolver, SourceClient
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Run live smoke checks against scholarly metadata sources")
parser.add_argument(
"--cache-dir",
default=os.environ.get("CITEGEIST_SOURCE_CACHE", ".cache/citegeist"),
help="Directory for cached live-source responses",
)
parser.add_argument(
"--fixtures-dir",
default=os.environ.get("CITEGEIST_SOURCE_FIXTURES"),
help="Optional fixture directory to read before live network calls",
)
return parser
def main() -> int:
args = build_parser().parse_args()
client = SourceClient(cache_dir=args.cache_dir, fixtures_dir=args.fixtures_dir)
resolver = MetadataResolver(source_client=client)
checks = {
"crossref_doi": resolver.resolve_doi("10.1038/nphys1170"),
"arxiv_id": resolver.resolve_arxiv("1706.03762"),
"openalex_search": resolver.search_openalex_best_match(
title="Attention Is All You Need",
author_text="Ashish Vaswani",
year="2017",
),
}
payload = {}
for name, resolution in checks.items():
payload[name] = None
if resolution is not None:
payload[name] = {
"source_label": resolution.source_label,
"title": resolution.entry.fields.get("title"),
"year": resolution.entry.fields.get("year"),
"doi": resolution.entry.fields.get("doi"),
"openalex": resolution.entry.fields.get("openalex"),
"arxiv": resolution.entry.fields.get("arxiv"),
}
print(json.dumps(payload, indent=2, sort_keys=True))
return 0
if __name__ == "__main__":
raise SystemExit(main())

52
src/citegeist/__init__.py Normal file
View File

@ -0,0 +1,52 @@
from .batch import BatchBootstrapRunner, BatchJobResult, load_batch_jobs
from .bibtex import BibEntry, parse_bibtex
from .bootstrap import BootstrapResult, Bootstrapper
from .expand import CrossrefExpander, OpenAlexExpander
from .extract import extract_references
from .harvest import OaiMetadataFormat, OaiPmhHarvester, OaiSet
from .resolve import MetadataResolver, merge_entries, merge_entries_with_conflicts
from .sources import SourceClient
from .storage import BibliographyStore
from .talkorigins import (
TalkOriginsBatchExport,
TalkOriginsDuplicateCluster,
TalkOriginsEnrichmentResult,
TalkOriginsIngestReport,
TalkOriginsReviewExport,
TalkOriginsScraper,
TalkOriginsSeedSet,
TalkOriginsTopicPhraseSuggestion,
TalkOriginsTopic,
TalkOriginsValidationReport,
)
__all__ = [
"BibEntry",
"BatchBootstrapRunner",
"BatchJobResult",
"BibliographyStore",
"BootstrapResult",
"Bootstrapper",
"CrossrefExpander",
"MetadataResolver",
"OpenAlexExpander",
"OaiPmhHarvester",
"OaiMetadataFormat",
"OaiSet",
"SourceClient",
"TalkOriginsBatchExport",
"TalkOriginsDuplicateCluster",
"TalkOriginsEnrichmentResult",
"TalkOriginsIngestReport",
"TalkOriginsReviewExport",
"TalkOriginsScraper",
"TalkOriginsSeedSet",
"TalkOriginsTopicPhraseSuggestion",
"TalkOriginsTopic",
"TalkOriginsValidationReport",
"extract_references",
"load_batch_jobs",
"merge_entries",
"merge_entries_with_conflicts",
"parse_bibtex",
]

View File

@ -0,0 +1,4 @@
from .cli import main
raise SystemExit(main())

78
src/citegeist/batch.py Normal file
View File

@ -0,0 +1,78 @@
from __future__ import annotations
import json
from dataclasses import dataclass
from pathlib import Path
from .bootstrap import BootstrapResult, Bootstrapper
from .storage import BibliographyStore
@dataclass(slots=True)
class BatchJobResult:
job_name: str
result_count: int
results: list[BootstrapResult]
def load_batch_jobs(path: str | Path) -> list[dict]:
path = Path(path)
payload = json.loads(path.read_text(encoding="utf-8"))
if isinstance(payload, dict):
jobs = payload.get("jobs", [])
else:
jobs = payload
if not isinstance(jobs, list):
raise ValueError("Batch JSON must be a list of jobs or an object with a 'jobs' list")
normalized_jobs: list[dict] = []
for job in jobs:
if not isinstance(job, dict):
raise ValueError("Each batch job must be an object")
normalized = dict(job)
seed_bib = normalized.get("seed_bib")
if isinstance(seed_bib, str) and seed_bib:
seed_path = Path(seed_bib)
if not seed_path.is_absolute():
normalized["seed_bib"] = str((path.parent / seed_path).resolve())
normalized_jobs.append(normalized)
return normalized_jobs
class BatchBootstrapRunner:
def __init__(self, bootstrapper: Bootstrapper | None = None) -> None:
self.bootstrapper = bootstrapper or Bootstrapper()
def run(self, store: BibliographyStore, jobs: list[dict]) -> list[BatchJobResult]:
results: list[BatchJobResult] = []
for index, job in enumerate(jobs, start=1):
seed_bib = job.get("seed_bib")
topic = job.get("topic")
topic_limit = int(job.get("topic_limit", 5))
topic_commit_limit = job.get("topic_commit_limit")
expand = bool(job.get("expand", True))
review_status = str(job.get("status", "draft"))
preview = bool(job.get("preview", False))
name = str(job.get("name") or f"job_{index}")
topic_slug = job.get("topic_slug")
topic_name = job.get("topic_name")
topic_phrase = job.get("topic_phrase")
seed_bibtex = None
if seed_bib:
seed_bibtex = Path(seed_bib).read_text(encoding="utf-8")
job_results = self.bootstrapper.bootstrap(
store,
seed_bibtex=seed_bibtex,
topic=topic,
topic_limit=topic_limit,
topic_commit_limit=int(topic_commit_limit) if topic_commit_limit is not None else None,
expand=expand,
review_status=review_status,
preview_only=preview,
topic_slug=str(topic_slug) if topic_slug else None,
topic_name=str(topic_name) if topic_name else None,
topic_phrase=str(topic_phrase) if topic_phrase else None,
)
results.append(BatchJobResult(name, len(job_results), job_results))
return results

116
src/citegeist/bibtex.py Normal file
View File

@ -0,0 +1,116 @@
from __future__ import annotations
from dataclasses import dataclass
from io import StringIO
try:
from pybtex.database import BibliographyData, Entry, Person, parse_string
from pybtex.bibtex.exceptions import BibTeXError
from pybtex.database.output.bibtex import Writer
except ImportError: # pragma: no cover - exercised only outside the configured venv
BibTeXError = None
BibliographyData = Entry = Person = Writer = None
parse_string = None
@dataclass(slots=True)
class BibEntry:
entry_type: str
citation_key: str
fields: dict[str, str]
def parse_bibtex(text: str) -> list[BibEntry]:
_require_pybtex()
bibliography = parse_string(text, bib_format="bibtex")
entries: list[BibEntry] = []
for citation_key, entry in bibliography.entries.items():
fields = dict(entry.fields.items())
for role, persons in entry.persons.items():
fields[role] = " and ".join(str(person) for person in persons)
entries.append(
BibEntry(
entry_type=entry.type,
citation_key=citation_key,
fields=fields,
)
)
return entries
def render_bibtex(entries: list[BibEntry]) -> str:
_require_pybtex()
bibliography_entries = {}
for entry in entries:
fields = {
key: _sanitize_bibtex_value(value)
for key, value in entry.fields.items()
if key not in {"author", "editor"}
}
persons = {}
for role in ("author", "editor"):
raw_names = entry.fields.get(role)
if raw_names:
persons[role] = [Person(name.strip()) for name in raw_names.split(" and ") if name.strip()]
bibliography_entries[entry.citation_key] = Entry(entry.entry_type, fields=fields, persons=persons)
buffer = StringIO()
try:
Writer().write_stream(BibliographyData(entries=bibliography_entries), buffer)
except BibTeXError:
conservative_entries = {}
for entry in entries:
fields = {
key: _flatten_bibtex_braces(value)
for key, value in entry.fields.items()
if key not in {"author", "editor"}
}
persons = {}
for role in ("author", "editor"):
raw_names = entry.fields.get(role)
if raw_names:
persons[role] = [Person(name.strip()) for name in raw_names.split(" and ") if name.strip()]
conservative_entries[entry.citation_key] = Entry(entry.entry_type, fields=fields, persons=persons)
buffer = StringIO()
Writer().write_stream(BibliographyData(entries=conservative_entries), buffer)
return buffer.getvalue().strip()
def _require_pybtex() -> None:
if parse_string is None or Writer is None:
raise RuntimeError(
"pybtex is required. Use the repo-local virtual environment under .venv/ for citegeist commands."
)
def _sanitize_bibtex_value(value: str) -> str:
depth = 0
parts: list[str] = []
for char in value:
if char == "{":
depth += 1
parts.append(char)
continue
if char == "}":
if depth == 0:
parts.append(")")
else:
depth -= 1
parts.append(char)
continue
parts.append(char)
if depth > 0:
open_count = depth
normalized = []
for char in parts:
if char == "{" and open_count > 0:
normalized.append("(")
open_count -= 1
else:
normalized.append(char)
return "".join(normalized)
return "".join(parts)
def _flatten_bibtex_braces(value: str) -> str:
return value.replace("{", "(").replace("}", ")")

145
src/citegeist/bootstrap.py Normal file
View File

@ -0,0 +1,145 @@
from __future__ import annotations
from dataclasses import dataclass
import re
from .bibtex import BibEntry, parse_bibtex
from .expand import CrossrefExpander, OpenAlexExpander
from .resolve import MetadataResolver
from .storage import BibliographyStore
@dataclass(slots=True)
class BootstrapResult:
citation_key: str
origin: str
created: bool
score: float = 0.0
class Bootstrapper:
def __init__(
self,
resolver: MetadataResolver | None = None,
crossref_expander: CrossrefExpander | None = None,
openalex_expander: OpenAlexExpander | None = None,
) -> None:
self.resolver = resolver or MetadataResolver()
self.crossref_expander = crossref_expander or CrossrefExpander(self.resolver)
self.openalex_expander = openalex_expander or OpenAlexExpander(self.resolver)
def bootstrap(
self,
store: BibliographyStore,
seed_bibtex: str | None = None,
topic: str | None = None,
topic_limit: int = 5,
topic_commit_limit: int | None = None,
expand: bool = True,
review_status: str = "draft",
preview_only: bool = False,
topic_slug: str | None = None,
topic_name: str | None = None,
topic_phrase: str | None = None,
) -> list[BootstrapResult]:
results: list[BootstrapResult] = []
seed_keys: list[str] = []
if seed_bibtex:
for entry in parse_bibtex(seed_bibtex):
created = store.get_entry(entry.citation_key) is None
if not preview_only:
store.upsert_entry(
entry,
raw_bibtex=None,
source_type="bootstrap",
source_label="seed_bibtex",
review_status=review_status,
)
seed_keys.append(entry.citation_key)
results.append(BootstrapResult(entry.citation_key, "seed_bibtex", created))
if topic:
if not preview_only and (topic_slug or topic_name or topic_phrase):
store.ensure_topic(
slug=topic_slug or _slugify(topic),
name=topic_name or topic,
source_type="bootstrap",
expansion_phrase=topic_phrase or topic,
)
ranked_candidates = self._topic_candidates(topic, seed_keys, topic_limit)
if topic_commit_limit is not None:
ranked_candidates = ranked_candidates[:topic_commit_limit]
for entry, score in ranked_candidates:
created = store.get_entry(entry.citation_key) is None
if not preview_only:
store.upsert_entry(
entry,
raw_bibtex=None,
source_type="bootstrap",
source_label=f"topic:{topic}",
review_status=review_status,
)
seed_keys.append(entry.citation_key)
results.append(BootstrapResult(entry.citation_key, "topic", created, score=score))
if expand and not preview_only:
expanded_keys = list(dict.fromkeys(seed_keys))
for citation_key in expanded_keys:
for item in self.crossref_expander.expand_entry_references(store, citation_key):
results.append(BootstrapResult(item.discovered_citation_key, "crossref_expand", item.created_entry))
for item in self.openalex_expander.expand_entry(store, citation_key, relation_type="cites", limit=topic_limit):
results.append(BootstrapResult(item.discovered_citation_key, "openalex_expand", item.created_entry))
store.connection.commit()
return results
def _topic_candidates(self, topic: str, seed_keys: list[str], limit: int) -> list[tuple[BibEntry, float]]:
scored: dict[str, tuple[BibEntry, float]] = {}
for source_name, base_score, entries in (
("openalex", 3.0, self.resolver.search_openalex(topic, limit=limit)),
("crossref", 2.0, self.resolver.search_crossref(topic, limit=limit)),
("datacite", 1.5, self.resolver.search_datacite(topic, limit=limit)),
):
for entry in entries:
score = base_score + _topic_relevance_score(entry, topic) + _seed_overlap_score(entry, seed_keys)
existing = scored.get(entry.citation_key)
if existing is None or score > existing[1]:
scored[entry.citation_key] = (entry, score)
ranked = sorted(
scored.values(),
key=lambda item: (-item[1], item[0].citation_key),
)
return ranked[:limit]
def _topic_relevance_score(entry: BibEntry, topic: str) -> float:
topic_terms = _tokenize(topic)
title_terms = _tokenize(entry.fields.get("title", ""))
abstract_terms = _tokenize(entry.fields.get("abstract", ""))
overlap = len(topic_terms & (title_terms | abstract_terms))
return float(overlap)
def _seed_overlap_score(entry: BibEntry, seed_keys: list[str]) -> float:
if not seed_keys:
return 0.0
title_terms = _tokenize(entry.fields.get("title", ""))
score = 0.0
for seed_key in seed_keys:
seed_terms = _tokenize(seed_key)
if seed_terms & title_terms:
score += 0.25
return score
def _tokenize(value: str) -> set[str]:
return {token for token in re.split(r"\W+", value.lower()) if token}
def _slugify(value: str) -> str:
slug = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-")
return slug or "topic"

1199
src/citegeist/cli.py Normal file

File diff suppressed because it is too large Load Diff

600
src/citegeist/expand.py Normal file
View File

@ -0,0 +1,600 @@
from __future__ import annotations
import re
from dataclasses import dataclass
from urllib.parse import quote, urlencode
from .bibtex import BibEntry, parse_bibtex
from .resolve import MetadataResolver
from .storage import BibliographyStore
@dataclass(slots=True)
class ExpansionResult:
source_citation_key: str
discovered_citation_key: str
created_entry: bool
relation_type: str
source_label: str
@dataclass(slots=True)
class TopicExpansionResult:
topic_slug: str
source_citation_key: str
discovered_citation_key: str
discovered_title: str
created_entry: bool
relation_type: str
source_label: str
relevance_score: float
meets_relevance_threshold: bool
assigned_to_topic: bool
class CrossrefExpander:
def __init__(self, resolver: MetadataResolver | None = None) -> None:
self.resolver = resolver or MetadataResolver()
def expand_entry_references(
self,
store: BibliographyStore,
citation_key: str,
) -> list[ExpansionResult]:
entry = store.get_entry(citation_key)
if entry is None:
return []
doi = entry.get("doi")
if not doi:
return []
payload = self.resolver.source_client.get_json(
f"https://api.crossref.org/works/{doi}?mailto=welsberr@gmail.com"
)
references = payload.get("message", {}).get("reference", [])
results: list[ExpansionResult] = []
for index, reference in enumerate(references, start=1):
discovered = _crossref_reference_to_entry(reference, citation_key, index)
created = False
if store.get_entry(discovered.citation_key) is None:
store.upsert_entry(
discovered,
raw_bibtex=None,
source_type="graph_expand",
source_label=f"crossref:references:{doi}",
review_status="draft",
)
store.connection.commit()
created = True
store.add_relation(
citation_key,
discovered.citation_key,
"cites",
source_type="graph_expand",
source_label=f"crossref:references:{doi}",
confidence=1.0 if reference.get("DOI") else 0.6,
)
results.append(
ExpansionResult(
source_citation_key=citation_key,
discovered_citation_key=discovered.citation_key,
created_entry=created,
relation_type="cites",
source_label=f"crossref:references:{doi}",
)
)
return results
class OpenAlexExpander:
def __init__(self, resolver: MetadataResolver | None = None) -> None:
self.resolver = resolver or MetadataResolver()
def expand_entry(
self,
store: BibliographyStore,
citation_key: str,
relation_type: str = "cites",
limit: int = 25,
) -> list[ExpansionResult]:
entry = store.get_entry(citation_key)
if entry is None:
return []
openalex_id = entry.get("openalex") or self._lookup_openalex_id(entry)
if not openalex_id:
return []
if not entry.get("openalex"):
bibtex = store.get_entry_bibtex(citation_key)
if bibtex:
seed_entry = parse_bibtex(bibtex)[0]
seed_entry.fields["openalex"] = openalex_id
store.replace_entry(
citation_key,
seed_entry,
source_type="resolver",
source_label=f"openalex:id:{openalex_id}",
review_status=str(entry.get("review_status") or "draft"),
)
filter_name = "cited_by" if relation_type == "cites" else "cites"
query = urlencode({"filter": f"{filter_name}:{openalex_id}", "per-page": limit})
payload = self.resolver.source_client.get_json(f"https://api.openalex.org/works?{query}")
works = payload.get("results", [])
results: list[ExpansionResult] = []
for work in works:
discovered = _openalex_work_to_entry(work)
created = False
if store.get_entry(discovered.citation_key) is None:
store.upsert_entry(
discovered,
raw_bibtex=None,
source_type="graph_expand",
source_label=f"openalex:{relation_type}:{openalex_id}",
review_status="draft",
)
store.connection.commit()
created = True
if relation_type == "cites":
source_key = citation_key
target_key = discovered.citation_key
else:
source_key = discovered.citation_key
target_key = citation_key
store.add_relation(
source_key,
target_key,
"cites",
source_type="graph_expand",
source_label=f"openalex:{relation_type}:{openalex_id}",
confidence=0.9,
)
results.append(
ExpansionResult(
source_citation_key=source_key,
discovered_citation_key=discovered.citation_key,
created_entry=created,
relation_type=relation_type,
source_label=f"openalex:{relation_type}:{openalex_id}",
)
)
return results
def _lookup_openalex_id(self, entry: dict[str, object]) -> str | None:
doi = entry.get("doi")
if not doi:
return None
query = urlencode({"filter": f"doi:https://doi.org/{doi}"})
payload = self.resolver.source_client.get_json(f"https://api.openalex.org/works?{query}")
results = payload.get("results", [])
if not results:
return None
return _normalize_openalex_id(results[0].get("id", ""))
class TopicExpander:
def __init__(
self,
crossref_expander: CrossrefExpander | None = None,
openalex_expander: OpenAlexExpander | None = None,
) -> None:
self.crossref_expander = crossref_expander or CrossrefExpander()
self.openalex_expander = openalex_expander or OpenAlexExpander()
def expand_topic(
self,
store: BibliographyStore,
topic_slug: str,
topic_phrase: str | None = None,
source: str = "openalex",
relation_type: str = "cites",
seed_limit: int = 25,
per_seed_limit: int = 25,
min_relevance: float = 0.2,
seed_keys: list[str] | None = None,
preview_only: bool = False,
) -> list[TopicExpansionResult]:
topic = store.get_topic(topic_slug)
if topic is None:
return []
phrase = (topic_phrase or str(topic.get("name") or topic_slug)).strip()
seeds = store.list_topic_entries(topic_slug, limit=seed_limit)
if seed_keys:
allowed = set(seed_keys)
seeds = [seed for seed in seeds if str(seed["citation_key"]) in allowed]
results: list[TopicExpansionResult] = []
for seed in seeds:
seed_key = str(seed["citation_key"])
if preview_only:
discovered_rows = self._preview_discoveries(
store,
seed_key,
source=source,
relation_type=relation_type,
limit=per_seed_limit,
)
else:
discovered_rows = self._materialized_discoveries(
store,
seed_key,
source=source,
relation_type=relation_type,
limit=per_seed_limit,
)
for row, target_entry in discovered_rows:
score = _topic_relevance_score(phrase, target_entry)
meets_threshold = _meets_topic_assignment_threshold(
phrase,
target_entry,
min_relevance=min_relevance,
relevance_score=score,
)
assigned = False
if not preview_only and meets_threshold and target_entry is not None:
assigned = store.add_entry_topic(
row.discovered_citation_key,
topic_slug=topic_slug,
topic_name=str(topic.get("name") or topic_slug),
source_type="topic_expand",
source_url=str(topic.get("source_url") or ""),
source_label=f"{source}:{relation_type}:{seed_key}",
confidence=score,
)
results.append(
TopicExpansionResult(
topic_slug=topic_slug,
source_citation_key=row.source_citation_key,
discovered_citation_key=row.discovered_citation_key,
discovered_title=str(target_entry.get("title") or ""),
created_entry=row.created_entry,
relation_type=row.relation_type,
source_label=row.source_label,
relevance_score=score,
meets_relevance_threshold=meets_threshold,
assigned_to_topic=assigned,
)
)
store.connection.commit()
return results
def _materialized_discoveries(
self,
store: BibliographyStore,
citation_key: str,
source: str,
relation_type: str,
limit: int,
) -> list[tuple[ExpansionResult, dict[str, object] | None]]:
if source == "crossref":
expansion_rows = self.crossref_expander.expand_entry_references(store, citation_key)
else:
expansion_rows = self.openalex_expander.expand_entry(
store,
citation_key,
relation_type=relation_type,
limit=limit,
)
return [(row, store.get_entry(row.discovered_citation_key)) for row in expansion_rows]
def _preview_discoveries(
self,
store: BibliographyStore,
citation_key: str,
source: str,
relation_type: str,
limit: int,
) -> list[tuple[ExpansionResult, dict[str, object]]]:
if source == "crossref":
return self._preview_crossref_discoveries(store, citation_key, limit)
return self._preview_openalex_discoveries(store, citation_key, relation_type, limit)
def _preview_crossref_discoveries(
self,
store: BibliographyStore,
citation_key: str,
limit: int,
) -> list[tuple[ExpansionResult, dict[str, object]]]:
entry = store.get_entry(citation_key)
if entry is None or not entry.get("doi"):
return []
doi = str(entry["doi"])
payload = self.crossref_expander.resolver.source_client.get_json(
f"https://api.crossref.org/works/{doi}?mailto=welsberr@gmail.com"
)
references = payload.get("message", {}).get("reference", [])[:limit]
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
for index, reference in enumerate(references, start=1):
discovered = _crossref_reference_to_entry(reference, citation_key, index)
rows.append(
(
ExpansionResult(
source_citation_key=citation_key,
discovered_citation_key=discovered.citation_key,
created_entry=store.get_entry(discovered.citation_key) is None,
relation_type="cites",
source_label=f"crossref:references:{doi}",
),
dict(discovered.fields),
)
)
return rows
def _preview_openalex_discoveries(
self,
store: BibliographyStore,
citation_key: str,
relation_type: str,
limit: int,
) -> list[tuple[ExpansionResult, dict[str, object]]]:
entry = store.get_entry(citation_key)
if entry is None:
return []
openalex_id = entry.get("openalex") or self.openalex_expander._lookup_openalex_id(entry)
if not openalex_id:
return []
filter_name = "cited_by" if relation_type == "cites" else "cites"
query = urlencode({"filter": f"{filter_name}:{openalex_id}", "per-page": limit})
payload = self.openalex_expander.resolver.source_client.get_json(f"https://api.openalex.org/works?{query}")
works = payload.get("results", [])
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
for work in works:
discovered = _openalex_work_to_entry(work)
source_key = citation_key if relation_type == "cites" else discovered.citation_key
rows.append(
(
ExpansionResult(
source_citation_key=source_key,
discovered_citation_key=discovered.citation_key,
created_entry=store.get_entry(discovered.citation_key) is None,
relation_type=relation_type,
source_label=f"openalex:{relation_type}:{openalex_id}",
),
dict(discovered.fields),
)
)
return rows
def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry:
title = (
reference.get("article-title")
or reference.get("volume-title")
or reference.get("journal-title")
or reference.get("unstructured")
or f"Referenced work {ordinal}"
)
year = str(reference.get("year") or "")
author = reference.get("author") or ""
doi = reference.get("DOI") or ""
journal_title = reference.get("journal-title") or ""
fields: dict[str, str] = {
"title": _normalize_text(title),
"note": f"discovered_from = {{{source_citation_key}}}",
}
if year:
fields["year"] = year
if author:
fields["author"] = _normalize_text(author)
if doi:
fields["doi"] = doi
fields["url"] = f"https://doi.org/{doi}"
if journal_title:
fields["journal"] = _normalize_text(journal_title)
citation_key = _reference_citation_key(reference, title, year, ordinal)
entry_type = "article" if journal_title else "misc"
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
def _reference_citation_key(reference: dict, title: str, year: str, ordinal: int) -> str:
if doi := reference.get("DOI"):
suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower()
return f"doi{suffix}"
author = reference.get("author") or "ref"
family = author.split(",")[0].split()[-1]
family = re.sub(r"[^A-Za-z0-9]+", "", family).lower() or "ref"
first_word = re.sub(r"[^A-Za-z0-9]+", "", title.split()[0]).lower() if title.split() else "untitled"
return f"{family}{year or 'nd'}{first_word}{ordinal}"
def _normalize_text(value: str) -> str:
return " ".join(value.split())
def _topic_relevance_score(topic_phrase: str, entry: dict[str, object] | None) -> float:
if entry is None:
return 0.0
topic_terms = _expanded_keyword_terms(topic_phrase)
if not topic_terms:
return 0.0
title_terms = _expanded_keyword_terms(str(entry.get("title") or ""))
abstract_terms = _expanded_keyword_terms(str(entry.get("abstract") or ""))
keyword_terms = _expanded_keyword_terms(str(entry.get("keywords") or ""))
venue_terms = _expanded_keyword_terms(" ".join(str(entry.get(field) or "") for field in ("journal", "booktitle")))
score = 0.0
score += 0.6 * _term_overlap_ratio(topic_terms, title_terms)
score += 0.25 * _term_overlap_ratio(topic_terms, abstract_terms)
score += 0.1 * _term_overlap_ratio(topic_terms, keyword_terms)
score += 0.05 * _term_overlap_ratio(topic_terms, venue_terms)
phrase = _normalize_text(topic_phrase.casefold())
title = _normalize_text(str(entry.get("title") or "").casefold())
if phrase and title and phrase in title:
score = max(score, 0.75)
return min(score, 1.0)
def _meets_topic_assignment_threshold(
topic_phrase: str,
entry: dict[str, object] | None,
min_relevance: float,
relevance_score: float | None = None,
) -> bool:
if entry is None:
return False
score = relevance_score if relevance_score is not None else _topic_relevance_score(topic_phrase, entry)
if score < min_relevance:
return False
title_anchor = _title_topic_anchor_ratio(topic_phrase, str(entry.get("title") or ""))
return title_anchor >= 0.2
def _keyword_terms(text: str) -> set[str]:
return {
_normalize_keyword(term)
for term in re.findall(r"[A-Za-z0-9]+", text.casefold())
if len(term) >= 4
}
def _expanded_keyword_terms(text: str) -> set[str]:
terms = _keyword_terms(text)
expanded = set(terms)
for term in terms:
expanded.update(_related_topic_terms(term))
return expanded
def _title_topic_anchor_ratio(topic_phrase: str, title: str) -> float:
normalized_phrase = _normalize_text(topic_phrase.casefold())
normalized_title = _normalize_text(title.casefold())
if normalized_phrase and normalized_title and normalized_phrase in normalized_title:
return 1.0
topic_terms = _core_topic_terms(topic_phrase)
title_terms = _keyword_terms(title)
if not topic_terms or not title_terms:
return 0.0
overlap = topic_terms & title_terms
if overlap:
return max(0.25, len(overlap) / len(topic_terms))
return 0.0
def _core_topic_terms(topic_phrase: str) -> set[str]:
generic_terms = {"evolution", "origin", "origins", "science", "study", "studies"}
return {term for term in _keyword_terms(topic_phrase) if term not in generic_terms}
def _term_overlap_ratio(topic_terms: set[str], candidate_terms: set[str]) -> float:
if not topic_terms or not candidate_terms:
return 0.0
return len(topic_terms & candidate_terms) / len(topic_terms)
def _normalize_keyword(term: str) -> str:
normalized = term.casefold()
for suffix in ("isms", "ists", "ation", "ment", "ings", "ness", "isms", "ism", "ist", "ing", "ers", "er", "ies", "ied", "ed", "es", "s"):
if len(normalized) > len(suffix) + 3 and normalized.endswith(suffix):
if suffix in {"ies", "ied"}:
return normalized[: -len(suffix)] + "y"
return normalized[: -len(suffix)]
return normalized
def _related_topic_terms(term: str) -> set[str]:
related_groups = (
{"human", "hominid", "hominin", "homo"},
{"chimpanzee", "chimp", "pan", "ape", "apes", "primate"},
{"primate", "primate", "ape", "apes", "hominid", "hominin"},
{"evolution", "evolutionary", "phylogeny", "phylogen", "ancestor", "ancestral"},
{"origin", "origins", "abiogenesis", "prebiotic"},
{"morphometry", "morphology", "cranial", "dental", "skeletal", "body"},
{"paleolithic", "paleoanthropology", "paleoanthropological", "pleistocene", "fossil"},
)
for group in related_groups:
if term in group:
return group - {term}
return set()
def _openalex_work_to_entry(work: dict) -> BibEntry:
title = _normalize_text(work.get("display_name", "") or "Untitled work")
year = str(work.get("publication_year") or "")
doi = _normalize_openalex_doi(work.get("doi"))
openalex_id = _normalize_openalex_id(work.get("id", ""))
authors = " and ".join(_openalex_author_name(item) for item in work.get("authorships", []))
source = ((work.get("primary_location") or {}).get("source") or {}).get("display_name", "")
work_type = work.get("type", "")
fields: dict[str, str] = {"title": title}
if year:
fields["year"] = year
if authors:
fields["author"] = authors
if doi:
fields["doi"] = doi
fields["url"] = f"https://doi.org/{doi}"
if openalex_id:
fields["openalex"] = openalex_id
if abstract := work.get("abstract_inverted_index"):
fields["abstract"] = _openalex_abstract_text(abstract)
if source:
if work_type == "article":
fields["journal"] = source
else:
fields["booktitle"] = source
citation_key = _openalex_citation_key(openalex_id, authors, year, title)
entry_type = _openalex_type_to_bibtype(work_type)
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
def _openalex_author_name(authorship: dict) -> str:
author = authorship.get("author") or {}
name = author.get("display_name", "")
return _normalize_text(name)
def _openalex_abstract_text(inverted_index: dict) -> str:
positions: dict[int, str] = {}
for word, indexes in inverted_index.items():
for index in indexes:
positions[int(index)] = word
return " ".join(word for _, word in sorted(positions.items()))
def _openalex_type_to_bibtype(work_type: str) -> str:
mapping = {
"article": "article",
"book": "book",
"book-chapter": "incollection",
"dissertation": "phdthesis",
"proceedings-article": "inproceedings",
}
return mapping.get(work_type, "misc")
def _openalex_citation_key(openalex_id: str, authors: str, year: str, title: str) -> str:
if openalex_id:
return f"openalex{re.sub(r'[^A-Za-z0-9]+', '', openalex_id).lower()}"
author = authors.split(" and ")[0] if authors else "ref"
family = re.sub(r"[^A-Za-z0-9]+", "", author.split()[-1]).lower() or "ref"
first_word = re.sub(r"[^A-Za-z0-9]+", "", title.split()[0]).lower() if title.split() else "untitled"
return f"{family}{year or 'nd'}{first_word}"
def _normalize_openalex_id(value: str) -> str:
if not value:
return ""
return value.rsplit("/", 1)[-1]
def _normalize_openalex_doi(value: str | None) -> str:
if not value:
return ""
if value.startswith("https://doi.org/"):
return value[len("https://doi.org/") :]
return value

201
src/citegeist/extract.py Normal file
View File

@ -0,0 +1,201 @@
from __future__ import annotations
import re
from .bibtex import BibEntry
YEAR_PATTERN = re.compile(r"\b(19|20)\d{2}\b")
YEAR_PAREN_PATTERN = re.compile(r"\((19|20)\d{2}\)")
REF_START_PATTERN = re.compile(r"^(?:\[\d+\]|\d+\.|\(\d+\))\s*")
def extract_references(text: str) -> list[BibEntry]:
entries: list[BibEntry] = []
for index, line in enumerate(_iter_reference_blocks(text), start=1):
parsed = _parse_reference_line(line, index)
if parsed is not None:
entries.append(parsed)
return entries
def render_extracted_bibtex(text: str) -> str:
from .bibtex import render_bibtex
return render_bibtex(extract_references(text))
def _iter_reference_blocks(text: str) -> list[str]:
lines: list[str] = []
current: list[str] = []
for raw_line in text.splitlines():
line = raw_line.strip()
if not line:
if current:
lines.append(" ".join(current))
current = []
continue
starts_new = bool(REF_START_PATTERN.match(line))
line = REF_START_PATTERN.sub("", line)
normalized = " ".join(line.split())
if len(normalized) < 20:
continue
if starts_new and current:
lines.append(" ".join(current))
current = [normalized]
else:
current.append(normalized)
if current:
lines.append(" ".join(current))
return lines
def _parse_reference_line(line: str, ordinal: int) -> BibEntry | None:
for parser in (_parse_apa_style_reference, _parse_publisher_style_reference, _parse_plain_year_reference):
parsed = parser(line, ordinal)
if parsed is not None:
return parsed
return None
def _parse_apa_style_reference(line: str, ordinal: int) -> BibEntry | None:
year_match = YEAR_PAREN_PATTERN.search(line)
if year_match is None:
return None
year = year_match.group(0).strip("()")
author_part = line[: year_match.start()].strip(" .")
remainder = line[year_match.end() :].strip(" .")
if not author_part or not remainder:
return None
segments = _segments_after_year(remainder)
if not segments:
return None
title = _clean_title(segments[0])
venue = segments[1] if len(segments) > 1 else ""
authors = _normalize_authors(author_part)
return _build_entry(line, ordinal, authors, year, title, venue)
def _parse_publisher_style_reference(line: str, ordinal: int) -> BibEntry | None:
year_match = YEAR_PATTERN.search(line)
if year_match is None:
return None
prefix = line[: year_match.start()].strip(" .,;")
if "." not in prefix:
return None
head, publisher = prefix.rsplit(".", 1)
if "." not in head:
return None
author_part, title = head.split(".", 1)
authors = _normalize_authors(author_part)
title = _clean_title(title)
publisher = publisher.strip(" .,;")
if not authors or not title or not publisher:
return None
citation_key = _make_citation_key(authors, year_match.group(0), title, ordinal)
return BibEntry(
entry_type="book",
citation_key=citation_key,
fields={
"author": authors,
"year": year_match.group(0),
"title": title,
"publisher": publisher,
"note": f"extracted_reference = {{true}}; raw_reference = {{{line}}}",
},
)
def _parse_plain_year_reference(line: str, ordinal: int) -> BibEntry | None:
year_match = YEAR_PATTERN.search(line)
if year_match is None:
return None
year = year_match.group(0)
author_part = line[: year_match.start()].strip(" .")
remainder = line[year_match.end() :].strip(" .")
if not author_part or not remainder:
return None
segments = _segments_after_year(remainder)
if not segments:
return None
title = _clean_title(segments[0])
venue = segments[1] if len(segments) > 1 else ""
authors = _normalize_authors(author_part)
return _build_entry(line, ordinal, authors, year, title, venue)
def _normalize_authors(author_part: str) -> str:
normalized = author_part.replace(" & ", " and ")
normalized = re.sub(r"\bet al\.?$", "and others", normalized)
normalized = re.sub(r"\s+and\s+", " and ", normalized)
normalized = re.sub(r"\s*,\s*", ", ", normalized)
return normalized.strip(" .")
def _segments_after_year(remainder: str) -> list[str]:
return [segment.strip(" .") for segment in remainder.split(". ") if segment.strip(" .")]
def _clean_title(title: str) -> str:
cleaned = title.strip(" .\"'")
cleaned = re.sub(r"\s+", " ", cleaned)
return cleaned
def _build_entry(
raw_line: str,
ordinal: int,
authors: str,
year: str,
title: str,
venue: str,
) -> BibEntry:
citation_key = _make_citation_key(authors, year, title, ordinal)
entry_type = _guess_entry_type(venue)
fields: dict[str, str] = {
"author": authors,
"year": year,
"title": title,
"note": f"extracted_reference = {{true}}; raw_reference = {{{raw_line}}}",
}
if venue:
if entry_type == "article":
fields["journal"] = venue
elif entry_type == "inproceedings":
fields["booktitle"] = venue
else:
fields["howpublished"] = venue
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
def _make_citation_key(authors: str, year: str, title: str, ordinal: int) -> str:
first_author = authors.split(" and ")[0]
family_name = first_author.split(",")[0] if "," in first_author else first_author.split()[-1]
family_name = re.sub(r"[^A-Za-z0-9]+", "", family_name).lower() or "ref"
first_word = re.sub(r"[^A-Za-z0-9]+", "", title.split()[0]).lower() if title.split() else "untitled"
if not first_word:
first_word = "untitled"
return f"{family_name}{year}{first_word}{ordinal}"
def _guess_entry_type(venue: str) -> str:
lowered = venue.lower()
if any(token in lowered for token in ("journal", "transactions", "review", "letters")):
return "article"
if any(token in lowered for token in ("proceedings", "conference", "workshop", "symposium")):
return "inproceedings"
if any(token in lowered for token in ("press", "publisher", "university")):
return "book"
return "misc"

317
src/citegeist/harvest.py Normal file
View File

@ -0,0 +1,317 @@
from __future__ import annotations
from dataclasses import dataclass
from urllib.parse import urlencode
import xml.etree.ElementTree as ET
from .bibtex import BibEntry
from .sources import SourceClient
NS = {
"oai": "http://www.openarchives.org/OAI/2.0/",
"oai_dc": "http://www.openarchives.org/OAI/2.0/oai_dc/",
"dc": "http://purl.org/dc/elements/1.1/",
"mods": "http://www.loc.gov/mods/v3",
}
@dataclass(slots=True)
class HarvestResult:
base_url: str
identifier: str
entry: BibEntry
@dataclass(slots=True)
class OaiSet:
set_spec: str
set_name: str
set_description: str = ""
@dataclass(slots=True)
class OaiMetadataFormat:
metadata_prefix: str
schema: str
metadata_namespace: str
class OaiPmhHarvester:
def __init__(self, source_client: SourceClient | None = None) -> None:
self.source_client = source_client or SourceClient()
def identify(self, base_url: str) -> dict[str, str]:
root = self.source_client.get_xml(f"{base_url}?{urlencode({'verb': 'Identify'})}")
identify = root.find(".//oai:Identify", NS)
if identify is None:
return {}
payload: dict[str, str] = {}
for field_name in (
"repositoryName",
"baseURL",
"protocolVersion",
"adminEmail",
"earliestDatestamp",
"deletedRecord",
"granularity",
):
payload[field_name] = _node_text(identify.find(f"oai:{field_name}", NS))
return payload
def list_sets(self, base_url: str) -> list[OaiSet]:
root = self.source_client.get_xml(f"{base_url}?{urlencode({'verb': 'ListSets'})}")
sets = root.findall(".//oai:set", NS)
results: list[OaiSet] = []
for node in sets:
results.append(
OaiSet(
set_spec=_node_text(node.find("oai:setSpec", NS)),
set_name=_node_text(node.find("oai:setName", NS)),
set_description=_flatten_set_description(node.find("oai:setDescription", NS)),
)
)
return results
def list_metadata_formats(self, base_url: str, identifier: str | None = None) -> list[OaiMetadataFormat]:
params = {"verb": "ListMetadataFormats"}
if identifier:
params["identifier"] = identifier
root = self.source_client.get_xml(f"{base_url}?{urlencode(params)}")
formats = root.findall(".//oai:metadataFormat", NS)
results: list[OaiMetadataFormat] = []
for node in formats:
results.append(
OaiMetadataFormat(
metadata_prefix=_node_text(node.find("oai:metadataPrefix", NS)),
schema=_node_text(node.find("oai:schema", NS)),
metadata_namespace=_node_text(node.find("oai:metadataNamespace", NS)),
)
)
return results
def list_records(
self,
base_url: str,
metadata_prefix: str = "oai_dc",
set_spec: str | None = None,
date_from: str | None = None,
date_until: str | None = None,
limit: int | None = None,
) -> list[HarvestResult]:
results: list[HarvestResult] = []
params = {"verb": "ListRecords", "metadataPrefix": metadata_prefix}
if set_spec:
params["set"] = set_spec
if date_from:
params["from"] = date_from
if date_until:
params["until"] = date_until
ordinal = 1
next_url = f"{base_url}?{urlencode(params)}"
while next_url:
root = self.source_client.get_xml(next_url)
records = root.findall(".//oai:record", NS)
for record in records:
parsed = self._record_to_result(base_url, record, ordinal)
ordinal += 1
if parsed is not None:
results.append(parsed)
if limit is not None and len(results) >= limit:
return results
next_url = self._resumption_url(base_url, root)
return results
def get_record(
self,
base_url: str,
identifier: str,
metadata_prefix: str = "oai_dc",
) -> HarvestResult | None:
params = {
"verb": "GetRecord",
"metadataPrefix": metadata_prefix,
"identifier": identifier,
}
root = self.source_client.get_xml(f"{base_url}?{urlencode(params)}")
record = root.find(".//oai:record", NS)
if record is None:
return None
return self._record_to_result(base_url, record, 1)
def _record_to_result(self, base_url: str, record: ET.Element, ordinal: int) -> HarvestResult | None:
identifier = _node_text(record.find("./oai:header/oai:identifier", NS))
metadata_node = record.find("./oai:metadata/*", NS)
if metadata_node is None or not identifier:
return None
entry = _metadata_node_to_entry(base_url, identifier, metadata_node, ordinal)
return HarvestResult(base_url=base_url, identifier=identifier, entry=entry)
def _resumption_url(self, base_url: str, root: ET.Element) -> str | None:
token = _node_text(root.find(".//oai:resumptionToken", NS))
if not token:
return None
return f"{base_url}?{urlencode({'verb': 'ListRecords', 'resumptionToken': token})}"
def _oai_dc_to_entry(base_url: str, identifier: str, metadata: ET.Element, ordinal: int) -> BibEntry:
titles = _all_text(metadata.findall("dc:title", NS))
creators = _all_text(metadata.findall("dc:creator", NS))
dates = _all_text(metadata.findall("dc:date", NS))
descriptions = _all_text(metadata.findall("dc:description", NS))
identifiers = _all_text(metadata.findall("dc:identifier", NS))
publishers = _all_text(metadata.findall("dc:publisher", NS))
types = [value.lower() for value in _all_text(metadata.findall("dc:type", NS))]
title = titles[0] if titles else "Untitled record"
year = _first_year(dates)
entry_type = _guess_oai_entry_type(types)
fields: dict[str, str] = {
"title": title,
"oai": identifier,
"url": _best_identifier_url(identifiers) or f"{base_url}?verb=GetRecord&identifier={identifier}&metadataPrefix=oai_dc",
"note": "harvested_from = {oai_pmh}",
}
if creators:
fields["author"] = " and ".join(creators)
if year:
fields["year"] = year
if descriptions:
fields["abstract"] = descriptions[0]
if publishers:
fields["publisher"] = publishers[0]
citation_key = _oai_citation_key(creators, year, title, ordinal)
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
def _mods_to_entry(base_url: str, identifier: str, metadata: ET.Element, ordinal: int) -> BibEntry:
title = _node_text(metadata.find(".//mods:titleInfo/mods:title", NS)) or "Untitled record"
sub_title = _node_text(metadata.find(".//mods:titleInfo/mods:subTitle", NS))
if sub_title:
title = f"{title}: {sub_title}"
creators: list[str] = []
for name in metadata.findall(".//mods:name", NS):
role_terms = [term.text or "" for term in name.findall(".//mods:roleTerm", NS)]
if role_terms and not any(term.lower() == "author" for term in role_terms):
continue
parts = [_node_text(part) for part in name.findall("./mods:namePart", NS)]
parts = [part for part in parts if part]
if parts:
creators.append(", ".join(parts) if len(parts) == 2 else " ".join(parts))
year = ""
for date_node in metadata.findall(".//mods:originInfo/mods:dateIssued", NS):
text = _node_text(date_node)
if len(text) >= 4 and text[:4].isdigit():
year = text[:4]
break
publisher = _node_text(metadata.find(".//mods:originInfo/mods:publisher", NS))
abstract = _node_text(metadata.find(".//mods:abstract", NS))
genre = _node_text(metadata.find(".//mods:genre", NS)).lower()
related_title = _node_text(metadata.find(".//mods:relatedItem/mods:titleInfo/mods:title", NS))
url = _node_text(metadata.find(".//mods:location/mods:url", NS))
entry_type = "phdthesis" if "thesis" in genre or "dissertation" in genre else "misc"
if not entry_type == "phdthesis":
if related_title:
entry_type = "article"
fields: dict[str, str] = {
"title": title,
"oai": identifier,
"url": url or f"{base_url}?verb=GetRecord&identifier={identifier}&metadataPrefix=mods",
"note": "harvested_from = {oai_pmh_mods}",
}
if creators:
fields["author"] = " and ".join(creators)
if year:
fields["year"] = year
if publisher:
fields["publisher"] = publisher
if abstract:
fields["abstract"] = abstract
if related_title:
fields["journal"] = related_title
citation_key = _oai_citation_key(creators, year, title, ordinal)
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
def _metadata_node_to_entry(base_url: str, identifier: str, metadata: ET.Element, ordinal: int) -> BibEntry:
if metadata.tag.endswith("dc"):
return _oai_dc_to_entry(base_url, identifier, metadata, ordinal)
if metadata.tag.endswith("mods"):
return _mods_to_entry(base_url, identifier, metadata, ordinal)
return BibEntry(
entry_type="misc",
citation_key=_oai_citation_key([], "", identifier, ordinal),
fields={
"title": identifier,
"oai": identifier,
"url": f"{base_url}?verb=GetRecord&identifier={identifier}",
"note": f"unsupported_oai_metadata = {{{metadata.tag}}}",
},
)
def _node_text(node: ET.Element | None) -> str:
if node is None or node.text is None:
return ""
return " ".join(node.text.split())
def _all_text(nodes: list[ET.Element]) -> list[str]:
values = []
for node in nodes:
value = _node_text(node)
if value:
values.append(value)
return values
def _first_year(dates: list[str]) -> str:
for date in dates:
if len(date) >= 4 and date[:4].isdigit():
return date[:4]
return ""
def _guess_oai_entry_type(types: list[str]) -> str:
joined = " ".join(types)
if "thesis" in joined or "dissertation" in joined:
return "phdthesis"
if "article" in joined:
return "article"
if "book" in joined:
return "book"
return "misc"
def _best_identifier_url(identifiers: list[str]) -> str:
for identifier in identifiers:
if identifier.startswith("http://") or identifier.startswith("https://"):
return identifier
return ""
def _oai_citation_key(creators: list[str], year: str, title: str, ordinal: int) -> str:
author = creators[0] if creators else "oai"
family = author.split(",")[0] if "," in author else author.split()[-1]
family = "".join(ch for ch in family.lower() if ch.isalnum()) or "oai"
first_word = "".join(ch for ch in title.split()[0].lower() if ch.isalnum()) if title.split() else "untitled"
return f"{family}{year or 'nd'}{first_word}{ordinal}"
def _flatten_set_description(node: ET.Element | None) -> str:
if node is None:
return ""
parts = []
for child in node.iter():
if child.text and child.text.strip():
parts.append(" ".join(child.text.split()))
return " ".join(parts)

567
src/citegeist/resolve.py Normal file
View File

@ -0,0 +1,567 @@
from __future__ import annotations
import re
import urllib.parse
import xml.etree.ElementTree as ET
from dataclasses import dataclass
from .bibtex import BibEntry, parse_bibtex
from .sources import SourceClient
@dataclass(slots=True)
class Resolution:
entry: BibEntry
source_type: str
source_label: str
class MetadataResolver:
def __init__(
self,
user_agent: str = "citegeist/0.1 (local research tool)",
source_client: SourceClient | None = None,
) -> None:
self.user_agent = user_agent
self.source_client = source_client or SourceClient(user_agent=user_agent)
def resolve_entry(self, entry: BibEntry) -> Resolution | None:
if doi := entry.fields.get("doi"):
resolved = self.resolve_doi(doi)
if resolved is not None:
return resolved
resolved = self.resolve_datacite_doi(doi)
if resolved is not None:
return resolved
if openalex_id := entry.fields.get("openalex"):
resolved = self.resolve_openalex(openalex_id)
if resolved is not None:
return resolved
if dblp_key := entry.fields.get("dblp"):
resolved = self.resolve_dblp(dblp_key)
if resolved is not None:
return resolved
if arxiv_id := entry.fields.get("arxiv"):
resolved = self.resolve_arxiv(arxiv_id)
if resolved is not None:
return resolved
if title := entry.fields.get("title"):
resolved = self.search_crossref_best_match(
title=title,
author_text=entry.fields.get("author", ""),
year=entry.fields.get("year", ""),
)
if resolved is not None:
return resolved
resolved = self.search_datacite_best_match(
title=title,
author_text=entry.fields.get("author", ""),
year=entry.fields.get("year", ""),
)
if resolved is not None:
return resolved
resolved = self.search_openalex_best_match(
title=title,
author_text=entry.fields.get("author", ""),
year=entry.fields.get("year", ""),
)
if resolved is not None:
return resolved
return None
def resolve_doi(self, doi: str) -> Resolution | None:
encoded = urllib.parse.quote(doi, safe="")
payload = self.source_client.get_json(f"https://api.crossref.org/works/{encoded}")
message = payload.get("message", {})
if not message:
return None
return Resolution(
entry=_crossref_message_to_entry(message),
source_type="resolver",
source_label=f"crossref:doi:{doi}",
)
def search_crossref(self, title: str, limit: int = 5) -> list[BibEntry]:
query = urllib.parse.urlencode({"query.title": title, "rows": limit})
payload = self.source_client.get_json(f"https://api.crossref.org/works?{query}")
items = payload.get("message", {}).get("items", [])
return [_crossref_message_to_entry(item) for item in items]
def search_crossref_best_match(
self,
title: str,
author_text: str = "",
year: str = "",
) -> Resolution | None:
candidate = _select_best_title_match(
self.search_crossref(title, limit=5),
title=title,
author_text=author_text,
year=year,
)
if candidate is None:
return None
return Resolution(
entry=candidate,
source_type="resolver",
source_label=f"crossref:search:{title}",
)
def resolve_dblp(self, dblp_key: str) -> Resolution | None:
encoded_key = urllib.parse.quote(dblp_key, safe="/:")
text = self.source_client.get_text(f"https://dblp.org/rec/{encoded_key}.bib")
entries = parse_bibtex(text)
if not entries:
return None
return Resolution(
entry=entries[0],
source_type="resolver",
source_label=f"dblp:key:{dblp_key}",
)
def search_dblp(self, query_text: str, limit: int = 5) -> list[BibEntry]:
query = urllib.parse.urlencode({"q": query_text, "format": "json", "h": limit})
payload = self.source_client.get_json(f"https://dblp.org/search/publ/api?{query}")
hits = payload.get("result", {}).get("hits", {}).get("hit", [])
if isinstance(hits, dict):
hits = [hits]
results: list[BibEntry] = []
for hit in hits:
info = hit.get("info", {})
dblp_key = info.get("key")
if dblp_key:
resolved = self.resolve_dblp(dblp_key)
if resolved is not None:
results.append(resolved.entry)
return results
def resolve_arxiv(self, arxiv_id: str) -> Resolution | None:
query = urllib.parse.urlencode({"id_list": arxiv_id})
root = self.source_client.get_xml(f"https://export.arxiv.org/api/query?{query}")
namespace = {"atom": "http://www.w3.org/2005/Atom"}
entry = root.find("atom:entry", namespace)
if entry is None:
return None
return Resolution(
entry=_arxiv_atom_entry_to_bib(entry, arxiv_id),
source_type="resolver",
source_label=f"arxiv:id:{arxiv_id}",
)
def resolve_openalex(self, openalex_id: str) -> Resolution | None:
normalized_id = _normalize_openalex_id(openalex_id)
payload = self.source_client.get_json(f"https://api.openalex.org/works/{normalized_id}")
if not payload:
return None
return Resolution(
entry=_openalex_work_to_entry(payload),
source_type="resolver",
source_label=f"openalex:id:{normalized_id}",
)
def resolve_datacite_doi(self, doi: str) -> Resolution | None:
encoded = urllib.parse.quote(doi, safe="")
payload = self.source_client.get_json(f"https://api.datacite.org/dois/{encoded}")
data = payload.get("data", {})
if not data:
return None
return Resolution(
entry=_datacite_work_to_entry(data),
source_type="resolver",
source_label=f"datacite:doi:{doi}",
)
def search_datacite(self, title: str, limit: int = 5) -> list[BibEntry]:
query = urllib.parse.urlencode({"query": title, "page[size]": limit})
payload = self.source_client.get_json(f"https://api.datacite.org/dois?{query}")
return [_datacite_work_to_entry(item) for item in payload.get("data", [])]
def search_datacite_best_match(
self,
title: str,
author_text: str = "",
year: str = "",
) -> Resolution | None:
candidate = _select_best_title_match(
self.search_datacite(title, limit=5),
title=title,
author_text=author_text,
year=year,
)
if candidate is None:
return None
return Resolution(
entry=candidate,
source_type="resolver",
source_label=f"datacite:search:{title}",
)
def search_openalex(self, title: str, limit: int = 5) -> list[BibEntry]:
query = urllib.parse.urlencode({"search": title, "per-page": limit})
payload = self.source_client.get_json(f"https://api.openalex.org/works?{query}")
return [_openalex_work_to_entry(item) for item in payload.get("results", [])]
def search_openalex_best_match(
self,
title: str,
author_text: str = "",
year: str = "",
) -> Resolution | None:
candidate = _select_best_title_match(
self.search_openalex(title, limit=5),
title=title,
author_text=author_text,
year=year,
)
if candidate is None:
return None
return Resolution(
entry=candidate,
source_type="resolver",
source_label=f"openalex:search:{title}",
)
def merge_entries(base: BibEntry, resolved: BibEntry) -> BibEntry:
merged, _ = merge_entries_with_conflicts(base, resolved)
return merged
def merge_entries_with_conflicts(base: BibEntry, resolved: BibEntry) -> tuple[BibEntry, list[dict[str, str]]]:
merged_fields = dict(base.fields)
conflicts: list[dict[str, str]] = []
for key, value in resolved.fields.items():
if not value:
continue
current_value = merged_fields.get(key, "")
if current_value and current_value != value:
conflicts.append(
{
"field_name": key,
"current_value": current_value,
"proposed_value": value,
}
)
continue
if key not in merged_fields or not merged_fields[key]:
merged_fields[key] = value
return (
BibEntry(
entry_type=base.entry_type or resolved.entry_type,
citation_key=base.citation_key,
fields=merged_fields,
),
conflicts,
)
def _crossref_message_to_entry(message: dict) -> BibEntry:
entry_type = _crossref_type_to_bibtype(message.get("type", "article"))
title_values = message.get("title", [])
title = title_values[0] if title_values else ""
year = _extract_crossref_year(message)
authors = " and ".join(_crossref_person_to_name(person) for person in message.get("author", []))
venue = ""
if container_title := message.get("container-title", []):
venue = container_title[0]
fields: dict[str, str] = {}
if authors:
fields["author"] = authors
if title:
fields["title"] = title
if year:
fields["year"] = year
if doi := message.get("DOI"):
fields["doi"] = doi
if url := message.get("URL"):
fields["url"] = url
if abstract := message.get("abstract"):
fields["abstract"] = abstract
if venue:
if entry_type == "article":
fields["journal"] = venue
else:
fields["booktitle"] = venue
if volume := message.get("volume"):
fields["volume"] = str(volume)
if issue := message.get("issue"):
fields["number"] = str(issue)
if pages := message.get("page"):
fields["pages"] = str(pages)
citation_key = _make_resolution_key(fields.get("author", "crossref"), year or "n.d.", title or "untitled")
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
def _arxiv_atom_entry_to_bib(node: ET.Element, arxiv_id: str) -> BibEntry:
ns = {
"atom": "http://www.w3.org/2005/Atom",
"arxiv": "http://arxiv.org/schemas/atom",
}
title = _node_text(node.find("atom:title", ns))
summary = _node_text(node.find("atom:summary", ns))
published = _node_text(node.find("atom:published", ns))
year = published[:4] if published else ""
authors = " and ".join(
_node_text(author.find("atom:name", ns)) for author in node.findall("atom:author", ns)
)
doi = _node_text(node.find("arxiv:doi", ns))
fields: dict[str, str] = {
"title": title,
"author": authors,
"year": year,
"arxiv": arxiv_id,
"url": f"https://arxiv.org/abs/{arxiv_id}",
"pdf": f"https://arxiv.org/pdf/{arxiv_id}.pdf",
}
if summary:
fields["abstract"] = summary
if doi:
fields["doi"] = doi
return BibEntry(entry_type="article", citation_key=f"arxiv{arxiv_id.replace('.', '').replace('/', '')}", fields=fields)
def _crossref_type_to_bibtype(crossref_type: str) -> str:
mapping = {
"journal-article": "article",
"proceedings-article": "inproceedings",
"book-chapter": "incollection",
"book": "book",
"proceedings": "proceedings",
}
return mapping.get(crossref_type, "misc")
def _extract_crossref_year(message: dict) -> str:
for field_name in ("published-print", "published-online", "issued", "created"):
date_parts = message.get(field_name, {}).get("date-parts", [])
if date_parts and date_parts[0]:
return str(date_parts[0][0])
return ""
def _crossref_person_to_name(person: dict) -> str:
family = person.get("family", "")
given = person.get("given", "")
if family and given:
return f"{family}, {given}"
return family or given
def _node_text(node: ET.Element | None) -> str:
if node is None or node.text is None:
return ""
return " ".join(node.text.split())
def _make_resolution_key(author_text: str, year: str, title: str) -> str:
first_author = author_text.split(" and ")[0]
family_name = first_author.split(",")[0] if "," in first_author else first_author.split()[-1]
family_name = "".join(ch for ch in family_name.lower() if ch.isalnum()) or "ref"
first_word = "".join(ch for ch in title.split()[0].lower() if ch.isalnum()) if title.split() else "untitled"
return f"{family_name}{year}{first_word}"
def _openalex_work_to_entry(work: dict) -> BibEntry:
title = work.get("display_name", "") or "Untitled work"
year = str(work.get("publication_year") or "")
doi = _normalize_openalex_doi(work.get("doi"))
openalex_id = _normalize_openalex_id(work.get("id", ""))
authors = " and ".join(_openalex_author_name(item) for item in work.get("authorships", []))
source = ((work.get("primary_location") or {}).get("source") or {}).get("display_name", "")
work_type = work.get("type", "")
fields: dict[str, str] = {}
if authors:
fields["author"] = authors
if title:
fields["title"] = title
if year:
fields["year"] = year
if doi:
fields["doi"] = doi
fields["url"] = f"https://doi.org/{doi}"
if openalex_id:
fields["openalex"] = openalex_id
fields.setdefault("url", f"https://openalex.org/{openalex_id}")
if abstract := work.get("abstract_inverted_index"):
fields["abstract"] = _openalex_abstract_text(abstract)
if source:
if work_type == "article":
fields["journal"] = source
else:
fields["booktitle"] = source
citation_key = f"openalex{re.sub(r'[^A-Za-z0-9]+', '', openalex_id).lower()}" if openalex_id else _make_resolution_key(authors or "openalex", year or "n.d.", title or "untitled")
return BibEntry(entry_type=_openalex_type_to_bibtype(work_type), citation_key=citation_key, fields=fields)
def _openalex_author_name(authorship: dict) -> str:
author = authorship.get("author") or {}
return " ".join(str(author.get("display_name", "")).split())
def _openalex_abstract_text(inverted_index: dict) -> str:
positions: dict[int, str] = {}
for word, indexes in inverted_index.items():
for index in indexes:
positions[int(index)] = word
return " ".join(word for _, word in sorted(positions.items()))
def _openalex_type_to_bibtype(work_type: str) -> str:
mapping = {
"article": "article",
"book": "book",
"book-chapter": "incollection",
"dissertation": "phdthesis",
"proceedings-article": "inproceedings",
}
return mapping.get(work_type, "misc")
def _normalize_openalex_id(value: str) -> str:
if not value:
return ""
return value.rsplit("/", 1)[-1]
def _normalize_openalex_doi(value: str | None) -> str:
if not value:
return ""
if value.startswith("https://doi.org/"):
return value[len("https://doi.org/") :]
return value
def _normalize_match_text(value: str) -> str:
lowered = value.lower()
lowered = re.sub(r"\W+", " ", lowered)
return " ".join(lowered.split())
def _select_best_title_match(
candidates: list[BibEntry],
title: str,
author_text: str = "",
year: str = "",
) -> BibEntry | None:
if not candidates:
return None
title_norm = _normalize_match_text(title)
author_tokens = _author_match_tokens(author_text)
year_text = str(year or "").strip()
for candidate in candidates:
candidate_title = _normalize_match_text(candidate.fields.get("title", ""))
if candidate_title != title_norm:
continue
candidate_year = str(candidate.fields.get("year", "") or "").strip()
if year_text and candidate_year and year_text != candidate_year:
continue
if author_tokens and not _candidate_matches_author_tokens(candidate, author_tokens):
continue
return candidate
return None
def _author_match_tokens(author_text: str) -> set[str]:
normalized = _normalize_match_text(author_text)
if not normalized:
return set()
tokens = {
token
for token in re.findall(r"[a-z0-9]+", normalized)
if len(token) >= 2 and token not in {"and", "et", "al"}
}
return tokens
def _candidate_matches_author_tokens(candidate: BibEntry, author_tokens: set[str]) -> bool:
candidate_author = _normalize_match_text(candidate.fields.get("author", ""))
if not candidate_author:
return False
candidate_tokens = set(re.findall(r"[a-z0-9]+", candidate_author))
return bool(author_tokens & candidate_tokens)
def _datacite_work_to_entry(data: dict) -> BibEntry:
attributes = data.get("attributes", {})
doi = str(attributes.get("doi") or "")
titles = attributes.get("titles") or []
creators = attributes.get("creators") or []
descriptions = attributes.get("descriptions") or []
publisher = str(attributes.get("publisher") or "")
year = str(attributes.get("publicationYear") or "")
url = str(attributes.get("url") or "")
types = attributes.get("types") or {}
title = titles[0].get("title", "") if titles else ""
author_names = " and ".join(_datacite_creator_name(creator) for creator in creators if _datacite_creator_name(creator))
abstract = _datacite_abstract(descriptions)
entry_type = _datacite_type_to_bibtype(str(types.get("resourceTypeGeneral") or ""))
fields: dict[str, str] = {}
if title:
fields["title"] = title
if author_names:
fields["author"] = author_names
if year:
fields["year"] = year
if doi:
fields["doi"] = doi
if url:
fields["url"] = url
elif doi:
fields["url"] = f"https://doi.org/{doi}"
if publisher:
fields["publisher"] = publisher
if abstract:
fields["abstract"] = abstract
citation_key = _make_resolution_key(author_names or "datacite", year or "n.d.", title or doi or "untitled")
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
def _datacite_creator_name(creator: dict) -> str:
family = str(creator.get("familyName") or "")
given = str(creator.get("givenName") or "")
if family and given:
return f"{family}, {given}"
return str(creator.get("name") or family or given)
def _datacite_abstract(descriptions: list[dict]) -> str:
for description in descriptions:
if str(description.get("descriptionType") or "").lower() == "abstract":
return str(description.get("description") or "")
return ""
def _datacite_type_to_bibtype(resource_type: str) -> str:
lowered = resource_type.lower()
mapping = {
"audiovisual": "misc",
"book": "book",
"bookchapter": "incollection",
"collection": "misc",
"computationalnotebook": "misc",
"conferencepaper": "inproceedings",
"dataset": "misc",
"dissertation": "phdthesis",
"image": "misc",
"journalarticle": "article",
"model": "misc",
"report": "techreport",
"software": "misc",
"text": "misc",
}
return mapping.get(lowered, "misc")

86
src/citegeist/sources.py Normal file
View File

@ -0,0 +1,86 @@
from __future__ import annotations
import hashlib
import json
import urllib.request
import xml.etree.ElementTree as ET
from pathlib import Path
class SourceClient:
def __init__(
self,
user_agent: str = "citegeist/0.1 (local research tool)",
cache_dir: str | Path | None = None,
fixtures_dir: str | Path | None = None,
) -> None:
self.user_agent = user_agent
self.cache_dir = Path(cache_dir) if cache_dir else None
self.fixtures_dir = Path(fixtures_dir) if fixtures_dir else None
def get_json(self, url: str) -> dict:
cached = self._read_cached(url, "json")
if cached is not None:
return json.loads(cached)
payload = self._fetch_bytes(url)
self._write_cache(url, "json", payload)
return json.loads(payload.decode("utf-8"))
def get_text(self, url: str) -> str:
cached = self._read_cached(url, "txt")
if cached is not None:
return self._decode_text(cached)
payload = self._fetch_bytes(url)
self._write_cache(url, "txt", payload)
return self._decode_text(payload)
def get_xml(self, url: str) -> ET.Element:
cached = self._read_cached(url, "xml")
if cached is not None:
return ET.fromstring(cached)
payload = self._fetch_bytes(url)
self._write_cache(url, "xml", payload)
return ET.fromstring(payload)
def _fetch_bytes(self, url: str) -> bytes:
with urllib.request.urlopen(self._request(url)) as response:
return response.read()
def _request(self, url: str) -> urllib.request.Request:
return urllib.request.Request(
url,
headers={
"User-Agent": self.user_agent,
},
)
def _cache_key(self, url: str, suffix: str) -> str:
digest = hashlib.sha1(url.encode("utf-8")).hexdigest()
return f"{digest}.{suffix}"
def _read_cached(self, url: str, suffix: str) -> bytes | None:
for root in (self.fixtures_dir, self.cache_dir):
if root is None:
continue
path = root / self._cache_key(url, suffix)
if path.exists():
return path.read_bytes()
return None
def _write_cache(self, url: str, suffix: str, payload: bytes) -> None:
if self.cache_dir is None:
return
self.cache_dir.mkdir(parents=True, exist_ok=True)
path = self.cache_dir / self._cache_key(url, suffix)
path.write_bytes(payload)
def _decode_text(self, payload: bytes) -> str:
for encoding in ("utf-8", "utf-8-sig", "iso-8859-1", "latin-1"):
try:
return payload.decode(encoding)
except UnicodeDecodeError:
continue
return payload.decode("utf-8", errors="replace")

1135
src/citegeist/storage.py Normal file

File diff suppressed because it is too large Load Diff

1485
src/citegeist/talkorigins.py Normal file

File diff suppressed because it is too large Load Diff

15
tests/conftest.py Normal file
View File

@ -0,0 +1,15 @@
from __future__ import annotations
import os
import pytest
def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None:
if os.environ.get("CITEGEIST_LIVE_TESTS") == "1":
return
skip_live = pytest.mark.skip(reason="set CITEGEIST_LIVE_TESTS=1 to run live-source tests")
for item in items:
if "live" in item.keywords:
item.add_marker(skip_live)

129
tests/test_batch.py Normal file
View File

@ -0,0 +1,129 @@
from pathlib import Path
from citegeist.batch import BatchBootstrapRunner, load_batch_jobs
from citegeist.cli import main
from citegeist.storage import BibliographyStore
def test_load_batch_jobs_accepts_object_with_jobs(tmp_path: Path):
path = tmp_path / "jobs.json"
path.write_text(
"""
{
"jobs": [
{"name": "topic-only", "topic": "graph topic"},
{"name": "seed-only", "seed_bib": "seed.bib"}
]
}
""",
encoding="utf-8",
)
jobs = load_batch_jobs(path)
assert jobs[0]["name"] == "topic-only"
assert jobs[1]["seed_bib"] == str((tmp_path / "seed.bib").resolve())
def test_batch_runner_executes_multiple_jobs(tmp_path: Path):
seed_bib = tmp_path / "seed.bib"
seed_bib.write_text(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024}
}
""",
encoding="utf-8",
)
jobs = [
{"name": "seed-job", "seed_bib": str(seed_bib), "expand": False},
{"name": "topic-job", "topic": "graph topic", "expand": False, "preview": True},
]
runner = BatchBootstrapRunner()
from citegeist import BibEntry
runner.bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign]
BibEntry(entry_type="article", citation_key="topic2024graph", fields={"title": "Graph Topic Result"})
]
runner.bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
runner.bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
store = BibliographyStore()
try:
results = runner.run(store, jobs)
assert [job.job_name for job in results] == ["seed-job", "topic-job"]
assert results[0].result_count == 1
assert results[1].results[0].citation_key == "topic2024graph"
assert store.get_entry("seed2024") is not None
assert store.get_entry("topic2024graph") is None
finally:
store.close()
def test_batch_runner_can_store_topic_phrase_metadata():
jobs = [
{
"name": "topic-job",
"topic": "graph topic",
"topic_slug": "graph-methods",
"topic_name": "Graph Methods",
"topic_phrase": "graph networks biology",
"expand": False,
"preview": False,
}
]
runner = BatchBootstrapRunner()
from citegeist import BibEntry
runner.bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign]
BibEntry(entry_type="article", citation_key="topic2024graph", fields={"title": "Graph Topic Result"})
]
runner.bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
runner.bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
store = BibliographyStore()
try:
runner.run(store, jobs)
topic = store.get_topic("graph-methods")
assert topic is not None
assert topic["name"] == "Graph Methods"
assert topic["expansion_phrase"] == "graph networks biology"
finally:
store.close()
def test_bootstrap_batch_cli_runs_json_jobs(tmp_path: Path):
seed_bib = tmp_path / "seed.bib"
seed_bib.write_text(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024}
}
""",
encoding="utf-8",
)
batch_json = tmp_path / "jobs.json"
batch_json.write_text(
f"""
[
{{"name": "seed-job", "seed_bib": "{seed_bib}", "expand": false}},
{{"name": "topic-job", "topic": "graph topic", "expand": false, "preview": true}}
]
""",
encoding="utf-8",
)
from unittest.mock import patch
database = tmp_path / "library.sqlite3"
with patch("citegeist.cli.BatchBootstrapRunner.run") as mocked_run:
mocked_run.return_value = []
exit_code = main(["--db", str(database), "bootstrap-batch", str(batch_json)])
assert exit_code == 0

175
tests/test_bootstrap.py Normal file
View File

@ -0,0 +1,175 @@
from citegeist import BibliographyStore
from citegeist.bootstrap import Bootstrapper
from citegeist.cli import main
def test_bootstrap_from_seed_bib_only():
store = BibliographyStore()
try:
bootstrapper = Bootstrapper()
bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign]
bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: [] # type: ignore[method-assign]
results = bootstrapper.bootstrap(
store,
seed_bibtex="""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024}
}
""",
expand=False,
)
assert [item.citation_key for item in results] == ["seed2024"]
assert store.get_entry("seed2024") is not None
finally:
store.close()
def test_bootstrap_from_topic_only():
store = BibliographyStore()
try:
bootstrapper = Bootstrapper()
bootstrapper.resolver.search_openalex = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [ # type: ignore[method-assign]
__import__("citegeist").BibEntry(
entry_type="article",
citation_key="topic2024graph",
fields={"title": "Graph Topic Result", "year": "2024"},
)
]
bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign]
bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: [] # type: ignore[method-assign]
results = bootstrapper.bootstrap(store, topic="graph topic", expand=False)
assert [item.citation_key for item in results] == ["topic2024graph"]
assert store.get_entry("topic2024graph") is not None
assert results[0].score > 0
finally:
store.close()
def test_bootstrap_cli_accepts_seed_and_topic(tmp_path):
seed_bib = tmp_path / "seed.bib"
seed_bib.write_text(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024}
}
""",
encoding="utf-8",
)
from unittest.mock import patch
database = tmp_path / "library.sqlite3"
with patch("citegeist.cli.Bootstrapper.bootstrap") as mocked_bootstrap:
mocked_bootstrap.return_value = []
exit_code = main(
[
"--db",
str(database),
"bootstrap",
"--seed-bib",
str(seed_bib),
"--topic",
"graph topic",
"--no-expand",
]
)
assert exit_code == 0
def test_bootstrap_ranks_and_deduplicates_topic_candidates():
store = BibliographyStore()
try:
bootstrapper = Bootstrapper()
from citegeist import BibEntry
bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign]
BibEntry(
entry_type="article",
citation_key="shared2024graph",
fields={"title": "Graph Topic Ranking", "abstract": "graph topic graph"},
)
]
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [ # type: ignore[method-assign]
BibEntry(
entry_type="article",
citation_key="shared2024graph",
fields={"title": "Graph Topic Ranking", "abstract": "graph"},
),
BibEntry(
entry_type="article",
citation_key="crossref2024other",
fields={"title": "Less relevant paper"},
),
]
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign]
bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: [] # type: ignore[method-assign]
results = bootstrapper.bootstrap(store, topic="graph topic", expand=False, topic_limit=5)
topic_results = [item for item in results if item.origin == "topic"]
assert [item.citation_key for item in topic_results] == ["shared2024graph", "crossref2024other"]
assert topic_results[0].score > topic_results[1].score
finally:
store.close()
def test_bootstrap_preview_does_not_write_to_database():
store = BibliographyStore()
try:
bootstrapper = Bootstrapper()
from citegeist import BibEntry
bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign]
BibEntry(entry_type="article", citation_key="preview2024graph", fields={"title": "Preview Graph Topic"})
]
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
results = bootstrapper.bootstrap(store, topic="graph topic", expand=False, preview_only=True)
assert [item.citation_key for item in results] == ["preview2024graph"]
assert store.get_entry("preview2024graph") is None
finally:
store.close()
def test_bootstrap_topic_commit_limit_restricts_persisted_candidates():
store = BibliographyStore()
try:
bootstrapper = Bootstrapper()
from citegeist import BibEntry
bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign]
BibEntry(entry_type="article", citation_key="rank1", fields={"title": "Graph Topic One"}),
BibEntry(entry_type="article", citation_key="rank2", fields={"title": "Graph Topic Two"}),
]
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign]
bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: [] # type: ignore[method-assign]
results = bootstrapper.bootstrap(
store,
topic="graph topic",
expand=False,
topic_limit=5,
topic_commit_limit=1,
)
assert [item.citation_key for item in results if item.origin == "topic"] == ["rank1"]
assert store.get_entry("rank1") is not None
assert store.get_entry("rank2") is None
finally:
store.close()

1078
tests/test_cli.py Normal file

File diff suppressed because it is too large Load Diff

69
tests/test_expand.py Normal file
View File

@ -0,0 +1,69 @@
from citegeist.expand import CrossrefExpander, _crossref_reference_to_entry
from citegeist.storage import BibliographyStore
def test_crossref_reference_to_entry_prefers_doi_key():
entry = _crossref_reference_to_entry(
{
"DOI": "10.1000/example-ref",
"article-title": "Discovered Reference",
"author": "Doe, Alex",
"year": "2022",
"journal-title": "Journal of Discovery",
},
"seed2024",
1,
)
assert entry.citation_key == "doi101000exampleref"
assert entry.fields["doi"] == "10.1000/example-ref"
assert entry.fields["journal"] == "Journal of Discovery"
def test_crossref_expander_creates_draft_nodes_and_relations():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024},
doi = {10.1000/seed-doi}
}
"""
)
expander = CrossrefExpander()
expander.resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign]
"message": {
"reference": [
{
"DOI": "10.1000/example-ref",
"article-title": "Discovered Reference",
"author": "Doe, Alex",
"year": "2022",
"journal-title": "Journal of Discovery",
},
{
"unstructured": "Unstructured reference string",
"year": "2021",
},
]
}
}
results = expander.expand_entry_references(store, "seed2024")
assert [result.discovered_citation_key for result in results] == [
"doi101000exampleref",
"ref2021unstructured2",
]
discovered = store.get_entry("doi101000exampleref")
assert discovered is not None
assert discovered["review_status"] == "draft"
assert store.get_relations("seed2024") == ["doi101000exampleref", "ref2021unstructured2"]
relation_provenance = store.get_relation_provenance("seed2024")
assert relation_provenance[0]["source_type"] == "graph_expand"
finally:
store.close()

65
tests/test_extract.py Normal file
View File

@ -0,0 +1,65 @@
from citegeist import extract_references, parse_bibtex
from citegeist.cli import main
SAMPLE_REFERENCES = """
[1] Smith, Jane and Doe, Alex. 2024. Graph-first bibliography augmentation. Journal of Research Systems.
[2] Miller, Sam. 2023. Semantic search for research corpora. Proceedings of the Retrieval Workshop.
"""
APA_AND_BOOK_REFERENCES = """
Brown, T., & Green, P. (2021). Retrieval methods for scholarly corpora. Journal of Information Retrieval.
Nguyen, An. Research Design for Literature Mapping. Example University Press, 2020.
"""
WRAPPED_REFERENCES = """
[1] Taylor, Ann. 2022. Multi-line reference extraction
for bibliography pipelines. Journal of Parsing Systems.
[2] Chen, Bo. 2021. Another entry. Proceedings of the Mining Workshop.
"""
def test_extract_references_builds_draft_entries():
entries = extract_references(SAMPLE_REFERENCES)
assert [entry.citation_key for entry in entries] == [
"smith2024graphfirst1",
"miller2023semantic2",
]
assert entries[0].entry_type == "article"
assert entries[0].fields["journal"] == "Journal of Research Systems"
assert entries[1].entry_type == "inproceedings"
assert entries[1].fields["booktitle"] == "Proceedings of the Retrieval Workshop"
def test_extract_cli_writes_bibtex(tmp_path):
input_path = tmp_path / "references.txt"
output_path = tmp_path / "draft.bib"
input_path.write_text(SAMPLE_REFERENCES, encoding="utf-8")
exit_code = main(["extract", str(input_path), "--output", str(output_path)])
assert exit_code == 0
exported = output_path.read_text(encoding="utf-8")
parsed = {entry.citation_key: entry for entry in parse_bibtex(exported)}
assert parsed["smith2024graphfirst1"].fields["journal"] == "Journal of Research Systems"
assert parsed["miller2023semantic2"].fields["booktitle"] == "Proceedings of the Retrieval Workshop"
def test_extract_references_supports_apa_and_book_styles():
entries = extract_references(APA_AND_BOOK_REFERENCES)
assert [entry.entry_type for entry in entries] == ["article", "book"]
assert entries[0].fields["journal"] == "Journal of Information Retrieval"
assert entries[0].fields["author"] == "Brown, T., and Green, P"
assert entries[1].fields["publisher"] == "Example University Press"
assert entries[1].fields["title"] == "Research Design for Literature Mapping"
def test_extract_references_joins_wrapped_reference_lines():
entries = extract_references(WRAPPED_REFERENCES)
assert len(entries) == 2
assert entries[0].fields["title"] == "Multi-line reference extraction for bibliography pipelines"
assert entries[0].fields["journal"] == "Journal of Parsing Systems"

293
tests/test_harvest.py Normal file
View File

@ -0,0 +1,293 @@
from citegeist import OaiPmhHarvester, parse_bibtex
from citegeist.cli import main
OAI_XML = """<?xml version="1.0" encoding="UTF-8"?>
<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<ListRecords>
<record>
<header>
<identifier>oai:example.edu:123</identifier>
</header>
<metadata>
<oai_dc:dc>
<dc:title>Thesis Metadata Harvesting</dc:title>
<dc:creator>Doe, Jane</dc:creator>
<dc:date>2023-05-01</dc:date>
<dc:description>A dissertation about repository harvesting.</dc:description>
<dc:identifier>https://example.edu/items/123</dc:identifier>
<dc:publisher>Example University</dc:publisher>
<dc:type>Text</dc:type>
<dc:type>Dissertation</dc:type>
</oai_dc:dc>
</metadata>
</record>
</ListRecords>
</OAI-PMH>
"""
OAI_XML_PAGE_1 = """<?xml version="1.0" encoding="UTF-8"?>
<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<ListRecords>
<record>
<header>
<identifier>oai:example.edu:123</identifier>
</header>
<metadata>
<oai_dc:dc>
<dc:title>First Harvested Thesis</dc:title>
<dc:creator>Doe, Jane</dc:creator>
<dc:date>2023-05-01</dc:date>
<dc:type>Dissertation</dc:type>
</oai_dc:dc>
</metadata>
</record>
<resumptionToken>TOKEN123</resumptionToken>
</ListRecords>
</OAI-PMH>
"""
OAI_XML_PAGE_2 = """<?xml version="1.0" encoding="UTF-8"?>
<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<ListRecords>
<record>
<header>
<identifier>oai:example.edu:456</identifier>
</header>
<metadata>
<oai_dc:dc>
<dc:title>Second Harvested Thesis</dc:title>
<dc:creator>Smith, John</dc:creator>
<dc:date>2022-05-01</dc:date>
<dc:type>Dissertation</dc:type>
</oai_dc:dc>
</metadata>
</record>
</ListRecords>
</OAI-PMH>
"""
OAI_IDENTIFY_XML = """<?xml version="1.0" encoding="UTF-8"?>
<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/">
<Identify>
<repositoryName>Example Repository</repositoryName>
<baseURL>https://example.edu/oai</baseURL>
<protocolVersion>2.0</protocolVersion>
<adminEmail>repo@example.edu</adminEmail>
<earliestDatestamp>2001-01-01</earliestDatestamp>
<deletedRecord>persistent</deletedRecord>
<granularity>YYYY-MM-DD</granularity>
</Identify>
</OAI-PMH>
"""
OAI_LISTSETS_XML = """<?xml version="1.0" encoding="UTF-8"?>
<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/">
<ListSets>
<set>
<setSpec>theses</setSpec>
<setName>Theses and Dissertations</setName>
<setDescription>
<description>This set contains graduate theses.</description>
</setDescription>
</set>
</ListSets>
</OAI-PMH>
"""
OAI_METADATA_FORMATS_XML = """<?xml version="1.0" encoding="UTF-8"?>
<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/">
<ListMetadataFormats>
<metadataFormat>
<metadataPrefix>oai_dc</metadataPrefix>
<schema>http://www.openarchives.org/OAI/2.0/oai_dc.xsd</schema>
<metadataNamespace>http://www.openarchives.org/OAI/2.0/oai_dc/</metadataNamespace>
</metadataFormat>
<metadataFormat>
<metadataPrefix>mods</metadataPrefix>
<schema>http://www.loc.gov/standards/mods/v3/mods-3-7.xsd</schema>
<metadataNamespace>http://www.loc.gov/mods/v3</metadataNamespace>
</metadataFormat>
</ListMetadataFormats>
</OAI-PMH>
"""
OAI_MODS_XML = """<?xml version="1.0" encoding="UTF-8"?>
<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
xmlns:mods="http://www.loc.gov/mods/v3">
<ListRecords>
<record>
<header>
<identifier>oai:example.edu:mods123</identifier>
</header>
<metadata>
<mods:mods>
<mods:titleInfo>
<mods:title>MODS Thesis Title</mods:title>
</mods:titleInfo>
<mods:name>
<mods:namePart>Doe</mods:namePart>
<mods:namePart>Jane</mods:namePart>
<mods:role>
<mods:roleTerm>author</mods:roleTerm>
</mods:role>
</mods:name>
<mods:originInfo>
<mods:publisher>Example University</mods:publisher>
<mods:dateIssued>2022</mods:dateIssued>
</mods:originInfo>
<mods:genre>dissertation</mods:genre>
<mods:abstract>MODS abstract text.</mods:abstract>
<mods:location>
<mods:url>https://example.edu/mods123</mods:url>
</mods:location>
</mods:mods>
</metadata>
</record>
</ListRecords>
</OAI-PMH>
"""
def test_oai_harvester_maps_dublin_core_to_bibentry():
harvester = OaiPmhHarvester()
harvester.source_client.get_xml = lambda _url: __import__("xml.etree.ElementTree").etree.ElementTree.fromstring(OAI_XML) # type: ignore[method-assign]
results = harvester.list_records("https://example.edu/oai")
assert len(results) == 1
entry = results[0].entry
assert entry.entry_type == "phdthesis"
assert entry.fields["title"] == "Thesis Metadata Harvesting"
assert entry.fields["author"] == "Doe, Jane"
assert entry.fields["oai"] == "oai:example.edu:123"
def test_oai_harvester_follows_resumption_tokens():
harvester = OaiPmhHarvester()
from xml.etree import ElementTree as ET
payloads = iter([ET.fromstring(OAI_XML_PAGE_1), ET.fromstring(OAI_XML_PAGE_2)])
harvester.source_client.get_xml = lambda _url: next(payloads) # type: ignore[method-assign]
results = harvester.list_records("https://example.edu/oai")
assert [result.identifier for result in results] == [
"oai:example.edu:123",
"oai:example.edu:456",
]
assert [result.entry.citation_key for result in results] == [
"doe2023first1",
"smith2022second2",
]
def test_oai_harvester_passes_date_filters():
harvester = OaiPmhHarvester()
seen_urls: list[str] = []
from xml.etree import ElementTree as ET
def fake_get_xml(url: str):
seen_urls.append(url)
return ET.fromstring(OAI_XML)
harvester.source_client.get_xml = fake_get_xml # type: ignore[method-assign]
harvester.list_records(
"https://example.edu/oai",
date_from="2023-01-01",
date_until="2023-12-31",
limit=1,
)
assert "from=2023-01-01" in seen_urls[0]
assert "until=2023-12-31" in seen_urls[0]
def test_oai_harvester_maps_mods_records():
harvester = OaiPmhHarvester()
from xml.etree import ElementTree as ET
harvester.source_client.get_xml = lambda _url: ET.fromstring(OAI_MODS_XML) # type: ignore[method-assign]
results = harvester.list_records("https://example.edu/oai", metadata_prefix="mods")
assert len(results) == 1
entry = results[0].entry
assert entry.entry_type == "phdthesis"
assert entry.fields["title"] == "MODS Thesis Title"
assert entry.fields["author"] == "Doe, Jane"
assert entry.fields["publisher"] == "Example University"
assert entry.fields["abstract"] == "MODS abstract text."
def test_oai_harvester_can_identify_repository_and_list_sets():
harvester = OaiPmhHarvester()
from xml.etree import ElementTree as ET
payloads = iter(
[ET.fromstring(OAI_IDENTIFY_XML), ET.fromstring(OAI_LISTSETS_XML), ET.fromstring(OAI_METADATA_FORMATS_XML)]
)
harvester.source_client.get_xml = lambda _url: next(payloads) # type: ignore[method-assign]
identify = harvester.identify("https://example.edu/oai")
sets = harvester.list_sets("https://example.edu/oai")
formats = harvester.list_metadata_formats("https://example.edu/oai")
assert identify["repositoryName"] == "Example Repository"
assert identify["granularity"] == "YYYY-MM-DD"
assert sets[0].set_spec == "theses"
assert sets[0].set_name == "Theses and Dissertations"
assert "graduate theses" in sets[0].set_description
assert [item.metadata_prefix for item in formats] == ["oai_dc", "mods"]
def test_harvest_oai_cli_ingests_records(tmp_path):
from unittest.mock import patch
database = tmp_path / "library.sqlite3"
harvester = OaiPmhHarvester()
from xml.etree import ElementTree as ET
harvester.source_client.get_xml = lambda _url: ET.fromstring(OAI_XML) # type: ignore[method-assign]
harvested = harvester.list_records("https://example.edu/oai")
with patch("citegeist.cli.OaiPmhHarvester.list_records") as mocked_list:
mocked_list.return_value = harvested
exit_code = main(
[
"--db",
str(database),
"harvest-oai",
"https://example.edu/oai",
"--metadata-prefix",
"oai_dc",
"--from",
"2023-01-01",
"--until",
"2023-12-31",
"--limit",
"5",
]
)
assert exit_code == 0
from citegeist.storage import BibliographyStore
store = BibliographyStore(database)
try:
entry = store.list_entries(limit=10)[0]
assert entry["citation_key"] == "doe2023thesis1"
bibtex = store.get_entry_bibtex("doe2023thesis1")
parsed = parse_bibtex(bibtex or "")
assert parsed[0].fields["oai"] == "oai:example.edu:123"
finally:
store.close()

View File

@ -0,0 +1,52 @@
from __future__ import annotations
import os
import pytest
from citegeist import MetadataResolver, SourceClient
pytestmark = pytest.mark.live
def _live_client() -> SourceClient:
cache_dir = os.environ.get("CITEGEIST_SOURCE_CACHE", ".cache/citegeist")
return SourceClient(
cache_dir=cache_dir,
fixtures_dir=os.environ.get("CITEGEIST_SOURCE_FIXTURES"),
)
def test_live_crossref_doi_resolution():
resolver = MetadataResolver(source_client=_live_client())
resolution = resolver.resolve_doi("10.1038/nphys1170")
assert resolution is not None
assert resolution.entry.fields.get("doi") == "10.1038/nphys1170"
assert resolution.entry.fields.get("title")
def test_live_arxiv_resolution():
resolver = MetadataResolver(source_client=_live_client())
resolution = resolver.resolve_arxiv("1706.03762")
assert resolution is not None
assert resolution.entry.fields.get("arxiv") == "1706.03762"
assert resolution.entry.fields.get("title")
def test_live_openalex_title_search():
resolver = MetadataResolver(source_client=_live_client())
resolution = resolver.search_openalex_best_match(
title="Attention Is All You Need",
author_text="Ashish Vaswani",
year="2017",
)
assert resolution is not None
assert resolution.entry.fields.get("title")
assert resolution.entry.fields.get("openalex")

View File

@ -0,0 +1,84 @@
from citegeist.expand import OpenAlexExpander, _openalex_work_to_entry
from citegeist.storage import BibliographyStore
def test_openalex_work_to_entry_maps_basic_fields():
entry = _openalex_work_to_entry(
{
"id": "https://openalex.org/W12345",
"doi": "https://doi.org/10.1000/example-openalex",
"display_name": "OpenAlex Discovered Work",
"publication_year": 2022,
"type": "article",
"authorships": [{"author": {"display_name": "Jane Smith"}}],
"primary_location": {"source": {"display_name": "Journal of Graph Discovery"}},
"abstract_inverted_index": {"Graph": [0], "discovery": [1]},
}
)
assert entry.citation_key == "openalexw12345"
assert entry.fields["openalex"] == "W12345"
assert entry.fields["doi"] == "10.1000/example-openalex"
assert entry.fields["journal"] == "Journal of Graph Discovery"
assert entry.fields["abstract"] == "Graph discovery"
def test_openalex_expander_adds_outgoing_and_incoming_edges():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024},
doi = {10.1000/seed-doi}
}
"""
)
expander = OpenAlexExpander()
payloads = iter(
[
{
"results": [
{
"id": "https://openalex.org/WSEED",
}
]
},
{
"results": [
{
"id": "https://openalex.org/WDISCOVERED",
"display_name": "Referenced OpenAlex Work",
"publication_year": 2021,
"type": "article",
"authorships": [{"author": {"display_name": "Bob Known"}}],
"primary_location": {"source": {"display_name": "OpenAlex Journal"}},
}
]
},
{
"results": [
{
"id": "https://openalex.org/WCITING",
"display_name": "Citing OpenAlex Work",
"publication_year": 2025,
"type": "article",
"authorships": [{"author": {"display_name": "Carol Citing"}}],
}
]
},
]
)
expander.resolver.source_client.get_json = lambda _url: next(payloads) # type: ignore[method-assign]
outgoing = expander.expand_entry(store, "seed2024", relation_type="cites", limit=5)
incoming = expander.expand_entry(store, "seed2024", relation_type="cited_by", limit=5)
assert outgoing[0].discovered_citation_key == "openalexwdiscovered"
assert incoming[0].source_citation_key == "openalexwciting"
assert "openalexwdiscovered" in store.get_relations("seed2024", "cites")
assert "seed2024" in store.get_relations("openalexwciting", "cites")
finally:
store.close()

403
tests/test_resolve.py Normal file
View File

@ -0,0 +1,403 @@
from xml.etree import ElementTree as ET
from citegeist.bibtex import BibEntry, render_bibtex
from citegeist.resolve import (
MetadataResolver,
_arxiv_atom_entry_to_bib,
_crossref_message_to_entry,
_datacite_work_to_entry,
_openalex_work_to_entry,
merge_entries_with_conflicts,
merge_entries,
)
def test_crossref_message_to_entry_maps_basic_fields():
entry = _crossref_message_to_entry(
{
"type": "journal-article",
"title": ["Graph-first bibliography augmentation"],
"DOI": "10.1000/example-doi",
"URL": "https://doi.org/10.1000/example-doi",
"container-title": ["Journal of Graph Studies"],
"author": [{"family": "Smith", "given": "Jane"}],
"issued": {"date-parts": [[2024, 5, 1]]},
}
)
assert entry.entry_type == "article"
assert entry.fields["author"] == "Smith, Jane"
assert entry.fields["journal"] == "Journal of Graph Studies"
assert entry.fields["year"] == "2024"
def test_arxiv_atom_entry_to_bib_maps_basic_fields():
xml = ET.fromstring(
"""
<entry xmlns="http://www.w3.org/2005/Atom" xmlns:arxiv="http://arxiv.org/schemas/atom">
<title>Semantic search for research corpora</title>
<summary>Dense retrieval improves recall.</summary>
<published>2023-01-15T00:00:00Z</published>
<author><name>Miller, Sam</name></author>
<arxiv:doi>10.1000/arxiv-example</arxiv:doi>
</entry>
"""
)
entry = _arxiv_atom_entry_to_bib(xml, "2301.12345")
assert entry.fields["author"] == "Miller, Sam"
assert entry.fields["arxiv"] == "2301.12345"
assert entry.fields["doi"] == "10.1000/arxiv-example"
def test_merge_entries_prefers_existing_values_and_adds_missing_fields():
base = BibEntry(
entry_type="article",
citation_key="smith2024graphs",
fields={"title": "Graph-first bibliography augmentation", "doi": "10.1000/example-doi"},
)
resolved = BibEntry(
entry_type="article",
citation_key="otherkey",
fields={"title": "Different title", "journal": "Journal of Graph Studies"},
)
merged = merge_entries(base, resolved)
assert merged.fields["title"] == "Graph-first bibliography augmentation"
assert merged.fields["journal"] == "Journal of Graph Studies"
def test_merge_entries_with_conflicts_records_disagreements():
base = BibEntry(
entry_type="article",
citation_key="smith2024graphs",
fields={"title": "Existing Title", "journal": "Current Journal"},
)
resolved = BibEntry(
entry_type="article",
citation_key="resolved",
fields={"title": "Resolved Title", "journal": "Current Journal", "year": "2024"},
)
merged, conflicts = merge_entries_with_conflicts(base, resolved)
assert merged.fields["title"] == "Existing Title"
assert merged.fields["year"] == "2024"
assert conflicts == [
{
"field_name": "title",
"current_value": "Existing Title",
"proposed_value": "Resolved Title",
}
]
def test_resolver_tries_doi_before_dblp():
resolver = MetadataResolver()
calls: list[tuple[str, str]] = []
def fake_doi(value: str):
calls.append(("doi", value))
return None
def fake_dblp(value: str):
calls.append(("dblp", value))
return None
def fake_datacite(value: str):
calls.append(("datacite", value))
return None
resolver.resolve_doi = fake_doi # type: ignore[method-assign]
resolver.resolve_datacite_doi = fake_datacite # type: ignore[method-assign]
resolver.resolve_dblp = fake_dblp # type: ignore[method-assign]
resolver.resolve_entry(
BibEntry(
entry_type="article",
citation_key="smith2024graphs",
fields={"doi": "10.1000/example-doi", "dblp": "conf/test/Smith24"},
)
)
assert calls == [
("doi", "10.1000/example-doi"),
("datacite", "10.1000/example-doi"),
("dblp", "conf/test/Smith24"),
]
def test_openalex_work_to_entry_maps_basic_fields():
entry = _openalex_work_to_entry(
{
"id": "https://openalex.org/W12345",
"doi": "https://doi.org/10.1000/example-openalex",
"display_name": "OpenAlex Resolved Work",
"publication_year": 2022,
"type": "article",
"authorships": [{"author": {"display_name": "Jane Smith"}}],
"primary_location": {"source": {"display_name": "Journal of Open Graphs"}},
"abstract_inverted_index": {"OpenAlex": [0], "resolved": [1]},
}
)
assert entry.citation_key == "openalexw12345"
assert entry.fields["openalex"] == "W12345"
assert entry.fields["doi"] == "10.1000/example-openalex"
assert entry.fields["journal"] == "Journal of Open Graphs"
assert entry.fields["abstract"] == "OpenAlex resolved"
def test_resolver_can_resolve_openalex_id():
resolver = MetadataResolver()
resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign]
"id": "https://openalex.org/W12345",
"display_name": "OpenAlex Resolved Work",
"publication_year": 2022,
"type": "article",
"authorships": [{"author": {"display_name": "Jane Smith"}}],
}
resolution = resolver.resolve_openalex("W12345")
assert resolution is not None
assert resolution.source_label == "openalex:id:W12345"
assert resolution.entry.fields["openalex"] == "W12345"
def test_resolver_falls_back_to_openalex_title_search():
resolver = MetadataResolver()
resolver.search_datacite = lambda title, limit=5: [] # type: ignore[method-assign]
resolver.search_crossref = lambda title, limit=5: [] # type: ignore[method-assign]
resolver.search_openalex = lambda title, limit=5: [ # type: ignore[method-assign]
_openalex_work_to_entry(
{
"id": "https://openalex.org/W12345",
"display_name": title,
"publication_year": 2022,
"type": "article",
"authorships": [{"author": {"display_name": "Jane Smith"}}],
}
)
]
resolution = resolver.resolve_entry(
BibEntry(
entry_type="article",
citation_key="smith2022openalex",
fields={"title": "OpenAlex Resolved Work", "author": "Jane Smith", "year": "2022"},
)
)
assert resolution is not None
assert resolution.source_label == "openalex:search:OpenAlex Resolved Work"
assert resolution.entry.fields["openalex"] == "W12345"
def test_resolver_prefers_exact_crossref_title_match_before_datacite():
resolver = MetadataResolver()
resolver.search_crossref = lambda title, limit=5: [ # type: ignore[method-assign]
_crossref_message_to_entry(
{
"type": "journal-article",
"title": [title],
"DOI": "10.1126/science.1090005",
"container-title": ["Science"],
"author": [
{"family": "King", "given": "Mary-Claire"},
{"family": "Wilson", "given": "A. C."},
],
"issued": {"date-parts": [[1975, 4, 11]]},
}
)
]
resolver.search_datacite = lambda title, limit=5: [ # type: ignore[method-assign]
_datacite_work_to_entry(
{
"attributes": {
"doi": "10.5061/dryad.v6wwpzh17",
"titles": [
{
"title": "Conserved patterns and locomotor-related evolutionary constraints in the hominoid vertebral column"
}
],
"creators": [
{"familyName": "Villamil", "givenName": "Catalina I."},
{"familyName": "Middleton", "givenName": "Emily R."},
],
"publicationYear": 2024,
"types": {"resourceTypeGeneral": "Dataset"},
}
}
)
]
resolution = resolver.resolve_entry(
BibEntry(
entry_type="article",
citation_key="king1975evolution2",
fields={
"title": "Evolution at two levels in humans and chimpanzees",
"author": "King, M. C. and Wilson, A. C.",
"year": "1975",
},
)
)
assert resolution is not None
assert resolution.source_label == "crossref:search:Evolution at two levels in humans and chimpanzees"
assert resolution.entry.fields["doi"] == "10.1126/science.1090005"
def test_resolver_rejects_mismatched_title_search_candidates():
resolver = MetadataResolver()
resolver.search_crossref = lambda title, limit=5: [] # type: ignore[method-assign]
resolver.search_datacite = lambda title, limit=5: [ # type: ignore[method-assign]
_datacite_work_to_entry(
{
"attributes": {
"doi": "10.5061/dryad.v6wwpzh17",
"titles": [
{
"title": "Conserved patterns and locomotor-related evolutionary constraints in the hominoid vertebral column"
}
],
"creators": [
{"familyName": "Villamil", "givenName": "Catalina I."},
],
"publicationYear": 2024,
"types": {"resourceTypeGeneral": "Dataset"},
}
}
)
]
resolver.search_openalex = lambda title, limit=5: [ # type: ignore[method-assign]
_openalex_work_to_entry(
{
"id": "https://openalex.org/W2033360601",
"display_name": "Immunological relatedness of glucose 6-phosphate dehydrogenases from vertebrate and invertebrate species.",
"publication_year": 1978,
"type": "article",
"authorships": [
{"author": {"display_name": "Yoshikazu Sado"}},
{"author": {"display_name": "Samuel H. Hori"}},
],
"doi": "https://doi.org/10.1266/jjg.53.91",
}
)
]
resolution = resolver.resolve_entry(
BibEntry(
entry_type="article",
citation_key="sarich1967immunological1",
fields={
"title": "Immunological Time Scale for Homonid Evolution",
"author": "Sarich, V. and Wilson, A.",
"year": "1967",
},
)
)
assert resolution is None
def test_datacite_work_to_entry_maps_basic_fields():
entry = _datacite_work_to_entry(
{
"attributes": {
"doi": "10.1000/datacite-example",
"titles": [{"title": "Repository Dissertation Record"}],
"creators": [{"familyName": "Doe", "givenName": "Jane"}],
"publicationYear": 2021,
"publisher": "Example University",
"url": "https://example.edu/record/123",
"types": {"resourceTypeGeneral": "Dissertation"},
"descriptions": [
{
"descriptionType": "Abstract",
"description": "An abstract from DataCite.",
}
],
}
}
)
assert entry.entry_type == "phdthesis"
assert entry.fields["doi"] == "10.1000/datacite-example"
assert entry.fields["author"] == "Doe, Jane"
assert entry.fields["publisher"] == "Example University"
assert entry.fields["abstract"] == "An abstract from DataCite."
def test_resolver_can_resolve_datacite_doi():
resolver = MetadataResolver()
resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign]
"data": {
"attributes": {
"doi": "10.1000/datacite-example",
"titles": [{"title": "Repository Dissertation Record"}],
"creators": [{"familyName": "Doe", "givenName": "Jane"}],
"publicationYear": 2021,
"types": {"resourceTypeGeneral": "Dissertation"},
}
}
}
resolution = resolver.resolve_datacite_doi("10.1000/datacite-example")
assert resolution is not None
assert resolution.source_label == "datacite:doi:10.1000/datacite-example"
assert resolution.entry.entry_type == "phdthesis"
def test_resolver_can_fall_back_to_datacite_title_search():
resolver = MetadataResolver()
resolver.search_crossref = lambda title, limit=5: [] # type: ignore[method-assign]
resolver.search_datacite = lambda title, limit=5: [ # type: ignore[method-assign]
_datacite_work_to_entry(
{
"attributes": {
"doi": "10.1000/datacite-example",
"titles": [{"title": title}],
"creators": [{"familyName": "Doe", "givenName": "Jane"}],
"publicationYear": 2021,
"types": {"resourceTypeGeneral": "Dissertation"},
}
}
)
]
resolver.search_openalex = lambda title, limit=5: [] # type: ignore[method-assign]
resolution = resolver.resolve_entry(
BibEntry(
entry_type="misc",
citation_key="draft1",
fields={"title": "Repository Dissertation Record", "author": "Doe, Jane", "year": "2021"},
)
)
assert resolution is not None
assert resolution.source_label == "datacite:search:Repository Dissertation Record"
assert resolution.entry.fields["doi"] == "10.1000/datacite-example"
def test_render_bibtex_tolerates_unmatched_braces_in_field_values():
rendered = render_bibtex(
[
BibEntry(
entry_type="misc",
citation_key="broken2026",
fields={
"author": "Broken, Example",
"title": "Unmatched { braces } example } tail",
"year": "2026",
"note": "Open { brace only",
},
)
]
)
assert "@misc{broken2026," in rendered
assert "Unmatched { braces } example ) tail" in rendered
assert "Open ( brace only" in rendered

41
tests/test_sources.py Normal file
View File

@ -0,0 +1,41 @@
from pathlib import Path
from citegeist.sources import SourceClient
def test_source_client_reads_fixture_before_network(tmp_path: Path):
fixtures_dir = tmp_path / "fixtures"
fixtures_dir.mkdir()
client = SourceClient(cache_dir=tmp_path / "cache", fixtures_dir=fixtures_dir)
url = "https://api.crossref.org/works/10.1000/example"
fixture_path = fixtures_dir / client._cache_key(url, "json") # noqa: SLF001
fixture_path.write_text('{"message": {"DOI": "10.1000/example"}}', encoding="utf-8")
payload = client.get_json(url)
assert payload["message"]["DOI"] == "10.1000/example"
def test_source_client_writes_cache_after_fetch(tmp_path: Path):
cache_dir = tmp_path / "cache"
client = SourceClient(cache_dir=cache_dir)
url = "https://example.org/test"
client._fetch_bytes = lambda _url: b'{"ok": true}' # type: ignore[method-assign]
payload = client.get_json(url)
assert payload["ok"] is True
assert any(cache_dir.iterdir())
def test_source_client_falls_back_to_latin1_for_text(tmp_path: Path):
client = SourceClient(cache_dir=tmp_path / "cache")
url = "https://example.org/latin1"
client._fetch_bytes = lambda _url: "café".encode("iso-8859-1") # type: ignore[method-assign]
payload = client.get_text(url)
assert payload == "café"

379
tests/test_storage.py Normal file
View File

@ -0,0 +1,379 @@
from citegeist import BibliographyStore, parse_bibtex
SAMPLE_BIB = """
@article{smith2024graphs,
author = {Smith, Jane and Doe, Alex},
title = {Graph-first bibliography augmentation},
year = {2024},
doi = {10.1000/graph.2024.1},
abstract = {We study citation graphs for literature discovery.},
references = {miller2023search}
}
@inproceedings{miller2023search,
author = {Miller, Sam},
title = {Semantic search for research corpora},
year = {2023},
abstract = {Dense retrieval improves recall for academic search.}
}
"""
def test_parse_bibtex_extracts_entries_and_fields():
entries = parse_bibtex(SAMPLE_BIB)
assert [entry.citation_key for entry in entries] == ["smith2024graphs", "miller2023search"]
assert entries[0].fields["title"] == "Graph-first bibliography augmentation"
assert entries[0].fields["references"] == "miller2023search"
def test_store_ingests_entries_relations_and_search_text():
store = BibliographyStore()
try:
store.ingest_bibtex(
SAMPLE_BIB,
fulltext_by_key={
"smith2024graphs": "This paper links citation graphs with semantic search over abstracts."
},
)
entry = store.get_entry("smith2024graphs")
assert entry is not None
assert entry["doi"] == "10.1000/graph.2024.1"
assert store.get_relations("smith2024graphs") == ["miller2023search"]
results = store.search_text("semantic")
assert [row["citation_key"] for row in results][:2] == [
"miller2023search",
"smith2024graphs",
]
finally:
store.close()
def test_store_exports_bibtex_from_normalized_rows():
store = BibliographyStore()
try:
store.ingest_bibtex(SAMPLE_BIB)
exported = store.export_bibtex()
parsed = {entry.citation_key: entry for entry in parse_bibtex(exported)}
assert "@article{smith2024graphs," in exported
assert "@inproceedings{miller2023search," in exported
assert parsed["smith2024graphs"].fields["author"] == "Smith, Jane and Doe, Alex"
assert parsed["smith2024graphs"].fields["references"] == "miller2023search"
finally:
store.close()
def test_store_records_provenance_and_review_status():
store = BibliographyStore()
try:
store.ingest_bibtex(SAMPLE_BIB, source_label="fixtures/sample.bib", review_status="draft")
entry = store.get_entry("smith2024graphs")
assert entry is not None
assert entry["review_status"] == "draft"
provenance = store.get_field_provenance("smith2024graphs")
assert provenance
assert provenance[0]["source_type"] == "bibtex"
assert provenance[0]["source_label"] == "fixtures/sample.bib"
assert store.set_entry_status("smith2024graphs", "reviewed") is True
updated = store.get_entry("smith2024graphs")
assert updated is not None
assert updated["review_status"] == "reviewed"
finally:
store.close()
def test_store_traverses_graph_and_surfaces_missing_targets():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024},
references = {known2023, missing2022}
}
@article{known2023,
author = {Known, Bob},
title = {Known Paper},
year = {2023},
references = {leaf2021}
}
@article{leaf2021,
author = {Leaf, Carol},
title = {Leaf Paper},
year = {2021}
}
""",
review_status="reviewed",
)
rows = store.traverse_graph(["seed2024"], relation_types=["cites"], max_depth=2)
assert [row["target_citation_key"] for row in rows] == [
"known2023",
"missing2022",
"leaf2021",
]
assert rows[1]["target_exists"] is False
assert rows[2]["depth"] == 2
finally:
store.close()
def test_store_records_and_updates_field_conflicts():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024}
}
"""
)
ok = store.record_conflicts(
"seed2024",
[
{
"field_name": "title",
"current_value": "Seed Paper",
"proposed_value": "Resolved Seed Paper",
}
],
source_type="resolver",
source_label="crossref:doi:10.1000/seed",
)
assert ok is True
conflicts = store.get_field_conflicts("seed2024")
assert conflicts[0]["field_name"] == "title"
assert conflicts[0]["status"] == "open"
assert store.set_conflict_status("seed2024", "title", "accepted") == 1
updated = store.get_field_conflicts("seed2024", status="accepted")
assert len(updated) == 1
finally:
store.close()
def test_store_can_apply_latest_conflict_value():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024}
}
"""
)
store.record_conflicts(
"seed2024",
[
{
"field_name": "title",
"current_value": "Seed Paper",
"proposed_value": "Resolved Seed Paper",
}
],
source_type="resolver",
source_label="crossref:doi:10.1000/seed",
)
assert store.apply_conflict_value("seed2024", "title") is True
entry = store.get_entry("seed2024")
assert entry is not None
assert entry["title"] == "Resolved Seed Paper"
accepted = store.get_field_conflicts("seed2024", status="accepted")
assert len(accepted) == 1
finally:
store.close()
def test_store_supports_entry_topic_membership():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024}
}
"""
)
assert store.add_entry_topic(
"seed2024",
topic_slug="graph-methods",
topic_name="Graph Methods",
source_type="talkorigins",
source_url="https://example.org/topics/graph-methods",
source_label="topic-seed",
) is True
assert store.add_entry_topic(
"seed2024",
topic_slug="semantic-search",
topic_name="Semantic Search",
source_type="talkorigins",
source_url="https://example.org/topics/semantic-search",
source_label="topic-seed",
) is True
entry = store.get_entry("seed2024")
assert entry is not None
assert [topic["slug"] for topic in entry["topics"]] == ["graph-methods", "semantic-search"]
topics = store.list_topics()
assert [topic["slug"] for topic in topics] == ["graph-methods", "semantic-search"]
assert topics[0]["entry_count"] == 1
topic = store.get_topic("graph-methods")
assert topic is not None
assert topic["name"] == "Graph Methods"
assert topic["expansion_phrase"] is None
topic_entries = store.list_topic_entries("graph-methods")
assert topic_entries[0]["citation_key"] == "seed2024"
finally:
store.close()
def test_store_can_set_topic_expansion_phrase():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024}
}
"""
)
store.add_entry_topic(
"seed2024",
topic_slug="graph-methods",
topic_name="Graph Methods",
source_type="talkorigins",
source_url="https://example.org/topics/graph-methods",
source_label="topic-seed",
)
assert store.set_topic_expansion_phrase("graph-methods", "graph networks biology") is True
topic = store.get_topic("graph-methods")
assert topic is not None
assert topic["expansion_phrase"] == "graph networks biology"
assert topic["phrase_review_status"] == "unreviewed"
topics = store.list_topics()
assert topics[0]["expansion_phrase"] == "graph networks biology"
finally:
store.close()
def test_store_can_stage_and_review_topic_phrase_suggestion():
store = BibliographyStore()
try:
store.ensure_topic("graph-methods", "Graph Methods")
assert store.stage_topic_phrase_suggestion(
"graph-methods",
"graph networks biology",
review_notes="generated from local titles",
) is True
staged = store.get_topic("graph-methods")
assert staged is not None
assert staged["suggested_phrase"] == "graph networks biology"
assert staged["expansion_phrase"] is None
assert staged["phrase_review_status"] == "pending"
assert staged["phrase_review_notes"] == "generated from local titles"
assert store.review_topic_phrase_suggestion(
"graph-methods",
"accepted",
review_notes="looks good",
) is True
reviewed = store.get_topic("graph-methods")
assert reviewed is not None
assert reviewed["suggested_phrase"] == "graph networks biology"
assert reviewed["expansion_phrase"] == "graph networks biology"
assert reviewed["phrase_review_status"] == "accepted"
assert reviewed["phrase_review_notes"] == "looks good"
finally:
store.close()
def test_store_can_filter_topics_by_phrase_review_status():
store = BibliographyStore()
try:
store.ensure_topic("graph-methods", "Graph Methods")
store.ensure_topic("abiogenesis", "Abiogenesis")
store.stage_topic_phrase_suggestion("graph-methods", "graph networks biology")
store.stage_topic_phrase_suggestion("abiogenesis", "abiogenesis life origin")
store.review_topic_phrase_suggestion("abiogenesis", "accepted")
pending_topics = store.list_topics(phrase_review_status="pending")
accepted_topics = store.list_topics(phrase_review_status="accepted")
assert [topic["slug"] for topic in pending_topics] == ["graph-methods"]
assert [topic["slug"] for topic in accepted_topics] == ["abiogenesis"]
finally:
store.close()
def test_store_search_text_can_filter_by_topic():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Graph Methods for Biology},
year = {2024},
abstract = {A graph methods paper.}
}
@article{other2023,
author = {Other, Bob},
title = {Graph Methods for Chemistry},
year = {2023},
abstract = {Another graph methods paper.}
}
"""
)
store.add_entry_topic(
"seed2024",
topic_slug="biology",
topic_name="Biology",
source_type="talkorigins",
source_url="https://example.org/topics/biology",
source_label="topic-seed",
)
store.add_entry_topic(
"other2023",
topic_slug="chemistry",
topic_name="Chemistry",
source_type="talkorigins",
source_url="https://example.org/topics/chemistry",
source_label="topic-seed",
)
store.connection.commit()
results = store.search_text("graph", topic_slug="biology")
assert [row["citation_key"] for row in results] == ["seed2024"]
finally:
store.close()

1024
tests/test_talkorigins.py Normal file

File diff suppressed because it is too large Load Diff

242
tests/test_topic_expand.py Normal file
View File

@ -0,0 +1,242 @@
from citegeist.bibtex import BibEntry
from citegeist.expand import (
ExpansionResult,
TopicExpander,
_meets_topic_assignment_threshold,
_topic_relevance_score,
)
from citegeist.storage import BibliographyStore
class FakeOpenAlexExpander:
def __init__(self, results: list[ExpansionResult] | dict[str, list[ExpansionResult]]) -> None:
self.results = results
def expand_entry(self, store, citation_key, relation_type="cites", limit=25):
if isinstance(self.results, dict):
return list(self.results.get(citation_key, []))
return list(self.results)
def test_topic_expander_assigns_relevant_discoveries_back_to_topic():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Abiogenesis Seed Paper},
year = {2024}
}
"""
)
store.add_entry_topic(
"seed2024",
topic_slug="abiogenesis",
topic_name="Abiogenesis",
source_type="talkorigins",
source_url="https://example.org/topics/abiogenesis",
source_label="seed",
)
store.upsert_entry(
BibEntry(
entry_type="article",
citation_key="discovered1",
fields={
"title": "Abiogenesis and origin chemistry",
"abstract": "A study of abiogenesis pathways.",
"year": "2025",
},
),
source_type="graph_expand",
source_label="test",
review_status="draft",
)
store.upsert_entry(
BibEntry(
entry_type="article",
citation_key="discovered2",
fields={
"title": "Galaxy formation dynamics",
"abstract": "Nothing about the topic.",
"year": "2025",
},
),
source_type="graph_expand",
source_label="test",
review_status="draft",
)
store.connection.commit()
expander = TopicExpander(
openalex_expander=FakeOpenAlexExpander(
[
ExpansionResult("seed2024", "discovered1", False, "cites", "openalex:cites:seed2024"),
ExpansionResult("seed2024", "discovered2", False, "cites", "openalex:cites:seed2024"),
]
)
)
results = expander.expand_topic(
store,
"abiogenesis",
topic_phrase="abiogenesis origin chemistry",
min_relevance=0.34,
)
assert len(results) == 2
assigned = {item.discovered_citation_key: item.assigned_to_topic for item in results}
assert assigned["discovered1"] is True
assert assigned["discovered2"] is False
topics = store.get_entry_topics("discovered1")
assert topics[0]["slug"] == "abiogenesis"
assert store.get_entry_topics("discovered2") == []
finally:
store.close()
def test_topic_expander_can_restrict_to_allowed_seed_keys():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Abiogenesis Seed Paper},
year = {2024}
}
@article{seed2023,
author = {Seed, Bob},
title = {Abiogenesis Historical Seed},
year = {2023}
}
"""
)
for citation_key in ("seed2024", "seed2023"):
store.add_entry_topic(
citation_key,
topic_slug="abiogenesis",
topic_name="Abiogenesis",
source_type="talkorigins",
source_url="https://example.org/topics/abiogenesis",
source_label="seed",
)
store.upsert_entry(
BibEntry(
entry_type="article",
citation_key="discovered1",
fields={
"title": "Abiogenesis origin chemistry",
"abstract": "A study of abiogenesis chemistry.",
"year": "2025",
},
),
source_type="graph_expand",
source_label="test",
review_status="draft",
)
store.connection.commit()
expander = TopicExpander(
openalex_expander=FakeOpenAlexExpander(
{"seed2023": [ExpansionResult("seed2023", "discovered1", False, "cites", "openalex:cites:seed2023")]}
)
)
results = expander.expand_topic(
store,
"abiogenesis",
topic_phrase="abiogenesis origin chemistry",
seed_keys=["seed2024"],
)
assert results == []
assert store.get_entry_topics("discovered1") == []
finally:
store.close()
def test_topic_expander_preview_discovers_without_writing():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Abiogenesis Seed Paper},
year = {2024}
}
"""
)
store.add_entry_topic(
"seed2024",
topic_slug="abiogenesis",
topic_name="Abiogenesis",
source_type="talkorigins",
source_url="https://example.org/topics/abiogenesis",
source_label="seed",
)
store.connection.commit()
expander = TopicExpander()
expander._preview_discoveries = lambda *_args, **_kwargs: [ # type: ignore[method-assign]
(
ExpansionResult(
"seed2024",
"preview1",
True,
"cites",
"openalex:cites:seed2024",
),
{
"title": "Abiogenesis origin chemistry",
"abstract": "A study of abiogenesis chemistry.",
"year": "2025",
},
)
]
results = expander.expand_topic(
store,
"abiogenesis",
topic_phrase="abiogenesis origin chemistry",
min_relevance=0.3,
preview_only=True,
)
assert len(results) == 1
assert results[0].discovered_citation_key == "preview1"
assert results[0].meets_relevance_threshold is True
assert results[0].assigned_to_topic is False
assert results[0].created_entry is True
assert store.get_entry("preview1") is None
assert store.get_entry_topics("preview1") == []
finally:
store.close()
def test_topic_relevance_score_expands_human_evolution_terms():
score = _topic_relevance_score(
"human evolution",
{
"title": "Body size and proportions in early hominids",
"abstract": "A fossil and paleolithic perspective on primate ancestry.",
"journal": "Science",
},
)
assert score >= 0.15
def test_topic_assignment_requires_title_anchor():
entry = {
"title": "Phylogenies and the Comparative Method",
"abstract": "A comparative framework for primate and hominid evolution.",
"journal": "Systematic Zoology",
}
score = _topic_relevance_score("human evolution", entry)
assert score >= 0.15
assert _meets_topic_assignment_threshold("human evolution", entry, min_relevance=0.15, relevance_score=score) is False