Compare commits
No commits in common. "0c6380562b38316170a77821b9d472fba02b194d" and "b74582b72f09f36b63e459c26e3cc7ea3d0696c2" have entirely different histories.
0c6380562b
...
b74582b72f
|
|
@ -1,229 +1,6 @@
|
|||
# ---> Python
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
# ---> Emacs
|
||||
# -*- mode: gitignore; -*-
|
||||
*~
|
||||
\#*\#
|
||||
/.emacs.desktop
|
||||
/.emacs.desktop.lock
|
||||
*.elc
|
||||
auto-save-list
|
||||
tramp
|
||||
.\#*
|
||||
|
||||
# Org-mode
|
||||
.org-id-locations
|
||||
*_archive
|
||||
|
||||
# flymake-mode
|
||||
*_flymake.*
|
||||
|
||||
# eshell files
|
||||
/eshell/history
|
||||
/eshell/lastdir
|
||||
|
||||
# elpa packages
|
||||
/elpa/
|
||||
|
||||
# reftex files
|
||||
*.rel
|
||||
|
||||
# AUCTeX auto folder
|
||||
/auto/
|
||||
|
||||
# cask packages
|
||||
.cask/
|
||||
dist/
|
||||
|
||||
# Flycheck
|
||||
flycheck_*.el
|
||||
|
||||
# server auth directory
|
||||
/server/
|
||||
|
||||
# projectiles files
|
||||
.projectile
|
||||
|
||||
# directory configuration
|
||||
.dir-locals.el
|
||||
|
||||
# network security
|
||||
/network-security.data
|
||||
|
||||
|
||||
# ---> Rust
|
||||
# Generated by Cargo
|
||||
# will have compiled files and executables
|
||||
debug/
|
||||
target/
|
||||
|
||||
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
|
||||
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
|
||||
Cargo.lock
|
||||
|
||||
# These are backup files generated by rustfmt
|
||||
**/*.rs.bk
|
||||
|
||||
# MSVC Windows builds of rustc generate these, which store debugging information
|
||||
*.pdb
|
||||
|
||||
.venv/
|
||||
.cache/
|
||||
*.pyc
|
||||
library.sqlite3
|
||||
|
|
|
|||
9
LICENSE
9
LICENSE
|
|
@ -1,9 +0,0 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2026 welsberr
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
PYTHONPATH_SRC=PYTHONPATH=src
|
||||
VENV_PYTHON=.venv/bin/python
|
||||
|
||||
.PHONY: test test-live live-smoke validate-talkorigins
|
||||
|
||||
test:
|
||||
$(PYTHONPATH_SRC) $(VENV_PYTHON) -m pytest -q
|
||||
|
||||
test-live:
|
||||
CITEGEIST_LIVE_TESTS=1 CITEGEIST_SOURCE_CACHE=.cache/citegeist $(PYTHONPATH_SRC) $(VENV_PYTHON) -m pytest -m live -q
|
||||
|
||||
live-smoke:
|
||||
CITEGEIST_SOURCE_CACHE=.cache/citegeist $(PYTHONPATH_SRC) $(VENV_PYTHON) scripts/live_smoke.py
|
||||
|
||||
validate-talkorigins:
|
||||
$(PYTHONPATH_SRC) $(VENV_PYTHON) -m citegeist validate-talkorigins talkorigins-out/talkorigins_manifest.json
|
||||
254
README.md
254
README.md
|
|
@ -1,3 +1,253 @@
|
|||
# CiteGeist
|
||||
# citegeist
|
||||
|
||||
A bibliography workbench based on Bibtex and local SQLite databases, aimed at several common bibliography tasks: ingestion of plain-text references, augmentation of Bibtex entries with metadata, graph representations of citations, graph expansion from a citation set, and more.
|
||||
`citegeist` is a research-oriented bibliography workbench for building, expanding, and auditing BibTeX libraries.
|
||||
|
||||
The aim is not just to store citations. The aim is to help with the harder problem: finding, improving, connecting, and checking the literature around a topic while keeping BibTeX as a first-class output format.
|
||||
|
||||
## Repo Description
|
||||
|
||||
`citegeist` is a BibTeX-native research tool for citation extraction, metadata enrichment, citation-graph expansion, and semantic search over scholarly sources.
|
||||
|
||||
## Scope
|
||||
|
||||
The project is intended to support a workflow like this:
|
||||
|
||||
1. Start from rough references extracted from papers, notes, syllabi, or dissertations.
|
||||
2. Convert them into draft BibTeX entries.
|
||||
3. Enrich and correct those entries using external scholarly metadata sources.
|
||||
4. Persist entries, identifiers, abstracts, and citation edges in a local database.
|
||||
5. Traverse the citation graph outward to discover additional relevant works.
|
||||
6. Search the local corpus semantically using abstracts and extracted full text.
|
||||
7. Export verified results back into BibTeX for LaTeX use.
|
||||
|
||||
## Why A New Codebase
|
||||
|
||||
This repository starts cleanly rather than extending the older `bib/` toolkit directly.
|
||||
|
||||
The older toolkit is useful as prior art:
|
||||
|
||||
- it demonstrates identifier-driven metadata augmentation;
|
||||
- it caches PDFs and extracted plaintext;
|
||||
- it shows one workable model for bibliography growth.
|
||||
|
||||
But it is not the right long-term base:
|
||||
|
||||
- it is Python 2-era code;
|
||||
- it is shell-script centric;
|
||||
- it does not provide a normalized database for graph workflows;
|
||||
- it is not structured as a reusable Python 3 library.
|
||||
|
||||
`citegeist` keeps the useful ideas and rebuilds the foundation around a cleaner Python 3 package boundary.
|
||||
|
||||
## Current Status
|
||||
|
||||
The initial repo includes:
|
||||
|
||||
- `pybtex`-backed BibTeX parsing and export in a repo-local virtual environment;
|
||||
- a SQLite-backed bibliography store;
|
||||
- a small CLI for ingest, search, inspection, and export;
|
||||
- review-state tracking on entries, per-field ingest provenance, and field-level conflict review;
|
||||
- plaintext reference extraction into draft BibTeX for numbered, APA-like, wrapped-line, and simple book-style references;
|
||||
- identifier-first metadata resolution for DOI, OpenAlex, DBLP, arXiv, and DataCite-backed entries, with OpenAlex/DataCite title-search fallback;
|
||||
- local citation-graph traversal over stored `cites`, `cited_by`, and `crossref` edges;
|
||||
- Crossref- and OpenAlex-backed graph expansion that materializes draft related works and edge provenance;
|
||||
- a dedicated source-client layer with fixture/cache support for live-source development;
|
||||
- OAI-PMH Dublin Core harvesting for institutional repositories and thesis/dissertation sources;
|
||||
- OAI-PMH repository discovery via `Identify`, `ListSets`, and `ListMetadataFormats` to target harvests more precisely;
|
||||
- bibliography bootstrap workflows that can start from a seed `.bib`, a topic phrase, or both;
|
||||
- batch bootstrap orchestration from JSON job files containing seed BibTeX paths, topic phrases, or both;
|
||||
- a TalkOrigins scraper that fixes repeated-author plaintext references, emits per-topic seed BibTeX files, and writes a batch JSON specification;
|
||||
- normalized tables for entries, creators, identifiers, and citation relations;
|
||||
- full-text-search-ready indexing over title, abstract, and fulltext when SQLite FTS5 is available;
|
||||
- tests covering parsing, ingestion, relation storage, and search.
|
||||
|
||||
The prioritized execution plan lives in [ROADMAP.md](./ROADMAP.md).
|
||||
|
||||
## Layout
|
||||
|
||||
```text
|
||||
citegeist/
|
||||
src/citegeist/
|
||||
bibtex.py
|
||||
storage.py
|
||||
tests/
|
||||
test_storage.py
|
||||
pyproject.toml
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
cd citegeist
|
||||
python3 -m virtualenv --always-copy .venv
|
||||
.venv/bin/pip install -e .
|
||||
.venv/bin/pip install pytest
|
||||
mkdir -p .cache/citegeist
|
||||
PYTHONPATH=src .venv/bin/python - <<'PY'
|
||||
from citegeist import BibliographyStore
|
||||
|
||||
bib = """
|
||||
@article{smith2024graphs,
|
||||
author = {Smith, Jane and Doe, Alex},
|
||||
title = {Graph-first bibliography augmentation},
|
||||
year = {2024},
|
||||
abstract = {We study citation graphs for literature discovery.},
|
||||
references = {miller2023search}
|
||||
}
|
||||
|
||||
@inproceedings{miller2023search,
|
||||
author = {Miller, Sam},
|
||||
title = {Semantic search for research corpora},
|
||||
year = {2023},
|
||||
abstract = {Dense retrieval improves recall for academic search.}
|
||||
}
|
||||
"""
|
||||
|
||||
store = BibliographyStore("library.sqlite3")
|
||||
store.ingest_bibtex(bib)
|
||||
print(store.get_relations("smith2024graphs"))
|
||||
print(store.search_text("semantic"))
|
||||
store.close()
|
||||
PY
|
||||
.venv/bin/python -m pytest -q
|
||||
```
|
||||
|
||||
Or use the CLI directly:
|
||||
|
||||
```bash
|
||||
cd citegeist
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 ingest references.bib
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 search "semantic search"
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 search "origin" --topic abiogenesis
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 show --provenance --conflicts smith2024graphs
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 set-status smith2024graphs reviewed
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve-conflicts smith2024graphs title accepted
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 apply-conflict smith2024graphs title
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 bootstrap --seed-bib seed.bib --topic "bayesian nonparametrics"
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 bootstrap --topic "bayesian nonparametrics" --preview --topic-commit-limit 5
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 scrape-talkorigins talkorigins-out --limit-topics 5 --limit-entries-per-topic 20
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist extract references.txt --output draft.bib
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve smith2024graphs
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topics
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topic-entries abiogenesis
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export-topic abiogenesis --output abiogenesis.bib
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 graph smith2024graphs --relation cites --depth 2 --missing-only
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source crossref
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source openalex --relation cited_by --limit 10
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand-topic abiogenesis --topic-phrase "abiogenesis origin chemistry" --source openalex --relation cites --seed-key seed2024 --min-relevance 0.3 --preview
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 set-topic-phrase abiogenesis "abiogenesis origin chemistry prebiotic"
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist discover-oai https://example.edu/oai
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 harvest-oai https://example.edu/oai --metadata-prefix mods --from 2024-01-01 --until 2024-12-31 --limit 10
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export --output reviewed.bib
|
||||
```
|
||||
|
||||
For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run.
|
||||
|
||||
For large legacy plaintext corpora such as the TalkOrigins bibliography, prefer a two-step workflow:
|
||||
|
||||
1. `scrape-talkorigins` to generate cleaned per-topic `seed_bib` files plus a `talkorigins_jobs.json` batch spec.
|
||||
2. `bootstrap-batch` on that JSON file when you want to ingest, resolve, and expand from the generated seeds.
|
||||
|
||||
The TalkOrigins scrape output now includes:
|
||||
|
||||
- `seeds/*.bib` per-topic seed BibTeX files for `bootstrap-batch`
|
||||
- `plaintext/*.txt` per-topic cleaned GSA-style plaintext with repeated authors expanded
|
||||
- `site/topics/*.html` reconstructed topic pages with hide/show BibTeX blocks
|
||||
- `talkorigins_full.txt` and `talkorigins_full.bib` aggregate downloads
|
||||
- `snapshots/*.json` cached topic payloads so reruns can resume without re-fetching already scraped topics
|
||||
|
||||
After a full scrape, run:
|
||||
|
||||
```bash
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist validate-talkorigins talkorigins-out/talkorigins_manifest.json
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist duplicates-talkorigins talkorigins-out/talkorigins_manifest.json --limit 20
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist duplicates-talkorigins talkorigins-out/talkorigins_manifest.json --limit 20 --preview --weak-only
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist suggest-talkorigins-phrases talkorigins-out/talkorigins_manifest.json --output topic-phrases.json
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 stage-topic-phrases topic-phrases.json
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 review-topic-phrase abiogenesis accepted --notes "curated from local corpus"
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 apply-topic-phrases topic-phrases.json
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 enrich-talkorigins talkorigins-out/talkorigins_manifest.json --limit 20
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins-copy.sqlite3 enrich-talkorigins talkorigins-out/talkorigins_manifest.json --limit 5 --apply --allow-unsafe-search-matches
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 review-talkorigins talkorigins-out/talkorigins_manifest.json --output talkorigins-review.json
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 apply-talkorigins-corrections talkorigins-out/talkorigins_manifest.json talkorigins-corrections.json
|
||||
```
|
||||
|
||||
That report summarizes parse coverage and flags suspicious entry-type / venue combinations for manual cleanup.
|
||||
It also reports duplicate clusters across topic seed files so you can gauge how much deduplication pressure to expect before ingestion.
|
||||
Use `duplicates-talkorigins` when you want to inspect specific clusters, filter by text, restrict the audit to one topic slug, or preview only weak canonicalization outcomes before importing.
|
||||
|
||||
Use `suggest-talkorigins-phrases` to derive candidate stored expansion phrases from the existing TalkOrigins topic corpus itself. The output is deterministic JSON keyed by topic slug, with a suggested phrase plus the extracted keywords that drove it. This is a useful first pass before setting topic phrases in the database or editing generated batch jobs.
|
||||
|
||||
Use `stage-topic-phrases` to load those suggestions into the database as review items. Staging stores the candidate in `suggested_phrase` and marks the topic `pending` without changing the active `expansion_phrase`.
|
||||
Use `review-topic-phrase` to accept or reject one staged suggestion in place. Accepting a suggestion copies it into `expansion_phrase`; rejecting it preserves the review state without changing the live phrase.
|
||||
Use `apply-topic-phrases` when you want a direct patch path instead of the staged review flow. It accepts either the raw suggestion list or an object with a `topics` list, and will apply `suggested_phrase` or `phrase` to matching topic slugs immediately.
|
||||
Use `enrich-talkorigins` when you want to target those weak canonical entries for resolver-based metadata upgrades before retrying graph expansion on imported topic slices.
|
||||
Use `review-talkorigins` when you want one JSON review artifact that combines weak canonical clusters with dry-run enrichment outcomes for manual cleanup.
|
||||
Use `expand-topic` when you already have both a topic phrase and a curated topic seed set in the database: it expands outward from the topic’s existing entries, then only assigns discovered works back to that topic if they clear a topic-relevance threshold. Write-enabled assignment is stricter than preview ranking: a candidate must clear the score threshold and show a non-generic title anchor to the topic phrase, so broad methods papers do not get attached just because their abstracts or related terms overlap. On large noisy topics, prefer `--seed-key` to restrict the run to just the trusted seed entries you want to expand from, and use `--preview` first to inspect discovered candidates and relevance scores before writing anything.
|
||||
|
||||
Use `set-topic-phrase` to store a curated expansion phrase on the topic itself. When a stored phrase exists, `expand-topic` will use it automatically if you do not pass `--topic-phrase`. Batch bootstrap jobs can also set `topic_slug`, `topic_name`, and `topic_phrase` so curated topic metadata is created as part of the run.
|
||||
Use `topics --phrase-review-status pending` when you want to audit only topics whose staged phrase suggestions still need review.
|
||||
`--allow-unsafe-search-matches` exists only for bounded experiments on copied databases when you explicitly want to relax trust to exercise downstream expansion behavior.
|
||||
|
||||
Correction files are simple JSON:
|
||||
|
||||
```json
|
||||
{
|
||||
"corrections": [
|
||||
{
|
||||
"key": "smith jane|1999|weak duplicate",
|
||||
"entry_type": "article",
|
||||
"review_status": "reviewed",
|
||||
"fields": {
|
||||
"journal": "Journal of Better Metadata",
|
||||
"doi": "10.1000/weak",
|
||||
"note": null
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
`fields` values overwrite the canonical entry for that duplicate-cluster key. Set a field to `null` to remove it.
|
||||
|
||||
To import the reconstructed corpus into SQLite while collapsing duplicate works across topics into canonical entries:
|
||||
|
||||
```bash
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 ingest-talkorigins talkorigins-out/talkorigins_manifest.json
|
||||
```
|
||||
|
||||
That import preserves many-to-many topic membership through the `topics` and `entry_topics` tables.
|
||||
After import, use `topics`, `topic-entries`, `search --topic`, and `export-topic` to inspect or export topic slices from the consolidated database.
|
||||
|
||||
Live-source workflow:
|
||||
|
||||
```bash
|
||||
cd citegeist
|
||||
export CITEGEIST_SOURCE_CACHE=.cache/citegeist
|
||||
export CITEGEIST_LIVE_TESTS=1
|
||||
PYTHONPATH=src .venv/bin/python -m pytest -m live -q
|
||||
PYTHONPATH=src .venv/bin/python scripts/live_smoke.py
|
||||
```
|
||||
|
||||
By default, live tests are skipped. They only run when `CITEGEIST_LIVE_TESTS=1` is set.
|
||||
|
||||
Convenience targets:
|
||||
|
||||
```bash
|
||||
make test
|
||||
make test-live
|
||||
make live-smoke
|
||||
```
|
||||
|
||||
## Near-Term Priorities
|
||||
|
||||
- source adapters beyond OAI-PMH for additional non-DOI scholarly ecosystems.
|
||||
|
||||
See [ROADMAP.md](./ROADMAP.md) for the prioritized phase plan and rationale.
|
||||
|
||||
## Naming
|
||||
|
||||
The name is intended to be short, distinct, and memorable:
|
||||
|
||||
- `cite` for citation work;
|
||||
- `geist` for the organizing intelligence around the literature.
|
||||
|
|
|
|||
|
|
@ -0,0 +1,187 @@
|
|||
# Roadmap
|
||||
|
||||
This roadmap prioritizes a usable local research workflow over breadth of integrations.
|
||||
|
||||
The first objective is not to support every metadata source. The first objective is to make one end-to-end path work reliably:
|
||||
|
||||
1. ingest draft references,
|
||||
2. normalize and store them,
|
||||
3. enrich them,
|
||||
4. traverse citation links,
|
||||
5. export reviewed BibTeX.
|
||||
|
||||
## Prioritization Principles
|
||||
|
||||
- prioritize steps that make the system usable by a single researcher on a local machine;
|
||||
- prioritize deterministic infrastructure before network integrations;
|
||||
- keep every stage inspectable and auditable;
|
||||
- treat verification and provenance as core features, not cleanup work;
|
||||
- defer heavy semantic infrastructure until the local corpus model is stable.
|
||||
|
||||
## Current Baseline
|
||||
|
||||
Completed:
|
||||
|
||||
- lightweight BibTeX parsing;
|
||||
- SQLite storage for entries, creators, identifiers, and relations;
|
||||
- local text search using SQLite FTS5 when available;
|
||||
- tests for ingest, relation storage, and search.
|
||||
|
||||
## Phase 1: Core Ingestion And Export
|
||||
|
||||
Priority: P0
|
||||
|
||||
Goal:
|
||||
Make `citegeist` useful as a local BibTeX workbench even before online enrichment is added.
|
||||
|
||||
Tasks:
|
||||
|
||||
- add BibTeX export from the normalized database back into stable, readable BibTeX;
|
||||
- add a small CLI for `ingest`, `show`, `search`, and `export`;
|
||||
- store field provenance metadata alongside imported and edited fields;
|
||||
- add schema support for entry status such as `draft`, `enriched`, `reviewed`, and `exported`;
|
||||
- add fixture-driven tests for round-tripping BibTeX through ingest and export.
|
||||
|
||||
Why this comes first:
|
||||
|
||||
- without export, the project is not yet useful in a LaTeX workflow;
|
||||
- without a CLI, the package is a library demo rather than a tool;
|
||||
- without provenance and state, later enrichment work becomes hard to audit.
|
||||
|
||||
Exit criteria:
|
||||
|
||||
- a user can ingest a `.bib` file, inspect entries, search locally, and export a reviewed `.bib`;
|
||||
- round-trip tests show no unexpected field loss for supported entry types.
|
||||
|
||||
## Phase 2: Reference Extraction
|
||||
|
||||
Priority: P0
|
||||
|
||||
Goal:
|
||||
Turn raw reference text into draft entries that can enter the main pipeline.
|
||||
|
||||
Tasks:
|
||||
|
||||
- add parsers for bibliography-section lines and plain-text reference lists;
|
||||
- define a draft-entry schema for incomplete references with confidence markers;
|
||||
- support ingestion of OCR- or PDF-derived plaintext bibliography sections;
|
||||
- add normalization for author names, years, title casing, and page ranges;
|
||||
- build gold-test fixtures from real, messy reference examples.
|
||||
|
||||
Why this is next:
|
||||
|
||||
- this addresses the project’s first unique bottleneck: getting rough references into structured form;
|
||||
- enrichment is much more effective once draft references are normalized.
|
||||
|
||||
Exit criteria:
|
||||
|
||||
- a user can pass a plaintext bibliography section and receive draft BibTeX entries with unresolved fields clearly marked;
|
||||
- tests cover common article, book, chapter, and proceedings references.
|
||||
|
||||
## Phase 3: Metadata Enrichment
|
||||
|
||||
Priority: P1
|
||||
|
||||
Goal:
|
||||
Resolve draft or partial entries against external scholarly sources and merge improved metadata safely.
|
||||
|
||||
Tasks:
|
||||
|
||||
- define a resolver interface with deterministic merge rules;
|
||||
- implement first-party resolvers for DOI/Crossref, DBLP, and arXiv;
|
||||
- add identifier-first resolution, then title/author/year fallback search;
|
||||
- store merge provenance per field and resolution attempt logs;
|
||||
- flag conflicts rather than silently overwriting disputed values.
|
||||
|
||||
Why this is P1 rather than the first phase:
|
||||
|
||||
- enrichment quality depends on the ingestion and provenance model being correct first;
|
||||
- it is easier to test deterministic merge behavior once local workflows already exist.
|
||||
|
||||
Exit criteria:
|
||||
|
||||
- an incomplete entry can be enriched from at least one authoritative source;
|
||||
- conflicting fields remain visible for review instead of being lost.
|
||||
|
||||
## Phase 4: Citation Graph Expansion
|
||||
|
||||
Priority: P1
|
||||
|
||||
Goal:
|
||||
Use citation edges as a discovery engine rather than just metadata storage.
|
||||
|
||||
Tasks:
|
||||
|
||||
- support explicit `cites` and `cited_by` edge ingestion with source provenance;
|
||||
- add graph expansion commands starting from one or more seed entries;
|
||||
- track edge discovery source, timestamp, and confidence;
|
||||
- add filters for depth, source type, year range, and reviewed status;
|
||||
- expose unresolved nodes so the user can decide what to enrich next.
|
||||
|
||||
Why this matters:
|
||||
|
||||
- this is central to literature discovery rather than mere bibliography cleanup;
|
||||
- it turns the database into a research navigation tool.
|
||||
|
||||
Exit criteria:
|
||||
|
||||
- starting from one or more seed entries, a user can expand outward through citation edges and persist newly discovered nodes;
|
||||
- graph traversal results can be exported as BibTeX candidates for review.
|
||||
|
||||
## Phase 5: Search And Ranking
|
||||
|
||||
Priority: P2
|
||||
|
||||
Goal:
|
||||
Improve discovery quality inside the local corpus.
|
||||
|
||||
Tasks:
|
||||
|
||||
- refine FTS ranking across title, abstract, keywords, and fulltext;
|
||||
- add saved search queries and result filters;
|
||||
- add optional embedding-backed semantic search behind a pluggable interface;
|
||||
- support hybrid ranking that combines lexical matching, identifiers, and citation proximity;
|
||||
- add benchmarking fixtures for retrieval quality on a few research topics.
|
||||
|
||||
Why this is later:
|
||||
|
||||
- FTS is already enough to support early workflows;
|
||||
- embedding infrastructure is expensive and should wait until the corpus schema stabilizes.
|
||||
|
||||
Exit criteria:
|
||||
|
||||
- local search is useful on realistic corpora without requiring external services;
|
||||
- semantic indexing is optional and does not displace the simpler local search path.
|
||||
|
||||
## Phase 6: Corpus Acquisition Pipelines
|
||||
|
||||
Priority: P2
|
||||
|
||||
Goal:
|
||||
Broaden source acquisition without mixing that complexity into the core model.
|
||||
|
||||
Tasks:
|
||||
|
||||
- add source adapters for open-access theses and dissertation repositories;
|
||||
- add support for harvesting publisher citation pages and preprint metadata pages;
|
||||
- define per-source import provenance and rate-limit behavior;
|
||||
- separate source-specific scraping logic from normalized entry storage;
|
||||
- add regression fixtures for representative public sources.
|
||||
|
||||
Why this is later:
|
||||
|
||||
- acquisition breadth is useful, but only after the core ingest/enrich/review loop is solid;
|
||||
- source adapters are brittle and should sit on top of a stable model.
|
||||
|
||||
Exit criteria:
|
||||
|
||||
- new public corpora can be imported through adapters without changing the storage core;
|
||||
- imported entries retain their source provenance and can be reviewed like any other entry.
|
||||
|
||||
## Suggested Next Three Tasks
|
||||
|
||||
1. Add a CLI module with `ingest`, `search`, `show`, and `export`.
|
||||
2. Implement BibTeX export from the normalized store.
|
||||
3. Add provenance tables and entry review status fields.
|
||||
|
||||
These three tasks complete the first usable local workflow and should be treated as the immediate sprint.
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
[build-system]
|
||||
requires = ["setuptools>=68"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "citegeist"
|
||||
version = "0.1.0"
|
||||
description = "BibTeX-native tooling for bibliography augmentation, citation graphs, and search"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = ["pybtex==0.25.1"]
|
||||
|
||||
[project.scripts]
|
||||
citegeist = "citegeist.cli:main"
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
pythonpath = ["src"]
|
||||
testpaths = ["tests"]
|
||||
markers = [
|
||||
"live: tests that call live external scholarly APIs and are skipped unless explicitly enabled",
|
||||
]
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
|
||||
from citegeist import MetadataResolver, SourceClient
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="Run live smoke checks against scholarly metadata sources")
|
||||
parser.add_argument(
|
||||
"--cache-dir",
|
||||
default=os.environ.get("CITEGEIST_SOURCE_CACHE", ".cache/citegeist"),
|
||||
help="Directory for cached live-source responses",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fixtures-dir",
|
||||
default=os.environ.get("CITEGEIST_SOURCE_FIXTURES"),
|
||||
help="Optional fixture directory to read before live network calls",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = build_parser().parse_args()
|
||||
client = SourceClient(cache_dir=args.cache_dir, fixtures_dir=args.fixtures_dir)
|
||||
resolver = MetadataResolver(source_client=client)
|
||||
|
||||
checks = {
|
||||
"crossref_doi": resolver.resolve_doi("10.1038/nphys1170"),
|
||||
"arxiv_id": resolver.resolve_arxiv("1706.03762"),
|
||||
"openalex_search": resolver.search_openalex_best_match(
|
||||
title="Attention Is All You Need",
|
||||
author_text="Ashish Vaswani",
|
||||
year="2017",
|
||||
),
|
||||
}
|
||||
|
||||
payload = {}
|
||||
for name, resolution in checks.items():
|
||||
payload[name] = None
|
||||
if resolution is not None:
|
||||
payload[name] = {
|
||||
"source_label": resolution.source_label,
|
||||
"title": resolution.entry.fields.get("title"),
|
||||
"year": resolution.entry.fields.get("year"),
|
||||
"doi": resolution.entry.fields.get("doi"),
|
||||
"openalex": resolution.entry.fields.get("openalex"),
|
||||
"arxiv": resolution.entry.fields.get("arxiv"),
|
||||
}
|
||||
|
||||
print(json.dumps(payload, indent=2, sort_keys=True))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
from .batch import BatchBootstrapRunner, BatchJobResult, load_batch_jobs
|
||||
from .bibtex import BibEntry, parse_bibtex
|
||||
from .bootstrap import BootstrapResult, Bootstrapper
|
||||
from .expand import CrossrefExpander, OpenAlexExpander
|
||||
from .extract import extract_references
|
||||
from .harvest import OaiMetadataFormat, OaiPmhHarvester, OaiSet
|
||||
from .resolve import MetadataResolver, merge_entries, merge_entries_with_conflicts
|
||||
from .sources import SourceClient
|
||||
from .storage import BibliographyStore
|
||||
from .talkorigins import (
|
||||
TalkOriginsBatchExport,
|
||||
TalkOriginsDuplicateCluster,
|
||||
TalkOriginsEnrichmentResult,
|
||||
TalkOriginsIngestReport,
|
||||
TalkOriginsReviewExport,
|
||||
TalkOriginsScraper,
|
||||
TalkOriginsSeedSet,
|
||||
TalkOriginsTopicPhraseSuggestion,
|
||||
TalkOriginsTopic,
|
||||
TalkOriginsValidationReport,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"BibEntry",
|
||||
"BatchBootstrapRunner",
|
||||
"BatchJobResult",
|
||||
"BibliographyStore",
|
||||
"BootstrapResult",
|
||||
"Bootstrapper",
|
||||
"CrossrefExpander",
|
||||
"MetadataResolver",
|
||||
"OpenAlexExpander",
|
||||
"OaiPmhHarvester",
|
||||
"OaiMetadataFormat",
|
||||
"OaiSet",
|
||||
"SourceClient",
|
||||
"TalkOriginsBatchExport",
|
||||
"TalkOriginsDuplicateCluster",
|
||||
"TalkOriginsEnrichmentResult",
|
||||
"TalkOriginsIngestReport",
|
||||
"TalkOriginsReviewExport",
|
||||
"TalkOriginsScraper",
|
||||
"TalkOriginsSeedSet",
|
||||
"TalkOriginsTopicPhraseSuggestion",
|
||||
"TalkOriginsTopic",
|
||||
"TalkOriginsValidationReport",
|
||||
"extract_references",
|
||||
"load_batch_jobs",
|
||||
"merge_entries",
|
||||
"merge_entries_with_conflicts",
|
||||
"parse_bibtex",
|
||||
]
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
from .cli import main
|
||||
|
||||
|
||||
raise SystemExit(main())
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from .bootstrap import BootstrapResult, Bootstrapper
|
||||
from .storage import BibliographyStore
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class BatchJobResult:
|
||||
job_name: str
|
||||
result_count: int
|
||||
results: list[BootstrapResult]
|
||||
|
||||
|
||||
def load_batch_jobs(path: str | Path) -> list[dict]:
|
||||
path = Path(path)
|
||||
payload = json.loads(path.read_text(encoding="utf-8"))
|
||||
if isinstance(payload, dict):
|
||||
jobs = payload.get("jobs", [])
|
||||
else:
|
||||
jobs = payload
|
||||
if not isinstance(jobs, list):
|
||||
raise ValueError("Batch JSON must be a list of jobs or an object with a 'jobs' list")
|
||||
normalized_jobs: list[dict] = []
|
||||
for job in jobs:
|
||||
if not isinstance(job, dict):
|
||||
raise ValueError("Each batch job must be an object")
|
||||
normalized = dict(job)
|
||||
seed_bib = normalized.get("seed_bib")
|
||||
if isinstance(seed_bib, str) and seed_bib:
|
||||
seed_path = Path(seed_bib)
|
||||
if not seed_path.is_absolute():
|
||||
normalized["seed_bib"] = str((path.parent / seed_path).resolve())
|
||||
normalized_jobs.append(normalized)
|
||||
return normalized_jobs
|
||||
|
||||
|
||||
class BatchBootstrapRunner:
|
||||
def __init__(self, bootstrapper: Bootstrapper | None = None) -> None:
|
||||
self.bootstrapper = bootstrapper or Bootstrapper()
|
||||
|
||||
def run(self, store: BibliographyStore, jobs: list[dict]) -> list[BatchJobResult]:
|
||||
results: list[BatchJobResult] = []
|
||||
for index, job in enumerate(jobs, start=1):
|
||||
seed_bib = job.get("seed_bib")
|
||||
topic = job.get("topic")
|
||||
topic_limit = int(job.get("topic_limit", 5))
|
||||
topic_commit_limit = job.get("topic_commit_limit")
|
||||
expand = bool(job.get("expand", True))
|
||||
review_status = str(job.get("status", "draft"))
|
||||
preview = bool(job.get("preview", False))
|
||||
name = str(job.get("name") or f"job_{index}")
|
||||
topic_slug = job.get("topic_slug")
|
||||
topic_name = job.get("topic_name")
|
||||
topic_phrase = job.get("topic_phrase")
|
||||
|
||||
seed_bibtex = None
|
||||
if seed_bib:
|
||||
seed_bibtex = Path(seed_bib).read_text(encoding="utf-8")
|
||||
|
||||
job_results = self.bootstrapper.bootstrap(
|
||||
store,
|
||||
seed_bibtex=seed_bibtex,
|
||||
topic=topic,
|
||||
topic_limit=topic_limit,
|
||||
topic_commit_limit=int(topic_commit_limit) if topic_commit_limit is not None else None,
|
||||
expand=expand,
|
||||
review_status=review_status,
|
||||
preview_only=preview,
|
||||
topic_slug=str(topic_slug) if topic_slug else None,
|
||||
topic_name=str(topic_name) if topic_name else None,
|
||||
topic_phrase=str(topic_phrase) if topic_phrase else None,
|
||||
)
|
||||
results.append(BatchJobResult(name, len(job_results), job_results))
|
||||
return results
|
||||
|
|
@ -0,0 +1,116 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from io import StringIO
|
||||
|
||||
try:
|
||||
from pybtex.database import BibliographyData, Entry, Person, parse_string
|
||||
from pybtex.bibtex.exceptions import BibTeXError
|
||||
from pybtex.database.output.bibtex import Writer
|
||||
except ImportError: # pragma: no cover - exercised only outside the configured venv
|
||||
BibTeXError = None
|
||||
BibliographyData = Entry = Person = Writer = None
|
||||
parse_string = None
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class BibEntry:
|
||||
entry_type: str
|
||||
citation_key: str
|
||||
fields: dict[str, str]
|
||||
|
||||
|
||||
def parse_bibtex(text: str) -> list[BibEntry]:
|
||||
_require_pybtex()
|
||||
bibliography = parse_string(text, bib_format="bibtex")
|
||||
entries: list[BibEntry] = []
|
||||
for citation_key, entry in bibliography.entries.items():
|
||||
fields = dict(entry.fields.items())
|
||||
for role, persons in entry.persons.items():
|
||||
fields[role] = " and ".join(str(person) for person in persons)
|
||||
entries.append(
|
||||
BibEntry(
|
||||
entry_type=entry.type,
|
||||
citation_key=citation_key,
|
||||
fields=fields,
|
||||
)
|
||||
)
|
||||
return entries
|
||||
|
||||
|
||||
def render_bibtex(entries: list[BibEntry]) -> str:
|
||||
_require_pybtex()
|
||||
bibliography_entries = {}
|
||||
for entry in entries:
|
||||
fields = {
|
||||
key: _sanitize_bibtex_value(value)
|
||||
for key, value in entry.fields.items()
|
||||
if key not in {"author", "editor"}
|
||||
}
|
||||
persons = {}
|
||||
for role in ("author", "editor"):
|
||||
raw_names = entry.fields.get(role)
|
||||
if raw_names:
|
||||
persons[role] = [Person(name.strip()) for name in raw_names.split(" and ") if name.strip()]
|
||||
bibliography_entries[entry.citation_key] = Entry(entry.entry_type, fields=fields, persons=persons)
|
||||
|
||||
buffer = StringIO()
|
||||
try:
|
||||
Writer().write_stream(BibliographyData(entries=bibliography_entries), buffer)
|
||||
except BibTeXError:
|
||||
conservative_entries = {}
|
||||
for entry in entries:
|
||||
fields = {
|
||||
key: _flatten_bibtex_braces(value)
|
||||
for key, value in entry.fields.items()
|
||||
if key not in {"author", "editor"}
|
||||
}
|
||||
persons = {}
|
||||
for role in ("author", "editor"):
|
||||
raw_names = entry.fields.get(role)
|
||||
if raw_names:
|
||||
persons[role] = [Person(name.strip()) for name in raw_names.split(" and ") if name.strip()]
|
||||
conservative_entries[entry.citation_key] = Entry(entry.entry_type, fields=fields, persons=persons)
|
||||
buffer = StringIO()
|
||||
Writer().write_stream(BibliographyData(entries=conservative_entries), buffer)
|
||||
return buffer.getvalue().strip()
|
||||
|
||||
|
||||
def _require_pybtex() -> None:
|
||||
if parse_string is None or Writer is None:
|
||||
raise RuntimeError(
|
||||
"pybtex is required. Use the repo-local virtual environment under .venv/ for citegeist commands."
|
||||
)
|
||||
|
||||
|
||||
def _sanitize_bibtex_value(value: str) -> str:
|
||||
depth = 0
|
||||
parts: list[str] = []
|
||||
for char in value:
|
||||
if char == "{":
|
||||
depth += 1
|
||||
parts.append(char)
|
||||
continue
|
||||
if char == "}":
|
||||
if depth == 0:
|
||||
parts.append(")")
|
||||
else:
|
||||
depth -= 1
|
||||
parts.append(char)
|
||||
continue
|
||||
parts.append(char)
|
||||
if depth > 0:
|
||||
open_count = depth
|
||||
normalized = []
|
||||
for char in parts:
|
||||
if char == "{" and open_count > 0:
|
||||
normalized.append("(")
|
||||
open_count -= 1
|
||||
else:
|
||||
normalized.append(char)
|
||||
return "".join(normalized)
|
||||
return "".join(parts)
|
||||
|
||||
|
||||
def _flatten_bibtex_braces(value: str) -> str:
|
||||
return value.replace("{", "(").replace("}", ")")
|
||||
|
|
@ -0,0 +1,145 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
import re
|
||||
|
||||
from .bibtex import BibEntry, parse_bibtex
|
||||
from .expand import CrossrefExpander, OpenAlexExpander
|
||||
from .resolve import MetadataResolver
|
||||
from .storage import BibliographyStore
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class BootstrapResult:
|
||||
citation_key: str
|
||||
origin: str
|
||||
created: bool
|
||||
score: float = 0.0
|
||||
|
||||
|
||||
class Bootstrapper:
|
||||
def __init__(
|
||||
self,
|
||||
resolver: MetadataResolver | None = None,
|
||||
crossref_expander: CrossrefExpander | None = None,
|
||||
openalex_expander: OpenAlexExpander | None = None,
|
||||
) -> None:
|
||||
self.resolver = resolver or MetadataResolver()
|
||||
self.crossref_expander = crossref_expander or CrossrefExpander(self.resolver)
|
||||
self.openalex_expander = openalex_expander or OpenAlexExpander(self.resolver)
|
||||
|
||||
def bootstrap(
|
||||
self,
|
||||
store: BibliographyStore,
|
||||
seed_bibtex: str | None = None,
|
||||
topic: str | None = None,
|
||||
topic_limit: int = 5,
|
||||
topic_commit_limit: int | None = None,
|
||||
expand: bool = True,
|
||||
review_status: str = "draft",
|
||||
preview_only: bool = False,
|
||||
topic_slug: str | None = None,
|
||||
topic_name: str | None = None,
|
||||
topic_phrase: str | None = None,
|
||||
) -> list[BootstrapResult]:
|
||||
results: list[BootstrapResult] = []
|
||||
seed_keys: list[str] = []
|
||||
|
||||
if seed_bibtex:
|
||||
for entry in parse_bibtex(seed_bibtex):
|
||||
created = store.get_entry(entry.citation_key) is None
|
||||
if not preview_only:
|
||||
store.upsert_entry(
|
||||
entry,
|
||||
raw_bibtex=None,
|
||||
source_type="bootstrap",
|
||||
source_label="seed_bibtex",
|
||||
review_status=review_status,
|
||||
)
|
||||
seed_keys.append(entry.citation_key)
|
||||
results.append(BootstrapResult(entry.citation_key, "seed_bibtex", created))
|
||||
|
||||
if topic:
|
||||
if not preview_only and (topic_slug or topic_name or topic_phrase):
|
||||
store.ensure_topic(
|
||||
slug=topic_slug or _slugify(topic),
|
||||
name=topic_name or topic,
|
||||
source_type="bootstrap",
|
||||
expansion_phrase=topic_phrase or topic,
|
||||
)
|
||||
ranked_candidates = self._topic_candidates(topic, seed_keys, topic_limit)
|
||||
if topic_commit_limit is not None:
|
||||
ranked_candidates = ranked_candidates[:topic_commit_limit]
|
||||
|
||||
for entry, score in ranked_candidates:
|
||||
created = store.get_entry(entry.citation_key) is None
|
||||
if not preview_only:
|
||||
store.upsert_entry(
|
||||
entry,
|
||||
raw_bibtex=None,
|
||||
source_type="bootstrap",
|
||||
source_label=f"topic:{topic}",
|
||||
review_status=review_status,
|
||||
)
|
||||
seed_keys.append(entry.citation_key)
|
||||
results.append(BootstrapResult(entry.citation_key, "topic", created, score=score))
|
||||
|
||||
if expand and not preview_only:
|
||||
expanded_keys = list(dict.fromkeys(seed_keys))
|
||||
for citation_key in expanded_keys:
|
||||
for item in self.crossref_expander.expand_entry_references(store, citation_key):
|
||||
results.append(BootstrapResult(item.discovered_citation_key, "crossref_expand", item.created_entry))
|
||||
for item in self.openalex_expander.expand_entry(store, citation_key, relation_type="cites", limit=topic_limit):
|
||||
results.append(BootstrapResult(item.discovered_citation_key, "openalex_expand", item.created_entry))
|
||||
|
||||
store.connection.commit()
|
||||
return results
|
||||
|
||||
def _topic_candidates(self, topic: str, seed_keys: list[str], limit: int) -> list[tuple[BibEntry, float]]:
|
||||
scored: dict[str, tuple[BibEntry, float]] = {}
|
||||
|
||||
for source_name, base_score, entries in (
|
||||
("openalex", 3.0, self.resolver.search_openalex(topic, limit=limit)),
|
||||
("crossref", 2.0, self.resolver.search_crossref(topic, limit=limit)),
|
||||
("datacite", 1.5, self.resolver.search_datacite(topic, limit=limit)),
|
||||
):
|
||||
for entry in entries:
|
||||
score = base_score + _topic_relevance_score(entry, topic) + _seed_overlap_score(entry, seed_keys)
|
||||
existing = scored.get(entry.citation_key)
|
||||
if existing is None or score > existing[1]:
|
||||
scored[entry.citation_key] = (entry, score)
|
||||
|
||||
ranked = sorted(
|
||||
scored.values(),
|
||||
key=lambda item: (-item[1], item[0].citation_key),
|
||||
)
|
||||
return ranked[:limit]
|
||||
|
||||
|
||||
def _topic_relevance_score(entry: BibEntry, topic: str) -> float:
|
||||
topic_terms = _tokenize(topic)
|
||||
title_terms = _tokenize(entry.fields.get("title", ""))
|
||||
abstract_terms = _tokenize(entry.fields.get("abstract", ""))
|
||||
overlap = len(topic_terms & (title_terms | abstract_terms))
|
||||
return float(overlap)
|
||||
|
||||
|
||||
def _seed_overlap_score(entry: BibEntry, seed_keys: list[str]) -> float:
|
||||
if not seed_keys:
|
||||
return 0.0
|
||||
title_terms = _tokenize(entry.fields.get("title", ""))
|
||||
score = 0.0
|
||||
for seed_key in seed_keys:
|
||||
seed_terms = _tokenize(seed_key)
|
||||
if seed_terms & title_terms:
|
||||
score += 0.25
|
||||
return score
|
||||
|
||||
|
||||
def _tokenize(value: str) -> set[str]:
|
||||
return {token for token in re.split(r"\W+", value.lower()) if token}
|
||||
|
||||
|
||||
def _slugify(value: str) -> str:
|
||||
slug = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-")
|
||||
return slug or "topic"
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,600 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from urllib.parse import quote, urlencode
|
||||
|
||||
from .bibtex import BibEntry, parse_bibtex
|
||||
from .resolve import MetadataResolver
|
||||
from .storage import BibliographyStore
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class ExpansionResult:
|
||||
source_citation_key: str
|
||||
discovered_citation_key: str
|
||||
created_entry: bool
|
||||
relation_type: str
|
||||
source_label: str
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class TopicExpansionResult:
|
||||
topic_slug: str
|
||||
source_citation_key: str
|
||||
discovered_citation_key: str
|
||||
discovered_title: str
|
||||
created_entry: bool
|
||||
relation_type: str
|
||||
source_label: str
|
||||
relevance_score: float
|
||||
meets_relevance_threshold: bool
|
||||
assigned_to_topic: bool
|
||||
|
||||
|
||||
class CrossrefExpander:
|
||||
def __init__(self, resolver: MetadataResolver | None = None) -> None:
|
||||
self.resolver = resolver or MetadataResolver()
|
||||
|
||||
def expand_entry_references(
|
||||
self,
|
||||
store: BibliographyStore,
|
||||
citation_key: str,
|
||||
) -> list[ExpansionResult]:
|
||||
entry = store.get_entry(citation_key)
|
||||
if entry is None:
|
||||
return []
|
||||
|
||||
doi = entry.get("doi")
|
||||
if not doi:
|
||||
return []
|
||||
|
||||
payload = self.resolver.source_client.get_json(
|
||||
f"https://api.crossref.org/works/{doi}?mailto=welsberr@gmail.com"
|
||||
)
|
||||
references = payload.get("message", {}).get("reference", [])
|
||||
results: list[ExpansionResult] = []
|
||||
for index, reference in enumerate(references, start=1):
|
||||
discovered = _crossref_reference_to_entry(reference, citation_key, index)
|
||||
created = False
|
||||
if store.get_entry(discovered.citation_key) is None:
|
||||
store.upsert_entry(
|
||||
discovered,
|
||||
raw_bibtex=None,
|
||||
source_type="graph_expand",
|
||||
source_label=f"crossref:references:{doi}",
|
||||
review_status="draft",
|
||||
)
|
||||
store.connection.commit()
|
||||
created = True
|
||||
|
||||
store.add_relation(
|
||||
citation_key,
|
||||
discovered.citation_key,
|
||||
"cites",
|
||||
source_type="graph_expand",
|
||||
source_label=f"crossref:references:{doi}",
|
||||
confidence=1.0 if reference.get("DOI") else 0.6,
|
||||
)
|
||||
results.append(
|
||||
ExpansionResult(
|
||||
source_citation_key=citation_key,
|
||||
discovered_citation_key=discovered.citation_key,
|
||||
created_entry=created,
|
||||
relation_type="cites",
|
||||
source_label=f"crossref:references:{doi}",
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
class OpenAlexExpander:
|
||||
def __init__(self, resolver: MetadataResolver | None = None) -> None:
|
||||
self.resolver = resolver or MetadataResolver()
|
||||
|
||||
def expand_entry(
|
||||
self,
|
||||
store: BibliographyStore,
|
||||
citation_key: str,
|
||||
relation_type: str = "cites",
|
||||
limit: int = 25,
|
||||
) -> list[ExpansionResult]:
|
||||
entry = store.get_entry(citation_key)
|
||||
if entry is None:
|
||||
return []
|
||||
|
||||
openalex_id = entry.get("openalex") or self._lookup_openalex_id(entry)
|
||||
if not openalex_id:
|
||||
return []
|
||||
if not entry.get("openalex"):
|
||||
bibtex = store.get_entry_bibtex(citation_key)
|
||||
if bibtex:
|
||||
seed_entry = parse_bibtex(bibtex)[0]
|
||||
seed_entry.fields["openalex"] = openalex_id
|
||||
store.replace_entry(
|
||||
citation_key,
|
||||
seed_entry,
|
||||
source_type="resolver",
|
||||
source_label=f"openalex:id:{openalex_id}",
|
||||
review_status=str(entry.get("review_status") or "draft"),
|
||||
)
|
||||
|
||||
filter_name = "cited_by" if relation_type == "cites" else "cites"
|
||||
query = urlencode({"filter": f"{filter_name}:{openalex_id}", "per-page": limit})
|
||||
payload = self.resolver.source_client.get_json(f"https://api.openalex.org/works?{query}")
|
||||
works = payload.get("results", [])
|
||||
|
||||
results: list[ExpansionResult] = []
|
||||
for work in works:
|
||||
discovered = _openalex_work_to_entry(work)
|
||||
created = False
|
||||
if store.get_entry(discovered.citation_key) is None:
|
||||
store.upsert_entry(
|
||||
discovered,
|
||||
raw_bibtex=None,
|
||||
source_type="graph_expand",
|
||||
source_label=f"openalex:{relation_type}:{openalex_id}",
|
||||
review_status="draft",
|
||||
)
|
||||
store.connection.commit()
|
||||
created = True
|
||||
|
||||
if relation_type == "cites":
|
||||
source_key = citation_key
|
||||
target_key = discovered.citation_key
|
||||
else:
|
||||
source_key = discovered.citation_key
|
||||
target_key = citation_key
|
||||
|
||||
store.add_relation(
|
||||
source_key,
|
||||
target_key,
|
||||
"cites",
|
||||
source_type="graph_expand",
|
||||
source_label=f"openalex:{relation_type}:{openalex_id}",
|
||||
confidence=0.9,
|
||||
)
|
||||
results.append(
|
||||
ExpansionResult(
|
||||
source_citation_key=source_key,
|
||||
discovered_citation_key=discovered.citation_key,
|
||||
created_entry=created,
|
||||
relation_type=relation_type,
|
||||
source_label=f"openalex:{relation_type}:{openalex_id}",
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
def _lookup_openalex_id(self, entry: dict[str, object]) -> str | None:
|
||||
doi = entry.get("doi")
|
||||
if not doi:
|
||||
return None
|
||||
query = urlencode({"filter": f"doi:https://doi.org/{doi}"})
|
||||
payload = self.resolver.source_client.get_json(f"https://api.openalex.org/works?{query}")
|
||||
results = payload.get("results", [])
|
||||
if not results:
|
||||
return None
|
||||
return _normalize_openalex_id(results[0].get("id", ""))
|
||||
|
||||
|
||||
class TopicExpander:
|
||||
def __init__(
|
||||
self,
|
||||
crossref_expander: CrossrefExpander | None = None,
|
||||
openalex_expander: OpenAlexExpander | None = None,
|
||||
) -> None:
|
||||
self.crossref_expander = crossref_expander or CrossrefExpander()
|
||||
self.openalex_expander = openalex_expander or OpenAlexExpander()
|
||||
|
||||
def expand_topic(
|
||||
self,
|
||||
store: BibliographyStore,
|
||||
topic_slug: str,
|
||||
topic_phrase: str | None = None,
|
||||
source: str = "openalex",
|
||||
relation_type: str = "cites",
|
||||
seed_limit: int = 25,
|
||||
per_seed_limit: int = 25,
|
||||
min_relevance: float = 0.2,
|
||||
seed_keys: list[str] | None = None,
|
||||
preview_only: bool = False,
|
||||
) -> list[TopicExpansionResult]:
|
||||
topic = store.get_topic(topic_slug)
|
||||
if topic is None:
|
||||
return []
|
||||
|
||||
phrase = (topic_phrase or str(topic.get("name") or topic_slug)).strip()
|
||||
seeds = store.list_topic_entries(topic_slug, limit=seed_limit)
|
||||
if seed_keys:
|
||||
allowed = set(seed_keys)
|
||||
seeds = [seed for seed in seeds if str(seed["citation_key"]) in allowed]
|
||||
results: list[TopicExpansionResult] = []
|
||||
|
||||
for seed in seeds:
|
||||
seed_key = str(seed["citation_key"])
|
||||
if preview_only:
|
||||
discovered_rows = self._preview_discoveries(
|
||||
store,
|
||||
seed_key,
|
||||
source=source,
|
||||
relation_type=relation_type,
|
||||
limit=per_seed_limit,
|
||||
)
|
||||
else:
|
||||
discovered_rows = self._materialized_discoveries(
|
||||
store,
|
||||
seed_key,
|
||||
source=source,
|
||||
relation_type=relation_type,
|
||||
limit=per_seed_limit,
|
||||
)
|
||||
|
||||
for row, target_entry in discovered_rows:
|
||||
score = _topic_relevance_score(phrase, target_entry)
|
||||
meets_threshold = _meets_topic_assignment_threshold(
|
||||
phrase,
|
||||
target_entry,
|
||||
min_relevance=min_relevance,
|
||||
relevance_score=score,
|
||||
)
|
||||
assigned = False
|
||||
if not preview_only and meets_threshold and target_entry is not None:
|
||||
assigned = store.add_entry_topic(
|
||||
row.discovered_citation_key,
|
||||
topic_slug=topic_slug,
|
||||
topic_name=str(topic.get("name") or topic_slug),
|
||||
source_type="topic_expand",
|
||||
source_url=str(topic.get("source_url") or ""),
|
||||
source_label=f"{source}:{relation_type}:{seed_key}",
|
||||
confidence=score,
|
||||
)
|
||||
results.append(
|
||||
TopicExpansionResult(
|
||||
topic_slug=topic_slug,
|
||||
source_citation_key=row.source_citation_key,
|
||||
discovered_citation_key=row.discovered_citation_key,
|
||||
discovered_title=str(target_entry.get("title") or ""),
|
||||
created_entry=row.created_entry,
|
||||
relation_type=row.relation_type,
|
||||
source_label=row.source_label,
|
||||
relevance_score=score,
|
||||
meets_relevance_threshold=meets_threshold,
|
||||
assigned_to_topic=assigned,
|
||||
)
|
||||
)
|
||||
store.connection.commit()
|
||||
return results
|
||||
|
||||
def _materialized_discoveries(
|
||||
self,
|
||||
store: BibliographyStore,
|
||||
citation_key: str,
|
||||
source: str,
|
||||
relation_type: str,
|
||||
limit: int,
|
||||
) -> list[tuple[ExpansionResult, dict[str, object] | None]]:
|
||||
if source == "crossref":
|
||||
expansion_rows = self.crossref_expander.expand_entry_references(store, citation_key)
|
||||
else:
|
||||
expansion_rows = self.openalex_expander.expand_entry(
|
||||
store,
|
||||
citation_key,
|
||||
relation_type=relation_type,
|
||||
limit=limit,
|
||||
)
|
||||
return [(row, store.get_entry(row.discovered_citation_key)) for row in expansion_rows]
|
||||
|
||||
def _preview_discoveries(
|
||||
self,
|
||||
store: BibliographyStore,
|
||||
citation_key: str,
|
||||
source: str,
|
||||
relation_type: str,
|
||||
limit: int,
|
||||
) -> list[tuple[ExpansionResult, dict[str, object]]]:
|
||||
if source == "crossref":
|
||||
return self._preview_crossref_discoveries(store, citation_key, limit)
|
||||
return self._preview_openalex_discoveries(store, citation_key, relation_type, limit)
|
||||
|
||||
def _preview_crossref_discoveries(
|
||||
self,
|
||||
store: BibliographyStore,
|
||||
citation_key: str,
|
||||
limit: int,
|
||||
) -> list[tuple[ExpansionResult, dict[str, object]]]:
|
||||
entry = store.get_entry(citation_key)
|
||||
if entry is None or not entry.get("doi"):
|
||||
return []
|
||||
doi = str(entry["doi"])
|
||||
payload = self.crossref_expander.resolver.source_client.get_json(
|
||||
f"https://api.crossref.org/works/{doi}?mailto=welsberr@gmail.com"
|
||||
)
|
||||
references = payload.get("message", {}).get("reference", [])[:limit]
|
||||
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
|
||||
for index, reference in enumerate(references, start=1):
|
||||
discovered = _crossref_reference_to_entry(reference, citation_key, index)
|
||||
rows.append(
|
||||
(
|
||||
ExpansionResult(
|
||||
source_citation_key=citation_key,
|
||||
discovered_citation_key=discovered.citation_key,
|
||||
created_entry=store.get_entry(discovered.citation_key) is None,
|
||||
relation_type="cites",
|
||||
source_label=f"crossref:references:{doi}",
|
||||
),
|
||||
dict(discovered.fields),
|
||||
)
|
||||
)
|
||||
return rows
|
||||
|
||||
def _preview_openalex_discoveries(
|
||||
self,
|
||||
store: BibliographyStore,
|
||||
citation_key: str,
|
||||
relation_type: str,
|
||||
limit: int,
|
||||
) -> list[tuple[ExpansionResult, dict[str, object]]]:
|
||||
entry = store.get_entry(citation_key)
|
||||
if entry is None:
|
||||
return []
|
||||
openalex_id = entry.get("openalex") or self.openalex_expander._lookup_openalex_id(entry)
|
||||
if not openalex_id:
|
||||
return []
|
||||
filter_name = "cited_by" if relation_type == "cites" else "cites"
|
||||
query = urlencode({"filter": f"{filter_name}:{openalex_id}", "per-page": limit})
|
||||
payload = self.openalex_expander.resolver.source_client.get_json(f"https://api.openalex.org/works?{query}")
|
||||
works = payload.get("results", [])
|
||||
rows: list[tuple[ExpansionResult, dict[str, object]]] = []
|
||||
for work in works:
|
||||
discovered = _openalex_work_to_entry(work)
|
||||
source_key = citation_key if relation_type == "cites" else discovered.citation_key
|
||||
rows.append(
|
||||
(
|
||||
ExpansionResult(
|
||||
source_citation_key=source_key,
|
||||
discovered_citation_key=discovered.citation_key,
|
||||
created_entry=store.get_entry(discovered.citation_key) is None,
|
||||
relation_type=relation_type,
|
||||
source_label=f"openalex:{relation_type}:{openalex_id}",
|
||||
),
|
||||
dict(discovered.fields),
|
||||
)
|
||||
)
|
||||
return rows
|
||||
|
||||
|
||||
def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry:
|
||||
title = (
|
||||
reference.get("article-title")
|
||||
or reference.get("volume-title")
|
||||
or reference.get("journal-title")
|
||||
or reference.get("unstructured")
|
||||
or f"Referenced work {ordinal}"
|
||||
)
|
||||
year = str(reference.get("year") or "")
|
||||
author = reference.get("author") or ""
|
||||
doi = reference.get("DOI") or ""
|
||||
journal_title = reference.get("journal-title") or ""
|
||||
|
||||
fields: dict[str, str] = {
|
||||
"title": _normalize_text(title),
|
||||
"note": f"discovered_from = {{{source_citation_key}}}",
|
||||
}
|
||||
if year:
|
||||
fields["year"] = year
|
||||
if author:
|
||||
fields["author"] = _normalize_text(author)
|
||||
if doi:
|
||||
fields["doi"] = doi
|
||||
fields["url"] = f"https://doi.org/{doi}"
|
||||
if journal_title:
|
||||
fields["journal"] = _normalize_text(journal_title)
|
||||
|
||||
citation_key = _reference_citation_key(reference, title, year, ordinal)
|
||||
entry_type = "article" if journal_title else "misc"
|
||||
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
|
||||
|
||||
|
||||
def _reference_citation_key(reference: dict, title: str, year: str, ordinal: int) -> str:
|
||||
if doi := reference.get("DOI"):
|
||||
suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower()
|
||||
return f"doi{suffix}"
|
||||
|
||||
author = reference.get("author") or "ref"
|
||||
family = author.split(",")[0].split()[-1]
|
||||
family = re.sub(r"[^A-Za-z0-9]+", "", family).lower() or "ref"
|
||||
first_word = re.sub(r"[^A-Za-z0-9]+", "", title.split()[0]).lower() if title.split() else "untitled"
|
||||
return f"{family}{year or 'nd'}{first_word}{ordinal}"
|
||||
|
||||
|
||||
def _normalize_text(value: str) -> str:
|
||||
return " ".join(value.split())
|
||||
|
||||
|
||||
def _topic_relevance_score(topic_phrase: str, entry: dict[str, object] | None) -> float:
|
||||
if entry is None:
|
||||
return 0.0
|
||||
topic_terms = _expanded_keyword_terms(topic_phrase)
|
||||
if not topic_terms:
|
||||
return 0.0
|
||||
title_terms = _expanded_keyword_terms(str(entry.get("title") or ""))
|
||||
abstract_terms = _expanded_keyword_terms(str(entry.get("abstract") or ""))
|
||||
keyword_terms = _expanded_keyword_terms(str(entry.get("keywords") or ""))
|
||||
venue_terms = _expanded_keyword_terms(" ".join(str(entry.get(field) or "") for field in ("journal", "booktitle")))
|
||||
|
||||
score = 0.0
|
||||
score += 0.6 * _term_overlap_ratio(topic_terms, title_terms)
|
||||
score += 0.25 * _term_overlap_ratio(topic_terms, abstract_terms)
|
||||
score += 0.1 * _term_overlap_ratio(topic_terms, keyword_terms)
|
||||
score += 0.05 * _term_overlap_ratio(topic_terms, venue_terms)
|
||||
|
||||
phrase = _normalize_text(topic_phrase.casefold())
|
||||
title = _normalize_text(str(entry.get("title") or "").casefold())
|
||||
if phrase and title and phrase in title:
|
||||
score = max(score, 0.75)
|
||||
|
||||
return min(score, 1.0)
|
||||
|
||||
|
||||
def _meets_topic_assignment_threshold(
|
||||
topic_phrase: str,
|
||||
entry: dict[str, object] | None,
|
||||
min_relevance: float,
|
||||
relevance_score: float | None = None,
|
||||
) -> bool:
|
||||
if entry is None:
|
||||
return False
|
||||
score = relevance_score if relevance_score is not None else _topic_relevance_score(topic_phrase, entry)
|
||||
if score < min_relevance:
|
||||
return False
|
||||
title_anchor = _title_topic_anchor_ratio(topic_phrase, str(entry.get("title") or ""))
|
||||
return title_anchor >= 0.2
|
||||
|
||||
|
||||
def _keyword_terms(text: str) -> set[str]:
|
||||
return {
|
||||
_normalize_keyword(term)
|
||||
for term in re.findall(r"[A-Za-z0-9]+", text.casefold())
|
||||
if len(term) >= 4
|
||||
}
|
||||
|
||||
|
||||
def _expanded_keyword_terms(text: str) -> set[str]:
|
||||
terms = _keyword_terms(text)
|
||||
expanded = set(terms)
|
||||
for term in terms:
|
||||
expanded.update(_related_topic_terms(term))
|
||||
return expanded
|
||||
|
||||
|
||||
def _title_topic_anchor_ratio(topic_phrase: str, title: str) -> float:
|
||||
normalized_phrase = _normalize_text(topic_phrase.casefold())
|
||||
normalized_title = _normalize_text(title.casefold())
|
||||
if normalized_phrase and normalized_title and normalized_phrase in normalized_title:
|
||||
return 1.0
|
||||
|
||||
topic_terms = _core_topic_terms(topic_phrase)
|
||||
title_terms = _keyword_terms(title)
|
||||
if not topic_terms or not title_terms:
|
||||
return 0.0
|
||||
overlap = topic_terms & title_terms
|
||||
if overlap:
|
||||
return max(0.25, len(overlap) / len(topic_terms))
|
||||
return 0.0
|
||||
|
||||
|
||||
def _core_topic_terms(topic_phrase: str) -> set[str]:
|
||||
generic_terms = {"evolution", "origin", "origins", "science", "study", "studies"}
|
||||
return {term for term in _keyword_terms(topic_phrase) if term not in generic_terms}
|
||||
|
||||
|
||||
def _term_overlap_ratio(topic_terms: set[str], candidate_terms: set[str]) -> float:
|
||||
if not topic_terms or not candidate_terms:
|
||||
return 0.0
|
||||
return len(topic_terms & candidate_terms) / len(topic_terms)
|
||||
|
||||
|
||||
def _normalize_keyword(term: str) -> str:
|
||||
normalized = term.casefold()
|
||||
for suffix in ("isms", "ists", "ation", "ment", "ings", "ness", "isms", "ism", "ist", "ing", "ers", "er", "ies", "ied", "ed", "es", "s"):
|
||||
if len(normalized) > len(suffix) + 3 and normalized.endswith(suffix):
|
||||
if suffix in {"ies", "ied"}:
|
||||
return normalized[: -len(suffix)] + "y"
|
||||
return normalized[: -len(suffix)]
|
||||
return normalized
|
||||
|
||||
|
||||
def _related_topic_terms(term: str) -> set[str]:
|
||||
related_groups = (
|
||||
{"human", "hominid", "hominin", "homo"},
|
||||
{"chimpanzee", "chimp", "pan", "ape", "apes", "primate"},
|
||||
{"primate", "primate", "ape", "apes", "hominid", "hominin"},
|
||||
{"evolution", "evolutionary", "phylogeny", "phylogen", "ancestor", "ancestral"},
|
||||
{"origin", "origins", "abiogenesis", "prebiotic"},
|
||||
{"morphometry", "morphology", "cranial", "dental", "skeletal", "body"},
|
||||
{"paleolithic", "paleoanthropology", "paleoanthropological", "pleistocene", "fossil"},
|
||||
)
|
||||
for group in related_groups:
|
||||
if term in group:
|
||||
return group - {term}
|
||||
return set()
|
||||
|
||||
|
||||
def _openalex_work_to_entry(work: dict) -> BibEntry:
|
||||
title = _normalize_text(work.get("display_name", "") or "Untitled work")
|
||||
year = str(work.get("publication_year") or "")
|
||||
doi = _normalize_openalex_doi(work.get("doi"))
|
||||
openalex_id = _normalize_openalex_id(work.get("id", ""))
|
||||
authors = " and ".join(_openalex_author_name(item) for item in work.get("authorships", []))
|
||||
source = ((work.get("primary_location") or {}).get("source") or {}).get("display_name", "")
|
||||
work_type = work.get("type", "")
|
||||
|
||||
fields: dict[str, str] = {"title": title}
|
||||
if year:
|
||||
fields["year"] = year
|
||||
if authors:
|
||||
fields["author"] = authors
|
||||
if doi:
|
||||
fields["doi"] = doi
|
||||
fields["url"] = f"https://doi.org/{doi}"
|
||||
if openalex_id:
|
||||
fields["openalex"] = openalex_id
|
||||
if abstract := work.get("abstract_inverted_index"):
|
||||
fields["abstract"] = _openalex_abstract_text(abstract)
|
||||
if source:
|
||||
if work_type == "article":
|
||||
fields["journal"] = source
|
||||
else:
|
||||
fields["booktitle"] = source
|
||||
|
||||
citation_key = _openalex_citation_key(openalex_id, authors, year, title)
|
||||
entry_type = _openalex_type_to_bibtype(work_type)
|
||||
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
|
||||
|
||||
|
||||
def _openalex_author_name(authorship: dict) -> str:
|
||||
author = authorship.get("author") or {}
|
||||
name = author.get("display_name", "")
|
||||
return _normalize_text(name)
|
||||
|
||||
|
||||
def _openalex_abstract_text(inverted_index: dict) -> str:
|
||||
positions: dict[int, str] = {}
|
||||
for word, indexes in inverted_index.items():
|
||||
for index in indexes:
|
||||
positions[int(index)] = word
|
||||
return " ".join(word for _, word in sorted(positions.items()))
|
||||
|
||||
|
||||
def _openalex_type_to_bibtype(work_type: str) -> str:
|
||||
mapping = {
|
||||
"article": "article",
|
||||
"book": "book",
|
||||
"book-chapter": "incollection",
|
||||
"dissertation": "phdthesis",
|
||||
"proceedings-article": "inproceedings",
|
||||
}
|
||||
return mapping.get(work_type, "misc")
|
||||
|
||||
|
||||
def _openalex_citation_key(openalex_id: str, authors: str, year: str, title: str) -> str:
|
||||
if openalex_id:
|
||||
return f"openalex{re.sub(r'[^A-Za-z0-9]+', '', openalex_id).lower()}"
|
||||
author = authors.split(" and ")[0] if authors else "ref"
|
||||
family = re.sub(r"[^A-Za-z0-9]+", "", author.split()[-1]).lower() or "ref"
|
||||
first_word = re.sub(r"[^A-Za-z0-9]+", "", title.split()[0]).lower() if title.split() else "untitled"
|
||||
return f"{family}{year or 'nd'}{first_word}"
|
||||
|
||||
|
||||
def _normalize_openalex_id(value: str) -> str:
|
||||
if not value:
|
||||
return ""
|
||||
return value.rsplit("/", 1)[-1]
|
||||
|
||||
|
||||
def _normalize_openalex_doi(value: str | None) -> str:
|
||||
if not value:
|
||||
return ""
|
||||
if value.startswith("https://doi.org/"):
|
||||
return value[len("https://doi.org/") :]
|
||||
return value
|
||||
|
|
@ -0,0 +1,201 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from .bibtex import BibEntry
|
||||
|
||||
YEAR_PATTERN = re.compile(r"\b(19|20)\d{2}\b")
|
||||
YEAR_PAREN_PATTERN = re.compile(r"\((19|20)\d{2}\)")
|
||||
REF_START_PATTERN = re.compile(r"^(?:\[\d+\]|\d+\.|\(\d+\))\s*")
|
||||
|
||||
|
||||
def extract_references(text: str) -> list[BibEntry]:
|
||||
entries: list[BibEntry] = []
|
||||
for index, line in enumerate(_iter_reference_blocks(text), start=1):
|
||||
parsed = _parse_reference_line(line, index)
|
||||
if parsed is not None:
|
||||
entries.append(parsed)
|
||||
return entries
|
||||
|
||||
|
||||
def render_extracted_bibtex(text: str) -> str:
|
||||
from .bibtex import render_bibtex
|
||||
|
||||
return render_bibtex(extract_references(text))
|
||||
|
||||
|
||||
def _iter_reference_blocks(text: str) -> list[str]:
|
||||
lines: list[str] = []
|
||||
current: list[str] = []
|
||||
for raw_line in text.splitlines():
|
||||
line = raw_line.strip()
|
||||
if not line:
|
||||
if current:
|
||||
lines.append(" ".join(current))
|
||||
current = []
|
||||
continue
|
||||
starts_new = bool(REF_START_PATTERN.match(line))
|
||||
line = REF_START_PATTERN.sub("", line)
|
||||
normalized = " ".join(line.split())
|
||||
if len(normalized) < 20:
|
||||
continue
|
||||
if starts_new and current:
|
||||
lines.append(" ".join(current))
|
||||
current = [normalized]
|
||||
else:
|
||||
current.append(normalized)
|
||||
if current:
|
||||
lines.append(" ".join(current))
|
||||
return lines
|
||||
|
||||
|
||||
def _parse_reference_line(line: str, ordinal: int) -> BibEntry | None:
|
||||
for parser in (_parse_apa_style_reference, _parse_publisher_style_reference, _parse_plain_year_reference):
|
||||
parsed = parser(line, ordinal)
|
||||
if parsed is not None:
|
||||
return parsed
|
||||
return None
|
||||
|
||||
|
||||
def _parse_apa_style_reference(line: str, ordinal: int) -> BibEntry | None:
|
||||
year_match = YEAR_PAREN_PATTERN.search(line)
|
||||
if year_match is None:
|
||||
return None
|
||||
|
||||
year = year_match.group(0).strip("()")
|
||||
author_part = line[: year_match.start()].strip(" .")
|
||||
remainder = line[year_match.end() :].strip(" .")
|
||||
if not author_part or not remainder:
|
||||
return None
|
||||
|
||||
segments = _segments_after_year(remainder)
|
||||
if not segments:
|
||||
return None
|
||||
|
||||
title = _clean_title(segments[0])
|
||||
venue = segments[1] if len(segments) > 1 else ""
|
||||
authors = _normalize_authors(author_part)
|
||||
return _build_entry(line, ordinal, authors, year, title, venue)
|
||||
|
||||
|
||||
def _parse_publisher_style_reference(line: str, ordinal: int) -> BibEntry | None:
|
||||
year_match = YEAR_PATTERN.search(line)
|
||||
if year_match is None:
|
||||
return None
|
||||
|
||||
prefix = line[: year_match.start()].strip(" .,;")
|
||||
if "." not in prefix:
|
||||
return None
|
||||
|
||||
head, publisher = prefix.rsplit(".", 1)
|
||||
if "." not in head:
|
||||
return None
|
||||
author_part, title = head.split(".", 1)
|
||||
|
||||
authors = _normalize_authors(author_part)
|
||||
title = _clean_title(title)
|
||||
publisher = publisher.strip(" .,;")
|
||||
if not authors or not title or not publisher:
|
||||
return None
|
||||
|
||||
citation_key = _make_citation_key(authors, year_match.group(0), title, ordinal)
|
||||
return BibEntry(
|
||||
entry_type="book",
|
||||
citation_key=citation_key,
|
||||
fields={
|
||||
"author": authors,
|
||||
"year": year_match.group(0),
|
||||
"title": title,
|
||||
"publisher": publisher,
|
||||
"note": f"extracted_reference = {{true}}; raw_reference = {{{line}}}",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _parse_plain_year_reference(line: str, ordinal: int) -> BibEntry | None:
|
||||
year_match = YEAR_PATTERN.search(line)
|
||||
if year_match is None:
|
||||
return None
|
||||
|
||||
year = year_match.group(0)
|
||||
author_part = line[: year_match.start()].strip(" .")
|
||||
remainder = line[year_match.end() :].strip(" .")
|
||||
if not author_part or not remainder:
|
||||
return None
|
||||
|
||||
segments = _segments_after_year(remainder)
|
||||
if not segments:
|
||||
return None
|
||||
|
||||
title = _clean_title(segments[0])
|
||||
venue = segments[1] if len(segments) > 1 else ""
|
||||
authors = _normalize_authors(author_part)
|
||||
return _build_entry(line, ordinal, authors, year, title, venue)
|
||||
|
||||
|
||||
def _normalize_authors(author_part: str) -> str:
|
||||
normalized = author_part.replace(" & ", " and ")
|
||||
normalized = re.sub(r"\bet al\.?$", "and others", normalized)
|
||||
normalized = re.sub(r"\s+and\s+", " and ", normalized)
|
||||
normalized = re.sub(r"\s*,\s*", ", ", normalized)
|
||||
return normalized.strip(" .")
|
||||
|
||||
|
||||
def _segments_after_year(remainder: str) -> list[str]:
|
||||
return [segment.strip(" .") for segment in remainder.split(". ") if segment.strip(" .")]
|
||||
|
||||
|
||||
def _clean_title(title: str) -> str:
|
||||
cleaned = title.strip(" .\"'")
|
||||
cleaned = re.sub(r"\s+", " ", cleaned)
|
||||
return cleaned
|
||||
|
||||
|
||||
def _build_entry(
|
||||
raw_line: str,
|
||||
ordinal: int,
|
||||
authors: str,
|
||||
year: str,
|
||||
title: str,
|
||||
venue: str,
|
||||
) -> BibEntry:
|
||||
citation_key = _make_citation_key(authors, year, title, ordinal)
|
||||
entry_type = _guess_entry_type(venue)
|
||||
|
||||
fields: dict[str, str] = {
|
||||
"author": authors,
|
||||
"year": year,
|
||||
"title": title,
|
||||
"note": f"extracted_reference = {{true}}; raw_reference = {{{raw_line}}}",
|
||||
}
|
||||
if venue:
|
||||
if entry_type == "article":
|
||||
fields["journal"] = venue
|
||||
elif entry_type == "inproceedings":
|
||||
fields["booktitle"] = venue
|
||||
else:
|
||||
fields["howpublished"] = venue
|
||||
|
||||
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
|
||||
|
||||
|
||||
def _make_citation_key(authors: str, year: str, title: str, ordinal: int) -> str:
|
||||
first_author = authors.split(" and ")[0]
|
||||
family_name = first_author.split(",")[0] if "," in first_author else first_author.split()[-1]
|
||||
family_name = re.sub(r"[^A-Za-z0-9]+", "", family_name).lower() or "ref"
|
||||
|
||||
first_word = re.sub(r"[^A-Za-z0-9]+", "", title.split()[0]).lower() if title.split() else "untitled"
|
||||
if not first_word:
|
||||
first_word = "untitled"
|
||||
return f"{family_name}{year}{first_word}{ordinal}"
|
||||
|
||||
|
||||
def _guess_entry_type(venue: str) -> str:
|
||||
lowered = venue.lower()
|
||||
if any(token in lowered for token in ("journal", "transactions", "review", "letters")):
|
||||
return "article"
|
||||
if any(token in lowered for token in ("proceedings", "conference", "workshop", "symposium")):
|
||||
return "inproceedings"
|
||||
if any(token in lowered for token in ("press", "publisher", "university")):
|
||||
return "book"
|
||||
return "misc"
|
||||
|
|
@ -0,0 +1,317 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from urllib.parse import urlencode
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
from .bibtex import BibEntry
|
||||
from .sources import SourceClient
|
||||
|
||||
NS = {
|
||||
"oai": "http://www.openarchives.org/OAI/2.0/",
|
||||
"oai_dc": "http://www.openarchives.org/OAI/2.0/oai_dc/",
|
||||
"dc": "http://purl.org/dc/elements/1.1/",
|
||||
"mods": "http://www.loc.gov/mods/v3",
|
||||
}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class HarvestResult:
|
||||
base_url: str
|
||||
identifier: str
|
||||
entry: BibEntry
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class OaiSet:
|
||||
set_spec: str
|
||||
set_name: str
|
||||
set_description: str = ""
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class OaiMetadataFormat:
|
||||
metadata_prefix: str
|
||||
schema: str
|
||||
metadata_namespace: str
|
||||
|
||||
|
||||
class OaiPmhHarvester:
|
||||
def __init__(self, source_client: SourceClient | None = None) -> None:
|
||||
self.source_client = source_client or SourceClient()
|
||||
|
||||
def identify(self, base_url: str) -> dict[str, str]:
|
||||
root = self.source_client.get_xml(f"{base_url}?{urlencode({'verb': 'Identify'})}")
|
||||
identify = root.find(".//oai:Identify", NS)
|
||||
if identify is None:
|
||||
return {}
|
||||
payload: dict[str, str] = {}
|
||||
for field_name in (
|
||||
"repositoryName",
|
||||
"baseURL",
|
||||
"protocolVersion",
|
||||
"adminEmail",
|
||||
"earliestDatestamp",
|
||||
"deletedRecord",
|
||||
"granularity",
|
||||
):
|
||||
payload[field_name] = _node_text(identify.find(f"oai:{field_name}", NS))
|
||||
return payload
|
||||
|
||||
def list_sets(self, base_url: str) -> list[OaiSet]:
|
||||
root = self.source_client.get_xml(f"{base_url}?{urlencode({'verb': 'ListSets'})}")
|
||||
sets = root.findall(".//oai:set", NS)
|
||||
results: list[OaiSet] = []
|
||||
for node in sets:
|
||||
results.append(
|
||||
OaiSet(
|
||||
set_spec=_node_text(node.find("oai:setSpec", NS)),
|
||||
set_name=_node_text(node.find("oai:setName", NS)),
|
||||
set_description=_flatten_set_description(node.find("oai:setDescription", NS)),
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
def list_metadata_formats(self, base_url: str, identifier: str | None = None) -> list[OaiMetadataFormat]:
|
||||
params = {"verb": "ListMetadataFormats"}
|
||||
if identifier:
|
||||
params["identifier"] = identifier
|
||||
root = self.source_client.get_xml(f"{base_url}?{urlencode(params)}")
|
||||
formats = root.findall(".//oai:metadataFormat", NS)
|
||||
results: list[OaiMetadataFormat] = []
|
||||
for node in formats:
|
||||
results.append(
|
||||
OaiMetadataFormat(
|
||||
metadata_prefix=_node_text(node.find("oai:metadataPrefix", NS)),
|
||||
schema=_node_text(node.find("oai:schema", NS)),
|
||||
metadata_namespace=_node_text(node.find("oai:metadataNamespace", NS)),
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
def list_records(
|
||||
self,
|
||||
base_url: str,
|
||||
metadata_prefix: str = "oai_dc",
|
||||
set_spec: str | None = None,
|
||||
date_from: str | None = None,
|
||||
date_until: str | None = None,
|
||||
limit: int | None = None,
|
||||
) -> list[HarvestResult]:
|
||||
results: list[HarvestResult] = []
|
||||
params = {"verb": "ListRecords", "metadataPrefix": metadata_prefix}
|
||||
if set_spec:
|
||||
params["set"] = set_spec
|
||||
if date_from:
|
||||
params["from"] = date_from
|
||||
if date_until:
|
||||
params["until"] = date_until
|
||||
|
||||
ordinal = 1
|
||||
next_url = f"{base_url}?{urlencode(params)}"
|
||||
while next_url:
|
||||
root = self.source_client.get_xml(next_url)
|
||||
records = root.findall(".//oai:record", NS)
|
||||
for record in records:
|
||||
parsed = self._record_to_result(base_url, record, ordinal)
|
||||
ordinal += 1
|
||||
if parsed is not None:
|
||||
results.append(parsed)
|
||||
if limit is not None and len(results) >= limit:
|
||||
return results
|
||||
next_url = self._resumption_url(base_url, root)
|
||||
return results
|
||||
|
||||
def get_record(
|
||||
self,
|
||||
base_url: str,
|
||||
identifier: str,
|
||||
metadata_prefix: str = "oai_dc",
|
||||
) -> HarvestResult | None:
|
||||
params = {
|
||||
"verb": "GetRecord",
|
||||
"metadataPrefix": metadata_prefix,
|
||||
"identifier": identifier,
|
||||
}
|
||||
root = self.source_client.get_xml(f"{base_url}?{urlencode(params)}")
|
||||
record = root.find(".//oai:record", NS)
|
||||
if record is None:
|
||||
return None
|
||||
return self._record_to_result(base_url, record, 1)
|
||||
|
||||
def _record_to_result(self, base_url: str, record: ET.Element, ordinal: int) -> HarvestResult | None:
|
||||
identifier = _node_text(record.find("./oai:header/oai:identifier", NS))
|
||||
metadata_node = record.find("./oai:metadata/*", NS)
|
||||
if metadata_node is None or not identifier:
|
||||
return None
|
||||
|
||||
entry = _metadata_node_to_entry(base_url, identifier, metadata_node, ordinal)
|
||||
return HarvestResult(base_url=base_url, identifier=identifier, entry=entry)
|
||||
|
||||
def _resumption_url(self, base_url: str, root: ET.Element) -> str | None:
|
||||
token = _node_text(root.find(".//oai:resumptionToken", NS))
|
||||
if not token:
|
||||
return None
|
||||
return f"{base_url}?{urlencode({'verb': 'ListRecords', 'resumptionToken': token})}"
|
||||
|
||||
|
||||
def _oai_dc_to_entry(base_url: str, identifier: str, metadata: ET.Element, ordinal: int) -> BibEntry:
|
||||
titles = _all_text(metadata.findall("dc:title", NS))
|
||||
creators = _all_text(metadata.findall("dc:creator", NS))
|
||||
dates = _all_text(metadata.findall("dc:date", NS))
|
||||
descriptions = _all_text(metadata.findall("dc:description", NS))
|
||||
identifiers = _all_text(metadata.findall("dc:identifier", NS))
|
||||
publishers = _all_text(metadata.findall("dc:publisher", NS))
|
||||
types = [value.lower() for value in _all_text(metadata.findall("dc:type", NS))]
|
||||
|
||||
title = titles[0] if titles else "Untitled record"
|
||||
year = _first_year(dates)
|
||||
entry_type = _guess_oai_entry_type(types)
|
||||
|
||||
fields: dict[str, str] = {
|
||||
"title": title,
|
||||
"oai": identifier,
|
||||
"url": _best_identifier_url(identifiers) or f"{base_url}?verb=GetRecord&identifier={identifier}&metadataPrefix=oai_dc",
|
||||
"note": "harvested_from = {oai_pmh}",
|
||||
}
|
||||
if creators:
|
||||
fields["author"] = " and ".join(creators)
|
||||
if year:
|
||||
fields["year"] = year
|
||||
if descriptions:
|
||||
fields["abstract"] = descriptions[0]
|
||||
if publishers:
|
||||
fields["publisher"] = publishers[0]
|
||||
|
||||
citation_key = _oai_citation_key(creators, year, title, ordinal)
|
||||
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
|
||||
|
||||
|
||||
def _mods_to_entry(base_url: str, identifier: str, metadata: ET.Element, ordinal: int) -> BibEntry:
|
||||
title = _node_text(metadata.find(".//mods:titleInfo/mods:title", NS)) or "Untitled record"
|
||||
sub_title = _node_text(metadata.find(".//mods:titleInfo/mods:subTitle", NS))
|
||||
if sub_title:
|
||||
title = f"{title}: {sub_title}"
|
||||
|
||||
creators: list[str] = []
|
||||
for name in metadata.findall(".//mods:name", NS):
|
||||
role_terms = [term.text or "" for term in name.findall(".//mods:roleTerm", NS)]
|
||||
if role_terms and not any(term.lower() == "author" for term in role_terms):
|
||||
continue
|
||||
parts = [_node_text(part) for part in name.findall("./mods:namePart", NS)]
|
||||
parts = [part for part in parts if part]
|
||||
if parts:
|
||||
creators.append(", ".join(parts) if len(parts) == 2 else " ".join(parts))
|
||||
|
||||
year = ""
|
||||
for date_node in metadata.findall(".//mods:originInfo/mods:dateIssued", NS):
|
||||
text = _node_text(date_node)
|
||||
if len(text) >= 4 and text[:4].isdigit():
|
||||
year = text[:4]
|
||||
break
|
||||
|
||||
publisher = _node_text(metadata.find(".//mods:originInfo/mods:publisher", NS))
|
||||
abstract = _node_text(metadata.find(".//mods:abstract", NS))
|
||||
genre = _node_text(metadata.find(".//mods:genre", NS)).lower()
|
||||
related_title = _node_text(metadata.find(".//mods:relatedItem/mods:titleInfo/mods:title", NS))
|
||||
url = _node_text(metadata.find(".//mods:location/mods:url", NS))
|
||||
|
||||
entry_type = "phdthesis" if "thesis" in genre or "dissertation" in genre else "misc"
|
||||
if not entry_type == "phdthesis":
|
||||
if related_title:
|
||||
entry_type = "article"
|
||||
|
||||
fields: dict[str, str] = {
|
||||
"title": title,
|
||||
"oai": identifier,
|
||||
"url": url or f"{base_url}?verb=GetRecord&identifier={identifier}&metadataPrefix=mods",
|
||||
"note": "harvested_from = {oai_pmh_mods}",
|
||||
}
|
||||
if creators:
|
||||
fields["author"] = " and ".join(creators)
|
||||
if year:
|
||||
fields["year"] = year
|
||||
if publisher:
|
||||
fields["publisher"] = publisher
|
||||
if abstract:
|
||||
fields["abstract"] = abstract
|
||||
if related_title:
|
||||
fields["journal"] = related_title
|
||||
|
||||
citation_key = _oai_citation_key(creators, year, title, ordinal)
|
||||
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
|
||||
|
||||
|
||||
def _metadata_node_to_entry(base_url: str, identifier: str, metadata: ET.Element, ordinal: int) -> BibEntry:
|
||||
if metadata.tag.endswith("dc"):
|
||||
return _oai_dc_to_entry(base_url, identifier, metadata, ordinal)
|
||||
if metadata.tag.endswith("mods"):
|
||||
return _mods_to_entry(base_url, identifier, metadata, ordinal)
|
||||
return BibEntry(
|
||||
entry_type="misc",
|
||||
citation_key=_oai_citation_key([], "", identifier, ordinal),
|
||||
fields={
|
||||
"title": identifier,
|
||||
"oai": identifier,
|
||||
"url": f"{base_url}?verb=GetRecord&identifier={identifier}",
|
||||
"note": f"unsupported_oai_metadata = {{{metadata.tag}}}",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _node_text(node: ET.Element | None) -> str:
|
||||
if node is None or node.text is None:
|
||||
return ""
|
||||
return " ".join(node.text.split())
|
||||
|
||||
|
||||
def _all_text(nodes: list[ET.Element]) -> list[str]:
|
||||
values = []
|
||||
for node in nodes:
|
||||
value = _node_text(node)
|
||||
if value:
|
||||
values.append(value)
|
||||
return values
|
||||
|
||||
|
||||
def _first_year(dates: list[str]) -> str:
|
||||
for date in dates:
|
||||
if len(date) >= 4 and date[:4].isdigit():
|
||||
return date[:4]
|
||||
return ""
|
||||
|
||||
|
||||
def _guess_oai_entry_type(types: list[str]) -> str:
|
||||
joined = " ".join(types)
|
||||
if "thesis" in joined or "dissertation" in joined:
|
||||
return "phdthesis"
|
||||
if "article" in joined:
|
||||
return "article"
|
||||
if "book" in joined:
|
||||
return "book"
|
||||
return "misc"
|
||||
|
||||
|
||||
def _best_identifier_url(identifiers: list[str]) -> str:
|
||||
for identifier in identifiers:
|
||||
if identifier.startswith("http://") or identifier.startswith("https://"):
|
||||
return identifier
|
||||
return ""
|
||||
|
||||
|
||||
def _oai_citation_key(creators: list[str], year: str, title: str, ordinal: int) -> str:
|
||||
author = creators[0] if creators else "oai"
|
||||
family = author.split(",")[0] if "," in author else author.split()[-1]
|
||||
family = "".join(ch for ch in family.lower() if ch.isalnum()) or "oai"
|
||||
first_word = "".join(ch for ch in title.split()[0].lower() if ch.isalnum()) if title.split() else "untitled"
|
||||
return f"{family}{year or 'nd'}{first_word}{ordinal}"
|
||||
|
||||
|
||||
def _flatten_set_description(node: ET.Element | None) -> str:
|
||||
if node is None:
|
||||
return ""
|
||||
parts = []
|
||||
for child in node.iter():
|
||||
if child.text and child.text.strip():
|
||||
parts.append(" ".join(child.text.split()))
|
||||
return " ".join(parts)
|
||||
|
|
@ -0,0 +1,567 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import urllib.parse
|
||||
import xml.etree.ElementTree as ET
|
||||
from dataclasses import dataclass
|
||||
|
||||
from .bibtex import BibEntry, parse_bibtex
|
||||
from .sources import SourceClient
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class Resolution:
|
||||
entry: BibEntry
|
||||
source_type: str
|
||||
source_label: str
|
||||
|
||||
|
||||
class MetadataResolver:
|
||||
def __init__(
|
||||
self,
|
||||
user_agent: str = "citegeist/0.1 (local research tool)",
|
||||
source_client: SourceClient | None = None,
|
||||
) -> None:
|
||||
self.user_agent = user_agent
|
||||
self.source_client = source_client or SourceClient(user_agent=user_agent)
|
||||
|
||||
def resolve_entry(self, entry: BibEntry) -> Resolution | None:
|
||||
if doi := entry.fields.get("doi"):
|
||||
resolved = self.resolve_doi(doi)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
resolved = self.resolve_datacite_doi(doi)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
|
||||
if openalex_id := entry.fields.get("openalex"):
|
||||
resolved = self.resolve_openalex(openalex_id)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
|
||||
if dblp_key := entry.fields.get("dblp"):
|
||||
resolved = self.resolve_dblp(dblp_key)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
|
||||
if arxiv_id := entry.fields.get("arxiv"):
|
||||
resolved = self.resolve_arxiv(arxiv_id)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
|
||||
if title := entry.fields.get("title"):
|
||||
resolved = self.search_crossref_best_match(
|
||||
title=title,
|
||||
author_text=entry.fields.get("author", ""),
|
||||
year=entry.fields.get("year", ""),
|
||||
)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
resolved = self.search_datacite_best_match(
|
||||
title=title,
|
||||
author_text=entry.fields.get("author", ""),
|
||||
year=entry.fields.get("year", ""),
|
||||
)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
resolved = self.search_openalex_best_match(
|
||||
title=title,
|
||||
author_text=entry.fields.get("author", ""),
|
||||
year=entry.fields.get("year", ""),
|
||||
)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
|
||||
return None
|
||||
|
||||
def resolve_doi(self, doi: str) -> Resolution | None:
|
||||
encoded = urllib.parse.quote(doi, safe="")
|
||||
payload = self.source_client.get_json(f"https://api.crossref.org/works/{encoded}")
|
||||
message = payload.get("message", {})
|
||||
if not message:
|
||||
return None
|
||||
return Resolution(
|
||||
entry=_crossref_message_to_entry(message),
|
||||
source_type="resolver",
|
||||
source_label=f"crossref:doi:{doi}",
|
||||
)
|
||||
|
||||
def search_crossref(self, title: str, limit: int = 5) -> list[BibEntry]:
|
||||
query = urllib.parse.urlencode({"query.title": title, "rows": limit})
|
||||
payload = self.source_client.get_json(f"https://api.crossref.org/works?{query}")
|
||||
items = payload.get("message", {}).get("items", [])
|
||||
return [_crossref_message_to_entry(item) for item in items]
|
||||
|
||||
def search_crossref_best_match(
|
||||
self,
|
||||
title: str,
|
||||
author_text: str = "",
|
||||
year: str = "",
|
||||
) -> Resolution | None:
|
||||
candidate = _select_best_title_match(
|
||||
self.search_crossref(title, limit=5),
|
||||
title=title,
|
||||
author_text=author_text,
|
||||
year=year,
|
||||
)
|
||||
if candidate is None:
|
||||
return None
|
||||
return Resolution(
|
||||
entry=candidate,
|
||||
source_type="resolver",
|
||||
source_label=f"crossref:search:{title}",
|
||||
)
|
||||
|
||||
def resolve_dblp(self, dblp_key: str) -> Resolution | None:
|
||||
encoded_key = urllib.parse.quote(dblp_key, safe="/:")
|
||||
text = self.source_client.get_text(f"https://dblp.org/rec/{encoded_key}.bib")
|
||||
entries = parse_bibtex(text)
|
||||
if not entries:
|
||||
return None
|
||||
return Resolution(
|
||||
entry=entries[0],
|
||||
source_type="resolver",
|
||||
source_label=f"dblp:key:{dblp_key}",
|
||||
)
|
||||
|
||||
def search_dblp(self, query_text: str, limit: int = 5) -> list[BibEntry]:
|
||||
query = urllib.parse.urlencode({"q": query_text, "format": "json", "h": limit})
|
||||
payload = self.source_client.get_json(f"https://dblp.org/search/publ/api?{query}")
|
||||
hits = payload.get("result", {}).get("hits", {}).get("hit", [])
|
||||
if isinstance(hits, dict):
|
||||
hits = [hits]
|
||||
|
||||
results: list[BibEntry] = []
|
||||
for hit in hits:
|
||||
info = hit.get("info", {})
|
||||
dblp_key = info.get("key")
|
||||
if dblp_key:
|
||||
resolved = self.resolve_dblp(dblp_key)
|
||||
if resolved is not None:
|
||||
results.append(resolved.entry)
|
||||
return results
|
||||
|
||||
def resolve_arxiv(self, arxiv_id: str) -> Resolution | None:
|
||||
query = urllib.parse.urlencode({"id_list": arxiv_id})
|
||||
root = self.source_client.get_xml(f"https://export.arxiv.org/api/query?{query}")
|
||||
namespace = {"atom": "http://www.w3.org/2005/Atom"}
|
||||
entry = root.find("atom:entry", namespace)
|
||||
if entry is None:
|
||||
return None
|
||||
return Resolution(
|
||||
entry=_arxiv_atom_entry_to_bib(entry, arxiv_id),
|
||||
source_type="resolver",
|
||||
source_label=f"arxiv:id:{arxiv_id}",
|
||||
)
|
||||
|
||||
def resolve_openalex(self, openalex_id: str) -> Resolution | None:
|
||||
normalized_id = _normalize_openalex_id(openalex_id)
|
||||
payload = self.source_client.get_json(f"https://api.openalex.org/works/{normalized_id}")
|
||||
if not payload:
|
||||
return None
|
||||
return Resolution(
|
||||
entry=_openalex_work_to_entry(payload),
|
||||
source_type="resolver",
|
||||
source_label=f"openalex:id:{normalized_id}",
|
||||
)
|
||||
|
||||
def resolve_datacite_doi(self, doi: str) -> Resolution | None:
|
||||
encoded = urllib.parse.quote(doi, safe="")
|
||||
payload = self.source_client.get_json(f"https://api.datacite.org/dois/{encoded}")
|
||||
data = payload.get("data", {})
|
||||
if not data:
|
||||
return None
|
||||
return Resolution(
|
||||
entry=_datacite_work_to_entry(data),
|
||||
source_type="resolver",
|
||||
source_label=f"datacite:doi:{doi}",
|
||||
)
|
||||
|
||||
def search_datacite(self, title: str, limit: int = 5) -> list[BibEntry]:
|
||||
query = urllib.parse.urlencode({"query": title, "page[size]": limit})
|
||||
payload = self.source_client.get_json(f"https://api.datacite.org/dois?{query}")
|
||||
return [_datacite_work_to_entry(item) for item in payload.get("data", [])]
|
||||
|
||||
def search_datacite_best_match(
|
||||
self,
|
||||
title: str,
|
||||
author_text: str = "",
|
||||
year: str = "",
|
||||
) -> Resolution | None:
|
||||
candidate = _select_best_title_match(
|
||||
self.search_datacite(title, limit=5),
|
||||
title=title,
|
||||
author_text=author_text,
|
||||
year=year,
|
||||
)
|
||||
if candidate is None:
|
||||
return None
|
||||
return Resolution(
|
||||
entry=candidate,
|
||||
source_type="resolver",
|
||||
source_label=f"datacite:search:{title}",
|
||||
)
|
||||
|
||||
def search_openalex(self, title: str, limit: int = 5) -> list[BibEntry]:
|
||||
query = urllib.parse.urlencode({"search": title, "per-page": limit})
|
||||
payload = self.source_client.get_json(f"https://api.openalex.org/works?{query}")
|
||||
return [_openalex_work_to_entry(item) for item in payload.get("results", [])]
|
||||
|
||||
def search_openalex_best_match(
|
||||
self,
|
||||
title: str,
|
||||
author_text: str = "",
|
||||
year: str = "",
|
||||
) -> Resolution | None:
|
||||
candidate = _select_best_title_match(
|
||||
self.search_openalex(title, limit=5),
|
||||
title=title,
|
||||
author_text=author_text,
|
||||
year=year,
|
||||
)
|
||||
if candidate is None:
|
||||
return None
|
||||
return Resolution(
|
||||
entry=candidate,
|
||||
source_type="resolver",
|
||||
source_label=f"openalex:search:{title}",
|
||||
)
|
||||
|
||||
def merge_entries(base: BibEntry, resolved: BibEntry) -> BibEntry:
|
||||
merged, _ = merge_entries_with_conflicts(base, resolved)
|
||||
return merged
|
||||
|
||||
|
||||
def merge_entries_with_conflicts(base: BibEntry, resolved: BibEntry) -> tuple[BibEntry, list[dict[str, str]]]:
|
||||
merged_fields = dict(base.fields)
|
||||
conflicts: list[dict[str, str]] = []
|
||||
for key, value in resolved.fields.items():
|
||||
if not value:
|
||||
continue
|
||||
current_value = merged_fields.get(key, "")
|
||||
if current_value and current_value != value:
|
||||
conflicts.append(
|
||||
{
|
||||
"field_name": key,
|
||||
"current_value": current_value,
|
||||
"proposed_value": value,
|
||||
}
|
||||
)
|
||||
continue
|
||||
if key not in merged_fields or not merged_fields[key]:
|
||||
merged_fields[key] = value
|
||||
return (
|
||||
BibEntry(
|
||||
entry_type=base.entry_type or resolved.entry_type,
|
||||
citation_key=base.citation_key,
|
||||
fields=merged_fields,
|
||||
),
|
||||
conflicts,
|
||||
)
|
||||
|
||||
|
||||
def _crossref_message_to_entry(message: dict) -> BibEntry:
|
||||
entry_type = _crossref_type_to_bibtype(message.get("type", "article"))
|
||||
title_values = message.get("title", [])
|
||||
title = title_values[0] if title_values else ""
|
||||
year = _extract_crossref_year(message)
|
||||
authors = " and ".join(_crossref_person_to_name(person) for person in message.get("author", []))
|
||||
venue = ""
|
||||
if container_title := message.get("container-title", []):
|
||||
venue = container_title[0]
|
||||
|
||||
fields: dict[str, str] = {}
|
||||
if authors:
|
||||
fields["author"] = authors
|
||||
if title:
|
||||
fields["title"] = title
|
||||
if year:
|
||||
fields["year"] = year
|
||||
if doi := message.get("DOI"):
|
||||
fields["doi"] = doi
|
||||
if url := message.get("URL"):
|
||||
fields["url"] = url
|
||||
if abstract := message.get("abstract"):
|
||||
fields["abstract"] = abstract
|
||||
if venue:
|
||||
if entry_type == "article":
|
||||
fields["journal"] = venue
|
||||
else:
|
||||
fields["booktitle"] = venue
|
||||
if volume := message.get("volume"):
|
||||
fields["volume"] = str(volume)
|
||||
if issue := message.get("issue"):
|
||||
fields["number"] = str(issue)
|
||||
if pages := message.get("page"):
|
||||
fields["pages"] = str(pages)
|
||||
|
||||
citation_key = _make_resolution_key(fields.get("author", "crossref"), year or "n.d.", title or "untitled")
|
||||
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
|
||||
|
||||
|
||||
def _arxiv_atom_entry_to_bib(node: ET.Element, arxiv_id: str) -> BibEntry:
|
||||
ns = {
|
||||
"atom": "http://www.w3.org/2005/Atom",
|
||||
"arxiv": "http://arxiv.org/schemas/atom",
|
||||
}
|
||||
title = _node_text(node.find("atom:title", ns))
|
||||
summary = _node_text(node.find("atom:summary", ns))
|
||||
published = _node_text(node.find("atom:published", ns))
|
||||
year = published[:4] if published else ""
|
||||
authors = " and ".join(
|
||||
_node_text(author.find("atom:name", ns)) for author in node.findall("atom:author", ns)
|
||||
)
|
||||
doi = _node_text(node.find("arxiv:doi", ns))
|
||||
|
||||
fields: dict[str, str] = {
|
||||
"title": title,
|
||||
"author": authors,
|
||||
"year": year,
|
||||
"arxiv": arxiv_id,
|
||||
"url": f"https://arxiv.org/abs/{arxiv_id}",
|
||||
"pdf": f"https://arxiv.org/pdf/{arxiv_id}.pdf",
|
||||
}
|
||||
if summary:
|
||||
fields["abstract"] = summary
|
||||
if doi:
|
||||
fields["doi"] = doi
|
||||
return BibEntry(entry_type="article", citation_key=f"arxiv{arxiv_id.replace('.', '').replace('/', '')}", fields=fields)
|
||||
|
||||
|
||||
def _crossref_type_to_bibtype(crossref_type: str) -> str:
|
||||
mapping = {
|
||||
"journal-article": "article",
|
||||
"proceedings-article": "inproceedings",
|
||||
"book-chapter": "incollection",
|
||||
"book": "book",
|
||||
"proceedings": "proceedings",
|
||||
}
|
||||
return mapping.get(crossref_type, "misc")
|
||||
|
||||
|
||||
def _extract_crossref_year(message: dict) -> str:
|
||||
for field_name in ("published-print", "published-online", "issued", "created"):
|
||||
date_parts = message.get(field_name, {}).get("date-parts", [])
|
||||
if date_parts and date_parts[0]:
|
||||
return str(date_parts[0][0])
|
||||
return ""
|
||||
|
||||
|
||||
def _crossref_person_to_name(person: dict) -> str:
|
||||
family = person.get("family", "")
|
||||
given = person.get("given", "")
|
||||
if family and given:
|
||||
return f"{family}, {given}"
|
||||
return family or given
|
||||
|
||||
|
||||
def _node_text(node: ET.Element | None) -> str:
|
||||
if node is None or node.text is None:
|
||||
return ""
|
||||
return " ".join(node.text.split())
|
||||
|
||||
|
||||
def _make_resolution_key(author_text: str, year: str, title: str) -> str:
|
||||
first_author = author_text.split(" and ")[0]
|
||||
family_name = first_author.split(",")[0] if "," in first_author else first_author.split()[-1]
|
||||
family_name = "".join(ch for ch in family_name.lower() if ch.isalnum()) or "ref"
|
||||
first_word = "".join(ch for ch in title.split()[0].lower() if ch.isalnum()) if title.split() else "untitled"
|
||||
return f"{family_name}{year}{first_word}"
|
||||
|
||||
|
||||
def _openalex_work_to_entry(work: dict) -> BibEntry:
|
||||
title = work.get("display_name", "") or "Untitled work"
|
||||
year = str(work.get("publication_year") or "")
|
||||
doi = _normalize_openalex_doi(work.get("doi"))
|
||||
openalex_id = _normalize_openalex_id(work.get("id", ""))
|
||||
authors = " and ".join(_openalex_author_name(item) for item in work.get("authorships", []))
|
||||
source = ((work.get("primary_location") or {}).get("source") or {}).get("display_name", "")
|
||||
work_type = work.get("type", "")
|
||||
|
||||
fields: dict[str, str] = {}
|
||||
if authors:
|
||||
fields["author"] = authors
|
||||
if title:
|
||||
fields["title"] = title
|
||||
if year:
|
||||
fields["year"] = year
|
||||
if doi:
|
||||
fields["doi"] = doi
|
||||
fields["url"] = f"https://doi.org/{doi}"
|
||||
if openalex_id:
|
||||
fields["openalex"] = openalex_id
|
||||
fields.setdefault("url", f"https://openalex.org/{openalex_id}")
|
||||
if abstract := work.get("abstract_inverted_index"):
|
||||
fields["abstract"] = _openalex_abstract_text(abstract)
|
||||
if source:
|
||||
if work_type == "article":
|
||||
fields["journal"] = source
|
||||
else:
|
||||
fields["booktitle"] = source
|
||||
|
||||
citation_key = f"openalex{re.sub(r'[^A-Za-z0-9]+', '', openalex_id).lower()}" if openalex_id else _make_resolution_key(authors or "openalex", year or "n.d.", title or "untitled")
|
||||
return BibEntry(entry_type=_openalex_type_to_bibtype(work_type), citation_key=citation_key, fields=fields)
|
||||
|
||||
|
||||
def _openalex_author_name(authorship: dict) -> str:
|
||||
author = authorship.get("author") or {}
|
||||
return " ".join(str(author.get("display_name", "")).split())
|
||||
|
||||
|
||||
def _openalex_abstract_text(inverted_index: dict) -> str:
|
||||
positions: dict[int, str] = {}
|
||||
for word, indexes in inverted_index.items():
|
||||
for index in indexes:
|
||||
positions[int(index)] = word
|
||||
return " ".join(word for _, word in sorted(positions.items()))
|
||||
|
||||
|
||||
def _openalex_type_to_bibtype(work_type: str) -> str:
|
||||
mapping = {
|
||||
"article": "article",
|
||||
"book": "book",
|
||||
"book-chapter": "incollection",
|
||||
"dissertation": "phdthesis",
|
||||
"proceedings-article": "inproceedings",
|
||||
}
|
||||
return mapping.get(work_type, "misc")
|
||||
|
||||
|
||||
def _normalize_openalex_id(value: str) -> str:
|
||||
if not value:
|
||||
return ""
|
||||
return value.rsplit("/", 1)[-1]
|
||||
|
||||
|
||||
def _normalize_openalex_doi(value: str | None) -> str:
|
||||
if not value:
|
||||
return ""
|
||||
if value.startswith("https://doi.org/"):
|
||||
return value[len("https://doi.org/") :]
|
||||
return value
|
||||
|
||||
|
||||
def _normalize_match_text(value: str) -> str:
|
||||
lowered = value.lower()
|
||||
lowered = re.sub(r"\W+", " ", lowered)
|
||||
return " ".join(lowered.split())
|
||||
|
||||
|
||||
def _select_best_title_match(
|
||||
candidates: list[BibEntry],
|
||||
title: str,
|
||||
author_text: str = "",
|
||||
year: str = "",
|
||||
) -> BibEntry | None:
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
title_norm = _normalize_match_text(title)
|
||||
author_tokens = _author_match_tokens(author_text)
|
||||
year_text = str(year or "").strip()
|
||||
|
||||
for candidate in candidates:
|
||||
candidate_title = _normalize_match_text(candidate.fields.get("title", ""))
|
||||
if candidate_title != title_norm:
|
||||
continue
|
||||
candidate_year = str(candidate.fields.get("year", "") or "").strip()
|
||||
if year_text and candidate_year and year_text != candidate_year:
|
||||
continue
|
||||
if author_tokens and not _candidate_matches_author_tokens(candidate, author_tokens):
|
||||
continue
|
||||
return candidate
|
||||
return None
|
||||
|
||||
|
||||
def _author_match_tokens(author_text: str) -> set[str]:
|
||||
normalized = _normalize_match_text(author_text)
|
||||
if not normalized:
|
||||
return set()
|
||||
tokens = {
|
||||
token
|
||||
for token in re.findall(r"[a-z0-9]+", normalized)
|
||||
if len(token) >= 2 and token not in {"and", "et", "al"}
|
||||
}
|
||||
return tokens
|
||||
|
||||
|
||||
def _candidate_matches_author_tokens(candidate: BibEntry, author_tokens: set[str]) -> bool:
|
||||
candidate_author = _normalize_match_text(candidate.fields.get("author", ""))
|
||||
if not candidate_author:
|
||||
return False
|
||||
candidate_tokens = set(re.findall(r"[a-z0-9]+", candidate_author))
|
||||
return bool(author_tokens & candidate_tokens)
|
||||
|
||||
|
||||
def _datacite_work_to_entry(data: dict) -> BibEntry:
|
||||
attributes = data.get("attributes", {})
|
||||
doi = str(attributes.get("doi") or "")
|
||||
titles = attributes.get("titles") or []
|
||||
creators = attributes.get("creators") or []
|
||||
descriptions = attributes.get("descriptions") or []
|
||||
publisher = str(attributes.get("publisher") or "")
|
||||
year = str(attributes.get("publicationYear") or "")
|
||||
url = str(attributes.get("url") or "")
|
||||
types = attributes.get("types") or {}
|
||||
|
||||
title = titles[0].get("title", "") if titles else ""
|
||||
author_names = " and ".join(_datacite_creator_name(creator) for creator in creators if _datacite_creator_name(creator))
|
||||
abstract = _datacite_abstract(descriptions)
|
||||
entry_type = _datacite_type_to_bibtype(str(types.get("resourceTypeGeneral") or ""))
|
||||
|
||||
fields: dict[str, str] = {}
|
||||
if title:
|
||||
fields["title"] = title
|
||||
if author_names:
|
||||
fields["author"] = author_names
|
||||
if year:
|
||||
fields["year"] = year
|
||||
if doi:
|
||||
fields["doi"] = doi
|
||||
if url:
|
||||
fields["url"] = url
|
||||
elif doi:
|
||||
fields["url"] = f"https://doi.org/{doi}"
|
||||
if publisher:
|
||||
fields["publisher"] = publisher
|
||||
if abstract:
|
||||
fields["abstract"] = abstract
|
||||
|
||||
citation_key = _make_resolution_key(author_names or "datacite", year or "n.d.", title or doi or "untitled")
|
||||
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
|
||||
|
||||
|
||||
def _datacite_creator_name(creator: dict) -> str:
|
||||
family = str(creator.get("familyName") or "")
|
||||
given = str(creator.get("givenName") or "")
|
||||
if family and given:
|
||||
return f"{family}, {given}"
|
||||
return str(creator.get("name") or family or given)
|
||||
|
||||
|
||||
def _datacite_abstract(descriptions: list[dict]) -> str:
|
||||
for description in descriptions:
|
||||
if str(description.get("descriptionType") or "").lower() == "abstract":
|
||||
return str(description.get("description") or "")
|
||||
return ""
|
||||
|
||||
|
||||
def _datacite_type_to_bibtype(resource_type: str) -> str:
|
||||
lowered = resource_type.lower()
|
||||
mapping = {
|
||||
"audiovisual": "misc",
|
||||
"book": "book",
|
||||
"bookchapter": "incollection",
|
||||
"collection": "misc",
|
||||
"computationalnotebook": "misc",
|
||||
"conferencepaper": "inproceedings",
|
||||
"dataset": "misc",
|
||||
"dissertation": "phdthesis",
|
||||
"image": "misc",
|
||||
"journalarticle": "article",
|
||||
"model": "misc",
|
||||
"report": "techreport",
|
||||
"software": "misc",
|
||||
"text": "misc",
|
||||
}
|
||||
return mapping.get(lowered, "misc")
|
||||
|
|
@ -0,0 +1,86 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import urllib.request
|
||||
import xml.etree.ElementTree as ET
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class SourceClient:
|
||||
def __init__(
|
||||
self,
|
||||
user_agent: str = "citegeist/0.1 (local research tool)",
|
||||
cache_dir: str | Path | None = None,
|
||||
fixtures_dir: str | Path | None = None,
|
||||
) -> None:
|
||||
self.user_agent = user_agent
|
||||
self.cache_dir = Path(cache_dir) if cache_dir else None
|
||||
self.fixtures_dir = Path(fixtures_dir) if fixtures_dir else None
|
||||
|
||||
def get_json(self, url: str) -> dict:
|
||||
cached = self._read_cached(url, "json")
|
||||
if cached is not None:
|
||||
return json.loads(cached)
|
||||
|
||||
payload = self._fetch_bytes(url)
|
||||
self._write_cache(url, "json", payload)
|
||||
return json.loads(payload.decode("utf-8"))
|
||||
|
||||
def get_text(self, url: str) -> str:
|
||||
cached = self._read_cached(url, "txt")
|
||||
if cached is not None:
|
||||
return self._decode_text(cached)
|
||||
|
||||
payload = self._fetch_bytes(url)
|
||||
self._write_cache(url, "txt", payload)
|
||||
return self._decode_text(payload)
|
||||
|
||||
def get_xml(self, url: str) -> ET.Element:
|
||||
cached = self._read_cached(url, "xml")
|
||||
if cached is not None:
|
||||
return ET.fromstring(cached)
|
||||
|
||||
payload = self._fetch_bytes(url)
|
||||
self._write_cache(url, "xml", payload)
|
||||
return ET.fromstring(payload)
|
||||
|
||||
def _fetch_bytes(self, url: str) -> bytes:
|
||||
with urllib.request.urlopen(self._request(url)) as response:
|
||||
return response.read()
|
||||
|
||||
def _request(self, url: str) -> urllib.request.Request:
|
||||
return urllib.request.Request(
|
||||
url,
|
||||
headers={
|
||||
"User-Agent": self.user_agent,
|
||||
},
|
||||
)
|
||||
|
||||
def _cache_key(self, url: str, suffix: str) -> str:
|
||||
digest = hashlib.sha1(url.encode("utf-8")).hexdigest()
|
||||
return f"{digest}.{suffix}"
|
||||
|
||||
def _read_cached(self, url: str, suffix: str) -> bytes | None:
|
||||
for root in (self.fixtures_dir, self.cache_dir):
|
||||
if root is None:
|
||||
continue
|
||||
path = root / self._cache_key(url, suffix)
|
||||
if path.exists():
|
||||
return path.read_bytes()
|
||||
return None
|
||||
|
||||
def _write_cache(self, url: str, suffix: str, payload: bytes) -> None:
|
||||
if self.cache_dir is None:
|
||||
return
|
||||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
path = self.cache_dir / self._cache_key(url, suffix)
|
||||
path.write_bytes(payload)
|
||||
|
||||
def _decode_text(self, payload: bytes) -> str:
|
||||
for encoding in ("utf-8", "utf-8-sig", "iso-8859-1", "latin-1"):
|
||||
try:
|
||||
return payload.decode(encoding)
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
return payload.decode("utf-8", errors="replace")
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,15 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None:
|
||||
if os.environ.get("CITEGEIST_LIVE_TESTS") == "1":
|
||||
return
|
||||
|
||||
skip_live = pytest.mark.skip(reason="set CITEGEIST_LIVE_TESTS=1 to run live-source tests")
|
||||
for item in items:
|
||||
if "live" in item.keywords:
|
||||
item.add_marker(skip_live)
|
||||
|
|
@ -0,0 +1,129 @@
|
|||
from pathlib import Path
|
||||
|
||||
from citegeist.batch import BatchBootstrapRunner, load_batch_jobs
|
||||
from citegeist.cli import main
|
||||
from citegeist.storage import BibliographyStore
|
||||
|
||||
|
||||
def test_load_batch_jobs_accepts_object_with_jobs(tmp_path: Path):
|
||||
path = tmp_path / "jobs.json"
|
||||
path.write_text(
|
||||
"""
|
||||
{
|
||||
"jobs": [
|
||||
{"name": "topic-only", "topic": "graph topic"},
|
||||
{"name": "seed-only", "seed_bib": "seed.bib"}
|
||||
]
|
||||
}
|
||||
""",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
jobs = load_batch_jobs(path)
|
||||
|
||||
assert jobs[0]["name"] == "topic-only"
|
||||
assert jobs[1]["seed_bib"] == str((tmp_path / "seed.bib").resolve())
|
||||
|
||||
|
||||
def test_batch_runner_executes_multiple_jobs(tmp_path: Path):
|
||||
seed_bib = tmp_path / "seed.bib"
|
||||
seed_bib.write_text(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Seed Paper},
|
||||
year = {2024}
|
||||
}
|
||||
""",
|
||||
encoding="utf-8",
|
||||
)
|
||||
jobs = [
|
||||
{"name": "seed-job", "seed_bib": str(seed_bib), "expand": False},
|
||||
{"name": "topic-job", "topic": "graph topic", "expand": False, "preview": True},
|
||||
]
|
||||
|
||||
runner = BatchBootstrapRunner()
|
||||
from citegeist import BibEntry
|
||||
|
||||
runner.bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign]
|
||||
BibEntry(entry_type="article", citation_key="topic2024graph", fields={"title": "Graph Topic Result"})
|
||||
]
|
||||
runner.bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
runner.bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
results = runner.run(store, jobs)
|
||||
assert [job.job_name for job in results] == ["seed-job", "topic-job"]
|
||||
assert results[0].result_count == 1
|
||||
assert results[1].results[0].citation_key == "topic2024graph"
|
||||
assert store.get_entry("seed2024") is not None
|
||||
assert store.get_entry("topic2024graph") is None
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_batch_runner_can_store_topic_phrase_metadata():
|
||||
jobs = [
|
||||
{
|
||||
"name": "topic-job",
|
||||
"topic": "graph topic",
|
||||
"topic_slug": "graph-methods",
|
||||
"topic_name": "Graph Methods",
|
||||
"topic_phrase": "graph networks biology",
|
||||
"expand": False,
|
||||
"preview": False,
|
||||
}
|
||||
]
|
||||
|
||||
runner = BatchBootstrapRunner()
|
||||
from citegeist import BibEntry
|
||||
|
||||
runner.bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign]
|
||||
BibEntry(entry_type="article", citation_key="topic2024graph", fields={"title": "Graph Topic Result"})
|
||||
]
|
||||
runner.bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
runner.bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
runner.run(store, jobs)
|
||||
topic = store.get_topic("graph-methods")
|
||||
assert topic is not None
|
||||
assert topic["name"] == "Graph Methods"
|
||||
assert topic["expansion_phrase"] == "graph networks biology"
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_bootstrap_batch_cli_runs_json_jobs(tmp_path: Path):
|
||||
seed_bib = tmp_path / "seed.bib"
|
||||
seed_bib.write_text(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Seed Paper},
|
||||
year = {2024}
|
||||
}
|
||||
""",
|
||||
encoding="utf-8",
|
||||
)
|
||||
batch_json = tmp_path / "jobs.json"
|
||||
batch_json.write_text(
|
||||
f"""
|
||||
[
|
||||
{{"name": "seed-job", "seed_bib": "{seed_bib}", "expand": false}},
|
||||
{{"name": "topic-job", "topic": "graph topic", "expand": false, "preview": true}}
|
||||
]
|
||||
""",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
database = tmp_path / "library.sqlite3"
|
||||
with patch("citegeist.cli.BatchBootstrapRunner.run") as mocked_run:
|
||||
mocked_run.return_value = []
|
||||
exit_code = main(["--db", str(database), "bootstrap-batch", str(batch_json)])
|
||||
|
||||
assert exit_code == 0
|
||||
|
|
@ -0,0 +1,175 @@
|
|||
from citegeist import BibliographyStore
|
||||
from citegeist.bootstrap import Bootstrapper
|
||||
from citegeist.cli import main
|
||||
|
||||
|
||||
def test_bootstrap_from_seed_bib_only():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
bootstrapper = Bootstrapper()
|
||||
bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign]
|
||||
bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: [] # type: ignore[method-assign]
|
||||
|
||||
results = bootstrapper.bootstrap(
|
||||
store,
|
||||
seed_bibtex="""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Seed Paper},
|
||||
year = {2024}
|
||||
}
|
||||
""",
|
||||
expand=False,
|
||||
)
|
||||
|
||||
assert [item.citation_key for item in results] == ["seed2024"]
|
||||
assert store.get_entry("seed2024") is not None
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_bootstrap_from_topic_only():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
bootstrapper = Bootstrapper()
|
||||
bootstrapper.resolver.search_openalex = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [ # type: ignore[method-assign]
|
||||
__import__("citegeist").BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="topic2024graph",
|
||||
fields={"title": "Graph Topic Result", "year": "2024"},
|
||||
)
|
||||
]
|
||||
bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign]
|
||||
bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: [] # type: ignore[method-assign]
|
||||
|
||||
results = bootstrapper.bootstrap(store, topic="graph topic", expand=False)
|
||||
|
||||
assert [item.citation_key for item in results] == ["topic2024graph"]
|
||||
assert store.get_entry("topic2024graph") is not None
|
||||
assert results[0].score > 0
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_bootstrap_cli_accepts_seed_and_topic(tmp_path):
|
||||
seed_bib = tmp_path / "seed.bib"
|
||||
seed_bib.write_text(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Seed Paper},
|
||||
year = {2024}
|
||||
}
|
||||
""",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
database = tmp_path / "library.sqlite3"
|
||||
with patch("citegeist.cli.Bootstrapper.bootstrap") as mocked_bootstrap:
|
||||
mocked_bootstrap.return_value = []
|
||||
exit_code = main(
|
||||
[
|
||||
"--db",
|
||||
str(database),
|
||||
"bootstrap",
|
||||
"--seed-bib",
|
||||
str(seed_bib),
|
||||
"--topic",
|
||||
"graph topic",
|
||||
"--no-expand",
|
||||
]
|
||||
)
|
||||
|
||||
assert exit_code == 0
|
||||
|
||||
|
||||
def test_bootstrap_ranks_and_deduplicates_topic_candidates():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
bootstrapper = Bootstrapper()
|
||||
from citegeist import BibEntry
|
||||
|
||||
bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign]
|
||||
BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="shared2024graph",
|
||||
fields={"title": "Graph Topic Ranking", "abstract": "graph topic graph"},
|
||||
)
|
||||
]
|
||||
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [ # type: ignore[method-assign]
|
||||
BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="shared2024graph",
|
||||
fields={"title": "Graph Topic Ranking", "abstract": "graph"},
|
||||
),
|
||||
BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="crossref2024other",
|
||||
fields={"title": "Less relevant paper"},
|
||||
),
|
||||
]
|
||||
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign]
|
||||
bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: [] # type: ignore[method-assign]
|
||||
|
||||
results = bootstrapper.bootstrap(store, topic="graph topic", expand=False, topic_limit=5)
|
||||
|
||||
topic_results = [item for item in results if item.origin == "topic"]
|
||||
assert [item.citation_key for item in topic_results] == ["shared2024graph", "crossref2024other"]
|
||||
assert topic_results[0].score > topic_results[1].score
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_bootstrap_preview_does_not_write_to_database():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
bootstrapper = Bootstrapper()
|
||||
from citegeist import BibEntry
|
||||
|
||||
bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign]
|
||||
BibEntry(entry_type="article", citation_key="preview2024graph", fields={"title": "Preview Graph Topic"})
|
||||
]
|
||||
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
|
||||
results = bootstrapper.bootstrap(store, topic="graph topic", expand=False, preview_only=True)
|
||||
|
||||
assert [item.citation_key for item in results] == ["preview2024graph"]
|
||||
assert store.get_entry("preview2024graph") is None
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_bootstrap_topic_commit_limit_restricts_persisted_candidates():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
bootstrapper = Bootstrapper()
|
||||
from citegeist import BibEntry
|
||||
|
||||
bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign]
|
||||
BibEntry(entry_type="article", citation_key="rank1", fields={"title": "Graph Topic One"}),
|
||||
BibEntry(entry_type="article", citation_key="rank2", fields={"title": "Graph Topic Two"}),
|
||||
]
|
||||
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign]
|
||||
bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: [] # type: ignore[method-assign]
|
||||
|
||||
results = bootstrapper.bootstrap(
|
||||
store,
|
||||
topic="graph topic",
|
||||
expand=False,
|
||||
topic_limit=5,
|
||||
topic_commit_limit=1,
|
||||
)
|
||||
|
||||
assert [item.citation_key for item in results if item.origin == "topic"] == ["rank1"]
|
||||
assert store.get_entry("rank1") is not None
|
||||
assert store.get_entry("rank2") is None
|
||||
finally:
|
||||
store.close()
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,69 @@
|
|||
from citegeist.expand import CrossrefExpander, _crossref_reference_to_entry
|
||||
from citegeist.storage import BibliographyStore
|
||||
|
||||
|
||||
def test_crossref_reference_to_entry_prefers_doi_key():
|
||||
entry = _crossref_reference_to_entry(
|
||||
{
|
||||
"DOI": "10.1000/example-ref",
|
||||
"article-title": "Discovered Reference",
|
||||
"author": "Doe, Alex",
|
||||
"year": "2022",
|
||||
"journal-title": "Journal of Discovery",
|
||||
},
|
||||
"seed2024",
|
||||
1,
|
||||
)
|
||||
|
||||
assert entry.citation_key == "doi101000exampleref"
|
||||
assert entry.fields["doi"] == "10.1000/example-ref"
|
||||
assert entry.fields["journal"] == "Journal of Discovery"
|
||||
|
||||
|
||||
def test_crossref_expander_creates_draft_nodes_and_relations():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Seed Paper},
|
||||
year = {2024},
|
||||
doi = {10.1000/seed-doi}
|
||||
}
|
||||
"""
|
||||
)
|
||||
|
||||
expander = CrossrefExpander()
|
||||
expander.resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign]
|
||||
"message": {
|
||||
"reference": [
|
||||
{
|
||||
"DOI": "10.1000/example-ref",
|
||||
"article-title": "Discovered Reference",
|
||||
"author": "Doe, Alex",
|
||||
"year": "2022",
|
||||
"journal-title": "Journal of Discovery",
|
||||
},
|
||||
{
|
||||
"unstructured": "Unstructured reference string",
|
||||
"year": "2021",
|
||||
},
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
results = expander.expand_entry_references(store, "seed2024")
|
||||
|
||||
assert [result.discovered_citation_key for result in results] == [
|
||||
"doi101000exampleref",
|
||||
"ref2021unstructured2",
|
||||
]
|
||||
discovered = store.get_entry("doi101000exampleref")
|
||||
assert discovered is not None
|
||||
assert discovered["review_status"] == "draft"
|
||||
assert store.get_relations("seed2024") == ["doi101000exampleref", "ref2021unstructured2"]
|
||||
relation_provenance = store.get_relation_provenance("seed2024")
|
||||
assert relation_provenance[0]["source_type"] == "graph_expand"
|
||||
finally:
|
||||
store.close()
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
from citegeist import extract_references, parse_bibtex
|
||||
from citegeist.cli import main
|
||||
|
||||
|
||||
SAMPLE_REFERENCES = """
|
||||
[1] Smith, Jane and Doe, Alex. 2024. Graph-first bibliography augmentation. Journal of Research Systems.
|
||||
[2] Miller, Sam. 2023. Semantic search for research corpora. Proceedings of the Retrieval Workshop.
|
||||
"""
|
||||
|
||||
APA_AND_BOOK_REFERENCES = """
|
||||
Brown, T., & Green, P. (2021). Retrieval methods for scholarly corpora. Journal of Information Retrieval.
|
||||
|
||||
Nguyen, An. Research Design for Literature Mapping. Example University Press, 2020.
|
||||
"""
|
||||
|
||||
WRAPPED_REFERENCES = """
|
||||
[1] Taylor, Ann. 2022. Multi-line reference extraction
|
||||
for bibliography pipelines. Journal of Parsing Systems.
|
||||
[2] Chen, Bo. 2021. Another entry. Proceedings of the Mining Workshop.
|
||||
"""
|
||||
|
||||
|
||||
def test_extract_references_builds_draft_entries():
|
||||
entries = extract_references(SAMPLE_REFERENCES)
|
||||
|
||||
assert [entry.citation_key for entry in entries] == [
|
||||
"smith2024graphfirst1",
|
||||
"miller2023semantic2",
|
||||
]
|
||||
assert entries[0].entry_type == "article"
|
||||
assert entries[0].fields["journal"] == "Journal of Research Systems"
|
||||
assert entries[1].entry_type == "inproceedings"
|
||||
assert entries[1].fields["booktitle"] == "Proceedings of the Retrieval Workshop"
|
||||
|
||||
|
||||
def test_extract_cli_writes_bibtex(tmp_path):
|
||||
input_path = tmp_path / "references.txt"
|
||||
output_path = tmp_path / "draft.bib"
|
||||
input_path.write_text(SAMPLE_REFERENCES, encoding="utf-8")
|
||||
|
||||
exit_code = main(["extract", str(input_path), "--output", str(output_path)])
|
||||
assert exit_code == 0
|
||||
|
||||
exported = output_path.read_text(encoding="utf-8")
|
||||
parsed = {entry.citation_key: entry for entry in parse_bibtex(exported)}
|
||||
assert parsed["smith2024graphfirst1"].fields["journal"] == "Journal of Research Systems"
|
||||
assert parsed["miller2023semantic2"].fields["booktitle"] == "Proceedings of the Retrieval Workshop"
|
||||
|
||||
|
||||
def test_extract_references_supports_apa_and_book_styles():
|
||||
entries = extract_references(APA_AND_BOOK_REFERENCES)
|
||||
|
||||
assert [entry.entry_type for entry in entries] == ["article", "book"]
|
||||
assert entries[0].fields["journal"] == "Journal of Information Retrieval"
|
||||
assert entries[0].fields["author"] == "Brown, T., and Green, P"
|
||||
assert entries[1].fields["publisher"] == "Example University Press"
|
||||
assert entries[1].fields["title"] == "Research Design for Literature Mapping"
|
||||
|
||||
|
||||
def test_extract_references_joins_wrapped_reference_lines():
|
||||
entries = extract_references(WRAPPED_REFERENCES)
|
||||
|
||||
assert len(entries) == 2
|
||||
assert entries[0].fields["title"] == "Multi-line reference extraction for bibliography pipelines"
|
||||
assert entries[0].fields["journal"] == "Journal of Parsing Systems"
|
||||
|
|
@ -0,0 +1,293 @@
|
|||
from citegeist import OaiPmhHarvester, parse_bibtex
|
||||
from citegeist.cli import main
|
||||
|
||||
|
||||
OAI_XML = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
|
||||
xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||
<ListRecords>
|
||||
<record>
|
||||
<header>
|
||||
<identifier>oai:example.edu:123</identifier>
|
||||
</header>
|
||||
<metadata>
|
||||
<oai_dc:dc>
|
||||
<dc:title>Thesis Metadata Harvesting</dc:title>
|
||||
<dc:creator>Doe, Jane</dc:creator>
|
||||
<dc:date>2023-05-01</dc:date>
|
||||
<dc:description>A dissertation about repository harvesting.</dc:description>
|
||||
<dc:identifier>https://example.edu/items/123</dc:identifier>
|
||||
<dc:publisher>Example University</dc:publisher>
|
||||
<dc:type>Text</dc:type>
|
||||
<dc:type>Dissertation</dc:type>
|
||||
</oai_dc:dc>
|
||||
</metadata>
|
||||
</record>
|
||||
</ListRecords>
|
||||
</OAI-PMH>
|
||||
"""
|
||||
|
||||
OAI_XML_PAGE_1 = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
|
||||
xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||
<ListRecords>
|
||||
<record>
|
||||
<header>
|
||||
<identifier>oai:example.edu:123</identifier>
|
||||
</header>
|
||||
<metadata>
|
||||
<oai_dc:dc>
|
||||
<dc:title>First Harvested Thesis</dc:title>
|
||||
<dc:creator>Doe, Jane</dc:creator>
|
||||
<dc:date>2023-05-01</dc:date>
|
||||
<dc:type>Dissertation</dc:type>
|
||||
</oai_dc:dc>
|
||||
</metadata>
|
||||
</record>
|
||||
<resumptionToken>TOKEN123</resumptionToken>
|
||||
</ListRecords>
|
||||
</OAI-PMH>
|
||||
"""
|
||||
|
||||
OAI_XML_PAGE_2 = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
|
||||
xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||
<ListRecords>
|
||||
<record>
|
||||
<header>
|
||||
<identifier>oai:example.edu:456</identifier>
|
||||
</header>
|
||||
<metadata>
|
||||
<oai_dc:dc>
|
||||
<dc:title>Second Harvested Thesis</dc:title>
|
||||
<dc:creator>Smith, John</dc:creator>
|
||||
<dc:date>2022-05-01</dc:date>
|
||||
<dc:type>Dissertation</dc:type>
|
||||
</oai_dc:dc>
|
||||
</metadata>
|
||||
</record>
|
||||
</ListRecords>
|
||||
</OAI-PMH>
|
||||
"""
|
||||
|
||||
OAI_IDENTIFY_XML = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/">
|
||||
<Identify>
|
||||
<repositoryName>Example Repository</repositoryName>
|
||||
<baseURL>https://example.edu/oai</baseURL>
|
||||
<protocolVersion>2.0</protocolVersion>
|
||||
<adminEmail>repo@example.edu</adminEmail>
|
||||
<earliestDatestamp>2001-01-01</earliestDatestamp>
|
||||
<deletedRecord>persistent</deletedRecord>
|
||||
<granularity>YYYY-MM-DD</granularity>
|
||||
</Identify>
|
||||
</OAI-PMH>
|
||||
"""
|
||||
|
||||
OAI_LISTSETS_XML = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/">
|
||||
<ListSets>
|
||||
<set>
|
||||
<setSpec>theses</setSpec>
|
||||
<setName>Theses and Dissertations</setName>
|
||||
<setDescription>
|
||||
<description>This set contains graduate theses.</description>
|
||||
</setDescription>
|
||||
</set>
|
||||
</ListSets>
|
||||
</OAI-PMH>
|
||||
"""
|
||||
|
||||
OAI_METADATA_FORMATS_XML = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/">
|
||||
<ListMetadataFormats>
|
||||
<metadataFormat>
|
||||
<metadataPrefix>oai_dc</metadataPrefix>
|
||||
<schema>http://www.openarchives.org/OAI/2.0/oai_dc.xsd</schema>
|
||||
<metadataNamespace>http://www.openarchives.org/OAI/2.0/oai_dc/</metadataNamespace>
|
||||
</metadataFormat>
|
||||
<metadataFormat>
|
||||
<metadataPrefix>mods</metadataPrefix>
|
||||
<schema>http://www.loc.gov/standards/mods/v3/mods-3-7.xsd</schema>
|
||||
<metadataNamespace>http://www.loc.gov/mods/v3</metadataNamespace>
|
||||
</metadataFormat>
|
||||
</ListMetadataFormats>
|
||||
</OAI-PMH>
|
||||
"""
|
||||
|
||||
OAI_MODS_XML = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
|
||||
xmlns:mods="http://www.loc.gov/mods/v3">
|
||||
<ListRecords>
|
||||
<record>
|
||||
<header>
|
||||
<identifier>oai:example.edu:mods123</identifier>
|
||||
</header>
|
||||
<metadata>
|
||||
<mods:mods>
|
||||
<mods:titleInfo>
|
||||
<mods:title>MODS Thesis Title</mods:title>
|
||||
</mods:titleInfo>
|
||||
<mods:name>
|
||||
<mods:namePart>Doe</mods:namePart>
|
||||
<mods:namePart>Jane</mods:namePart>
|
||||
<mods:role>
|
||||
<mods:roleTerm>author</mods:roleTerm>
|
||||
</mods:role>
|
||||
</mods:name>
|
||||
<mods:originInfo>
|
||||
<mods:publisher>Example University</mods:publisher>
|
||||
<mods:dateIssued>2022</mods:dateIssued>
|
||||
</mods:originInfo>
|
||||
<mods:genre>dissertation</mods:genre>
|
||||
<mods:abstract>MODS abstract text.</mods:abstract>
|
||||
<mods:location>
|
||||
<mods:url>https://example.edu/mods123</mods:url>
|
||||
</mods:location>
|
||||
</mods:mods>
|
||||
</metadata>
|
||||
</record>
|
||||
</ListRecords>
|
||||
</OAI-PMH>
|
||||
"""
|
||||
|
||||
|
||||
def test_oai_harvester_maps_dublin_core_to_bibentry():
|
||||
harvester = OaiPmhHarvester()
|
||||
harvester.source_client.get_xml = lambda _url: __import__("xml.etree.ElementTree").etree.ElementTree.fromstring(OAI_XML) # type: ignore[method-assign]
|
||||
|
||||
results = harvester.list_records("https://example.edu/oai")
|
||||
|
||||
assert len(results) == 1
|
||||
entry = results[0].entry
|
||||
assert entry.entry_type == "phdthesis"
|
||||
assert entry.fields["title"] == "Thesis Metadata Harvesting"
|
||||
assert entry.fields["author"] == "Doe, Jane"
|
||||
assert entry.fields["oai"] == "oai:example.edu:123"
|
||||
|
||||
|
||||
def test_oai_harvester_follows_resumption_tokens():
|
||||
harvester = OaiPmhHarvester()
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
payloads = iter([ET.fromstring(OAI_XML_PAGE_1), ET.fromstring(OAI_XML_PAGE_2)])
|
||||
harvester.source_client.get_xml = lambda _url: next(payloads) # type: ignore[method-assign]
|
||||
|
||||
results = harvester.list_records("https://example.edu/oai")
|
||||
|
||||
assert [result.identifier for result in results] == [
|
||||
"oai:example.edu:123",
|
||||
"oai:example.edu:456",
|
||||
]
|
||||
assert [result.entry.citation_key for result in results] == [
|
||||
"doe2023first1",
|
||||
"smith2022second2",
|
||||
]
|
||||
|
||||
|
||||
def test_oai_harvester_passes_date_filters():
|
||||
harvester = OaiPmhHarvester()
|
||||
seen_urls: list[str] = []
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
def fake_get_xml(url: str):
|
||||
seen_urls.append(url)
|
||||
return ET.fromstring(OAI_XML)
|
||||
|
||||
harvester.source_client.get_xml = fake_get_xml # type: ignore[method-assign]
|
||||
|
||||
harvester.list_records(
|
||||
"https://example.edu/oai",
|
||||
date_from="2023-01-01",
|
||||
date_until="2023-12-31",
|
||||
limit=1,
|
||||
)
|
||||
|
||||
assert "from=2023-01-01" in seen_urls[0]
|
||||
assert "until=2023-12-31" in seen_urls[0]
|
||||
|
||||
|
||||
def test_oai_harvester_maps_mods_records():
|
||||
harvester = OaiPmhHarvester()
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
harvester.source_client.get_xml = lambda _url: ET.fromstring(OAI_MODS_XML) # type: ignore[method-assign]
|
||||
|
||||
results = harvester.list_records("https://example.edu/oai", metadata_prefix="mods")
|
||||
|
||||
assert len(results) == 1
|
||||
entry = results[0].entry
|
||||
assert entry.entry_type == "phdthesis"
|
||||
assert entry.fields["title"] == "MODS Thesis Title"
|
||||
assert entry.fields["author"] == "Doe, Jane"
|
||||
assert entry.fields["publisher"] == "Example University"
|
||||
assert entry.fields["abstract"] == "MODS abstract text."
|
||||
|
||||
|
||||
def test_oai_harvester_can_identify_repository_and_list_sets():
|
||||
harvester = OaiPmhHarvester()
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
payloads = iter(
|
||||
[ET.fromstring(OAI_IDENTIFY_XML), ET.fromstring(OAI_LISTSETS_XML), ET.fromstring(OAI_METADATA_FORMATS_XML)]
|
||||
)
|
||||
harvester.source_client.get_xml = lambda _url: next(payloads) # type: ignore[method-assign]
|
||||
|
||||
identify = harvester.identify("https://example.edu/oai")
|
||||
sets = harvester.list_sets("https://example.edu/oai")
|
||||
formats = harvester.list_metadata_formats("https://example.edu/oai")
|
||||
|
||||
assert identify["repositoryName"] == "Example Repository"
|
||||
assert identify["granularity"] == "YYYY-MM-DD"
|
||||
assert sets[0].set_spec == "theses"
|
||||
assert sets[0].set_name == "Theses and Dissertations"
|
||||
assert "graduate theses" in sets[0].set_description
|
||||
assert [item.metadata_prefix for item in formats] == ["oai_dc", "mods"]
|
||||
|
||||
|
||||
def test_harvest_oai_cli_ingests_records(tmp_path):
|
||||
from unittest.mock import patch
|
||||
|
||||
database = tmp_path / "library.sqlite3"
|
||||
harvester = OaiPmhHarvester()
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
harvester.source_client.get_xml = lambda _url: ET.fromstring(OAI_XML) # type: ignore[method-assign]
|
||||
harvested = harvester.list_records("https://example.edu/oai")
|
||||
|
||||
with patch("citegeist.cli.OaiPmhHarvester.list_records") as mocked_list:
|
||||
mocked_list.return_value = harvested
|
||||
|
||||
exit_code = main(
|
||||
[
|
||||
"--db",
|
||||
str(database),
|
||||
"harvest-oai",
|
||||
"https://example.edu/oai",
|
||||
"--metadata-prefix",
|
||||
"oai_dc",
|
||||
"--from",
|
||||
"2023-01-01",
|
||||
"--until",
|
||||
"2023-12-31",
|
||||
"--limit",
|
||||
"5",
|
||||
]
|
||||
)
|
||||
|
||||
assert exit_code == 0
|
||||
|
||||
from citegeist.storage import BibliographyStore
|
||||
|
||||
store = BibliographyStore(database)
|
||||
try:
|
||||
entry = store.list_entries(limit=10)[0]
|
||||
assert entry["citation_key"] == "doe2023thesis1"
|
||||
bibtex = store.get_entry_bibtex("doe2023thesis1")
|
||||
parsed = parse_bibtex(bibtex or "")
|
||||
assert parsed[0].fields["oai"] == "oai:example.edu:123"
|
||||
finally:
|
||||
store.close()
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from citegeist import MetadataResolver, SourceClient
|
||||
|
||||
|
||||
pytestmark = pytest.mark.live
|
||||
|
||||
|
||||
def _live_client() -> SourceClient:
|
||||
cache_dir = os.environ.get("CITEGEIST_SOURCE_CACHE", ".cache/citegeist")
|
||||
return SourceClient(
|
||||
cache_dir=cache_dir,
|
||||
fixtures_dir=os.environ.get("CITEGEIST_SOURCE_FIXTURES"),
|
||||
)
|
||||
|
||||
|
||||
def test_live_crossref_doi_resolution():
|
||||
resolver = MetadataResolver(source_client=_live_client())
|
||||
|
||||
resolution = resolver.resolve_doi("10.1038/nphys1170")
|
||||
|
||||
assert resolution is not None
|
||||
assert resolution.entry.fields.get("doi") == "10.1038/nphys1170"
|
||||
assert resolution.entry.fields.get("title")
|
||||
|
||||
|
||||
def test_live_arxiv_resolution():
|
||||
resolver = MetadataResolver(source_client=_live_client())
|
||||
|
||||
resolution = resolver.resolve_arxiv("1706.03762")
|
||||
|
||||
assert resolution is not None
|
||||
assert resolution.entry.fields.get("arxiv") == "1706.03762"
|
||||
assert resolution.entry.fields.get("title")
|
||||
|
||||
|
||||
def test_live_openalex_title_search():
|
||||
resolver = MetadataResolver(source_client=_live_client())
|
||||
|
||||
resolution = resolver.search_openalex_best_match(
|
||||
title="Attention Is All You Need",
|
||||
author_text="Ashish Vaswani",
|
||||
year="2017",
|
||||
)
|
||||
|
||||
assert resolution is not None
|
||||
assert resolution.entry.fields.get("title")
|
||||
assert resolution.entry.fields.get("openalex")
|
||||
|
|
@ -0,0 +1,84 @@
|
|||
from citegeist.expand import OpenAlexExpander, _openalex_work_to_entry
|
||||
from citegeist.storage import BibliographyStore
|
||||
|
||||
|
||||
def test_openalex_work_to_entry_maps_basic_fields():
|
||||
entry = _openalex_work_to_entry(
|
||||
{
|
||||
"id": "https://openalex.org/W12345",
|
||||
"doi": "https://doi.org/10.1000/example-openalex",
|
||||
"display_name": "OpenAlex Discovered Work",
|
||||
"publication_year": 2022,
|
||||
"type": "article",
|
||||
"authorships": [{"author": {"display_name": "Jane Smith"}}],
|
||||
"primary_location": {"source": {"display_name": "Journal of Graph Discovery"}},
|
||||
"abstract_inverted_index": {"Graph": [0], "discovery": [1]},
|
||||
}
|
||||
)
|
||||
|
||||
assert entry.citation_key == "openalexw12345"
|
||||
assert entry.fields["openalex"] == "W12345"
|
||||
assert entry.fields["doi"] == "10.1000/example-openalex"
|
||||
assert entry.fields["journal"] == "Journal of Graph Discovery"
|
||||
assert entry.fields["abstract"] == "Graph discovery"
|
||||
|
||||
|
||||
def test_openalex_expander_adds_outgoing_and_incoming_edges():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Seed Paper},
|
||||
year = {2024},
|
||||
doi = {10.1000/seed-doi}
|
||||
}
|
||||
"""
|
||||
)
|
||||
expander = OpenAlexExpander()
|
||||
payloads = iter(
|
||||
[
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"id": "https://openalex.org/WSEED",
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"id": "https://openalex.org/WDISCOVERED",
|
||||
"display_name": "Referenced OpenAlex Work",
|
||||
"publication_year": 2021,
|
||||
"type": "article",
|
||||
"authorships": [{"author": {"display_name": "Bob Known"}}],
|
||||
"primary_location": {"source": {"display_name": "OpenAlex Journal"}},
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"id": "https://openalex.org/WCITING",
|
||||
"display_name": "Citing OpenAlex Work",
|
||||
"publication_year": 2025,
|
||||
"type": "article",
|
||||
"authorships": [{"author": {"display_name": "Carol Citing"}}],
|
||||
}
|
||||
]
|
||||
},
|
||||
]
|
||||
)
|
||||
expander.resolver.source_client.get_json = lambda _url: next(payloads) # type: ignore[method-assign]
|
||||
|
||||
outgoing = expander.expand_entry(store, "seed2024", relation_type="cites", limit=5)
|
||||
incoming = expander.expand_entry(store, "seed2024", relation_type="cited_by", limit=5)
|
||||
|
||||
assert outgoing[0].discovered_citation_key == "openalexwdiscovered"
|
||||
assert incoming[0].source_citation_key == "openalexwciting"
|
||||
assert "openalexwdiscovered" in store.get_relations("seed2024", "cites")
|
||||
assert "seed2024" in store.get_relations("openalexwciting", "cites")
|
||||
finally:
|
||||
store.close()
|
||||
|
|
@ -0,0 +1,403 @@
|
|||
from xml.etree import ElementTree as ET
|
||||
|
||||
from citegeist.bibtex import BibEntry, render_bibtex
|
||||
from citegeist.resolve import (
|
||||
MetadataResolver,
|
||||
_arxiv_atom_entry_to_bib,
|
||||
_crossref_message_to_entry,
|
||||
_datacite_work_to_entry,
|
||||
_openalex_work_to_entry,
|
||||
merge_entries_with_conflicts,
|
||||
merge_entries,
|
||||
)
|
||||
|
||||
|
||||
def test_crossref_message_to_entry_maps_basic_fields():
|
||||
entry = _crossref_message_to_entry(
|
||||
{
|
||||
"type": "journal-article",
|
||||
"title": ["Graph-first bibliography augmentation"],
|
||||
"DOI": "10.1000/example-doi",
|
||||
"URL": "https://doi.org/10.1000/example-doi",
|
||||
"container-title": ["Journal of Graph Studies"],
|
||||
"author": [{"family": "Smith", "given": "Jane"}],
|
||||
"issued": {"date-parts": [[2024, 5, 1]]},
|
||||
}
|
||||
)
|
||||
|
||||
assert entry.entry_type == "article"
|
||||
assert entry.fields["author"] == "Smith, Jane"
|
||||
assert entry.fields["journal"] == "Journal of Graph Studies"
|
||||
assert entry.fields["year"] == "2024"
|
||||
|
||||
|
||||
def test_arxiv_atom_entry_to_bib_maps_basic_fields():
|
||||
xml = ET.fromstring(
|
||||
"""
|
||||
<entry xmlns="http://www.w3.org/2005/Atom" xmlns:arxiv="http://arxiv.org/schemas/atom">
|
||||
<title>Semantic search for research corpora</title>
|
||||
<summary>Dense retrieval improves recall.</summary>
|
||||
<published>2023-01-15T00:00:00Z</published>
|
||||
<author><name>Miller, Sam</name></author>
|
||||
<arxiv:doi>10.1000/arxiv-example</arxiv:doi>
|
||||
</entry>
|
||||
"""
|
||||
)
|
||||
entry = _arxiv_atom_entry_to_bib(xml, "2301.12345")
|
||||
assert entry.fields["author"] == "Miller, Sam"
|
||||
assert entry.fields["arxiv"] == "2301.12345"
|
||||
assert entry.fields["doi"] == "10.1000/arxiv-example"
|
||||
|
||||
|
||||
def test_merge_entries_prefers_existing_values_and_adds_missing_fields():
|
||||
base = BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="smith2024graphs",
|
||||
fields={"title": "Graph-first bibliography augmentation", "doi": "10.1000/example-doi"},
|
||||
)
|
||||
resolved = BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="otherkey",
|
||||
fields={"title": "Different title", "journal": "Journal of Graph Studies"},
|
||||
)
|
||||
|
||||
merged = merge_entries(base, resolved)
|
||||
|
||||
assert merged.fields["title"] == "Graph-first bibliography augmentation"
|
||||
assert merged.fields["journal"] == "Journal of Graph Studies"
|
||||
|
||||
|
||||
def test_merge_entries_with_conflicts_records_disagreements():
|
||||
base = BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="smith2024graphs",
|
||||
fields={"title": "Existing Title", "journal": "Current Journal"},
|
||||
)
|
||||
resolved = BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="resolved",
|
||||
fields={"title": "Resolved Title", "journal": "Current Journal", "year": "2024"},
|
||||
)
|
||||
|
||||
merged, conflicts = merge_entries_with_conflicts(base, resolved)
|
||||
|
||||
assert merged.fields["title"] == "Existing Title"
|
||||
assert merged.fields["year"] == "2024"
|
||||
assert conflicts == [
|
||||
{
|
||||
"field_name": "title",
|
||||
"current_value": "Existing Title",
|
||||
"proposed_value": "Resolved Title",
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def test_resolver_tries_doi_before_dblp():
|
||||
resolver = MetadataResolver()
|
||||
calls: list[tuple[str, str]] = []
|
||||
|
||||
def fake_doi(value: str):
|
||||
calls.append(("doi", value))
|
||||
return None
|
||||
|
||||
def fake_dblp(value: str):
|
||||
calls.append(("dblp", value))
|
||||
return None
|
||||
|
||||
def fake_datacite(value: str):
|
||||
calls.append(("datacite", value))
|
||||
return None
|
||||
|
||||
resolver.resolve_doi = fake_doi # type: ignore[method-assign]
|
||||
resolver.resolve_datacite_doi = fake_datacite # type: ignore[method-assign]
|
||||
resolver.resolve_dblp = fake_dblp # type: ignore[method-assign]
|
||||
|
||||
resolver.resolve_entry(
|
||||
BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="smith2024graphs",
|
||||
fields={"doi": "10.1000/example-doi", "dblp": "conf/test/Smith24"},
|
||||
)
|
||||
)
|
||||
|
||||
assert calls == [
|
||||
("doi", "10.1000/example-doi"),
|
||||
("datacite", "10.1000/example-doi"),
|
||||
("dblp", "conf/test/Smith24"),
|
||||
]
|
||||
|
||||
|
||||
def test_openalex_work_to_entry_maps_basic_fields():
|
||||
entry = _openalex_work_to_entry(
|
||||
{
|
||||
"id": "https://openalex.org/W12345",
|
||||
"doi": "https://doi.org/10.1000/example-openalex",
|
||||
"display_name": "OpenAlex Resolved Work",
|
||||
"publication_year": 2022,
|
||||
"type": "article",
|
||||
"authorships": [{"author": {"display_name": "Jane Smith"}}],
|
||||
"primary_location": {"source": {"display_name": "Journal of Open Graphs"}},
|
||||
"abstract_inverted_index": {"OpenAlex": [0], "resolved": [1]},
|
||||
}
|
||||
)
|
||||
|
||||
assert entry.citation_key == "openalexw12345"
|
||||
assert entry.fields["openalex"] == "W12345"
|
||||
assert entry.fields["doi"] == "10.1000/example-openalex"
|
||||
assert entry.fields["journal"] == "Journal of Open Graphs"
|
||||
assert entry.fields["abstract"] == "OpenAlex resolved"
|
||||
|
||||
|
||||
def test_resolver_can_resolve_openalex_id():
|
||||
resolver = MetadataResolver()
|
||||
resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign]
|
||||
"id": "https://openalex.org/W12345",
|
||||
"display_name": "OpenAlex Resolved Work",
|
||||
"publication_year": 2022,
|
||||
"type": "article",
|
||||
"authorships": [{"author": {"display_name": "Jane Smith"}}],
|
||||
}
|
||||
|
||||
resolution = resolver.resolve_openalex("W12345")
|
||||
|
||||
assert resolution is not None
|
||||
assert resolution.source_label == "openalex:id:W12345"
|
||||
assert resolution.entry.fields["openalex"] == "W12345"
|
||||
|
||||
|
||||
def test_resolver_falls_back_to_openalex_title_search():
|
||||
resolver = MetadataResolver()
|
||||
resolver.search_datacite = lambda title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_crossref = lambda title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_openalex = lambda title, limit=5: [ # type: ignore[method-assign]
|
||||
_openalex_work_to_entry(
|
||||
{
|
||||
"id": "https://openalex.org/W12345",
|
||||
"display_name": title,
|
||||
"publication_year": 2022,
|
||||
"type": "article",
|
||||
"authorships": [{"author": {"display_name": "Jane Smith"}}],
|
||||
}
|
||||
)
|
||||
]
|
||||
|
||||
resolution = resolver.resolve_entry(
|
||||
BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="smith2022openalex",
|
||||
fields={"title": "OpenAlex Resolved Work", "author": "Jane Smith", "year": "2022"},
|
||||
)
|
||||
)
|
||||
|
||||
assert resolution is not None
|
||||
assert resolution.source_label == "openalex:search:OpenAlex Resolved Work"
|
||||
assert resolution.entry.fields["openalex"] == "W12345"
|
||||
|
||||
|
||||
def test_resolver_prefers_exact_crossref_title_match_before_datacite():
|
||||
resolver = MetadataResolver()
|
||||
resolver.search_crossref = lambda title, limit=5: [ # type: ignore[method-assign]
|
||||
_crossref_message_to_entry(
|
||||
{
|
||||
"type": "journal-article",
|
||||
"title": [title],
|
||||
"DOI": "10.1126/science.1090005",
|
||||
"container-title": ["Science"],
|
||||
"author": [
|
||||
{"family": "King", "given": "Mary-Claire"},
|
||||
{"family": "Wilson", "given": "A. C."},
|
||||
],
|
||||
"issued": {"date-parts": [[1975, 4, 11]]},
|
||||
}
|
||||
)
|
||||
]
|
||||
resolver.search_datacite = lambda title, limit=5: [ # type: ignore[method-assign]
|
||||
_datacite_work_to_entry(
|
||||
{
|
||||
"attributes": {
|
||||
"doi": "10.5061/dryad.v6wwpzh17",
|
||||
"titles": [
|
||||
{
|
||||
"title": "Conserved patterns and locomotor-related evolutionary constraints in the hominoid vertebral column"
|
||||
}
|
||||
],
|
||||
"creators": [
|
||||
{"familyName": "Villamil", "givenName": "Catalina I."},
|
||||
{"familyName": "Middleton", "givenName": "Emily R."},
|
||||
],
|
||||
"publicationYear": 2024,
|
||||
"types": {"resourceTypeGeneral": "Dataset"},
|
||||
}
|
||||
}
|
||||
)
|
||||
]
|
||||
|
||||
resolution = resolver.resolve_entry(
|
||||
BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="king1975evolution2",
|
||||
fields={
|
||||
"title": "Evolution at two levels in humans and chimpanzees",
|
||||
"author": "King, M. C. and Wilson, A. C.",
|
||||
"year": "1975",
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
assert resolution is not None
|
||||
assert resolution.source_label == "crossref:search:Evolution at two levels in humans and chimpanzees"
|
||||
assert resolution.entry.fields["doi"] == "10.1126/science.1090005"
|
||||
|
||||
|
||||
def test_resolver_rejects_mismatched_title_search_candidates():
|
||||
resolver = MetadataResolver()
|
||||
resolver.search_crossref = lambda title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_datacite = lambda title, limit=5: [ # type: ignore[method-assign]
|
||||
_datacite_work_to_entry(
|
||||
{
|
||||
"attributes": {
|
||||
"doi": "10.5061/dryad.v6wwpzh17",
|
||||
"titles": [
|
||||
{
|
||||
"title": "Conserved patterns and locomotor-related evolutionary constraints in the hominoid vertebral column"
|
||||
}
|
||||
],
|
||||
"creators": [
|
||||
{"familyName": "Villamil", "givenName": "Catalina I."},
|
||||
],
|
||||
"publicationYear": 2024,
|
||||
"types": {"resourceTypeGeneral": "Dataset"},
|
||||
}
|
||||
}
|
||||
)
|
||||
]
|
||||
resolver.search_openalex = lambda title, limit=5: [ # type: ignore[method-assign]
|
||||
_openalex_work_to_entry(
|
||||
{
|
||||
"id": "https://openalex.org/W2033360601",
|
||||
"display_name": "Immunological relatedness of glucose 6-phosphate dehydrogenases from vertebrate and invertebrate species.",
|
||||
"publication_year": 1978,
|
||||
"type": "article",
|
||||
"authorships": [
|
||||
{"author": {"display_name": "Yoshikazu Sado"}},
|
||||
{"author": {"display_name": "Samuel H. Hori"}},
|
||||
],
|
||||
"doi": "https://doi.org/10.1266/jjg.53.91",
|
||||
}
|
||||
)
|
||||
]
|
||||
|
||||
resolution = resolver.resolve_entry(
|
||||
BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="sarich1967immunological1",
|
||||
fields={
|
||||
"title": "Immunological Time Scale for Homonid Evolution",
|
||||
"author": "Sarich, V. and Wilson, A.",
|
||||
"year": "1967",
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
assert resolution is None
|
||||
|
||||
|
||||
def test_datacite_work_to_entry_maps_basic_fields():
|
||||
entry = _datacite_work_to_entry(
|
||||
{
|
||||
"attributes": {
|
||||
"doi": "10.1000/datacite-example",
|
||||
"titles": [{"title": "Repository Dissertation Record"}],
|
||||
"creators": [{"familyName": "Doe", "givenName": "Jane"}],
|
||||
"publicationYear": 2021,
|
||||
"publisher": "Example University",
|
||||
"url": "https://example.edu/record/123",
|
||||
"types": {"resourceTypeGeneral": "Dissertation"},
|
||||
"descriptions": [
|
||||
{
|
||||
"descriptionType": "Abstract",
|
||||
"description": "An abstract from DataCite.",
|
||||
}
|
||||
],
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
assert entry.entry_type == "phdthesis"
|
||||
assert entry.fields["doi"] == "10.1000/datacite-example"
|
||||
assert entry.fields["author"] == "Doe, Jane"
|
||||
assert entry.fields["publisher"] == "Example University"
|
||||
assert entry.fields["abstract"] == "An abstract from DataCite."
|
||||
|
||||
|
||||
def test_resolver_can_resolve_datacite_doi():
|
||||
resolver = MetadataResolver()
|
||||
resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign]
|
||||
"data": {
|
||||
"attributes": {
|
||||
"doi": "10.1000/datacite-example",
|
||||
"titles": [{"title": "Repository Dissertation Record"}],
|
||||
"creators": [{"familyName": "Doe", "givenName": "Jane"}],
|
||||
"publicationYear": 2021,
|
||||
"types": {"resourceTypeGeneral": "Dissertation"},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resolution = resolver.resolve_datacite_doi("10.1000/datacite-example")
|
||||
|
||||
assert resolution is not None
|
||||
assert resolution.source_label == "datacite:doi:10.1000/datacite-example"
|
||||
assert resolution.entry.entry_type == "phdthesis"
|
||||
|
||||
|
||||
def test_resolver_can_fall_back_to_datacite_title_search():
|
||||
resolver = MetadataResolver()
|
||||
resolver.search_crossref = lambda title, limit=5: [] # type: ignore[method-assign]
|
||||
resolver.search_datacite = lambda title, limit=5: [ # type: ignore[method-assign]
|
||||
_datacite_work_to_entry(
|
||||
{
|
||||
"attributes": {
|
||||
"doi": "10.1000/datacite-example",
|
||||
"titles": [{"title": title}],
|
||||
"creators": [{"familyName": "Doe", "givenName": "Jane"}],
|
||||
"publicationYear": 2021,
|
||||
"types": {"resourceTypeGeneral": "Dissertation"},
|
||||
}
|
||||
}
|
||||
)
|
||||
]
|
||||
resolver.search_openalex = lambda title, limit=5: [] # type: ignore[method-assign]
|
||||
|
||||
resolution = resolver.resolve_entry(
|
||||
BibEntry(
|
||||
entry_type="misc",
|
||||
citation_key="draft1",
|
||||
fields={"title": "Repository Dissertation Record", "author": "Doe, Jane", "year": "2021"},
|
||||
)
|
||||
)
|
||||
|
||||
assert resolution is not None
|
||||
assert resolution.source_label == "datacite:search:Repository Dissertation Record"
|
||||
assert resolution.entry.fields["doi"] == "10.1000/datacite-example"
|
||||
|
||||
|
||||
def test_render_bibtex_tolerates_unmatched_braces_in_field_values():
|
||||
rendered = render_bibtex(
|
||||
[
|
||||
BibEntry(
|
||||
entry_type="misc",
|
||||
citation_key="broken2026",
|
||||
fields={
|
||||
"author": "Broken, Example",
|
||||
"title": "Unmatched { braces } example } tail",
|
||||
"year": "2026",
|
||||
"note": "Open { brace only",
|
||||
},
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
assert "@misc{broken2026," in rendered
|
||||
assert "Unmatched { braces } example ) tail" in rendered
|
||||
assert "Open ( brace only" in rendered
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
from pathlib import Path
|
||||
|
||||
from citegeist.sources import SourceClient
|
||||
|
||||
|
||||
def test_source_client_reads_fixture_before_network(tmp_path: Path):
|
||||
fixtures_dir = tmp_path / "fixtures"
|
||||
fixtures_dir.mkdir()
|
||||
|
||||
client = SourceClient(cache_dir=tmp_path / "cache", fixtures_dir=fixtures_dir)
|
||||
url = "https://api.crossref.org/works/10.1000/example"
|
||||
fixture_path = fixtures_dir / client._cache_key(url, "json") # noqa: SLF001
|
||||
fixture_path.write_text('{"message": {"DOI": "10.1000/example"}}', encoding="utf-8")
|
||||
|
||||
payload = client.get_json(url)
|
||||
|
||||
assert payload["message"]["DOI"] == "10.1000/example"
|
||||
|
||||
|
||||
def test_source_client_writes_cache_after_fetch(tmp_path: Path):
|
||||
cache_dir = tmp_path / "cache"
|
||||
client = SourceClient(cache_dir=cache_dir)
|
||||
url = "https://example.org/test"
|
||||
|
||||
client._fetch_bytes = lambda _url: b'{"ok": true}' # type: ignore[method-assign]
|
||||
|
||||
payload = client.get_json(url)
|
||||
|
||||
assert payload["ok"] is True
|
||||
assert any(cache_dir.iterdir())
|
||||
|
||||
|
||||
def test_source_client_falls_back_to_latin1_for_text(tmp_path: Path):
|
||||
client = SourceClient(cache_dir=tmp_path / "cache")
|
||||
url = "https://example.org/latin1"
|
||||
|
||||
client._fetch_bytes = lambda _url: "café".encode("iso-8859-1") # type: ignore[method-assign]
|
||||
|
||||
payload = client.get_text(url)
|
||||
|
||||
assert payload == "café"
|
||||
|
|
@ -0,0 +1,379 @@
|
|||
from citegeist import BibliographyStore, parse_bibtex
|
||||
|
||||
|
||||
SAMPLE_BIB = """
|
||||
@article{smith2024graphs,
|
||||
author = {Smith, Jane and Doe, Alex},
|
||||
title = {Graph-first bibliography augmentation},
|
||||
year = {2024},
|
||||
doi = {10.1000/graph.2024.1},
|
||||
abstract = {We study citation graphs for literature discovery.},
|
||||
references = {miller2023search}
|
||||
}
|
||||
|
||||
@inproceedings{miller2023search,
|
||||
author = {Miller, Sam},
|
||||
title = {Semantic search for research corpora},
|
||||
year = {2023},
|
||||
abstract = {Dense retrieval improves recall for academic search.}
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
def test_parse_bibtex_extracts_entries_and_fields():
|
||||
entries = parse_bibtex(SAMPLE_BIB)
|
||||
|
||||
assert [entry.citation_key for entry in entries] == ["smith2024graphs", "miller2023search"]
|
||||
assert entries[0].fields["title"] == "Graph-first bibliography augmentation"
|
||||
assert entries[0].fields["references"] == "miller2023search"
|
||||
|
||||
|
||||
def test_store_ingests_entries_relations_and_search_text():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(
|
||||
SAMPLE_BIB,
|
||||
fulltext_by_key={
|
||||
"smith2024graphs": "This paper links citation graphs with semantic search over abstracts."
|
||||
},
|
||||
)
|
||||
|
||||
entry = store.get_entry("smith2024graphs")
|
||||
assert entry is not None
|
||||
assert entry["doi"] == "10.1000/graph.2024.1"
|
||||
|
||||
assert store.get_relations("smith2024graphs") == ["miller2023search"]
|
||||
|
||||
results = store.search_text("semantic")
|
||||
assert [row["citation_key"] for row in results][:2] == [
|
||||
"miller2023search",
|
||||
"smith2024graphs",
|
||||
]
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_store_exports_bibtex_from_normalized_rows():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(SAMPLE_BIB)
|
||||
|
||||
exported = store.export_bibtex()
|
||||
parsed = {entry.citation_key: entry for entry in parse_bibtex(exported)}
|
||||
|
||||
assert "@article{smith2024graphs," in exported
|
||||
assert "@inproceedings{miller2023search," in exported
|
||||
assert parsed["smith2024graphs"].fields["author"] == "Smith, Jane and Doe, Alex"
|
||||
assert parsed["smith2024graphs"].fields["references"] == "miller2023search"
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_store_records_provenance_and_review_status():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(SAMPLE_BIB, source_label="fixtures/sample.bib", review_status="draft")
|
||||
|
||||
entry = store.get_entry("smith2024graphs")
|
||||
assert entry is not None
|
||||
assert entry["review_status"] == "draft"
|
||||
|
||||
provenance = store.get_field_provenance("smith2024graphs")
|
||||
assert provenance
|
||||
assert provenance[0]["source_type"] == "bibtex"
|
||||
assert provenance[0]["source_label"] == "fixtures/sample.bib"
|
||||
|
||||
assert store.set_entry_status("smith2024graphs", "reviewed") is True
|
||||
updated = store.get_entry("smith2024graphs")
|
||||
assert updated is not None
|
||||
assert updated["review_status"] == "reviewed"
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_store_traverses_graph_and_surfaces_missing_targets():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Seed Paper},
|
||||
year = {2024},
|
||||
references = {known2023, missing2022}
|
||||
}
|
||||
|
||||
@article{known2023,
|
||||
author = {Known, Bob},
|
||||
title = {Known Paper},
|
||||
year = {2023},
|
||||
references = {leaf2021}
|
||||
}
|
||||
|
||||
@article{leaf2021,
|
||||
author = {Leaf, Carol},
|
||||
title = {Leaf Paper},
|
||||
year = {2021}
|
||||
}
|
||||
""",
|
||||
review_status="reviewed",
|
||||
)
|
||||
|
||||
rows = store.traverse_graph(["seed2024"], relation_types=["cites"], max_depth=2)
|
||||
|
||||
assert [row["target_citation_key"] for row in rows] == [
|
||||
"known2023",
|
||||
"missing2022",
|
||||
"leaf2021",
|
||||
]
|
||||
assert rows[1]["target_exists"] is False
|
||||
assert rows[2]["depth"] == 2
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_store_records_and_updates_field_conflicts():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Seed Paper},
|
||||
year = {2024}
|
||||
}
|
||||
"""
|
||||
)
|
||||
ok = store.record_conflicts(
|
||||
"seed2024",
|
||||
[
|
||||
{
|
||||
"field_name": "title",
|
||||
"current_value": "Seed Paper",
|
||||
"proposed_value": "Resolved Seed Paper",
|
||||
}
|
||||
],
|
||||
source_type="resolver",
|
||||
source_label="crossref:doi:10.1000/seed",
|
||||
)
|
||||
assert ok is True
|
||||
conflicts = store.get_field_conflicts("seed2024")
|
||||
assert conflicts[0]["field_name"] == "title"
|
||||
assert conflicts[0]["status"] == "open"
|
||||
assert store.set_conflict_status("seed2024", "title", "accepted") == 1
|
||||
updated = store.get_field_conflicts("seed2024", status="accepted")
|
||||
assert len(updated) == 1
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_store_can_apply_latest_conflict_value():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Seed Paper},
|
||||
year = {2024}
|
||||
}
|
||||
"""
|
||||
)
|
||||
store.record_conflicts(
|
||||
"seed2024",
|
||||
[
|
||||
{
|
||||
"field_name": "title",
|
||||
"current_value": "Seed Paper",
|
||||
"proposed_value": "Resolved Seed Paper",
|
||||
}
|
||||
],
|
||||
source_type="resolver",
|
||||
source_label="crossref:doi:10.1000/seed",
|
||||
)
|
||||
|
||||
assert store.apply_conflict_value("seed2024", "title") is True
|
||||
entry = store.get_entry("seed2024")
|
||||
assert entry is not None
|
||||
assert entry["title"] == "Resolved Seed Paper"
|
||||
accepted = store.get_field_conflicts("seed2024", status="accepted")
|
||||
assert len(accepted) == 1
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_store_supports_entry_topic_membership():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Seed Paper},
|
||||
year = {2024}
|
||||
}
|
||||
"""
|
||||
)
|
||||
|
||||
assert store.add_entry_topic(
|
||||
"seed2024",
|
||||
topic_slug="graph-methods",
|
||||
topic_name="Graph Methods",
|
||||
source_type="talkorigins",
|
||||
source_url="https://example.org/topics/graph-methods",
|
||||
source_label="topic-seed",
|
||||
) is True
|
||||
assert store.add_entry_topic(
|
||||
"seed2024",
|
||||
topic_slug="semantic-search",
|
||||
topic_name="Semantic Search",
|
||||
source_type="talkorigins",
|
||||
source_url="https://example.org/topics/semantic-search",
|
||||
source_label="topic-seed",
|
||||
) is True
|
||||
|
||||
entry = store.get_entry("seed2024")
|
||||
assert entry is not None
|
||||
assert [topic["slug"] for topic in entry["topics"]] == ["graph-methods", "semantic-search"]
|
||||
|
||||
topics = store.list_topics()
|
||||
assert [topic["slug"] for topic in topics] == ["graph-methods", "semantic-search"]
|
||||
assert topics[0]["entry_count"] == 1
|
||||
topic = store.get_topic("graph-methods")
|
||||
assert topic is not None
|
||||
assert topic["name"] == "Graph Methods"
|
||||
assert topic["expansion_phrase"] is None
|
||||
topic_entries = store.list_topic_entries("graph-methods")
|
||||
assert topic_entries[0]["citation_key"] == "seed2024"
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_store_can_set_topic_expansion_phrase():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Seed Paper},
|
||||
year = {2024}
|
||||
}
|
||||
"""
|
||||
)
|
||||
store.add_entry_topic(
|
||||
"seed2024",
|
||||
topic_slug="graph-methods",
|
||||
topic_name="Graph Methods",
|
||||
source_type="talkorigins",
|
||||
source_url="https://example.org/topics/graph-methods",
|
||||
source_label="topic-seed",
|
||||
)
|
||||
assert store.set_topic_expansion_phrase("graph-methods", "graph networks biology") is True
|
||||
|
||||
topic = store.get_topic("graph-methods")
|
||||
assert topic is not None
|
||||
assert topic["expansion_phrase"] == "graph networks biology"
|
||||
assert topic["phrase_review_status"] == "unreviewed"
|
||||
topics = store.list_topics()
|
||||
assert topics[0]["expansion_phrase"] == "graph networks biology"
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_store_can_stage_and_review_topic_phrase_suggestion():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ensure_topic("graph-methods", "Graph Methods")
|
||||
|
||||
assert store.stage_topic_phrase_suggestion(
|
||||
"graph-methods",
|
||||
"graph networks biology",
|
||||
review_notes="generated from local titles",
|
||||
) is True
|
||||
|
||||
staged = store.get_topic("graph-methods")
|
||||
assert staged is not None
|
||||
assert staged["suggested_phrase"] == "graph networks biology"
|
||||
assert staged["expansion_phrase"] is None
|
||||
assert staged["phrase_review_status"] == "pending"
|
||||
assert staged["phrase_review_notes"] == "generated from local titles"
|
||||
|
||||
assert store.review_topic_phrase_suggestion(
|
||||
"graph-methods",
|
||||
"accepted",
|
||||
review_notes="looks good",
|
||||
) is True
|
||||
|
||||
reviewed = store.get_topic("graph-methods")
|
||||
assert reviewed is not None
|
||||
assert reviewed["suggested_phrase"] == "graph networks biology"
|
||||
assert reviewed["expansion_phrase"] == "graph networks biology"
|
||||
assert reviewed["phrase_review_status"] == "accepted"
|
||||
assert reviewed["phrase_review_notes"] == "looks good"
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_store_can_filter_topics_by_phrase_review_status():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ensure_topic("graph-methods", "Graph Methods")
|
||||
store.ensure_topic("abiogenesis", "Abiogenesis")
|
||||
store.stage_topic_phrase_suggestion("graph-methods", "graph networks biology")
|
||||
store.stage_topic_phrase_suggestion("abiogenesis", "abiogenesis life origin")
|
||||
store.review_topic_phrase_suggestion("abiogenesis", "accepted")
|
||||
|
||||
pending_topics = store.list_topics(phrase_review_status="pending")
|
||||
accepted_topics = store.list_topics(phrase_review_status="accepted")
|
||||
|
||||
assert [topic["slug"] for topic in pending_topics] == ["graph-methods"]
|
||||
assert [topic["slug"] for topic in accepted_topics] == ["abiogenesis"]
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_store_search_text_can_filter_by_topic():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Graph Methods for Biology},
|
||||
year = {2024},
|
||||
abstract = {A graph methods paper.}
|
||||
}
|
||||
|
||||
@article{other2023,
|
||||
author = {Other, Bob},
|
||||
title = {Graph Methods for Chemistry},
|
||||
year = {2023},
|
||||
abstract = {Another graph methods paper.}
|
||||
}
|
||||
"""
|
||||
)
|
||||
|
||||
store.add_entry_topic(
|
||||
"seed2024",
|
||||
topic_slug="biology",
|
||||
topic_name="Biology",
|
||||
source_type="talkorigins",
|
||||
source_url="https://example.org/topics/biology",
|
||||
source_label="topic-seed",
|
||||
)
|
||||
store.add_entry_topic(
|
||||
"other2023",
|
||||
topic_slug="chemistry",
|
||||
topic_name="Chemistry",
|
||||
source_type="talkorigins",
|
||||
source_url="https://example.org/topics/chemistry",
|
||||
source_label="topic-seed",
|
||||
)
|
||||
store.connection.commit()
|
||||
|
||||
results = store.search_text("graph", topic_slug="biology")
|
||||
|
||||
assert [row["citation_key"] for row in results] == ["seed2024"]
|
||||
finally:
|
||||
store.close()
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,242 @@
|
|||
from citegeist.bibtex import BibEntry
|
||||
from citegeist.expand import (
|
||||
ExpansionResult,
|
||||
TopicExpander,
|
||||
_meets_topic_assignment_threshold,
|
||||
_topic_relevance_score,
|
||||
)
|
||||
from citegeist.storage import BibliographyStore
|
||||
|
||||
|
||||
class FakeOpenAlexExpander:
|
||||
def __init__(self, results: list[ExpansionResult] | dict[str, list[ExpansionResult]]) -> None:
|
||||
self.results = results
|
||||
|
||||
def expand_entry(self, store, citation_key, relation_type="cites", limit=25):
|
||||
if isinstance(self.results, dict):
|
||||
return list(self.results.get(citation_key, []))
|
||||
return list(self.results)
|
||||
|
||||
|
||||
def test_topic_expander_assigns_relevant_discoveries_back_to_topic():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Abiogenesis Seed Paper},
|
||||
year = {2024}
|
||||
}
|
||||
"""
|
||||
)
|
||||
store.add_entry_topic(
|
||||
"seed2024",
|
||||
topic_slug="abiogenesis",
|
||||
topic_name="Abiogenesis",
|
||||
source_type="talkorigins",
|
||||
source_url="https://example.org/topics/abiogenesis",
|
||||
source_label="seed",
|
||||
)
|
||||
store.upsert_entry(
|
||||
BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="discovered1",
|
||||
fields={
|
||||
"title": "Abiogenesis and origin chemistry",
|
||||
"abstract": "A study of abiogenesis pathways.",
|
||||
"year": "2025",
|
||||
},
|
||||
),
|
||||
source_type="graph_expand",
|
||||
source_label="test",
|
||||
review_status="draft",
|
||||
)
|
||||
store.upsert_entry(
|
||||
BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="discovered2",
|
||||
fields={
|
||||
"title": "Galaxy formation dynamics",
|
||||
"abstract": "Nothing about the topic.",
|
||||
"year": "2025",
|
||||
},
|
||||
),
|
||||
source_type="graph_expand",
|
||||
source_label="test",
|
||||
review_status="draft",
|
||||
)
|
||||
store.connection.commit()
|
||||
|
||||
expander = TopicExpander(
|
||||
openalex_expander=FakeOpenAlexExpander(
|
||||
[
|
||||
ExpansionResult("seed2024", "discovered1", False, "cites", "openalex:cites:seed2024"),
|
||||
ExpansionResult("seed2024", "discovered2", False, "cites", "openalex:cites:seed2024"),
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
results = expander.expand_topic(
|
||||
store,
|
||||
"abiogenesis",
|
||||
topic_phrase="abiogenesis origin chemistry",
|
||||
min_relevance=0.34,
|
||||
)
|
||||
|
||||
assert len(results) == 2
|
||||
assigned = {item.discovered_citation_key: item.assigned_to_topic for item in results}
|
||||
assert assigned["discovered1"] is True
|
||||
assert assigned["discovered2"] is False
|
||||
topics = store.get_entry_topics("discovered1")
|
||||
assert topics[0]["slug"] == "abiogenesis"
|
||||
assert store.get_entry_topics("discovered2") == []
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_topic_expander_can_restrict_to_allowed_seed_keys():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Abiogenesis Seed Paper},
|
||||
year = {2024}
|
||||
}
|
||||
|
||||
@article{seed2023,
|
||||
author = {Seed, Bob},
|
||||
title = {Abiogenesis Historical Seed},
|
||||
year = {2023}
|
||||
}
|
||||
"""
|
||||
)
|
||||
for citation_key in ("seed2024", "seed2023"):
|
||||
store.add_entry_topic(
|
||||
citation_key,
|
||||
topic_slug="abiogenesis",
|
||||
topic_name="Abiogenesis",
|
||||
source_type="talkorigins",
|
||||
source_url="https://example.org/topics/abiogenesis",
|
||||
source_label="seed",
|
||||
)
|
||||
store.upsert_entry(
|
||||
BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="discovered1",
|
||||
fields={
|
||||
"title": "Abiogenesis origin chemistry",
|
||||
"abstract": "A study of abiogenesis chemistry.",
|
||||
"year": "2025",
|
||||
},
|
||||
),
|
||||
source_type="graph_expand",
|
||||
source_label="test",
|
||||
review_status="draft",
|
||||
)
|
||||
store.connection.commit()
|
||||
|
||||
expander = TopicExpander(
|
||||
openalex_expander=FakeOpenAlexExpander(
|
||||
{"seed2023": [ExpansionResult("seed2023", "discovered1", False, "cites", "openalex:cites:seed2023")]}
|
||||
)
|
||||
)
|
||||
|
||||
results = expander.expand_topic(
|
||||
store,
|
||||
"abiogenesis",
|
||||
topic_phrase="abiogenesis origin chemistry",
|
||||
seed_keys=["seed2024"],
|
||||
)
|
||||
|
||||
assert results == []
|
||||
assert store.get_entry_topics("discovered1") == []
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_topic_expander_preview_discovers_without_writing():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Abiogenesis Seed Paper},
|
||||
year = {2024}
|
||||
}
|
||||
"""
|
||||
)
|
||||
store.add_entry_topic(
|
||||
"seed2024",
|
||||
topic_slug="abiogenesis",
|
||||
topic_name="Abiogenesis",
|
||||
source_type="talkorigins",
|
||||
source_url="https://example.org/topics/abiogenesis",
|
||||
source_label="seed",
|
||||
)
|
||||
store.connection.commit()
|
||||
|
||||
expander = TopicExpander()
|
||||
expander._preview_discoveries = lambda *_args, **_kwargs: [ # type: ignore[method-assign]
|
||||
(
|
||||
ExpansionResult(
|
||||
"seed2024",
|
||||
"preview1",
|
||||
True,
|
||||
"cites",
|
||||
"openalex:cites:seed2024",
|
||||
),
|
||||
{
|
||||
"title": "Abiogenesis origin chemistry",
|
||||
"abstract": "A study of abiogenesis chemistry.",
|
||||
"year": "2025",
|
||||
},
|
||||
)
|
||||
]
|
||||
|
||||
results = expander.expand_topic(
|
||||
store,
|
||||
"abiogenesis",
|
||||
topic_phrase="abiogenesis origin chemistry",
|
||||
min_relevance=0.3,
|
||||
preview_only=True,
|
||||
)
|
||||
|
||||
assert len(results) == 1
|
||||
assert results[0].discovered_citation_key == "preview1"
|
||||
assert results[0].meets_relevance_threshold is True
|
||||
assert results[0].assigned_to_topic is False
|
||||
assert results[0].created_entry is True
|
||||
assert store.get_entry("preview1") is None
|
||||
assert store.get_entry_topics("preview1") == []
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
def test_topic_relevance_score_expands_human_evolution_terms():
|
||||
score = _topic_relevance_score(
|
||||
"human evolution",
|
||||
{
|
||||
"title": "Body size and proportions in early hominids",
|
||||
"abstract": "A fossil and paleolithic perspective on primate ancestry.",
|
||||
"journal": "Science",
|
||||
},
|
||||
)
|
||||
|
||||
assert score >= 0.15
|
||||
|
||||
|
||||
def test_topic_assignment_requires_title_anchor():
|
||||
entry = {
|
||||
"title": "Phylogenies and the Comparative Method",
|
||||
"abstract": "A comparative framework for primate and hominid evolution.",
|
||||
"journal": "Systematic Zoology",
|
||||
}
|
||||
|
||||
score = _topic_relevance_score("human evolution", entry)
|
||||
|
||||
assert score >= 0.15
|
||||
assert _meets_topic_assignment_threshold("human evolution", entry, min_relevance=0.15, relevance_score=score) is False
|
||||
Loading…
Reference in New Issue