Add live source test and smoke scaffolding

This commit is contained in:
welsberr 2026-03-19 21:22:30 -04:00
parent d4d31b371f
commit cabe12719b
9 changed files with 367 additions and 6 deletions

View File

@ -47,10 +47,10 @@ The initial repo includes:
- a SQLite-backed bibliography store; - a SQLite-backed bibliography store;
- a small CLI for ingest, search, inspection, and export; - a small CLI for ingest, search, inspection, and export;
- review-state tracking on entries and per-field ingest provenance; - review-state tracking on entries and per-field ingest provenance;
- first-pass plaintext reference extraction into draft BibTeX; - plaintext reference extraction into draft BibTeX for numbered, APA-like, wrapped-line, and simple book-style references;
- identifier-first metadata resolution for DOI, DBLP, and arXiv-backed entries; - identifier-first metadata resolution for DOI, OpenAlex, DBLP, and arXiv-backed entries, with OpenAlex title-search fallback;
- local citation-graph traversal over stored `cites`, `cited_by`, and `crossref` edges; - local citation-graph traversal over stored `cites`, `cited_by`, and `crossref` edges;
- Crossref-backed graph expansion that materializes draft referenced works and edge provenance; - Crossref- and OpenAlex-backed graph expansion that materializes draft related works and edge provenance;
- a dedicated source-client layer with fixture/cache support for live-source development; - a dedicated source-client layer with fixture/cache support for live-source development;
- normalized tables for entries, creators, identifiers, and citation relations; - normalized tables for entries, creators, identifiers, and citation relations;
- full-text-search-ready indexing over title, abstract, and fulltext when SQLite FTS5 is available; - full-text-search-ready indexing over title, abstract, and fulltext when SQLite FTS5 is available;
@ -119,15 +119,27 @@ PYTHONPATH=src .venv/bin/python -m citegeist extract references.txt --output dra
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve smith2024graphs PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve smith2024graphs
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 graph smith2024graphs --relation cites --depth 2 --missing-only PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 graph smith2024graphs --relation cites --depth 2 --missing-only
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source crossref PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source crossref
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source openalex --relation cited_by --limit 10
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export --output reviewed.bib PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export --output reviewed.bib
``` ```
For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run. For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run.
Live-source workflow:
```bash
cd citegeist
export CITEGEIST_SOURCE_CACHE=.cache/citegeist
export CITEGEIST_LIVE_TESTS=1
PYTHONPATH=src .venv/bin/python -m pytest -m live -q
PYTHONPATH=src .venv/bin/python scripts/live_smoke.py
```
By default, live tests are skipped. They only run when `CITEGEIST_LIVE_TESTS=1` is set.
## Near-Term Priorities ## Near-Term Priorities
- stronger plaintext extraction coverage for more citation styles; - additional resolvers and expansion paths for non-DOI scholarly ecosystems.
- richer graph expansion from additional external citation sources.
See [ROADMAP.md](./ROADMAP.md) for the prioritized phase plan and rationale. See [ROADMAP.md](./ROADMAP.md) for the prioritized phase plan and rationale.

View File

@ -15,3 +15,6 @@ citegeist = "citegeist.cli:main"
[tool.pytest.ini_options] [tool.pytest.ini_options]
pythonpath = ["src"] pythonpath = ["src"]
testpaths = ["tests"] testpaths = ["tests"]
markers = [
"live: tests that call live external scholarly APIs and are skipped unless explicitly enabled",
]

58
scripts/live_smoke.py Normal file
View File

@ -0,0 +1,58 @@
from __future__ import annotations
import argparse
import json
import os
from citegeist import MetadataResolver, SourceClient
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Run live smoke checks against scholarly metadata sources")
parser.add_argument(
"--cache-dir",
default=os.environ.get("CITEGEIST_SOURCE_CACHE", ".cache/citegeist"),
help="Directory for cached live-source responses",
)
parser.add_argument(
"--fixtures-dir",
default=os.environ.get("CITEGEIST_SOURCE_FIXTURES"),
help="Optional fixture directory to read before live network calls",
)
return parser
def main() -> int:
args = build_parser().parse_args()
client = SourceClient(cache_dir=args.cache_dir, fixtures_dir=args.fixtures_dir)
resolver = MetadataResolver(source_client=client)
checks = {
"crossref_doi": resolver.resolve_doi("10.1038/nphys1170"),
"arxiv_id": resolver.resolve_arxiv("1706.03762"),
"openalex_search": resolver.search_openalex_best_match(
title="Attention Is All You Need",
author_text="Ashish Vaswani",
year="2017",
),
}
payload = {}
for name, resolution in checks.items():
payload[name] = None
if resolution is not None:
payload[name] = {
"source_label": resolution.source_label,
"title": resolution.entry.fields.get("title"),
"year": resolution.entry.fields.get("year"),
"doi": resolution.entry.fields.get("doi"),
"openalex": resolution.entry.fields.get("openalex"),
"arxiv": resolution.entry.fields.get("arxiv"),
}
print(json.dumps(payload, indent=2, sort_keys=True))
return 0
if __name__ == "__main__":
raise SystemExit(main())

15
tests/conftest.py Normal file
View File

@ -0,0 +1,15 @@
from __future__ import annotations
import os
import pytest
def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None:
if os.environ.get("CITEGEIST_LIVE_TESTS") == "1":
return
skip_live = pytest.mark.skip(reason="set CITEGEIST_LIVE_TESTS=1 to run live-source tests")
for item in items:
if "live" in item.keywords:
item.add_marker(skip_live)

View File

@ -203,3 +203,39 @@ def test_cli_expand_with_mocked_crossref(tmp_path: Path):
exit_code = main(["--db", str(database), "expand", "seed2024"]) exit_code = main(["--db", str(database), "expand", "seed2024"])
assert exit_code == 0 assert exit_code == 0
def test_cli_expand_with_mocked_openalex(tmp_path: Path):
bib_path = tmp_path / "expand-openalex.bib"
bib_path.write_text(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024},
doi = {10.1000/seed-doi}
}
""",
encoding="utf-8",
)
ingest = run_cli(tmp_path, "ingest", str(bib_path))
assert ingest.returncode == 0
from citegeist.expand import ExpansionResult
with patch("citegeist.cli.OpenAlexExpander.expand_entry") as mocked_expand:
mocked_expand.return_value = [
ExpansionResult(
source_citation_key="seed2024",
discovered_citation_key="openalexw12345",
created_entry=True,
relation_type="cites",
source_label="openalex:cites:WSEED",
)
]
database = tmp_path / "library.sqlite3"
exit_code = main(
["--db", str(database), "expand", "seed2024", "--source", "openalex", "--relation", "cites"]
)
assert exit_code == 0

View File

@ -7,6 +7,18 @@ SAMPLE_REFERENCES = """
[2] Miller, Sam. 2023. Semantic search for research corpora. Proceedings of the Retrieval Workshop. [2] Miller, Sam. 2023. Semantic search for research corpora. Proceedings of the Retrieval Workshop.
""" """
APA_AND_BOOK_REFERENCES = """
Brown, T., & Green, P. (2021). Retrieval methods for scholarly corpora. Journal of Information Retrieval.
Nguyen, An. Research Design for Literature Mapping. Example University Press, 2020.
"""
WRAPPED_REFERENCES = """
[1] Taylor, Ann. 2022. Multi-line reference extraction
for bibliography pipelines. Journal of Parsing Systems.
[2] Chen, Bo. 2021. Another entry. Proceedings of the Mining Workshop.
"""
def test_extract_references_builds_draft_entries(): def test_extract_references_builds_draft_entries():
entries = extract_references(SAMPLE_REFERENCES) entries = extract_references(SAMPLE_REFERENCES)
@ -33,3 +45,21 @@ def test_extract_cli_writes_bibtex(tmp_path):
parsed = {entry.citation_key: entry for entry in parse_bibtex(exported)} parsed = {entry.citation_key: entry for entry in parse_bibtex(exported)}
assert parsed["smith2024graphfirst1"].fields["journal"] == "Journal of Research Systems" assert parsed["smith2024graphfirst1"].fields["journal"] == "Journal of Research Systems"
assert parsed["miller2023semantic2"].fields["booktitle"] == "Proceedings of the Retrieval Workshop" assert parsed["miller2023semantic2"].fields["booktitle"] == "Proceedings of the Retrieval Workshop"
def test_extract_references_supports_apa_and_book_styles():
entries = extract_references(APA_AND_BOOK_REFERENCES)
assert [entry.entry_type for entry in entries] == ["article", "book"]
assert entries[0].fields["journal"] == "Journal of Information Retrieval"
assert entries[0].fields["author"] == "Brown, T., and Green, P"
assert entries[1].fields["publisher"] == "Example University Press"
assert entries[1].fields["title"] == "Research Design for Literature Mapping"
def test_extract_references_joins_wrapped_reference_lines():
entries = extract_references(WRAPPED_REFERENCES)
assert len(entries) == 2
assert entries[0].fields["title"] == "Multi-line reference extraction for bibliography pipelines"
assert entries[0].fields["journal"] == "Journal of Parsing Systems"

View File

@ -0,0 +1,52 @@
from __future__ import annotations
import os
import pytest
from citegeist import MetadataResolver, SourceClient
pytestmark = pytest.mark.live
def _live_client() -> SourceClient:
cache_dir = os.environ.get("CITEGEIST_SOURCE_CACHE", ".cache/citegeist")
return SourceClient(
cache_dir=cache_dir,
fixtures_dir=os.environ.get("CITEGEIST_SOURCE_FIXTURES"),
)
def test_live_crossref_doi_resolution():
resolver = MetadataResolver(source_client=_live_client())
resolution = resolver.resolve_doi("10.1038/nphys1170")
assert resolution is not None
assert resolution.entry.fields.get("doi") == "10.1038/nphys1170"
assert resolution.entry.fields.get("title")
def test_live_arxiv_resolution():
resolver = MetadataResolver(source_client=_live_client())
resolution = resolver.resolve_arxiv("1706.03762")
assert resolution is not None
assert resolution.entry.fields.get("arxiv") == "1706.03762"
assert resolution.entry.fields.get("title")
def test_live_openalex_title_search():
resolver = MetadataResolver(source_client=_live_client())
resolution = resolver.search_openalex_best_match(
title="Attention Is All You Need",
author_text="Ashish Vaswani",
year="2017",
)
assert resolution is not None
assert resolution.entry.fields.get("title")
assert resolution.entry.fields.get("openalex")

View File

@ -0,0 +1,84 @@
from citegeist.expand import OpenAlexExpander, _openalex_work_to_entry
from citegeist.storage import BibliographyStore
def test_openalex_work_to_entry_maps_basic_fields():
entry = _openalex_work_to_entry(
{
"id": "https://openalex.org/W12345",
"doi": "https://doi.org/10.1000/example-openalex",
"display_name": "OpenAlex Discovered Work",
"publication_year": 2022,
"type": "article",
"authorships": [{"author": {"display_name": "Jane Smith"}}],
"primary_location": {"source": {"display_name": "Journal of Graph Discovery"}},
"abstract_inverted_index": {"Graph": [0], "discovery": [1]},
}
)
assert entry.citation_key == "openalexw12345"
assert entry.fields["openalex"] == "W12345"
assert entry.fields["doi"] == "10.1000/example-openalex"
assert entry.fields["journal"] == "Journal of Graph Discovery"
assert entry.fields["abstract"] == "Graph discovery"
def test_openalex_expander_adds_outgoing_and_incoming_edges():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024},
doi = {10.1000/seed-doi}
}
"""
)
expander = OpenAlexExpander()
payloads = iter(
[
{
"results": [
{
"id": "https://openalex.org/WSEED",
}
]
},
{
"results": [
{
"id": "https://openalex.org/WDISCOVERED",
"display_name": "Referenced OpenAlex Work",
"publication_year": 2021,
"type": "article",
"authorships": [{"author": {"display_name": "Bob Known"}}],
"primary_location": {"source": {"display_name": "OpenAlex Journal"}},
}
]
},
{
"results": [
{
"id": "https://openalex.org/WCITING",
"display_name": "Citing OpenAlex Work",
"publication_year": 2025,
"type": "article",
"authorships": [{"author": {"display_name": "Carol Citing"}}],
}
]
},
]
)
expander.resolver.source_client.get_json = lambda _url: next(payloads) # type: ignore[method-assign]
outgoing = expander.expand_entry(store, "seed2024", relation_type="cites", limit=5)
incoming = expander.expand_entry(store, "seed2024", relation_type="cited_by", limit=5)
assert outgoing[0].discovered_citation_key == "openalexwdiscovered"
assert incoming[0].source_citation_key == "openalexwciting"
assert "openalexwdiscovered" in store.get_relations("seed2024", "cites")
assert "seed2024" in store.get_relations("openalexwciting", "cites")
finally:
store.close()

View File

@ -1,7 +1,13 @@
from xml.etree import ElementTree as ET from xml.etree import ElementTree as ET
from citegeist.bibtex import BibEntry from citegeist.bibtex import BibEntry
from citegeist.resolve import MetadataResolver, _arxiv_atom_entry_to_bib, _crossref_message_to_entry, merge_entries from citegeist.resolve import (
MetadataResolver,
_arxiv_atom_entry_to_bib,
_crossref_message_to_entry,
_openalex_work_to_entry,
merge_entries,
)
def test_crossref_message_to_entry_maps_basic_fields(): def test_crossref_message_to_entry_maps_basic_fields():
@ -83,3 +89,68 @@ def test_resolver_tries_doi_before_dblp():
) )
assert calls == [("doi", "10.1000/example-doi"), ("dblp", "conf/test/Smith24")] assert calls == [("doi", "10.1000/example-doi"), ("dblp", "conf/test/Smith24")]
def test_openalex_work_to_entry_maps_basic_fields():
entry = _openalex_work_to_entry(
{
"id": "https://openalex.org/W12345",
"doi": "https://doi.org/10.1000/example-openalex",
"display_name": "OpenAlex Resolved Work",
"publication_year": 2022,
"type": "article",
"authorships": [{"author": {"display_name": "Jane Smith"}}],
"primary_location": {"source": {"display_name": "Journal of Open Graphs"}},
"abstract_inverted_index": {"OpenAlex": [0], "resolved": [1]},
}
)
assert entry.citation_key == "openalexw12345"
assert entry.fields["openalex"] == "W12345"
assert entry.fields["doi"] == "10.1000/example-openalex"
assert entry.fields["journal"] == "Journal of Open Graphs"
assert entry.fields["abstract"] == "OpenAlex resolved"
def test_resolver_can_resolve_openalex_id():
resolver = MetadataResolver()
resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign]
"id": "https://openalex.org/W12345",
"display_name": "OpenAlex Resolved Work",
"publication_year": 2022,
"type": "article",
"authorships": [{"author": {"display_name": "Jane Smith"}}],
}
resolution = resolver.resolve_openalex("W12345")
assert resolution is not None
assert resolution.source_label == "openalex:id:W12345"
assert resolution.entry.fields["openalex"] == "W12345"
def test_resolver_falls_back_to_openalex_title_search():
resolver = MetadataResolver()
resolver.search_openalex = lambda title, limit=5: [ # type: ignore[method-assign]
_openalex_work_to_entry(
{
"id": "https://openalex.org/W12345",
"display_name": title,
"publication_year": 2022,
"type": "article",
"authorships": [{"author": {"display_name": "Jane Smith"}}],
}
)
]
resolution = resolver.resolve_entry(
BibEntry(
entry_type="article",
citation_key="smith2022openalex",
fields={"title": "OpenAlex Resolved Work", "author": "Jane Smith", "year": "2022"},
)
)
assert resolution is not None
assert resolution.source_label == "openalex:search:OpenAlex Resolved Work"
assert resolution.entry.fields["openalex"] == "W12345"