From cabe12719b8c5050eb6a239d1639339d2323d022 Mon Sep 17 00:00:00 2001 From: welsberr Date: Thu, 19 Mar 2026 21:22:30 -0400 Subject: [PATCH] Add live source test and smoke scaffolding --- README.md | 22 ++++++--- pyproject.toml | 3 ++ scripts/live_smoke.py | 58 ++++++++++++++++++++++++ tests/conftest.py | 15 +++++++ tests/test_cli.py | 36 +++++++++++++++ tests/test_extract.py | 30 +++++++++++++ tests/test_live_sources.py | 52 ++++++++++++++++++++++ tests/test_openalex_expand.py | 84 +++++++++++++++++++++++++++++++++++ tests/test_resolve.py | 73 +++++++++++++++++++++++++++++- 9 files changed, 367 insertions(+), 6 deletions(-) create mode 100644 scripts/live_smoke.py create mode 100644 tests/conftest.py create mode 100644 tests/test_live_sources.py create mode 100644 tests/test_openalex_expand.py diff --git a/README.md b/README.md index 7bcecbd..d00168f 100644 --- a/README.md +++ b/README.md @@ -47,10 +47,10 @@ The initial repo includes: - a SQLite-backed bibliography store; - a small CLI for ingest, search, inspection, and export; - review-state tracking on entries and per-field ingest provenance; -- first-pass plaintext reference extraction into draft BibTeX; -- identifier-first metadata resolution for DOI, DBLP, and arXiv-backed entries; +- plaintext reference extraction into draft BibTeX for numbered, APA-like, wrapped-line, and simple book-style references; +- identifier-first metadata resolution for DOI, OpenAlex, DBLP, and arXiv-backed entries, with OpenAlex title-search fallback; - local citation-graph traversal over stored `cites`, `cited_by`, and `crossref` edges; -- Crossref-backed graph expansion that materializes draft referenced works and edge provenance; +- Crossref- and OpenAlex-backed graph expansion that materializes draft related works and edge provenance; - a dedicated source-client layer with fixture/cache support for live-source development; - normalized tables for entries, creators, identifiers, and citation relations; - full-text-search-ready indexing over title, abstract, and fulltext when SQLite FTS5 is available; @@ -119,15 +119,27 @@ PYTHONPATH=src .venv/bin/python -m citegeist extract references.txt --output dra PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve smith2024graphs PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 graph smith2024graphs --relation cites --depth 2 --missing-only PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source crossref +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source openalex --relation cited_by --limit 10 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export --output reviewed.bib ``` For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run. +Live-source workflow: + +```bash +cd citegeist +export CITEGEIST_SOURCE_CACHE=.cache/citegeist +export CITEGEIST_LIVE_TESTS=1 +PYTHONPATH=src .venv/bin/python -m pytest -m live -q +PYTHONPATH=src .venv/bin/python scripts/live_smoke.py +``` + +By default, live tests are skipped. They only run when `CITEGEIST_LIVE_TESTS=1` is set. + ## Near-Term Priorities -- stronger plaintext extraction coverage for more citation styles; -- richer graph expansion from additional external citation sources. +- additional resolvers and expansion paths for non-DOI scholarly ecosystems. See [ROADMAP.md](./ROADMAP.md) for the prioritized phase plan and rationale. diff --git a/pyproject.toml b/pyproject.toml index 5357fce..880b039 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,3 +15,6 @@ citegeist = "citegeist.cli:main" [tool.pytest.ini_options] pythonpath = ["src"] testpaths = ["tests"] +markers = [ + "live: tests that call live external scholarly APIs and are skipped unless explicitly enabled", +] diff --git a/scripts/live_smoke.py b/scripts/live_smoke.py new file mode 100644 index 0000000..5d84b09 --- /dev/null +++ b/scripts/live_smoke.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import argparse +import json +import os + +from citegeist import MetadataResolver, SourceClient + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Run live smoke checks against scholarly metadata sources") + parser.add_argument( + "--cache-dir", + default=os.environ.get("CITEGEIST_SOURCE_CACHE", ".cache/citegeist"), + help="Directory for cached live-source responses", + ) + parser.add_argument( + "--fixtures-dir", + default=os.environ.get("CITEGEIST_SOURCE_FIXTURES"), + help="Optional fixture directory to read before live network calls", + ) + return parser + + +def main() -> int: + args = build_parser().parse_args() + client = SourceClient(cache_dir=args.cache_dir, fixtures_dir=args.fixtures_dir) + resolver = MetadataResolver(source_client=client) + + checks = { + "crossref_doi": resolver.resolve_doi("10.1038/nphys1170"), + "arxiv_id": resolver.resolve_arxiv("1706.03762"), + "openalex_search": resolver.search_openalex_best_match( + title="Attention Is All You Need", + author_text="Ashish Vaswani", + year="2017", + ), + } + + payload = {} + for name, resolution in checks.items(): + payload[name] = None + if resolution is not None: + payload[name] = { + "source_label": resolution.source_label, + "title": resolution.entry.fields.get("title"), + "year": resolution.entry.fields.get("year"), + "doi": resolution.entry.fields.get("doi"), + "openalex": resolution.entry.fields.get("openalex"), + "arxiv": resolution.entry.fields.get("arxiv"), + } + + print(json.dumps(payload, indent=2, sort_keys=True)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..2100be7 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +import os + +import pytest + + +def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None: + if os.environ.get("CITEGEIST_LIVE_TESTS") == "1": + return + + skip_live = pytest.mark.skip(reason="set CITEGEIST_LIVE_TESTS=1 to run live-source tests") + for item in items: + if "live" in item.keywords: + item.add_marker(skip_live) diff --git a/tests/test_cli.py b/tests/test_cli.py index bfc1734..4fed32c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -203,3 +203,39 @@ def test_cli_expand_with_mocked_crossref(tmp_path: Path): exit_code = main(["--db", str(database), "expand", "seed2024"]) assert exit_code == 0 + + +def test_cli_expand_with_mocked_openalex(tmp_path: Path): + bib_path = tmp_path / "expand-openalex.bib" + bib_path.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + doi = {10.1000/seed-doi} +} +""", + encoding="utf-8", + ) + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + from citegeist.expand import ExpansionResult + + with patch("citegeist.cli.OpenAlexExpander.expand_entry") as mocked_expand: + mocked_expand.return_value = [ + ExpansionResult( + source_citation_key="seed2024", + discovered_citation_key="openalexw12345", + created_entry=True, + relation_type="cites", + source_label="openalex:cites:WSEED", + ) + ] + database = tmp_path / "library.sqlite3" + exit_code = main( + ["--db", str(database), "expand", "seed2024", "--source", "openalex", "--relation", "cites"] + ) + + assert exit_code == 0 diff --git a/tests/test_extract.py b/tests/test_extract.py index 18b0283..e29987d 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -7,6 +7,18 @@ SAMPLE_REFERENCES = """ [2] Miller, Sam. 2023. Semantic search for research corpora. Proceedings of the Retrieval Workshop. """ +APA_AND_BOOK_REFERENCES = """ +Brown, T., & Green, P. (2021). Retrieval methods for scholarly corpora. Journal of Information Retrieval. + +Nguyen, An. Research Design for Literature Mapping. Example University Press, 2020. +""" + +WRAPPED_REFERENCES = """ +[1] Taylor, Ann. 2022. Multi-line reference extraction +for bibliography pipelines. Journal of Parsing Systems. +[2] Chen, Bo. 2021. Another entry. Proceedings of the Mining Workshop. +""" + def test_extract_references_builds_draft_entries(): entries = extract_references(SAMPLE_REFERENCES) @@ -33,3 +45,21 @@ def test_extract_cli_writes_bibtex(tmp_path): parsed = {entry.citation_key: entry for entry in parse_bibtex(exported)} assert parsed["smith2024graphfirst1"].fields["journal"] == "Journal of Research Systems" assert parsed["miller2023semantic2"].fields["booktitle"] == "Proceedings of the Retrieval Workshop" + + +def test_extract_references_supports_apa_and_book_styles(): + entries = extract_references(APA_AND_BOOK_REFERENCES) + + assert [entry.entry_type for entry in entries] == ["article", "book"] + assert entries[0].fields["journal"] == "Journal of Information Retrieval" + assert entries[0].fields["author"] == "Brown, T., and Green, P" + assert entries[1].fields["publisher"] == "Example University Press" + assert entries[1].fields["title"] == "Research Design for Literature Mapping" + + +def test_extract_references_joins_wrapped_reference_lines(): + entries = extract_references(WRAPPED_REFERENCES) + + assert len(entries) == 2 + assert entries[0].fields["title"] == "Multi-line reference extraction for bibliography pipelines" + assert entries[0].fields["journal"] == "Journal of Parsing Systems" diff --git a/tests/test_live_sources.py b/tests/test_live_sources.py new file mode 100644 index 0000000..c0a5ce6 --- /dev/null +++ b/tests/test_live_sources.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +import os + +import pytest + +from citegeist import MetadataResolver, SourceClient + + +pytestmark = pytest.mark.live + + +def _live_client() -> SourceClient: + cache_dir = os.environ.get("CITEGEIST_SOURCE_CACHE", ".cache/citegeist") + return SourceClient( + cache_dir=cache_dir, + fixtures_dir=os.environ.get("CITEGEIST_SOURCE_FIXTURES"), + ) + + +def test_live_crossref_doi_resolution(): + resolver = MetadataResolver(source_client=_live_client()) + + resolution = resolver.resolve_doi("10.1038/nphys1170") + + assert resolution is not None + assert resolution.entry.fields.get("doi") == "10.1038/nphys1170" + assert resolution.entry.fields.get("title") + + +def test_live_arxiv_resolution(): + resolver = MetadataResolver(source_client=_live_client()) + + resolution = resolver.resolve_arxiv("1706.03762") + + assert resolution is not None + assert resolution.entry.fields.get("arxiv") == "1706.03762" + assert resolution.entry.fields.get("title") + + +def test_live_openalex_title_search(): + resolver = MetadataResolver(source_client=_live_client()) + + resolution = resolver.search_openalex_best_match( + title="Attention Is All You Need", + author_text="Ashish Vaswani", + year="2017", + ) + + assert resolution is not None + assert resolution.entry.fields.get("title") + assert resolution.entry.fields.get("openalex") diff --git a/tests/test_openalex_expand.py b/tests/test_openalex_expand.py new file mode 100644 index 0000000..2c46af7 --- /dev/null +++ b/tests/test_openalex_expand.py @@ -0,0 +1,84 @@ +from citegeist.expand import OpenAlexExpander, _openalex_work_to_entry +from citegeist.storage import BibliographyStore + + +def test_openalex_work_to_entry_maps_basic_fields(): + entry = _openalex_work_to_entry( + { + "id": "https://openalex.org/W12345", + "doi": "https://doi.org/10.1000/example-openalex", + "display_name": "OpenAlex Discovered Work", + "publication_year": 2022, + "type": "article", + "authorships": [{"author": {"display_name": "Jane Smith"}}], + "primary_location": {"source": {"display_name": "Journal of Graph Discovery"}}, + "abstract_inverted_index": {"Graph": [0], "discovery": [1]}, + } + ) + + assert entry.citation_key == "openalexw12345" + assert entry.fields["openalex"] == "W12345" + assert entry.fields["doi"] == "10.1000/example-openalex" + assert entry.fields["journal"] == "Journal of Graph Discovery" + assert entry.fields["abstract"] == "Graph discovery" + + +def test_openalex_expander_adds_outgoing_and_incoming_edges(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + doi = {10.1000/seed-doi} +} +""" + ) + expander = OpenAlexExpander() + payloads = iter( + [ + { + "results": [ + { + "id": "https://openalex.org/WSEED", + } + ] + }, + { + "results": [ + { + "id": "https://openalex.org/WDISCOVERED", + "display_name": "Referenced OpenAlex Work", + "publication_year": 2021, + "type": "article", + "authorships": [{"author": {"display_name": "Bob Known"}}], + "primary_location": {"source": {"display_name": "OpenAlex Journal"}}, + } + ] + }, + { + "results": [ + { + "id": "https://openalex.org/WCITING", + "display_name": "Citing OpenAlex Work", + "publication_year": 2025, + "type": "article", + "authorships": [{"author": {"display_name": "Carol Citing"}}], + } + ] + }, + ] + ) + expander.resolver.source_client.get_json = lambda _url: next(payloads) # type: ignore[method-assign] + + outgoing = expander.expand_entry(store, "seed2024", relation_type="cites", limit=5) + incoming = expander.expand_entry(store, "seed2024", relation_type="cited_by", limit=5) + + assert outgoing[0].discovered_citation_key == "openalexwdiscovered" + assert incoming[0].source_citation_key == "openalexwciting" + assert "openalexwdiscovered" in store.get_relations("seed2024", "cites") + assert "seed2024" in store.get_relations("openalexwciting", "cites") + finally: + store.close() diff --git a/tests/test_resolve.py b/tests/test_resolve.py index f66220d..f4c22b0 100644 --- a/tests/test_resolve.py +++ b/tests/test_resolve.py @@ -1,7 +1,13 @@ from xml.etree import ElementTree as ET from citegeist.bibtex import BibEntry -from citegeist.resolve import MetadataResolver, _arxiv_atom_entry_to_bib, _crossref_message_to_entry, merge_entries +from citegeist.resolve import ( + MetadataResolver, + _arxiv_atom_entry_to_bib, + _crossref_message_to_entry, + _openalex_work_to_entry, + merge_entries, +) def test_crossref_message_to_entry_maps_basic_fields(): @@ -83,3 +89,68 @@ def test_resolver_tries_doi_before_dblp(): ) assert calls == [("doi", "10.1000/example-doi"), ("dblp", "conf/test/Smith24")] + + +def test_openalex_work_to_entry_maps_basic_fields(): + entry = _openalex_work_to_entry( + { + "id": "https://openalex.org/W12345", + "doi": "https://doi.org/10.1000/example-openalex", + "display_name": "OpenAlex Resolved Work", + "publication_year": 2022, + "type": "article", + "authorships": [{"author": {"display_name": "Jane Smith"}}], + "primary_location": {"source": {"display_name": "Journal of Open Graphs"}}, + "abstract_inverted_index": {"OpenAlex": [0], "resolved": [1]}, + } + ) + + assert entry.citation_key == "openalexw12345" + assert entry.fields["openalex"] == "W12345" + assert entry.fields["doi"] == "10.1000/example-openalex" + assert entry.fields["journal"] == "Journal of Open Graphs" + assert entry.fields["abstract"] == "OpenAlex resolved" + + +def test_resolver_can_resolve_openalex_id(): + resolver = MetadataResolver() + resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign] + "id": "https://openalex.org/W12345", + "display_name": "OpenAlex Resolved Work", + "publication_year": 2022, + "type": "article", + "authorships": [{"author": {"display_name": "Jane Smith"}}], + } + + resolution = resolver.resolve_openalex("W12345") + + assert resolution is not None + assert resolution.source_label == "openalex:id:W12345" + assert resolution.entry.fields["openalex"] == "W12345" + + +def test_resolver_falls_back_to_openalex_title_search(): + resolver = MetadataResolver() + resolver.search_openalex = lambda title, limit=5: [ # type: ignore[method-assign] + _openalex_work_to_entry( + { + "id": "https://openalex.org/W12345", + "display_name": title, + "publication_year": 2022, + "type": "article", + "authorships": [{"author": {"display_name": "Jane Smith"}}], + } + ) + ] + + resolution = resolver.resolve_entry( + BibEntry( + entry_type="article", + citation_key="smith2022openalex", + fields={"title": "OpenAlex Resolved Work", "author": "Jane Smith", "year": "2022"}, + ) + ) + + assert resolution is not None + assert resolution.source_label == "openalex:search:OpenAlex Resolved Work" + assert resolution.entry.fields["openalex"] == "W12345"