Add live source test and smoke scaffolding
This commit is contained in:
parent
d4d31b371f
commit
cabe12719b
22
README.md
22
README.md
|
|
@ -47,10 +47,10 @@ The initial repo includes:
|
|||
- a SQLite-backed bibliography store;
|
||||
- a small CLI for ingest, search, inspection, and export;
|
||||
- review-state tracking on entries and per-field ingest provenance;
|
||||
- first-pass plaintext reference extraction into draft BibTeX;
|
||||
- identifier-first metadata resolution for DOI, DBLP, and arXiv-backed entries;
|
||||
- plaintext reference extraction into draft BibTeX for numbered, APA-like, wrapped-line, and simple book-style references;
|
||||
- identifier-first metadata resolution for DOI, OpenAlex, DBLP, and arXiv-backed entries, with OpenAlex title-search fallback;
|
||||
- local citation-graph traversal over stored `cites`, `cited_by`, and `crossref` edges;
|
||||
- Crossref-backed graph expansion that materializes draft referenced works and edge provenance;
|
||||
- Crossref- and OpenAlex-backed graph expansion that materializes draft related works and edge provenance;
|
||||
- a dedicated source-client layer with fixture/cache support for live-source development;
|
||||
- normalized tables for entries, creators, identifiers, and citation relations;
|
||||
- full-text-search-ready indexing over title, abstract, and fulltext when SQLite FTS5 is available;
|
||||
|
|
@ -119,15 +119,27 @@ PYTHONPATH=src .venv/bin/python -m citegeist extract references.txt --output dra
|
|||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve smith2024graphs
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 graph smith2024graphs --relation cites --depth 2 --missing-only
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source crossref
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source openalex --relation cited_by --limit 10
|
||||
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export --output reviewed.bib
|
||||
```
|
||||
|
||||
For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run.
|
||||
|
||||
Live-source workflow:
|
||||
|
||||
```bash
|
||||
cd citegeist
|
||||
export CITEGEIST_SOURCE_CACHE=.cache/citegeist
|
||||
export CITEGEIST_LIVE_TESTS=1
|
||||
PYTHONPATH=src .venv/bin/python -m pytest -m live -q
|
||||
PYTHONPATH=src .venv/bin/python scripts/live_smoke.py
|
||||
```
|
||||
|
||||
By default, live tests are skipped. They only run when `CITEGEIST_LIVE_TESTS=1` is set.
|
||||
|
||||
## Near-Term Priorities
|
||||
|
||||
- stronger plaintext extraction coverage for more citation styles;
|
||||
- richer graph expansion from additional external citation sources.
|
||||
- additional resolvers and expansion paths for non-DOI scholarly ecosystems.
|
||||
|
||||
See [ROADMAP.md](./ROADMAP.md) for the prioritized phase plan and rationale.
|
||||
|
||||
|
|
|
|||
|
|
@ -15,3 +15,6 @@ citegeist = "citegeist.cli:main"
|
|||
[tool.pytest.ini_options]
|
||||
pythonpath = ["src"]
|
||||
testpaths = ["tests"]
|
||||
markers = [
|
||||
"live: tests that call live external scholarly APIs and are skipped unless explicitly enabled",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -0,0 +1,58 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
|
||||
from citegeist import MetadataResolver, SourceClient
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="Run live smoke checks against scholarly metadata sources")
|
||||
parser.add_argument(
|
||||
"--cache-dir",
|
||||
default=os.environ.get("CITEGEIST_SOURCE_CACHE", ".cache/citegeist"),
|
||||
help="Directory for cached live-source responses",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fixtures-dir",
|
||||
default=os.environ.get("CITEGEIST_SOURCE_FIXTURES"),
|
||||
help="Optional fixture directory to read before live network calls",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = build_parser().parse_args()
|
||||
client = SourceClient(cache_dir=args.cache_dir, fixtures_dir=args.fixtures_dir)
|
||||
resolver = MetadataResolver(source_client=client)
|
||||
|
||||
checks = {
|
||||
"crossref_doi": resolver.resolve_doi("10.1038/nphys1170"),
|
||||
"arxiv_id": resolver.resolve_arxiv("1706.03762"),
|
||||
"openalex_search": resolver.search_openalex_best_match(
|
||||
title="Attention Is All You Need",
|
||||
author_text="Ashish Vaswani",
|
||||
year="2017",
|
||||
),
|
||||
}
|
||||
|
||||
payload = {}
|
||||
for name, resolution in checks.items():
|
||||
payload[name] = None
|
||||
if resolution is not None:
|
||||
payload[name] = {
|
||||
"source_label": resolution.source_label,
|
||||
"title": resolution.entry.fields.get("title"),
|
||||
"year": resolution.entry.fields.get("year"),
|
||||
"doi": resolution.entry.fields.get("doi"),
|
||||
"openalex": resolution.entry.fields.get("openalex"),
|
||||
"arxiv": resolution.entry.fields.get("arxiv"),
|
||||
}
|
||||
|
||||
print(json.dumps(payload, indent=2, sort_keys=True))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None:
|
||||
if os.environ.get("CITEGEIST_LIVE_TESTS") == "1":
|
||||
return
|
||||
|
||||
skip_live = pytest.mark.skip(reason="set CITEGEIST_LIVE_TESTS=1 to run live-source tests")
|
||||
for item in items:
|
||||
if "live" in item.keywords:
|
||||
item.add_marker(skip_live)
|
||||
|
|
@ -203,3 +203,39 @@ def test_cli_expand_with_mocked_crossref(tmp_path: Path):
|
|||
exit_code = main(["--db", str(database), "expand", "seed2024"])
|
||||
|
||||
assert exit_code == 0
|
||||
|
||||
|
||||
def test_cli_expand_with_mocked_openalex(tmp_path: Path):
|
||||
bib_path = tmp_path / "expand-openalex.bib"
|
||||
bib_path.write_text(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Seed Paper},
|
||||
year = {2024},
|
||||
doi = {10.1000/seed-doi}
|
||||
}
|
||||
""",
|
||||
encoding="utf-8",
|
||||
)
|
||||
ingest = run_cli(tmp_path, "ingest", str(bib_path))
|
||||
assert ingest.returncode == 0
|
||||
|
||||
from citegeist.expand import ExpansionResult
|
||||
|
||||
with patch("citegeist.cli.OpenAlexExpander.expand_entry") as mocked_expand:
|
||||
mocked_expand.return_value = [
|
||||
ExpansionResult(
|
||||
source_citation_key="seed2024",
|
||||
discovered_citation_key="openalexw12345",
|
||||
created_entry=True,
|
||||
relation_type="cites",
|
||||
source_label="openalex:cites:WSEED",
|
||||
)
|
||||
]
|
||||
database = tmp_path / "library.sqlite3"
|
||||
exit_code = main(
|
||||
["--db", str(database), "expand", "seed2024", "--source", "openalex", "--relation", "cites"]
|
||||
)
|
||||
|
||||
assert exit_code == 0
|
||||
|
|
|
|||
|
|
@ -7,6 +7,18 @@ SAMPLE_REFERENCES = """
|
|||
[2] Miller, Sam. 2023. Semantic search for research corpora. Proceedings of the Retrieval Workshop.
|
||||
"""
|
||||
|
||||
APA_AND_BOOK_REFERENCES = """
|
||||
Brown, T., & Green, P. (2021). Retrieval methods for scholarly corpora. Journal of Information Retrieval.
|
||||
|
||||
Nguyen, An. Research Design for Literature Mapping. Example University Press, 2020.
|
||||
"""
|
||||
|
||||
WRAPPED_REFERENCES = """
|
||||
[1] Taylor, Ann. 2022. Multi-line reference extraction
|
||||
for bibliography pipelines. Journal of Parsing Systems.
|
||||
[2] Chen, Bo. 2021. Another entry. Proceedings of the Mining Workshop.
|
||||
"""
|
||||
|
||||
|
||||
def test_extract_references_builds_draft_entries():
|
||||
entries = extract_references(SAMPLE_REFERENCES)
|
||||
|
|
@ -33,3 +45,21 @@ def test_extract_cli_writes_bibtex(tmp_path):
|
|||
parsed = {entry.citation_key: entry for entry in parse_bibtex(exported)}
|
||||
assert parsed["smith2024graphfirst1"].fields["journal"] == "Journal of Research Systems"
|
||||
assert parsed["miller2023semantic2"].fields["booktitle"] == "Proceedings of the Retrieval Workshop"
|
||||
|
||||
|
||||
def test_extract_references_supports_apa_and_book_styles():
|
||||
entries = extract_references(APA_AND_BOOK_REFERENCES)
|
||||
|
||||
assert [entry.entry_type for entry in entries] == ["article", "book"]
|
||||
assert entries[0].fields["journal"] == "Journal of Information Retrieval"
|
||||
assert entries[0].fields["author"] == "Brown, T., and Green, P"
|
||||
assert entries[1].fields["publisher"] == "Example University Press"
|
||||
assert entries[1].fields["title"] == "Research Design for Literature Mapping"
|
||||
|
||||
|
||||
def test_extract_references_joins_wrapped_reference_lines():
|
||||
entries = extract_references(WRAPPED_REFERENCES)
|
||||
|
||||
assert len(entries) == 2
|
||||
assert entries[0].fields["title"] == "Multi-line reference extraction for bibliography pipelines"
|
||||
assert entries[0].fields["journal"] == "Journal of Parsing Systems"
|
||||
|
|
|
|||
|
|
@ -0,0 +1,52 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from citegeist import MetadataResolver, SourceClient
|
||||
|
||||
|
||||
pytestmark = pytest.mark.live
|
||||
|
||||
|
||||
def _live_client() -> SourceClient:
|
||||
cache_dir = os.environ.get("CITEGEIST_SOURCE_CACHE", ".cache/citegeist")
|
||||
return SourceClient(
|
||||
cache_dir=cache_dir,
|
||||
fixtures_dir=os.environ.get("CITEGEIST_SOURCE_FIXTURES"),
|
||||
)
|
||||
|
||||
|
||||
def test_live_crossref_doi_resolution():
|
||||
resolver = MetadataResolver(source_client=_live_client())
|
||||
|
||||
resolution = resolver.resolve_doi("10.1038/nphys1170")
|
||||
|
||||
assert resolution is not None
|
||||
assert resolution.entry.fields.get("doi") == "10.1038/nphys1170"
|
||||
assert resolution.entry.fields.get("title")
|
||||
|
||||
|
||||
def test_live_arxiv_resolution():
|
||||
resolver = MetadataResolver(source_client=_live_client())
|
||||
|
||||
resolution = resolver.resolve_arxiv("1706.03762")
|
||||
|
||||
assert resolution is not None
|
||||
assert resolution.entry.fields.get("arxiv") == "1706.03762"
|
||||
assert resolution.entry.fields.get("title")
|
||||
|
||||
|
||||
def test_live_openalex_title_search():
|
||||
resolver = MetadataResolver(source_client=_live_client())
|
||||
|
||||
resolution = resolver.search_openalex_best_match(
|
||||
title="Attention Is All You Need",
|
||||
author_text="Ashish Vaswani",
|
||||
year="2017",
|
||||
)
|
||||
|
||||
assert resolution is not None
|
||||
assert resolution.entry.fields.get("title")
|
||||
assert resolution.entry.fields.get("openalex")
|
||||
|
|
@ -0,0 +1,84 @@
|
|||
from citegeist.expand import OpenAlexExpander, _openalex_work_to_entry
|
||||
from citegeist.storage import BibliographyStore
|
||||
|
||||
|
||||
def test_openalex_work_to_entry_maps_basic_fields():
|
||||
entry = _openalex_work_to_entry(
|
||||
{
|
||||
"id": "https://openalex.org/W12345",
|
||||
"doi": "https://doi.org/10.1000/example-openalex",
|
||||
"display_name": "OpenAlex Discovered Work",
|
||||
"publication_year": 2022,
|
||||
"type": "article",
|
||||
"authorships": [{"author": {"display_name": "Jane Smith"}}],
|
||||
"primary_location": {"source": {"display_name": "Journal of Graph Discovery"}},
|
||||
"abstract_inverted_index": {"Graph": [0], "discovery": [1]},
|
||||
}
|
||||
)
|
||||
|
||||
assert entry.citation_key == "openalexw12345"
|
||||
assert entry.fields["openalex"] == "W12345"
|
||||
assert entry.fields["doi"] == "10.1000/example-openalex"
|
||||
assert entry.fields["journal"] == "Journal of Graph Discovery"
|
||||
assert entry.fields["abstract"] == "Graph discovery"
|
||||
|
||||
|
||||
def test_openalex_expander_adds_outgoing_and_incoming_edges():
|
||||
store = BibliographyStore()
|
||||
try:
|
||||
store.ingest_bibtex(
|
||||
"""
|
||||
@article{seed2024,
|
||||
author = {Seed, Alice},
|
||||
title = {Seed Paper},
|
||||
year = {2024},
|
||||
doi = {10.1000/seed-doi}
|
||||
}
|
||||
"""
|
||||
)
|
||||
expander = OpenAlexExpander()
|
||||
payloads = iter(
|
||||
[
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"id": "https://openalex.org/WSEED",
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"id": "https://openalex.org/WDISCOVERED",
|
||||
"display_name": "Referenced OpenAlex Work",
|
||||
"publication_year": 2021,
|
||||
"type": "article",
|
||||
"authorships": [{"author": {"display_name": "Bob Known"}}],
|
||||
"primary_location": {"source": {"display_name": "OpenAlex Journal"}},
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"id": "https://openalex.org/WCITING",
|
||||
"display_name": "Citing OpenAlex Work",
|
||||
"publication_year": 2025,
|
||||
"type": "article",
|
||||
"authorships": [{"author": {"display_name": "Carol Citing"}}],
|
||||
}
|
||||
]
|
||||
},
|
||||
]
|
||||
)
|
||||
expander.resolver.source_client.get_json = lambda _url: next(payloads) # type: ignore[method-assign]
|
||||
|
||||
outgoing = expander.expand_entry(store, "seed2024", relation_type="cites", limit=5)
|
||||
incoming = expander.expand_entry(store, "seed2024", relation_type="cited_by", limit=5)
|
||||
|
||||
assert outgoing[0].discovered_citation_key == "openalexwdiscovered"
|
||||
assert incoming[0].source_citation_key == "openalexwciting"
|
||||
assert "openalexwdiscovered" in store.get_relations("seed2024", "cites")
|
||||
assert "seed2024" in store.get_relations("openalexwciting", "cites")
|
||||
finally:
|
||||
store.close()
|
||||
|
|
@ -1,7 +1,13 @@
|
|||
from xml.etree import ElementTree as ET
|
||||
|
||||
from citegeist.bibtex import BibEntry
|
||||
from citegeist.resolve import MetadataResolver, _arxiv_atom_entry_to_bib, _crossref_message_to_entry, merge_entries
|
||||
from citegeist.resolve import (
|
||||
MetadataResolver,
|
||||
_arxiv_atom_entry_to_bib,
|
||||
_crossref_message_to_entry,
|
||||
_openalex_work_to_entry,
|
||||
merge_entries,
|
||||
)
|
||||
|
||||
|
||||
def test_crossref_message_to_entry_maps_basic_fields():
|
||||
|
|
@ -83,3 +89,68 @@ def test_resolver_tries_doi_before_dblp():
|
|||
)
|
||||
|
||||
assert calls == [("doi", "10.1000/example-doi"), ("dblp", "conf/test/Smith24")]
|
||||
|
||||
|
||||
def test_openalex_work_to_entry_maps_basic_fields():
|
||||
entry = _openalex_work_to_entry(
|
||||
{
|
||||
"id": "https://openalex.org/W12345",
|
||||
"doi": "https://doi.org/10.1000/example-openalex",
|
||||
"display_name": "OpenAlex Resolved Work",
|
||||
"publication_year": 2022,
|
||||
"type": "article",
|
||||
"authorships": [{"author": {"display_name": "Jane Smith"}}],
|
||||
"primary_location": {"source": {"display_name": "Journal of Open Graphs"}},
|
||||
"abstract_inverted_index": {"OpenAlex": [0], "resolved": [1]},
|
||||
}
|
||||
)
|
||||
|
||||
assert entry.citation_key == "openalexw12345"
|
||||
assert entry.fields["openalex"] == "W12345"
|
||||
assert entry.fields["doi"] == "10.1000/example-openalex"
|
||||
assert entry.fields["journal"] == "Journal of Open Graphs"
|
||||
assert entry.fields["abstract"] == "OpenAlex resolved"
|
||||
|
||||
|
||||
def test_resolver_can_resolve_openalex_id():
|
||||
resolver = MetadataResolver()
|
||||
resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign]
|
||||
"id": "https://openalex.org/W12345",
|
||||
"display_name": "OpenAlex Resolved Work",
|
||||
"publication_year": 2022,
|
||||
"type": "article",
|
||||
"authorships": [{"author": {"display_name": "Jane Smith"}}],
|
||||
}
|
||||
|
||||
resolution = resolver.resolve_openalex("W12345")
|
||||
|
||||
assert resolution is not None
|
||||
assert resolution.source_label == "openalex:id:W12345"
|
||||
assert resolution.entry.fields["openalex"] == "W12345"
|
||||
|
||||
|
||||
def test_resolver_falls_back_to_openalex_title_search():
|
||||
resolver = MetadataResolver()
|
||||
resolver.search_openalex = lambda title, limit=5: [ # type: ignore[method-assign]
|
||||
_openalex_work_to_entry(
|
||||
{
|
||||
"id": "https://openalex.org/W12345",
|
||||
"display_name": title,
|
||||
"publication_year": 2022,
|
||||
"type": "article",
|
||||
"authorships": [{"author": {"display_name": "Jane Smith"}}],
|
||||
}
|
||||
)
|
||||
]
|
||||
|
||||
resolution = resolver.resolve_entry(
|
||||
BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="smith2022openalex",
|
||||
fields={"title": "OpenAlex Resolved Work", "author": "Jane Smith", "year": "2022"},
|
||||
)
|
||||
)
|
||||
|
||||
assert resolution is not None
|
||||
assert resolution.source_label == "openalex:search:OpenAlex Resolved Work"
|
||||
assert resolution.entry.fields["openalex"] == "W12345"
|
||||
|
|
|
|||
Loading…
Reference in New Issue