1245 lines
38 KiB
Python
1245 lines
38 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
from unittest.mock import patch
|
|
|
|
from citegeist.cli import main
|
|
|
|
|
|
SAMPLE_BIB = """
|
|
@article{smith2024graphs,
|
|
author = {Smith, Jane and Doe, Alex},
|
|
title = {Graph-first bibliography augmentation},
|
|
year = {2024},
|
|
abstract = {We study citation graphs for literature discovery.},
|
|
references = {miller2023search}
|
|
}
|
|
|
|
@inproceedings{miller2023search,
|
|
author = {Miller, Sam},
|
|
title = {Semantic search for research corpora},
|
|
year = {2023},
|
|
abstract = {Dense retrieval improves recall for academic search.}
|
|
}
|
|
"""
|
|
|
|
|
|
def run_cli(tmp_path: Path, *args: str) -> subprocess.CompletedProcess[str]:
|
|
database = tmp_path / "library.sqlite3"
|
|
env = {"PYTHONPATH": "src"}
|
|
return subprocess.run(
|
|
[sys.executable, "-m", "citegeist", "--db", str(database), *args],
|
|
cwd=Path(__file__).resolve().parents[1],
|
|
env=env,
|
|
capture_output=True,
|
|
text=True,
|
|
check=False,
|
|
)
|
|
|
|
|
|
def test_cli_ingest_show_search_and_export(tmp_path: Path):
|
|
bib_path = tmp_path / "input.bib"
|
|
bib_path.write_text(SAMPLE_BIB, encoding="utf-8")
|
|
|
|
ingest = run_cli(tmp_path, "ingest", str(bib_path))
|
|
assert ingest.returncode == 0
|
|
assert "smith2024graphs" in ingest.stdout
|
|
|
|
show = run_cli(tmp_path, "show", "smith2024graphs")
|
|
assert show.returncode == 0
|
|
payload = json.loads(show.stdout)
|
|
assert payload["citation_key"] == "smith2024graphs"
|
|
|
|
search = run_cli(tmp_path, "search", "semantic")
|
|
assert search.returncode == 0
|
|
assert "miller2023search" in search.stdout
|
|
|
|
export_path = tmp_path / "exported.bib"
|
|
export_result = run_cli(tmp_path, "export", "--output", str(export_path))
|
|
assert export_result.returncode == 0
|
|
exported = export_path.read_text(encoding="utf-8")
|
|
assert "@article{smith2024graphs," in exported
|
|
|
|
|
|
def test_cli_provenance_and_status_updates(tmp_path: Path):
|
|
bib_path = tmp_path / "input.bib"
|
|
bib_path.write_text(SAMPLE_BIB, encoding="utf-8")
|
|
|
|
ingest = run_cli(
|
|
tmp_path,
|
|
"ingest",
|
|
"--status",
|
|
"draft",
|
|
"--source-label",
|
|
"tests/input.bib",
|
|
str(bib_path),
|
|
)
|
|
assert ingest.returncode == 0
|
|
|
|
show = run_cli(tmp_path, "show", "--provenance", "smith2024graphs")
|
|
assert show.returncode == 0
|
|
payload = json.loads(show.stdout)
|
|
assert payload["review_status"] == "draft"
|
|
assert payload["field_provenance"][0]["source_label"] == "tests/input.bib"
|
|
|
|
status = run_cli(tmp_path, "set-status", "smith2024graphs", "reviewed")
|
|
assert status.returncode == 0
|
|
assert "reviewed" in status.stdout
|
|
|
|
|
|
def test_cli_resolve_updates_entry(tmp_path: Path):
|
|
bib_path = tmp_path / "input.bib"
|
|
bib_path.write_text(
|
|
"""
|
|
@article{smith2024graphs,
|
|
author = {Smith, Jane},
|
|
title = {Graph-first bibliography augmentation},
|
|
year = {2024},
|
|
doi = {10.1000/example-doi}
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
ingest = run_cli(tmp_path, "ingest", str(bib_path))
|
|
assert ingest.returncode == 0
|
|
|
|
from citegeist.bibtex import BibEntry
|
|
from citegeist.resolve import Resolution
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
|
|
with patch("citegeist.cli.MetadataResolver.resolve_entry") as mocked_resolve:
|
|
mocked_resolve.return_value = Resolution(
|
|
entry=BibEntry(
|
|
entry_type="article",
|
|
citation_key="resolvedkey",
|
|
fields={
|
|
"author": "Smith, Jane",
|
|
"title": "Resolved Graph-first bibliography augmentation",
|
|
"year": "2024",
|
|
"doi": "10.1000/example-doi",
|
|
"journal": "Journal of Graph Studies",
|
|
},
|
|
),
|
|
source_type="resolver",
|
|
source_label="crossref:doi:10.1000/example-doi",
|
|
)
|
|
exit_code = main(
|
|
[
|
|
"--db",
|
|
str(database),
|
|
"resolve",
|
|
"smith2024graphs",
|
|
]
|
|
)
|
|
|
|
assert exit_code == 0
|
|
show = run_cli(tmp_path, "show", "--conflicts", "smith2024graphs")
|
|
assert show.returncode == 0
|
|
payload = json.loads(show.stdout)
|
|
assert payload["field_conflicts"][0]["field_name"] == "title"
|
|
|
|
|
|
def test_cli_resolve_conflicts_updates_status(tmp_path: Path):
|
|
bib_path = tmp_path / "input.bib"
|
|
bib_path.write_text(
|
|
"""
|
|
@article{smith2024graphs,
|
|
author = {Smith, Jane},
|
|
title = {Graph-first bibliography augmentation},
|
|
year = {2024}
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
ingest = run_cli(tmp_path, "ingest", str(bib_path))
|
|
assert ingest.returncode == 0
|
|
|
|
from citegeist.storage import BibliographyStore
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
store = BibliographyStore(database)
|
|
try:
|
|
store.record_conflicts(
|
|
"smith2024graphs",
|
|
[
|
|
{
|
|
"field_name": "title",
|
|
"current_value": "Graph-first bibliography augmentation",
|
|
"proposed_value": "Resolved title",
|
|
}
|
|
],
|
|
source_type="resolver",
|
|
source_label="openalex:search:Graph-first bibliography augmentation",
|
|
)
|
|
finally:
|
|
store.close()
|
|
|
|
result = run_cli(tmp_path, "resolve-conflicts", "smith2024graphs", "title", "accepted")
|
|
assert result.returncode == 0
|
|
assert "accepted" in result.stdout
|
|
|
|
|
|
def test_cli_apply_conflict_updates_entry_value(tmp_path: Path):
|
|
bib_path = tmp_path / "input.bib"
|
|
bib_path.write_text(
|
|
"""
|
|
@article{smith2024graphs,
|
|
author = {Smith, Jane},
|
|
title = {Graph-first bibliography augmentation},
|
|
year = {2024}
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
ingest = run_cli(tmp_path, "ingest", str(bib_path))
|
|
assert ingest.returncode == 0
|
|
|
|
from citegeist.storage import BibliographyStore
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
store = BibliographyStore(database)
|
|
try:
|
|
store.record_conflicts(
|
|
"smith2024graphs",
|
|
[
|
|
{
|
|
"field_name": "title",
|
|
"current_value": "Graph-first bibliography augmentation",
|
|
"proposed_value": "Resolved Graph-first bibliography augmentation",
|
|
}
|
|
],
|
|
source_type="resolver",
|
|
source_label="openalex:search:Graph-first bibliography augmentation",
|
|
)
|
|
finally:
|
|
store.close()
|
|
|
|
result = run_cli(tmp_path, "apply-conflict", "smith2024graphs", "title")
|
|
assert result.returncode == 0
|
|
assert "applied" in result.stdout
|
|
|
|
show = run_cli(tmp_path, "show", "smith2024graphs")
|
|
payload = json.loads(show.stdout)
|
|
assert payload["title"] == "Resolved Graph-first bibliography augmentation"
|
|
|
|
|
|
def test_cli_discover_oai_outputs_identity_and_sets():
|
|
from unittest.mock import patch
|
|
from citegeist.harvest import OaiMetadataFormat, OaiSet
|
|
|
|
with patch("citegeist.cli.OaiPmhHarvester.identify") as mocked_identify, patch(
|
|
"citegeist.cli.OaiPmhHarvester.list_sets"
|
|
) as mocked_sets, patch("citegeist.cli.OaiPmhHarvester.list_metadata_formats") as mocked_formats:
|
|
mocked_identify.return_value = {
|
|
"repositoryName": "Example Repository",
|
|
"granularity": "YYYY-MM-DD",
|
|
}
|
|
mocked_formats.return_value = [
|
|
OaiMetadataFormat(
|
|
metadata_prefix="oai_dc",
|
|
schema="http://www.openarchives.org/OAI/2.0/oai_dc.xsd",
|
|
metadata_namespace="http://www.openarchives.org/OAI/2.0/oai_dc/",
|
|
)
|
|
]
|
|
mocked_sets.return_value = [
|
|
OaiSet(set_spec="theses", set_name="Theses", set_description="Graduate theses")
|
|
]
|
|
exit_code = main(["discover-oai", "https://example.edu/oai"])
|
|
|
|
assert exit_code == 0
|
|
|
|
|
|
def test_cli_bootstrap_preview_mode(tmp_path):
|
|
from unittest.mock import patch
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
with patch("citegeist.cli.Bootstrapper.bootstrap") as mocked_bootstrap:
|
|
mocked_bootstrap.return_value = []
|
|
exit_code = main(
|
|
[
|
|
"--db",
|
|
str(database),
|
|
"bootstrap",
|
|
"--topic",
|
|
"graph topic",
|
|
"--preview",
|
|
"--topic-commit-limit",
|
|
"2",
|
|
]
|
|
)
|
|
|
|
assert exit_code == 0
|
|
_, kwargs = mocked_bootstrap.call_args
|
|
assert kwargs["preview_only"] is True
|
|
assert kwargs["topic_commit_limit"] == 2
|
|
|
|
|
|
def test_cli_bootstrap_accepts_stored_topic_metadata(tmp_path):
|
|
from unittest.mock import patch
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
with patch("citegeist.cli.Bootstrapper.bootstrap") as mocked_bootstrap:
|
|
mocked_bootstrap.return_value = []
|
|
exit_code = main(
|
|
[
|
|
"--db",
|
|
str(database),
|
|
"bootstrap",
|
|
"--topic",
|
|
"graph topic",
|
|
"--topic-slug",
|
|
"graph-methods",
|
|
"--topic-name",
|
|
"Graph Methods",
|
|
"--store-topic-phrase",
|
|
"graph networks biology",
|
|
]
|
|
)
|
|
|
|
assert exit_code == 0
|
|
_, kwargs = mocked_bootstrap.call_args
|
|
assert kwargs["topic_slug"] == "graph-methods"
|
|
assert kwargs["topic_name"] == "Graph Methods"
|
|
assert kwargs["topic_phrase"] == "graph networks biology"
|
|
|
|
|
|
def test_cli_scrape_talkorigins_accepts_output_dir(tmp_path):
|
|
from unittest.mock import patch
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
with patch("citegeist.cli.TalkOriginsScraper.scrape_to_directory") as mocked_scrape:
|
|
mocked_scrape.return_value = __import__("citegeist").TalkOriginsBatchExport(
|
|
base_url="https://www.talkorigins.org/origins/biblio/",
|
|
output_dir=str(tmp_path),
|
|
topic_count=1,
|
|
entry_count=2,
|
|
jobs_path=str(tmp_path / "jobs.json"),
|
|
manifest_path=str(tmp_path / "manifest.json"),
|
|
seed_sets=[],
|
|
)
|
|
exit_code = main(
|
|
[
|
|
"--db",
|
|
str(database),
|
|
"scrape-talkorigins",
|
|
str(tmp_path / "talkorigins-out"),
|
|
"--limit-topics",
|
|
"3",
|
|
"--limit-entries-per-topic",
|
|
"10",
|
|
"--no-resume",
|
|
"--no-expand",
|
|
]
|
|
)
|
|
|
|
assert exit_code == 0
|
|
|
|
|
|
def test_cli_validate_talkorigins_accepts_manifest(tmp_path):
|
|
from unittest.mock import patch
|
|
|
|
manifest = tmp_path / "talkorigins_manifest.json"
|
|
manifest.write_text("{}", encoding="utf-8")
|
|
with patch("citegeist.cli.TalkOriginsScraper.validate_export") as mocked_validate:
|
|
mocked_validate.return_value = __import__("citegeist").TalkOriginsValidationReport(
|
|
manifest_path=str(manifest),
|
|
topic_count=1,
|
|
entry_count=2,
|
|
parsed_ratio=1.0,
|
|
missing_author_count=0,
|
|
missing_title_count=0,
|
|
missing_year_count=0,
|
|
suspicious_entry_type_count=0,
|
|
suspicious_examples=[],
|
|
duplicate_cluster_count=0,
|
|
duplicate_entry_count=0,
|
|
duplicate_examples=[],
|
|
)
|
|
exit_code = main(["validate-talkorigins", str(manifest)])
|
|
|
|
assert exit_code == 0
|
|
|
|
|
|
def test_cli_suggest_talkorigins_phrases_writes_output(tmp_path):
|
|
from unittest.mock import patch
|
|
|
|
manifest = tmp_path / "talkorigins_manifest.json"
|
|
manifest.write_text("{}", encoding="utf-8")
|
|
output = tmp_path / "phrases.json"
|
|
with patch("citegeist.cli.TalkOriginsScraper.suggest_topic_phrases") as mocked_suggest:
|
|
mocked_suggest.return_value = [
|
|
__import__("citegeist", fromlist=["TalkOriginsTopicPhraseSuggestion"]).TalkOriginsTopicPhraseSuggestion(
|
|
slug="abiogenesis",
|
|
topic="Abiogenesis",
|
|
entry_count=2,
|
|
suggested_phrase="Abiogenesis prebiotic chemistry ribozyme",
|
|
keywords=["prebiotic", "chemistry", "ribozyme"],
|
|
review_required=True,
|
|
review_reasons=["small_topic"],
|
|
)
|
|
]
|
|
exit_code = main(
|
|
[
|
|
"suggest-talkorigins-phrases",
|
|
str(manifest),
|
|
"--topic",
|
|
"abiogenesis",
|
|
"--output",
|
|
str(output),
|
|
]
|
|
)
|
|
|
|
assert exit_code == 0
|
|
payload = json.loads(output.read_text(encoding="utf-8"))
|
|
assert payload[0]["slug"] == "abiogenesis"
|
|
|
|
|
|
def test_cli_duplicates_talkorigins_accepts_manifest(tmp_path):
|
|
from unittest.mock import patch
|
|
|
|
manifest = tmp_path / "talkorigins_manifest.json"
|
|
manifest.write_text("{}", encoding="utf-8")
|
|
with patch("citegeist.cli.TalkOriginsScraper.inspect_duplicate_clusters") as mocked_duplicates:
|
|
mocked_duplicates.return_value = [
|
|
__import__("citegeist.talkorigins", fromlist=["TalkOriginsDuplicateCluster"]).TalkOriginsDuplicateCluster(
|
|
key="smith|1999|duplicate paper",
|
|
count=2,
|
|
items=[
|
|
{
|
|
"citation_key": "dup1",
|
|
"title": "Duplicate Paper",
|
|
"author": "Smith, Jane",
|
|
"year": "1999",
|
|
"seed_bib": "a.bib",
|
|
"topic": "Abiogenesis",
|
|
"topic_slug": "abiogenesis",
|
|
}
|
|
],
|
|
canonical={
|
|
"citation_key": "dup1",
|
|
"entry_type": "article",
|
|
"field_count": 3,
|
|
"fields": {"title": "Duplicate Paper"},
|
|
"weak_reasons": [],
|
|
},
|
|
)
|
|
]
|
|
exit_code = main(
|
|
[
|
|
"duplicates-talkorigins",
|
|
str(manifest),
|
|
"--topic",
|
|
"abiogenesis",
|
|
"--match",
|
|
"duplicate",
|
|
"--preview",
|
|
"--weak-only",
|
|
]
|
|
)
|
|
|
|
assert exit_code == 0
|
|
|
|
|
|
def test_cli_ingest_talkorigins_accepts_manifest(tmp_path):
|
|
from unittest.mock import patch
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
manifest = tmp_path / "talkorigins_manifest.json"
|
|
manifest.write_text("{}", encoding="utf-8")
|
|
with patch("citegeist.cli.TalkOriginsScraper.ingest_export") as mocked_ingest:
|
|
mocked_ingest.return_value = __import__("citegeist").TalkOriginsIngestReport(
|
|
manifest_path=str(manifest),
|
|
topic_count=1,
|
|
raw_entry_count=2,
|
|
stored_entry_count=1,
|
|
duplicate_cluster_count=1,
|
|
duplicate_entry_count=2,
|
|
canonicalized_count=1,
|
|
)
|
|
exit_code = main(["--db", str(database), "ingest-talkorigins", str(manifest)])
|
|
|
|
assert exit_code == 0
|
|
|
|
|
|
def test_cli_enrich_talkorigins_accepts_manifest(tmp_path):
|
|
from unittest.mock import patch
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
manifest = tmp_path / "talkorigins_manifest.json"
|
|
manifest.write_text("{}", encoding="utf-8")
|
|
with patch("citegeist.cli.TalkOriginsScraper.enrich_weak_canonicals") as mocked_enrich:
|
|
mocked_enrich.return_value = [
|
|
__import__("citegeist.talkorigins", fromlist=["TalkOriginsEnrichmentResult"]).TalkOriginsEnrichmentResult(
|
|
key="smith|1999|duplicate paper",
|
|
citation_key="dup1",
|
|
weak_reasons_before=["missing:doi"],
|
|
resolved=True,
|
|
applied=False,
|
|
source_label="crossref:search:Duplicate Paper",
|
|
weak_reasons_after=[],
|
|
conflicts=[],
|
|
error="",
|
|
)
|
|
]
|
|
exit_code = main(
|
|
[
|
|
"--db",
|
|
str(database),
|
|
"enrich-talkorigins",
|
|
str(manifest),
|
|
"--limit",
|
|
"5",
|
|
"--apply",
|
|
"--allow-unsafe-search-matches",
|
|
]
|
|
)
|
|
|
|
assert exit_code == 0
|
|
|
|
|
|
def test_cli_review_talkorigins_writes_output(tmp_path):
|
|
from unittest.mock import patch
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
manifest = tmp_path / "talkorigins_manifest.json"
|
|
manifest.write_text("{}", encoding="utf-8")
|
|
output = tmp_path / "review.json"
|
|
with patch("citegeist.cli.TalkOriginsScraper.build_review_export") as mocked_review:
|
|
mocked_review.return_value = __import__("citegeist.talkorigins", fromlist=["TalkOriginsReviewExport"]).TalkOriginsReviewExport(
|
|
manifest_path=str(manifest),
|
|
item_count=1,
|
|
items=[{"key": "smith|1999|duplicate paper", "canonical": {}, "enrichment": {}}],
|
|
)
|
|
exit_code = main(
|
|
[
|
|
"--db",
|
|
str(database),
|
|
"review-talkorigins",
|
|
str(manifest),
|
|
"--output",
|
|
str(output),
|
|
]
|
|
)
|
|
|
|
assert exit_code == 0
|
|
assert output.exists()
|
|
|
|
|
|
def test_cli_apply_talkorigins_corrections_accepts_files(tmp_path):
|
|
from unittest.mock import patch
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
manifest = tmp_path / "talkorigins_manifest.json"
|
|
corrections = tmp_path / "corrections.json"
|
|
manifest.write_text("{}", encoding="utf-8")
|
|
corrections.write_text('{"corrections": []}', encoding="utf-8")
|
|
with patch("citegeist.cli.TalkOriginsScraper.apply_review_corrections") as mocked_apply:
|
|
mocked_apply.return_value = [
|
|
__import__("citegeist.talkorigins", fromlist=["TalkOriginsCorrectionResult"]).TalkOriginsCorrectionResult(
|
|
key="smith|1999|duplicate paper",
|
|
citation_key="dup1",
|
|
applied=True,
|
|
error="",
|
|
)
|
|
]
|
|
exit_code = main(
|
|
[
|
|
"--db",
|
|
str(database),
|
|
"apply-talkorigins-corrections",
|
|
str(manifest),
|
|
str(corrections),
|
|
]
|
|
)
|
|
|
|
assert exit_code == 0
|
|
|
|
|
|
def test_cli_topics_and_topic_entries(tmp_path: Path):
|
|
bib_path = tmp_path / "input.bib"
|
|
bib_path.write_text(
|
|
"""
|
|
@article{seed2024,
|
|
author = {Seed, Alice},
|
|
title = {Seed Paper},
|
|
year = {2024}
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
ingest = run_cli(tmp_path, "ingest", str(bib_path))
|
|
assert ingest.returncode == 0
|
|
|
|
from citegeist.storage import BibliographyStore
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
store = BibliographyStore(database)
|
|
try:
|
|
store.add_entry_topic(
|
|
"seed2024",
|
|
topic_slug="graph-methods",
|
|
topic_name="Graph Methods",
|
|
source_type="talkorigins",
|
|
source_url="https://example.org/topics/graph-methods",
|
|
source_label="topic-seed",
|
|
)
|
|
store.connection.commit()
|
|
finally:
|
|
store.close()
|
|
|
|
topics = run_cli(tmp_path, "topics")
|
|
assert topics.returncode == 0
|
|
topics_payload = json.loads(topics.stdout)
|
|
assert topics_payload[0]["slug"] == "graph-methods"
|
|
|
|
topic_entries = run_cli(tmp_path, "topic-entries", "graph-methods")
|
|
assert topic_entries.returncode == 0
|
|
topic_payload = json.loads(topic_entries.stdout)
|
|
assert topic_payload["topic"]["slug"] == "graph-methods"
|
|
assert topic_payload["entries"][0]["citation_key"] == "seed2024"
|
|
|
|
|
|
def test_cli_can_set_topic_phrase(tmp_path: Path):
|
|
bib_path = tmp_path / "input.bib"
|
|
bib_path.write_text(
|
|
"""
|
|
@article{seed2024,
|
|
author = {Seed, Alice},
|
|
title = {Seed Paper},
|
|
year = {2024}
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
ingest = run_cli(tmp_path, "ingest", str(bib_path))
|
|
assert ingest.returncode == 0
|
|
|
|
from citegeist.storage import BibliographyStore
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
store = BibliographyStore(database)
|
|
try:
|
|
store.add_entry_topic(
|
|
"seed2024",
|
|
topic_slug="graph-methods",
|
|
topic_name="Graph Methods",
|
|
source_type="talkorigins",
|
|
source_url="https://example.org/topics/graph-methods",
|
|
source_label="topic-seed",
|
|
)
|
|
store.connection.commit()
|
|
finally:
|
|
store.close()
|
|
|
|
result = run_cli(tmp_path, "set-topic-phrase", "graph-methods", "graph networks biology")
|
|
assert result.returncode == 0
|
|
payload = json.loads(result.stdout)
|
|
assert payload["expansion_phrase"] == "graph networks biology"
|
|
|
|
|
|
def test_cli_can_apply_topic_phrases_from_json(tmp_path: Path):
|
|
bib_path = tmp_path / "input.bib"
|
|
bib_path.write_text(
|
|
"""
|
|
@article{seed2024,
|
|
author = {Seed, Alice},
|
|
title = {Seed Paper},
|
|
year = {2024}
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
ingest = run_cli(tmp_path, "ingest", str(bib_path))
|
|
assert ingest.returncode == 0
|
|
|
|
from citegeist.storage import BibliographyStore
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
store = BibliographyStore(database)
|
|
try:
|
|
store.add_entry_topic(
|
|
"seed2024",
|
|
topic_slug="graph-methods",
|
|
topic_name="Graph Methods",
|
|
source_type="talkorigins",
|
|
source_url="https://example.org/topics/graph-methods",
|
|
source_label="topic-seed",
|
|
)
|
|
store.connection.commit()
|
|
finally:
|
|
store.close()
|
|
|
|
phrases_path = tmp_path / "phrases.json"
|
|
phrases_path.write_text(
|
|
json.dumps(
|
|
[
|
|
{
|
|
"slug": "graph-methods",
|
|
"suggested_phrase": "graph networks biology",
|
|
}
|
|
]
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
result = run_cli(tmp_path, "apply-topic-phrases", str(phrases_path))
|
|
assert result.returncode == 0
|
|
payload = json.loads(result.stdout)
|
|
assert payload[0]["applied"] is True
|
|
|
|
check = run_cli(tmp_path, "topics")
|
|
topics_payload = json.loads(check.stdout)
|
|
assert topics_payload[0]["expansion_phrase"] == "graph networks biology"
|
|
|
|
|
|
def test_cli_can_stage_topic_phrases_from_json(tmp_path: Path):
|
|
bib_path = tmp_path / "input.bib"
|
|
bib_path.write_text(
|
|
"""
|
|
@article{seed2024,
|
|
author = {Seed, Alice},
|
|
title = {Seed Paper},
|
|
year = {2024}
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
ingest = run_cli(tmp_path, "ingest", str(bib_path))
|
|
assert ingest.returncode == 0
|
|
|
|
from citegeist.storage import BibliographyStore
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
store = BibliographyStore(database)
|
|
try:
|
|
store.add_entry_topic(
|
|
"seed2024",
|
|
topic_slug="graph-methods",
|
|
topic_name="Graph Methods",
|
|
source_type="talkorigins",
|
|
source_url="https://example.org/topics/graph-methods",
|
|
source_label="topic-seed",
|
|
)
|
|
store.connection.commit()
|
|
finally:
|
|
store.close()
|
|
|
|
phrases_path = tmp_path / "phrases.json"
|
|
phrases_path.write_text(
|
|
json.dumps(
|
|
[
|
|
{
|
|
"slug": "graph-methods",
|
|
"suggested_phrase": "graph networks biology",
|
|
}
|
|
]
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
result = run_cli(tmp_path, "stage-topic-phrases", str(phrases_path))
|
|
assert result.returncode == 0
|
|
payload = json.loads(result.stdout)
|
|
assert payload[0]["staged"] is True
|
|
assert payload[0]["phrase_review_status"] == "pending"
|
|
|
|
check = run_cli(tmp_path, "topics")
|
|
topics_payload = json.loads(check.stdout)
|
|
assert topics_payload[0]["suggested_phrase"] == "graph networks biology"
|
|
assert topics_payload[0]["expansion_phrase"] is None
|
|
assert topics_payload[0]["phrase_review_status"] == "pending"
|
|
|
|
|
|
def test_cli_can_review_topic_phrase(tmp_path: Path):
|
|
bib_path = tmp_path / "input.bib"
|
|
bib_path.write_text(
|
|
"""
|
|
@article{seed2024,
|
|
author = {Seed, Alice},
|
|
title = {Seed Paper},
|
|
year = {2024}
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
ingest = run_cli(tmp_path, "ingest", str(bib_path))
|
|
assert ingest.returncode == 0
|
|
|
|
from citegeist.storage import BibliographyStore
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
store = BibliographyStore(database)
|
|
try:
|
|
store.add_entry_topic(
|
|
"seed2024",
|
|
topic_slug="graph-methods",
|
|
topic_name="Graph Methods",
|
|
source_type="talkorigins",
|
|
source_url="https://example.org/topics/graph-methods",
|
|
source_label="topic-seed",
|
|
)
|
|
store.stage_topic_phrase_suggestion("graph-methods", "graph networks biology")
|
|
finally:
|
|
store.close()
|
|
|
|
result = run_cli(
|
|
tmp_path,
|
|
"review-topic-phrase",
|
|
"graph-methods",
|
|
"accepted",
|
|
"--notes",
|
|
"curated and approved",
|
|
)
|
|
assert result.returncode == 0
|
|
payload = json.loads(result.stdout)
|
|
assert payload["suggested_phrase"] is None
|
|
assert payload["expansion_phrase"] == "graph networks biology"
|
|
assert payload["phrase_review_status"] == "accepted"
|
|
assert payload["phrase_review_notes"] == "curated and approved"
|
|
|
|
|
|
def test_cli_topics_can_filter_by_phrase_review_status(tmp_path: Path):
|
|
bib_path = tmp_path / "input.bib"
|
|
bib_path.write_text(
|
|
"""
|
|
@article{seed2024,
|
|
author = {Seed, Alice},
|
|
title = {Seed Paper},
|
|
year = {2024}
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
ingest = run_cli(tmp_path, "ingest", str(bib_path))
|
|
assert ingest.returncode == 0
|
|
|
|
from citegeist.storage import BibliographyStore
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
store = BibliographyStore(database)
|
|
try:
|
|
store.add_entry_topic(
|
|
"seed2024",
|
|
topic_slug="graph-methods",
|
|
topic_name="Graph Methods",
|
|
source_type="talkorigins",
|
|
source_url="https://example.org/topics/graph-methods",
|
|
source_label="topic-seed",
|
|
)
|
|
store.ensure_topic("abiogenesis", "Abiogenesis")
|
|
store.stage_topic_phrase_suggestion("graph-methods", "graph networks biology")
|
|
store.stage_topic_phrase_suggestion("abiogenesis", "abiogenesis life origin")
|
|
store.review_topic_phrase_suggestion("abiogenesis", "accepted")
|
|
finally:
|
|
store.close()
|
|
|
|
result = run_cli(tmp_path, "topics", "--phrase-review-status", "pending")
|
|
assert result.returncode == 0
|
|
payload = json.loads(result.stdout)
|
|
assert [topic["slug"] for topic in payload] == ["graph-methods"]
|
|
|
|
|
|
def test_cli_can_list_topic_phrase_reviews(tmp_path: Path):
|
|
bib_path = tmp_path / "input.bib"
|
|
bib_path.write_text(
|
|
"""
|
|
@article{seed2024,
|
|
author = {Seed, Alice},
|
|
title = {Seed Paper},
|
|
year = {2024}
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
ingest = run_cli(tmp_path, "ingest", str(bib_path))
|
|
assert ingest.returncode == 0
|
|
|
|
from citegeist.storage import BibliographyStore
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
store = BibliographyStore(database)
|
|
try:
|
|
store.add_entry_topic(
|
|
"seed2024",
|
|
topic_slug="graph-methods",
|
|
topic_name="Graph Methods",
|
|
source_type="talkorigins",
|
|
source_url="https://example.org/topics/graph-methods",
|
|
source_label="topic-seed",
|
|
)
|
|
store.ensure_topic("abiogenesis", "Abiogenesis")
|
|
store.stage_topic_phrase_suggestion("graph-methods", "graph networks biology")
|
|
store.stage_topic_phrase_suggestion("abiogenesis", "abiogenesis life origin")
|
|
store.review_topic_phrase_suggestion("abiogenesis", "accepted")
|
|
finally:
|
|
store.close()
|
|
|
|
result = run_cli(tmp_path, "topic-phrase-reviews", "--phrase-review-status", "pending")
|
|
assert result.returncode == 0
|
|
payload = json.loads(result.stdout)
|
|
assert [review["slug"] for review in payload] == ["graph-methods"]
|
|
assert payload[0]["suggested_phrase"] == "graph networks biology"
|
|
assert payload[0]["phrase_review_status"] == "pending"
|
|
|
|
|
|
def test_cli_can_review_topic_phrases_in_bulk(tmp_path: Path):
|
|
bib_path = tmp_path / "input.bib"
|
|
bib_path.write_text(
|
|
"""
|
|
@article{seed2024,
|
|
author = {Seed, Alice},
|
|
title = {Seed Paper},
|
|
year = {2024}
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
ingest = run_cli(tmp_path, "ingest", str(bib_path))
|
|
assert ingest.returncode == 0
|
|
|
|
from citegeist.storage import BibliographyStore
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
store = BibliographyStore(database)
|
|
try:
|
|
store.add_entry_topic(
|
|
"seed2024",
|
|
topic_slug="graph-methods",
|
|
topic_name="Graph Methods",
|
|
source_type="talkorigins",
|
|
source_url="https://example.org/topics/graph-methods",
|
|
source_label="topic-seed",
|
|
)
|
|
store.ensure_topic("abiogenesis", "Abiogenesis")
|
|
store.stage_topic_phrase_suggestion("graph-methods", "graph networks biology")
|
|
store.stage_topic_phrase_suggestion("abiogenesis", "abiogenesis life origin")
|
|
finally:
|
|
store.close()
|
|
|
|
review_path = tmp_path / "phrase-review.json"
|
|
review_path.write_text(
|
|
json.dumps(
|
|
[
|
|
{
|
|
"slug": "graph-methods",
|
|
"status": "accepted",
|
|
"review_notes": "good phrase",
|
|
},
|
|
{
|
|
"slug": "abiogenesis",
|
|
"status": "rejected",
|
|
"review_notes": "too sparse",
|
|
},
|
|
]
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
result = run_cli(tmp_path, "review-topic-phrases", str(review_path))
|
|
assert result.returncode == 0
|
|
payload = json.loads(result.stdout)
|
|
assert payload[0]["reviewed"] is True
|
|
assert payload[1]["reviewed"] is True
|
|
|
|
pending_result = run_cli(tmp_path, "topic-phrase-reviews", "--phrase-review-status", "pending")
|
|
assert pending_result.returncode == 0
|
|
assert json.loads(pending_result.stdout) == []
|
|
|
|
rejected_result = run_cli(tmp_path, "topic-phrase-reviews", "--phrase-review-status", "rejected")
|
|
assert rejected_result.returncode == 0
|
|
rejected_payload = json.loads(rejected_result.stdout)
|
|
assert [review["slug"] for review in rejected_payload] == ["abiogenesis"]
|
|
|
|
topics_result = run_cli(tmp_path, "topics", "--phrase-review-status", "accepted")
|
|
assert topics_result.returncode == 0
|
|
topics_payload = json.loads(topics_result.stdout)
|
|
assert [topic["slug"] for topic in topics_payload] == ["graph-methods"]
|
|
|
|
|
|
def test_cli_can_export_topic_phrase_review_template(tmp_path: Path):
|
|
bib_path = tmp_path / "input.bib"
|
|
bib_path.write_text(
|
|
"""
|
|
@article{seed2024,
|
|
author = {Seed, Alice},
|
|
title = {Seed Paper},
|
|
year = {2024}
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
ingest = run_cli(tmp_path, "ingest", str(bib_path))
|
|
assert ingest.returncode == 0
|
|
|
|
from citegeist.storage import BibliographyStore
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
store = BibliographyStore(database)
|
|
try:
|
|
store.add_entry_topic(
|
|
"seed2024",
|
|
topic_slug="graph-methods",
|
|
topic_name="Graph Methods",
|
|
source_type="talkorigins",
|
|
source_url="https://example.org/topics/graph-methods",
|
|
source_label="topic-seed",
|
|
)
|
|
store.stage_topic_phrase_suggestion("graph-methods", "graph networks biology")
|
|
finally:
|
|
store.close()
|
|
|
|
output_path = tmp_path / "topic-phrase-review.json"
|
|
result = run_cli(
|
|
tmp_path,
|
|
"export-topic-phrase-reviews",
|
|
"--output",
|
|
str(output_path),
|
|
)
|
|
assert result.returncode == 0
|
|
payload = json.loads(output_path.read_text(encoding="utf-8"))
|
|
assert [item["slug"] for item in payload] == ["graph-methods"]
|
|
assert payload[0]["current_expansion_phrase"] is None
|
|
assert payload[0]["suggested_phrase"] == "graph networks biology"
|
|
assert payload[0]["current_status"] == "pending"
|
|
assert payload[0]["status"] == ""
|
|
assert payload[0]["phrase"] == "graph networks biology"
|
|
|
|
|
|
def test_cli_export_topic(tmp_path: Path):
|
|
bib_path = tmp_path / "input.bib"
|
|
bib_path.write_text(
|
|
"""
|
|
@article{seed2024,
|
|
author = {Seed, Alice},
|
|
title = {Seed Paper},
|
|
year = {2024}
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
ingest = run_cli(tmp_path, "ingest", str(bib_path))
|
|
assert ingest.returncode == 0
|
|
|
|
from citegeist.storage import BibliographyStore
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
store = BibliographyStore(database)
|
|
try:
|
|
store.add_entry_topic(
|
|
"seed2024",
|
|
topic_slug="graph-methods",
|
|
topic_name="Graph Methods",
|
|
source_type="talkorigins",
|
|
source_url="https://example.org/topics/graph-methods",
|
|
source_label="topic-seed",
|
|
)
|
|
store.connection.commit()
|
|
finally:
|
|
store.close()
|
|
|
|
export_path = tmp_path / "graph-methods.bib"
|
|
result = run_cli(tmp_path, "export-topic", "graph-methods", "--output", str(export_path))
|
|
assert result.returncode == 0
|
|
exported = export_path.read_text(encoding="utf-8")
|
|
assert "@article{seed2024," in exported
|
|
|
|
|
|
def test_cli_search_can_filter_by_topic(tmp_path: Path):
|
|
bib_path = tmp_path / "input.bib"
|
|
bib_path.write_text(
|
|
"""
|
|
@article{seed2024,
|
|
author = {Seed, Alice},
|
|
title = {Graph Methods for Biology},
|
|
year = {2024},
|
|
abstract = {A graph methods paper.}
|
|
}
|
|
|
|
@article{other2023,
|
|
author = {Other, Bob},
|
|
title = {Graph Methods for Chemistry},
|
|
year = {2023},
|
|
abstract = {Another graph methods paper.}
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
ingest = run_cli(tmp_path, "ingest", str(bib_path))
|
|
assert ingest.returncode == 0
|
|
|
|
from citegeist.storage import BibliographyStore
|
|
|
|
database = tmp_path / "library.sqlite3"
|
|
store = BibliographyStore(database)
|
|
try:
|
|
store.add_entry_topic(
|
|
"seed2024",
|
|
topic_slug="biology",
|
|
topic_name="Biology",
|
|
source_type="talkorigins",
|
|
source_url="https://example.org/topics/biology",
|
|
source_label="topic-seed",
|
|
)
|
|
store.add_entry_topic(
|
|
"other2023",
|
|
topic_slug="chemistry",
|
|
topic_name="Chemistry",
|
|
source_type="talkorigins",
|
|
source_url="https://example.org/topics/chemistry",
|
|
source_label="topic-seed",
|
|
)
|
|
store.connection.commit()
|
|
finally:
|
|
store.close()
|
|
|
|
search = run_cli(tmp_path, "search", "graph", "--topic", "biology")
|
|
assert search.returncode == 0
|
|
assert "seed2024" in search.stdout
|
|
assert "other2023" not in search.stdout
|
|
|
|
|
|
def test_cli_graph_outputs_missing_targets(tmp_path: Path):
|
|
bib_path = tmp_path / "graph.bib"
|
|
bib_path.write_text(
|
|
"""
|
|
@article{seed2024,
|
|
author = {Seed, Alice},
|
|
title = {Seed Paper},
|
|
year = {2024},
|
|
references = {known2023, missing2022}
|
|
}
|
|
|
|
@article{known2023,
|
|
author = {Known, Bob},
|
|
title = {Known Paper},
|
|
year = {2023}
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
ingest = run_cli(tmp_path, "ingest", str(bib_path))
|
|
assert ingest.returncode == 0
|
|
|
|
graph = run_cli(tmp_path, "graph", "seed2024", "--missing-only")
|
|
assert graph.returncode == 0
|
|
payload = json.loads(graph.stdout)
|
|
assert len(payload) == 1
|
|
assert payload[0]["target_citation_key"] == "missing2022"
|
|
assert payload[0]["target_exists"] is False
|
|
|
|
|
|
def test_cli_expand_with_mocked_crossref(tmp_path: Path):
|
|
bib_path = tmp_path / "expand.bib"
|
|
bib_path.write_text(
|
|
"""
|
|
@article{seed2024,
|
|
author = {Seed, Alice},
|
|
title = {Seed Paper},
|
|
year = {2024},
|
|
doi = {10.1000/seed-doi}
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
ingest = run_cli(tmp_path, "ingest", str(bib_path))
|
|
assert ingest.returncode == 0
|
|
|
|
from citegeist.expand import ExpansionResult
|
|
|
|
with patch("citegeist.cli.CrossrefExpander.expand_entry_references") as mocked_expand:
|
|
mocked_expand.return_value = [
|
|
ExpansionResult(
|
|
source_citation_key="seed2024",
|
|
discovered_citation_key="doi101000exampleref",
|
|
created_entry=True,
|
|
relation_type="cites",
|
|
source_label="crossref:references:10.1000/seed-doi",
|
|
)
|
|
]
|
|
database = tmp_path / "library.sqlite3"
|
|
exit_code = main(["--db", str(database), "expand", "seed2024"])
|
|
|
|
assert exit_code == 0
|
|
|
|
|
|
def test_cli_expand_with_mocked_openalex(tmp_path: Path):
|
|
bib_path = tmp_path / "expand-openalex.bib"
|
|
bib_path.write_text(
|
|
"""
|
|
@article{seed2024,
|
|
author = {Seed, Alice},
|
|
title = {Seed Paper},
|
|
year = {2024},
|
|
doi = {10.1000/seed-doi}
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
ingest = run_cli(tmp_path, "ingest", str(bib_path))
|
|
assert ingest.returncode == 0
|
|
|
|
from citegeist.expand import ExpansionResult
|
|
|
|
with patch("citegeist.cli.OpenAlexExpander.expand_entry") as mocked_expand:
|
|
mocked_expand.return_value = [
|
|
ExpansionResult(
|
|
source_citation_key="seed2024",
|
|
discovered_citation_key="openalexw12345",
|
|
created_entry=True,
|
|
relation_type="cites",
|
|
source_label="openalex:cites:WSEED",
|
|
)
|
|
]
|
|
database = tmp_path / "library.sqlite3"
|
|
exit_code = main(
|
|
["--db", str(database), "expand", "seed2024", "--source", "openalex", "--relation", "cites"]
|
|
)
|
|
|
|
assert exit_code == 0
|
|
|
|
|
|
def test_cli_expand_topic_with_mocked_expander(tmp_path: Path):
|
|
from citegeist.expand import TopicExpansionResult
|
|
|
|
with patch("citegeist.cli.TopicExpander.expand_topic") as mocked_expand:
|
|
mocked_expand.return_value = [
|
|
TopicExpansionResult(
|
|
topic_slug="abiogenesis",
|
|
source_citation_key="seed2024",
|
|
discovered_citation_key="discovered1",
|
|
discovered_title="Abiogenesis origin chemistry",
|
|
created_entry=True,
|
|
relation_type="cites",
|
|
source_label="openalex:cites:seed2024",
|
|
relevance_score=0.67,
|
|
meets_relevance_threshold=True,
|
|
assigned_to_topic=True,
|
|
)
|
|
]
|
|
database = tmp_path / "library.sqlite3"
|
|
exit_code = main(
|
|
[
|
|
"--db",
|
|
str(database),
|
|
"expand-topic",
|
|
"abiogenesis",
|
|
"--topic-phrase",
|
|
"abiogenesis origin chemistry",
|
|
"--seed-key",
|
|
"seed2024",
|
|
"--min-relevance",
|
|
"0.3",
|
|
"--preview",
|
|
]
|
|
)
|
|
|
|
assert exit_code == 0
|
|
_, kwargs = mocked_expand.call_args
|
|
assert kwargs["preview_only"] is True
|