CiteGeist/tests/test_talkorigins.py

1025 lines
30 KiB
Python

from __future__ import annotations
import json
from pathlib import Path
from citegeist.batch import load_batch_jobs
from citegeist.bibtex import BibEntry
from citegeist.examples.talkorigins import TalkOriginsScraper, normalize_topic_entries
from citegeist.storage import BibliographyStore
INDEX_HTML = """
<html><body>
<a href="abiogenesis.html">Abiogenesis</a>
<a href="evolution.html">Evolution</a>
<a href="/origins/faqs.html">Browse</a>
</body></html>
"""
ABIOGENESIS_HTML = """
<html><body><pre>
Smith, J., 1998, First paper title: Journal of Origins, v. 10, p. 1-10.
---, 2001, Second paper title: Journal of Origins, v. 12, p. 20-30.
</pre></body></html>
"""
EVOLUTION_HTML = """
<html><body><pre>
Jones, A., and Roe, B.,
2003, Wrapped title across lines:
Proceedings of the Example Conference, p. 40-55.
</pre></body></html>
"""
class FakeSourceClient:
def __init__(self, payloads: dict[str, str]) -> None:
self.payloads = payloads
def get_text(self, url: str) -> str:
return self.payloads[url]
def test_normalize_topic_entries_carries_forward_repeated_authors():
text = """
Smith, J., 1998, First paper title: Journal of Origins.
---, 2001, Second paper title: Journal of Origins.
"""
entries = normalize_topic_entries(text)
assert entries[1].startswith("Smith, J., 2001")
def test_talkorigins_scraper_writes_seed_bibs_and_jobs(tmp_path: Path):
base_url = "https://www.talkorigins.org/origins/biblio/"
scraper = TalkOriginsScraper(
source_client=FakeSourceClient(
{
base_url: INDEX_HTML,
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
f"{base_url}evolution.html": EVOLUTION_HTML,
}
)
)
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
assert export.topic_count == 2
assert export.entry_count == 3
jobs = json.loads(Path(export.jobs_path).read_text(encoding="utf-8"))
assert jobs["jobs"][0]["name"] == "talkorigins:abiogenesis"
assert Path(jobs["jobs"][0]["seed_bib"]).exists()
manifest = json.loads(Path(export.manifest_path).read_text(encoding="utf-8"))
assert manifest["seed_sets"][0]["parsed_entry_count"] == 2
abiogenesis_bib = Path(export.seed_sets[0].seed_bib).read_text(encoding="utf-8")
abiogenesis_plain = Path(export.seed_sets[0].plaintext_path).read_text(encoding="utf-8")
abiogenesis_page = Path(export.seed_sets[0].page_path).read_text(encoding="utf-8")
full_bib = Path(export.full_bib_path).read_text(encoding="utf-8")
full_plain = Path(export.full_plaintext_path).read_text(encoding="utf-8")
site_index = Path(export.site_index_path).read_text(encoding="utf-8")
assert "@article{smith1998first1," in abiogenesis_bib
assert 'author = "Smith, J"' in abiogenesis_bib
assert "@article{smith2001second2," in abiogenesis_bib
assert "Abiogenesis" in abiogenesis_plain
assert "Show BibTeX" in abiogenesis_page
assert "toggleBibtex" in abiogenesis_page
assert "@article{smith1998first1," in full_bib
assert "Evolution" in full_plain
assert "Full BibTeX bibliography" in site_index
def test_talkorigins_parser_prefers_book_for_publisher_like_venues():
scraper = TalkOriginsScraper(source_client=FakeSourceClient({}))
entry = scraper.parse_reference_entry(
"Rutten, M. G., 1971, The Origin of Life by Natural Causes: Amsterdam, London, New York, Elsevier.",
1,
)
assert entry is not None
assert entry.entry_type == "book"
assert entry.fields["publisher"] == "Amsterdam, London, New York, Elsevier"
def test_talkorigins_parser_promotes_edited_volume_chapter_to_incollection():
scraper = TalkOriginsScraper(source_client=FakeSourceClient({}))
entry = scraper.parse_reference_entry(
"Carpenter, C. R., 1958, Territoriality: A Review of Concepts and Problems, in Roe, A., and Simpson, G. G., eds., Behavior and Evolution: New Haven, Yale University Press, p. 224-250.",
1,
)
assert entry is not None
assert entry.entry_type == "incollection"
assert entry.fields["title"] == "Territoriality: A Review of Concepts and Problems"
assert entry.fields["editor"] == "Roe, A. and Simpson, G. G."
assert entry.fields["booktitle"] == "Behavior and Evolution"
assert "Yale University Press" in entry.fields["publisher"]
def test_talkorigins_scraper_resume_uses_saved_snapshot(tmp_path: Path):
base_url = "https://www.talkorigins.org/origins/biblio/"
scraper = TalkOriginsScraper(
source_client=FakeSourceClient(
{
base_url: INDEX_HTML,
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
f"{base_url}evolution.html": EVOLUTION_HTML,
}
)
)
first_export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path, limit_topics=1)
snapshot_path = Path(first_export.seed_sets[0].snapshot_path)
snapshot = json.loads(snapshot_path.read_text(encoding="utf-8"))
assert snapshot["raw_entries"][0].startswith("Smith, J.")
scraper_with_broken_page = TalkOriginsScraper(
source_client=FakeSourceClient(
{
base_url: INDEX_HTML,
f"{base_url}abiogenesis.html": "<html><body>broken</body></html>",
}
)
)
resumed_export = scraper_with_broken_page.scrape_to_directory(base_url=base_url, output_dir=tmp_path, limit_topics=1)
assert resumed_export.entry_count == 2
def test_talkorigins_validation_reports_suspicious_entries(tmp_path: Path):
base_url = "https://www.talkorigins.org/origins/biblio/"
scraper = TalkOriginsScraper(
source_client=FakeSourceClient(
{
base_url: INDEX_HTML,
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
}
)
)
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path, limit_topics=1)
seed_bib_path = Path(export.seed_sets[0].seed_bib)
seed_bib_path.write_text(
"""
@article{bad1,
author = "Example, A",
year = "1999",
title = "Bad Venue Classification",
journal = "Elsevier"
}
""",
encoding="utf-8",
)
report = scraper.validate_export(export.manifest_path)
assert report.topic_count == 1
assert report.entry_count == 2
assert report.suspicious_entry_type_count >= 1
assert report.suspicious_examples[0]["citation_key"] == "bad1"
def test_talkorigins_validation_does_not_flag_legitimate_incollection(tmp_path: Path):
base_url = "https://www.talkorigins.org/origins/biblio/"
scraper = TalkOriginsScraper(
source_client=FakeSourceClient(
{
base_url: INDEX_HTML,
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
}
)
)
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path, limit_topics=1)
seed_bib_path = Path(export.seed_sets[0].seed_bib)
seed_bib_path.write_text(
"""
@incollection{good1,
author = "Example, A",
editor = "Editor, E",
year = "1999",
title = "Good Chapter",
booktitle = "Collected Essays",
publisher = "New Haven, Yale University Press"
}
""",
encoding="utf-8",
)
report = scraper.validate_export(export.manifest_path)
assert all(item["citation_key"] != "good1" for item in report.suspicious_examples)
def test_talkorigins_validation_reports_duplicate_clusters(tmp_path: Path):
base_url = "https://www.talkorigins.org/origins/biblio/"
scraper = TalkOriginsScraper(
source_client=FakeSourceClient(
{
base_url: INDEX_HTML,
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
f"{base_url}evolution.html": EVOLUTION_HTML,
}
)
)
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
Path(export.seed_sets[0].seed_bib).write_text(
"""
@article{dup1,
author = "Smith, Jane",
year = "1999",
title = "Duplicate Paper",
journal = "Journal A"
}
""",
encoding="utf-8",
)
Path(export.seed_sets[1].seed_bib).write_text(
"""
@article{dup2,
author = "Smith, Jane",
year = "1999",
title = "Duplicate Paper",
journal = "Journal B"
}
""",
encoding="utf-8",
)
report = scraper.validate_export(export.manifest_path)
assert report.duplicate_cluster_count >= 1
assert report.duplicate_entry_count >= 2
assert report.duplicate_examples[0]["items"][0]["citation_key"] in {"dup1", "dup2"}
def test_talkorigins_can_suggest_topic_phrases(tmp_path: Path):
base_url = "https://www.talkorigins.org/origins/biblio/"
scraper = TalkOriginsScraper(
source_client=FakeSourceClient(
{
base_url: INDEX_HTML,
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
}
)
)
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path, limit_topics=1)
Path(export.seed_sets[0].seed_bib).write_text(
"""
@article{bio1,
author = "Smith, Jane",
year = "1999",
title = "Prebiotic chemistry and ribozyme catalysis",
journal = "Origins"
}
@article{bio2,
author = "Smith, Jane",
year = "2001",
title = "Ribozyme networks in prebiotic chemistry",
journal = "Origins"
}
""",
encoding="utf-8",
)
suggestions = scraper.suggest_topic_phrases(export.manifest_path)
assert len(suggestions) == 1
assert suggestions[0].slug == "abiogenesis"
assert suggestions[0].suggested_phrase.startswith("Abiogenesis ")
assert "chemistry" in suggestions[0].keywords
assert "prebiotic" in suggestions[0].keywords
assert suggestions[0].review_required is True
assert "small_topic" in (suggestions[0].review_reasons or [])
assert "noisy_keywords" not in (suggestions[0].review_reasons or [])
def test_talkorigins_duplicate_inspection_filters_by_topic_and_match(tmp_path: Path):
base_url = "https://www.talkorigins.org/origins/biblio/"
scraper = TalkOriginsScraper(
source_client=FakeSourceClient(
{
base_url: INDEX_HTML,
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
f"{base_url}evolution.html": EVOLUTION_HTML,
}
)
)
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
Path(export.seed_sets[0].seed_bib).write_text(
"""
@article{dup1,
author = "Smith, Jane",
year = "1999",
title = "Duplicate Paper",
journal = "Journal A"
}
@article{dup2,
author = "Smith, Jane",
year = "1999",
title = "Duplicate Paper",
journal = "Journal B"
}
""",
encoding="utf-8",
)
Path(export.seed_sets[1].seed_bib).write_text(
"""
@article{other1,
author = "Jones, Alex",
year = "2001",
title = "Other Topic Paper",
journal = "Journal C"
}
""",
encoding="utf-8",
)
clusters = scraper.inspect_duplicate_clusters(
export.manifest_path,
topic_slug="abiogenesis",
match="duplicate",
)
assert len(clusters) == 1
assert clusters[0].count == 2
assert all(item["topic_slug"] == "abiogenesis" for item in clusters[0].items)
def test_talkorigins_duplicate_inspection_can_preview_canonical_choice(tmp_path: Path):
base_url = "https://www.talkorigins.org/origins/biblio/"
scraper = TalkOriginsScraper(
source_client=FakeSourceClient(
{
base_url: INDEX_HTML,
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
f"{base_url}evolution.html": EVOLUTION_HTML,
}
)
)
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
Path(export.seed_sets[0].seed_bib).write_text(
"""
@article{dup1,
author = "Smith, Jane",
year = "1999",
title = "Duplicate Paper",
journal = "Journal A"
}
""",
encoding="utf-8",
)
Path(export.seed_sets[1].seed_bib).write_text(
"""
@article{dup2,
author = "Smith, Jane",
year = "1999",
title = "Duplicate Paper",
journal = "Journal B",
doi = "10.1000/dup"
}
""",
encoding="utf-8",
)
clusters = scraper.inspect_duplicate_clusters(export.manifest_path, preview_canonical=True)
assert len(clusters) == 1
assert clusters[0].canonical is not None
assert clusters[0].canonical["citation_key"] == "dup2"
assert clusters[0].canonical["fields"]["doi"] == "10.1000/dup"
assert clusters[0].canonical["weak_reasons"] == []
def test_talkorigins_duplicate_inspection_can_filter_to_weak_canonicals(tmp_path: Path):
base_url = "https://www.talkorigins.org/origins/biblio/"
scraper = TalkOriginsScraper(
source_client=FakeSourceClient(
{
base_url: INDEX_HTML,
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
f"{base_url}evolution.html": EVOLUTION_HTML,
}
)
)
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
Path(export.seed_sets[0].seed_bib).write_text(
"""
@misc{weak1,
author = "Smith, Jane",
year = "1999",
title = "Weak Duplicate"
}
@misc{weak2,
author = "Smith, Jane",
year = "1999",
title = "Weak Duplicate",
note = "Copied from legacy source"
}
""",
encoding="utf-8",
)
Path(export.seed_sets[1].seed_bib).write_text(
"""
@article{strong1,
author = "Jones, Alex",
year = "2001",
title = "Strong Duplicate",
journal = "Journal B",
doi = "10.1000/strong"
}
@article{strong2,
author = "Jones, Alex",
year = "2001",
title = "Strong Duplicate",
journal = "Journal B"
}
""",
encoding="utf-8",
)
clusters = scraper.inspect_duplicate_clusters(
export.manifest_path,
preview_canonical=True,
weak_only=True,
)
assert len(clusters) == 1
assert clusters[0].canonical is not None
assert clusters[0].canonical["citation_key"] == "weak2"
assert "entry_type:misc" in clusters[0].canonical["weak_reasons"]
assert "missing:doi" in clusters[0].canonical["weak_reasons"]
def test_talkorigins_enrich_weak_canonicals_can_preview_resolution(tmp_path: Path):
base_url = "https://www.talkorigins.org/origins/biblio/"
scraper = TalkOriginsScraper(
source_client=FakeSourceClient(
{
base_url: INDEX_HTML,
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
f"{base_url}evolution.html": EVOLUTION_HTML,
}
)
)
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
Path(export.seed_sets[0].seed_bib).write_text(
"""
@misc{weak1,
author = "Smith, Jane",
year = "1999",
title = "Weak Duplicate"
}
@misc{weak2,
author = "Smith, Jane",
year = "1999",
title = "Weak Duplicate",
note = "Copied from legacy source"
}
""",
encoding="utf-8",
)
Path(export.seed_sets[1].seed_bib).write_text("", encoding="utf-8")
from citegeist.resolve import Resolution
scraper.resolver.resolve_entry = lambda entry: Resolution( # type: ignore[method-assign]
entry=BibEntry(
entry_type="article",
citation_key="resolved",
fields={
"author": entry.fields["author"],
"title": entry.fields["title"],
"year": entry.fields["year"],
"doi": "10.1000/weak",
"journal": "Journal of Better Metadata",
},
),
source_type="resolver",
source_label="crossref:search:Weak Duplicate",
)
store = BibliographyStore()
try:
results = scraper.enrich_weak_canonicals(export.manifest_path, store, apply=False)
finally:
store.close()
assert len(results) == 1
assert results[0].resolved is True
assert results[0].applied is False
assert results[0].weak_reasons_after == []
def test_talkorigins_enrich_weak_canonicals_can_apply_to_store(tmp_path: Path):
base_url = "https://www.talkorigins.org/origins/biblio/"
scraper = TalkOriginsScraper(
source_client=FakeSourceClient(
{
base_url: INDEX_HTML,
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
f"{base_url}evolution.html": EVOLUTION_HTML,
}
)
)
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
Path(export.seed_sets[0].seed_bib).write_text(
"""
@misc{weak2,
author = "Smith, Jane",
year = "1999",
title = "Weak Duplicate"
}
""",
encoding="utf-8",
)
Path(export.seed_sets[1].seed_bib).write_text(
"""
@misc{weak1,
author = "Smith, Jane",
year = "1999",
title = "Weak Duplicate",
note = "Copied from legacy source"
}
""",
encoding="utf-8",
)
from citegeist.resolve import Resolution
scraper.resolver.resolve_entry = lambda entry: Resolution( # type: ignore[method-assign]
entry=BibEntry(
entry_type="article",
citation_key="resolved",
fields={
"author": entry.fields["author"],
"title": entry.fields["title"],
"year": entry.fields["year"],
"doi": "10.1000/weak",
"journal": "Journal of Better Metadata",
},
),
source_type="resolver",
source_label="crossref:search:Weak Duplicate",
)
store = BibliographyStore()
try:
scraper.ingest_export(export.manifest_path, store, dedupe=False)
results = scraper.enrich_weak_canonicals(export.manifest_path, store, apply=True)
assert len(results) == 1
assert results[0].applied is True
entry = store.get_entry(results[0].citation_key)
assert entry is not None
assert entry["doi"] == "10.1000/weak"
assert entry["journal"] == "Journal of Better Metadata"
assert entry["review_status"] == "enriched"
finally:
store.close()
def test_talkorigins_enrich_weak_canonicals_rejects_unsafe_search_match(tmp_path: Path):
base_url = "https://www.talkorigins.org/origins/biblio/"
scraper = TalkOriginsScraper(
source_client=FakeSourceClient(
{
base_url: INDEX_HTML,
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
f"{base_url}evolution.html": EVOLUTION_HTML,
}
)
)
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
Path(export.seed_sets[0].seed_bib).write_text(
"""
@misc{weak2,
author = "Adams, D",
year = "1987",
title = "The bigger they are, the harder they fall"
}
""",
encoding="utf-8",
)
Path(export.seed_sets[1].seed_bib).write_text(
"""
@misc{weak1,
author = "Adams, D",
year = "1987",
title = "The bigger they are, the harder they fall",
note = "Copied from legacy source"
}
""",
encoding="utf-8",
)
from citegeist.resolve import Resolution
scraper.resolver.resolve_entry = lambda entry: Resolution( # type: ignore[method-assign]
entry=BibEntry(
entry_type="misc",
citation_key="resolved",
fields={
"author": "Kulik, Dean",
"title": "The Nexus Recursive Harmonic Framework: Reality as Unbounded, Observerless Computation (SILR / RHA / CST) Ver 2",
"year": "2026",
"doi": "10.9999/not-a-match",
},
),
source_type="resolver",
source_label="datacite:search:The bigger they are, the harder they fall",
)
store = BibliographyStore()
try:
scraper.ingest_export(export.manifest_path, store, dedupe=False)
results = scraper.enrich_weak_canonicals(export.manifest_path, store, apply=True)
assert len(results) == 1
assert results[0].resolved is False
assert results[0].applied is False
assert results[0].error == "unsafe resolver match"
entry = store.get_entry("weak2") or store.get_entry("weak1")
assert entry is not None
assert entry["doi"] is None
finally:
store.close()
def test_talkorigins_enrich_weak_canonicals_can_allow_unsafe_search_match(tmp_path: Path):
base_url = "https://www.talkorigins.org/origins/biblio/"
scraper = TalkOriginsScraper(
source_client=FakeSourceClient(
{
base_url: INDEX_HTML,
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
f"{base_url}evolution.html": EVOLUTION_HTML,
}
)
)
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
Path(export.seed_sets[0].seed_bib).write_text(
"""
@misc{weak2,
author = "Adams, D",
year = "1987",
title = "The bigger they are, the harder they fall"
}
""",
encoding="utf-8",
)
Path(export.seed_sets[1].seed_bib).write_text(
"""
@misc{weak1,
author = "Adams, D",
year = "1987",
title = "The bigger they are, the harder they fall",
note = "Copied from legacy source"
}
""",
encoding="utf-8",
)
from citegeist.resolve import Resolution
scraper.resolver.resolve_entry = lambda entry: Resolution( # type: ignore[method-assign]
entry=BibEntry(
entry_type="misc",
citation_key="resolved",
fields={
"author": "Kulik, Dean",
"title": "The Nexus Recursive Harmonic Framework: Reality as Unbounded, Observerless Computation (SILR / RHA / CST) Ver 2",
"year": "2026",
"doi": "10.9999/not-a-match",
},
),
source_type="resolver",
source_label="datacite:search:The bigger they are, the harder they fall",
)
store = BibliographyStore()
try:
scraper.ingest_export(export.manifest_path, store, dedupe=False)
results = scraper.enrich_weak_canonicals(
export.manifest_path,
store,
apply=True,
allow_unsafe_matches=True,
)
assert len(results) == 1
assert results[0].resolved is True
assert results[0].applied is True
entry = store.get_entry(results[0].citation_key)
assert entry is not None
assert entry["doi"] == "10.9999/not-a-match"
finally:
store.close()
def test_talkorigins_build_review_export_combines_cluster_and_enrichment(tmp_path: Path):
base_url = "https://www.talkorigins.org/origins/biblio/"
scraper = TalkOriginsScraper(
source_client=FakeSourceClient(
{
base_url: INDEX_HTML,
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
f"{base_url}evolution.html": EVOLUTION_HTML,
}
)
)
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
Path(export.seed_sets[0].seed_bib).write_text(
"""
@misc{weak1,
author = "Smith, Jane",
year = "1999",
title = "Weak Duplicate"
}
@misc{weak2,
author = "Smith, Jane",
year = "1999",
title = "Weak Duplicate",
note = "Copied from legacy source"
}
""",
encoding="utf-8",
)
Path(export.seed_sets[1].seed_bib).write_text("", encoding="utf-8")
from citegeist.resolve import Resolution
scraper.resolver.resolve_entry = lambda entry: Resolution( # type: ignore[method-assign]
entry=BibEntry(
entry_type="article",
citation_key="resolved",
fields={
"author": entry.fields["author"],
"title": entry.fields["title"],
"year": entry.fields["year"],
"doi": "10.1000/weak",
"journal": "Journal of Better Metadata",
},
),
source_type="resolver",
source_label="crossref:search:Weak Duplicate",
)
store = BibliographyStore()
try:
review = scraper.build_review_export(export.manifest_path, store)
finally:
store.close()
assert review.item_count == 1
assert review.items[0]["canonical"]["citation_key"] == "weak2"
assert review.items[0]["enrichment"]["resolved"] is True
assert review.items[0]["enrichment"]["applied"] is False
def test_talkorigins_apply_review_corrections_updates_store_entry(tmp_path: Path):
base_url = "https://www.talkorigins.org/origins/biblio/"
scraper = TalkOriginsScraper(
source_client=FakeSourceClient(
{
base_url: INDEX_HTML,
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
f"{base_url}evolution.html": EVOLUTION_HTML,
}
)
)
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
Path(export.seed_sets[0].seed_bib).write_text(
"""
@misc{weak1,
author = "Smith, Jane",
year = "1999",
title = "Weak Duplicate"
}
@misc{weak2,
author = "Smith, Jane",
year = "1999",
title = "Weak Duplicate",
note = "Copied from legacy source"
}
""",
encoding="utf-8",
)
Path(export.seed_sets[1].seed_bib).write_text("", encoding="utf-8")
corrections_path = tmp_path / "corrections.json"
corrections_path.write_text(
json.dumps(
{
"corrections": [
{
"key": "smith jane|1999|weak duplicate",
"entry_type": "article",
"review_status": "reviewed",
"fields": {
"journal": "Journal of Better Metadata",
"doi": "10.1000/weak",
"note": None,
},
}
]
}
),
encoding="utf-8",
)
store = BibliographyStore()
try:
scraper.ingest_export(export.manifest_path, store, dedupe=True)
results = scraper.apply_review_corrections(export.manifest_path, corrections_path, store)
assert len(results) == 1
assert results[0].applied is True
entry = store.get_entry(results[0].citation_key)
assert entry is not None
assert entry["entry_type"] == "article"
assert entry["journal"] == "Journal of Better Metadata"
assert entry["doi"] == "10.1000/weak"
assert entry["review_status"] == "reviewed"
assert entry.get("note") is None
finally:
store.close()
def test_talkorigins_scraper_assigns_topics_when_ingesting(tmp_path: Path):
base_url = "https://www.talkorigins.org/origins/biblio/"
scraper = TalkOriginsScraper(
source_client=FakeSourceClient(
{
base_url: INDEX_HTML,
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
}
)
)
store = BibliographyStore()
try:
export = scraper.scrape_to_directory(
base_url=base_url,
output_dir=tmp_path,
limit_topics=1,
ingest_store=store,
)
assert export.entry_count == 2
entry = store.get_entry("smith1998first1")
assert entry is not None
assert entry["topics"][0]["slug"] == "abiogenesis"
assert entry["topics"][0]["name"] == "Abiogenesis"
assert store.list_topics()[0]["slug"] == "abiogenesis"
finally:
store.close()
def test_talkorigins_ingest_export_consolidates_duplicates_into_one_entry(tmp_path: Path):
base_url = "https://www.talkorigins.org/origins/biblio/"
scraper = TalkOriginsScraper(
source_client=FakeSourceClient(
{
base_url: INDEX_HTML,
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
f"{base_url}evolution.html": EVOLUTION_HTML,
}
)
)
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
Path(export.seed_sets[0].seed_bib).write_text(
"""
@article{dup1,
author = "Smith, Jane",
year = "1999",
title = "Duplicate Paper",
journal = "Journal A"
}
""",
encoding="utf-8",
)
Path(export.seed_sets[1].seed_bib).write_text(
"""
@article{dup2,
author = "Smith, Jane",
year = "1999",
title = "Duplicate Paper",
journal = "Journal B",
doi = "10.1000/dup"
}
""",
encoding="utf-8",
)
store = BibliographyStore()
try:
report = scraper.ingest_export(export.manifest_path, store)
assert report.duplicate_cluster_count >= 1
assert report.stored_entry_count == 1
assert report.canonicalized_count >= 1
entry = store.get_entry("dup2")
assert entry is not None
assert entry["doi"] == "10.1000/dup"
assert [topic["slug"] for topic in entry["topics"]] == ["abiogenesis", "evolution"]
finally:
store.close()
def test_talkorigins_ingest_export_avoids_canonical_key_collisions(tmp_path: Path):
base_url = "https://www.talkorigins.org/origins/biblio/"
scraper = TalkOriginsScraper(
source_client=FakeSourceClient(
{
base_url: INDEX_HTML,
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
f"{base_url}evolution.html": EVOLUTION_HTML,
}
)
)
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
Path(export.seed_sets[0].seed_bib).write_text(
"""
@article{sharedkey,
author = "Smith, Jane",
year = "1999",
title = "First Paper",
journal = "Journal A"
}
""",
encoding="utf-8",
)
Path(export.seed_sets[1].seed_bib).write_text(
"""
@article{sharedkey,
author = "Jones, Alex",
year = "2001",
title = "Second Paper",
journal = "Journal B"
}
""",
encoding="utf-8",
)
store = BibliographyStore()
try:
report = scraper.ingest_export(export.manifest_path, store)
assert report.stored_entry_count == 2
entries = store.list_entries(limit=10)
assert len(entries) == 2
assert len({entry["citation_key"] for entry in entries}) == 2
finally:
store.close()
def test_load_batch_jobs_resolves_relative_seed_paths(tmp_path: Path):
seed_bib = tmp_path / "seeds" / "topic.bib"
seed_bib.parent.mkdir(parents=True)
seed_bib.write_text("", encoding="utf-8")
jobs_json = tmp_path / "jobs.json"
jobs_json.write_text(
"""
{
"jobs": [
{"name": "relative-job", "seed_bib": "seeds/topic.bib", "topic": "Abiogenesis"}
]
}
""",
encoding="utf-8",
)
jobs = load_batch_jobs(jobs_json)
assert jobs[0]["seed_bib"] == str(seed_bib.resolve())