1025 lines
30 KiB
Python
1025 lines
30 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
from citegeist.batch import load_batch_jobs
|
|
from citegeist.bibtex import BibEntry
|
|
from citegeist.examples.talkorigins import TalkOriginsScraper, normalize_topic_entries
|
|
from citegeist.storage import BibliographyStore
|
|
|
|
|
|
INDEX_HTML = """
|
|
<html><body>
|
|
<a href="abiogenesis.html">Abiogenesis</a>
|
|
<a href="evolution.html">Evolution</a>
|
|
<a href="/origins/faqs.html">Browse</a>
|
|
</body></html>
|
|
"""
|
|
|
|
ABIOGENESIS_HTML = """
|
|
<html><body><pre>
|
|
Smith, J., 1998, First paper title: Journal of Origins, v. 10, p. 1-10.
|
|
|
|
---, 2001, Second paper title: Journal of Origins, v. 12, p. 20-30.
|
|
</pre></body></html>
|
|
"""
|
|
|
|
EVOLUTION_HTML = """
|
|
<html><body><pre>
|
|
Jones, A., and Roe, B.,
|
|
2003, Wrapped title across lines:
|
|
Proceedings of the Example Conference, p. 40-55.
|
|
</pre></body></html>
|
|
"""
|
|
|
|
|
|
class FakeSourceClient:
|
|
def __init__(self, payloads: dict[str, str]) -> None:
|
|
self.payloads = payloads
|
|
|
|
def get_text(self, url: str) -> str:
|
|
return self.payloads[url]
|
|
|
|
|
|
def test_normalize_topic_entries_carries_forward_repeated_authors():
|
|
text = """
|
|
Smith, J., 1998, First paper title: Journal of Origins.
|
|
|
|
---, 2001, Second paper title: Journal of Origins.
|
|
"""
|
|
|
|
entries = normalize_topic_entries(text)
|
|
|
|
assert entries[1].startswith("Smith, J., 2001")
|
|
|
|
|
|
def test_talkorigins_scraper_writes_seed_bibs_and_jobs(tmp_path: Path):
|
|
base_url = "https://www.talkorigins.org/origins/biblio/"
|
|
scraper = TalkOriginsScraper(
|
|
source_client=FakeSourceClient(
|
|
{
|
|
base_url: INDEX_HTML,
|
|
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
|
|
f"{base_url}evolution.html": EVOLUTION_HTML,
|
|
}
|
|
)
|
|
)
|
|
|
|
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
|
|
|
|
assert export.topic_count == 2
|
|
assert export.entry_count == 3
|
|
|
|
jobs = json.loads(Path(export.jobs_path).read_text(encoding="utf-8"))
|
|
assert jobs["jobs"][0]["name"] == "talkorigins:abiogenesis"
|
|
assert Path(jobs["jobs"][0]["seed_bib"]).exists()
|
|
|
|
manifest = json.loads(Path(export.manifest_path).read_text(encoding="utf-8"))
|
|
assert manifest["seed_sets"][0]["parsed_entry_count"] == 2
|
|
|
|
abiogenesis_bib = Path(export.seed_sets[0].seed_bib).read_text(encoding="utf-8")
|
|
abiogenesis_plain = Path(export.seed_sets[0].plaintext_path).read_text(encoding="utf-8")
|
|
abiogenesis_page = Path(export.seed_sets[0].page_path).read_text(encoding="utf-8")
|
|
full_bib = Path(export.full_bib_path).read_text(encoding="utf-8")
|
|
full_plain = Path(export.full_plaintext_path).read_text(encoding="utf-8")
|
|
site_index = Path(export.site_index_path).read_text(encoding="utf-8")
|
|
assert "@article{smith1998first1," in abiogenesis_bib
|
|
assert 'author = "Smith, J"' in abiogenesis_bib
|
|
assert "@article{smith2001second2," in abiogenesis_bib
|
|
assert "Abiogenesis" in abiogenesis_plain
|
|
assert "Show BibTeX" in abiogenesis_page
|
|
assert "toggleBibtex" in abiogenesis_page
|
|
assert "@article{smith1998first1," in full_bib
|
|
assert "Evolution" in full_plain
|
|
assert "Full BibTeX bibliography" in site_index
|
|
|
|
|
|
def test_talkorigins_parser_prefers_book_for_publisher_like_venues():
|
|
scraper = TalkOriginsScraper(source_client=FakeSourceClient({}))
|
|
|
|
entry = scraper.parse_reference_entry(
|
|
"Rutten, M. G., 1971, The Origin of Life by Natural Causes: Amsterdam, London, New York, Elsevier.",
|
|
1,
|
|
)
|
|
|
|
assert entry is not None
|
|
assert entry.entry_type == "book"
|
|
assert entry.fields["publisher"] == "Amsterdam, London, New York, Elsevier"
|
|
|
|
|
|
def test_talkorigins_parser_promotes_edited_volume_chapter_to_incollection():
|
|
scraper = TalkOriginsScraper(source_client=FakeSourceClient({}))
|
|
|
|
entry = scraper.parse_reference_entry(
|
|
"Carpenter, C. R., 1958, Territoriality: A Review of Concepts and Problems, in Roe, A., and Simpson, G. G., eds., Behavior and Evolution: New Haven, Yale University Press, p. 224-250.",
|
|
1,
|
|
)
|
|
|
|
assert entry is not None
|
|
assert entry.entry_type == "incollection"
|
|
assert entry.fields["title"] == "Territoriality: A Review of Concepts and Problems"
|
|
assert entry.fields["editor"] == "Roe, A. and Simpson, G. G."
|
|
assert entry.fields["booktitle"] == "Behavior and Evolution"
|
|
assert "Yale University Press" in entry.fields["publisher"]
|
|
|
|
|
|
def test_talkorigins_scraper_resume_uses_saved_snapshot(tmp_path: Path):
|
|
base_url = "https://www.talkorigins.org/origins/biblio/"
|
|
scraper = TalkOriginsScraper(
|
|
source_client=FakeSourceClient(
|
|
{
|
|
base_url: INDEX_HTML,
|
|
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
|
|
f"{base_url}evolution.html": EVOLUTION_HTML,
|
|
}
|
|
)
|
|
)
|
|
|
|
first_export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path, limit_topics=1)
|
|
snapshot_path = Path(first_export.seed_sets[0].snapshot_path)
|
|
snapshot = json.loads(snapshot_path.read_text(encoding="utf-8"))
|
|
assert snapshot["raw_entries"][0].startswith("Smith, J.")
|
|
|
|
scraper_with_broken_page = TalkOriginsScraper(
|
|
source_client=FakeSourceClient(
|
|
{
|
|
base_url: INDEX_HTML,
|
|
f"{base_url}abiogenesis.html": "<html><body>broken</body></html>",
|
|
}
|
|
)
|
|
)
|
|
resumed_export = scraper_with_broken_page.scrape_to_directory(base_url=base_url, output_dir=tmp_path, limit_topics=1)
|
|
|
|
assert resumed_export.entry_count == 2
|
|
|
|
|
|
def test_talkorigins_validation_reports_suspicious_entries(tmp_path: Path):
|
|
base_url = "https://www.talkorigins.org/origins/biblio/"
|
|
scraper = TalkOriginsScraper(
|
|
source_client=FakeSourceClient(
|
|
{
|
|
base_url: INDEX_HTML,
|
|
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
|
|
}
|
|
)
|
|
)
|
|
|
|
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path, limit_topics=1)
|
|
seed_bib_path = Path(export.seed_sets[0].seed_bib)
|
|
seed_bib_path.write_text(
|
|
"""
|
|
@article{bad1,
|
|
author = "Example, A",
|
|
year = "1999",
|
|
title = "Bad Venue Classification",
|
|
journal = "Elsevier"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
report = scraper.validate_export(export.manifest_path)
|
|
|
|
assert report.topic_count == 1
|
|
assert report.entry_count == 2
|
|
assert report.suspicious_entry_type_count >= 1
|
|
assert report.suspicious_examples[0]["citation_key"] == "bad1"
|
|
|
|
|
|
def test_talkorigins_validation_does_not_flag_legitimate_incollection(tmp_path: Path):
|
|
base_url = "https://www.talkorigins.org/origins/biblio/"
|
|
scraper = TalkOriginsScraper(
|
|
source_client=FakeSourceClient(
|
|
{
|
|
base_url: INDEX_HTML,
|
|
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
|
|
}
|
|
)
|
|
)
|
|
|
|
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path, limit_topics=1)
|
|
seed_bib_path = Path(export.seed_sets[0].seed_bib)
|
|
seed_bib_path.write_text(
|
|
"""
|
|
@incollection{good1,
|
|
author = "Example, A",
|
|
editor = "Editor, E",
|
|
year = "1999",
|
|
title = "Good Chapter",
|
|
booktitle = "Collected Essays",
|
|
publisher = "New Haven, Yale University Press"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
report = scraper.validate_export(export.manifest_path)
|
|
|
|
assert all(item["citation_key"] != "good1" for item in report.suspicious_examples)
|
|
|
|
|
|
def test_talkorigins_validation_reports_duplicate_clusters(tmp_path: Path):
|
|
base_url = "https://www.talkorigins.org/origins/biblio/"
|
|
scraper = TalkOriginsScraper(
|
|
source_client=FakeSourceClient(
|
|
{
|
|
base_url: INDEX_HTML,
|
|
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
|
|
f"{base_url}evolution.html": EVOLUTION_HTML,
|
|
}
|
|
)
|
|
)
|
|
|
|
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
|
|
Path(export.seed_sets[0].seed_bib).write_text(
|
|
"""
|
|
@article{dup1,
|
|
author = "Smith, Jane",
|
|
year = "1999",
|
|
title = "Duplicate Paper",
|
|
journal = "Journal A"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
Path(export.seed_sets[1].seed_bib).write_text(
|
|
"""
|
|
@article{dup2,
|
|
author = "Smith, Jane",
|
|
year = "1999",
|
|
title = "Duplicate Paper",
|
|
journal = "Journal B"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
report = scraper.validate_export(export.manifest_path)
|
|
|
|
assert report.duplicate_cluster_count >= 1
|
|
assert report.duplicate_entry_count >= 2
|
|
assert report.duplicate_examples[0]["items"][0]["citation_key"] in {"dup1", "dup2"}
|
|
|
|
|
|
def test_talkorigins_can_suggest_topic_phrases(tmp_path: Path):
|
|
base_url = "https://www.talkorigins.org/origins/biblio/"
|
|
scraper = TalkOriginsScraper(
|
|
source_client=FakeSourceClient(
|
|
{
|
|
base_url: INDEX_HTML,
|
|
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
|
|
}
|
|
)
|
|
)
|
|
|
|
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path, limit_topics=1)
|
|
Path(export.seed_sets[0].seed_bib).write_text(
|
|
"""
|
|
@article{bio1,
|
|
author = "Smith, Jane",
|
|
year = "1999",
|
|
title = "Prebiotic chemistry and ribozyme catalysis",
|
|
journal = "Origins"
|
|
}
|
|
|
|
@article{bio2,
|
|
author = "Smith, Jane",
|
|
year = "2001",
|
|
title = "Ribozyme networks in prebiotic chemistry",
|
|
journal = "Origins"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
suggestions = scraper.suggest_topic_phrases(export.manifest_path)
|
|
|
|
assert len(suggestions) == 1
|
|
assert suggestions[0].slug == "abiogenesis"
|
|
assert suggestions[0].suggested_phrase.startswith("Abiogenesis ")
|
|
assert "chemistry" in suggestions[0].keywords
|
|
assert "prebiotic" in suggestions[0].keywords
|
|
assert suggestions[0].review_required is True
|
|
assert "small_topic" in (suggestions[0].review_reasons or [])
|
|
assert "noisy_keywords" not in (suggestions[0].review_reasons or [])
|
|
|
|
|
|
def test_talkorigins_duplicate_inspection_filters_by_topic_and_match(tmp_path: Path):
|
|
base_url = "https://www.talkorigins.org/origins/biblio/"
|
|
scraper = TalkOriginsScraper(
|
|
source_client=FakeSourceClient(
|
|
{
|
|
base_url: INDEX_HTML,
|
|
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
|
|
f"{base_url}evolution.html": EVOLUTION_HTML,
|
|
}
|
|
)
|
|
)
|
|
|
|
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
|
|
Path(export.seed_sets[0].seed_bib).write_text(
|
|
"""
|
|
@article{dup1,
|
|
author = "Smith, Jane",
|
|
year = "1999",
|
|
title = "Duplicate Paper",
|
|
journal = "Journal A"
|
|
}
|
|
|
|
@article{dup2,
|
|
author = "Smith, Jane",
|
|
year = "1999",
|
|
title = "Duplicate Paper",
|
|
journal = "Journal B"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
Path(export.seed_sets[1].seed_bib).write_text(
|
|
"""
|
|
@article{other1,
|
|
author = "Jones, Alex",
|
|
year = "2001",
|
|
title = "Other Topic Paper",
|
|
journal = "Journal C"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
clusters = scraper.inspect_duplicate_clusters(
|
|
export.manifest_path,
|
|
topic_slug="abiogenesis",
|
|
match="duplicate",
|
|
)
|
|
|
|
assert len(clusters) == 1
|
|
assert clusters[0].count == 2
|
|
assert all(item["topic_slug"] == "abiogenesis" for item in clusters[0].items)
|
|
|
|
|
|
def test_talkorigins_duplicate_inspection_can_preview_canonical_choice(tmp_path: Path):
|
|
base_url = "https://www.talkorigins.org/origins/biblio/"
|
|
scraper = TalkOriginsScraper(
|
|
source_client=FakeSourceClient(
|
|
{
|
|
base_url: INDEX_HTML,
|
|
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
|
|
f"{base_url}evolution.html": EVOLUTION_HTML,
|
|
}
|
|
)
|
|
)
|
|
|
|
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
|
|
Path(export.seed_sets[0].seed_bib).write_text(
|
|
"""
|
|
@article{dup1,
|
|
author = "Smith, Jane",
|
|
year = "1999",
|
|
title = "Duplicate Paper",
|
|
journal = "Journal A"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
Path(export.seed_sets[1].seed_bib).write_text(
|
|
"""
|
|
@article{dup2,
|
|
author = "Smith, Jane",
|
|
year = "1999",
|
|
title = "Duplicate Paper",
|
|
journal = "Journal B",
|
|
doi = "10.1000/dup"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
clusters = scraper.inspect_duplicate_clusters(export.manifest_path, preview_canonical=True)
|
|
|
|
assert len(clusters) == 1
|
|
assert clusters[0].canonical is not None
|
|
assert clusters[0].canonical["citation_key"] == "dup2"
|
|
assert clusters[0].canonical["fields"]["doi"] == "10.1000/dup"
|
|
assert clusters[0].canonical["weak_reasons"] == []
|
|
|
|
|
|
def test_talkorigins_duplicate_inspection_can_filter_to_weak_canonicals(tmp_path: Path):
|
|
base_url = "https://www.talkorigins.org/origins/biblio/"
|
|
scraper = TalkOriginsScraper(
|
|
source_client=FakeSourceClient(
|
|
{
|
|
base_url: INDEX_HTML,
|
|
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
|
|
f"{base_url}evolution.html": EVOLUTION_HTML,
|
|
}
|
|
)
|
|
)
|
|
|
|
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
|
|
Path(export.seed_sets[0].seed_bib).write_text(
|
|
"""
|
|
@misc{weak1,
|
|
author = "Smith, Jane",
|
|
year = "1999",
|
|
title = "Weak Duplicate"
|
|
}
|
|
|
|
@misc{weak2,
|
|
author = "Smith, Jane",
|
|
year = "1999",
|
|
title = "Weak Duplicate",
|
|
note = "Copied from legacy source"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
Path(export.seed_sets[1].seed_bib).write_text(
|
|
"""
|
|
@article{strong1,
|
|
author = "Jones, Alex",
|
|
year = "2001",
|
|
title = "Strong Duplicate",
|
|
journal = "Journal B",
|
|
doi = "10.1000/strong"
|
|
}
|
|
|
|
@article{strong2,
|
|
author = "Jones, Alex",
|
|
year = "2001",
|
|
title = "Strong Duplicate",
|
|
journal = "Journal B"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
clusters = scraper.inspect_duplicate_clusters(
|
|
export.manifest_path,
|
|
preview_canonical=True,
|
|
weak_only=True,
|
|
)
|
|
|
|
assert len(clusters) == 1
|
|
assert clusters[0].canonical is not None
|
|
assert clusters[0].canonical["citation_key"] == "weak2"
|
|
assert "entry_type:misc" in clusters[0].canonical["weak_reasons"]
|
|
assert "missing:doi" in clusters[0].canonical["weak_reasons"]
|
|
|
|
|
|
def test_talkorigins_enrich_weak_canonicals_can_preview_resolution(tmp_path: Path):
|
|
base_url = "https://www.talkorigins.org/origins/biblio/"
|
|
scraper = TalkOriginsScraper(
|
|
source_client=FakeSourceClient(
|
|
{
|
|
base_url: INDEX_HTML,
|
|
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
|
|
f"{base_url}evolution.html": EVOLUTION_HTML,
|
|
}
|
|
)
|
|
)
|
|
|
|
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
|
|
Path(export.seed_sets[0].seed_bib).write_text(
|
|
"""
|
|
@misc{weak1,
|
|
author = "Smith, Jane",
|
|
year = "1999",
|
|
title = "Weak Duplicate"
|
|
}
|
|
|
|
@misc{weak2,
|
|
author = "Smith, Jane",
|
|
year = "1999",
|
|
title = "Weak Duplicate",
|
|
note = "Copied from legacy source"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
Path(export.seed_sets[1].seed_bib).write_text("", encoding="utf-8")
|
|
|
|
from citegeist.resolve import Resolution
|
|
|
|
scraper.resolver.resolve_entry = lambda entry: Resolution( # type: ignore[method-assign]
|
|
entry=BibEntry(
|
|
entry_type="article",
|
|
citation_key="resolved",
|
|
fields={
|
|
"author": entry.fields["author"],
|
|
"title": entry.fields["title"],
|
|
"year": entry.fields["year"],
|
|
"doi": "10.1000/weak",
|
|
"journal": "Journal of Better Metadata",
|
|
},
|
|
),
|
|
source_type="resolver",
|
|
source_label="crossref:search:Weak Duplicate",
|
|
)
|
|
|
|
store = BibliographyStore()
|
|
try:
|
|
results = scraper.enrich_weak_canonicals(export.manifest_path, store, apply=False)
|
|
finally:
|
|
store.close()
|
|
|
|
assert len(results) == 1
|
|
assert results[0].resolved is True
|
|
assert results[0].applied is False
|
|
assert results[0].weak_reasons_after == []
|
|
|
|
|
|
def test_talkorigins_enrich_weak_canonicals_can_apply_to_store(tmp_path: Path):
|
|
base_url = "https://www.talkorigins.org/origins/biblio/"
|
|
scraper = TalkOriginsScraper(
|
|
source_client=FakeSourceClient(
|
|
{
|
|
base_url: INDEX_HTML,
|
|
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
|
|
f"{base_url}evolution.html": EVOLUTION_HTML,
|
|
}
|
|
)
|
|
)
|
|
|
|
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
|
|
Path(export.seed_sets[0].seed_bib).write_text(
|
|
"""
|
|
@misc{weak2,
|
|
author = "Smith, Jane",
|
|
year = "1999",
|
|
title = "Weak Duplicate"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
Path(export.seed_sets[1].seed_bib).write_text(
|
|
"""
|
|
@misc{weak1,
|
|
author = "Smith, Jane",
|
|
year = "1999",
|
|
title = "Weak Duplicate",
|
|
note = "Copied from legacy source"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
from citegeist.resolve import Resolution
|
|
|
|
scraper.resolver.resolve_entry = lambda entry: Resolution( # type: ignore[method-assign]
|
|
entry=BibEntry(
|
|
entry_type="article",
|
|
citation_key="resolved",
|
|
fields={
|
|
"author": entry.fields["author"],
|
|
"title": entry.fields["title"],
|
|
"year": entry.fields["year"],
|
|
"doi": "10.1000/weak",
|
|
"journal": "Journal of Better Metadata",
|
|
},
|
|
),
|
|
source_type="resolver",
|
|
source_label="crossref:search:Weak Duplicate",
|
|
)
|
|
|
|
store = BibliographyStore()
|
|
try:
|
|
scraper.ingest_export(export.manifest_path, store, dedupe=False)
|
|
results = scraper.enrich_weak_canonicals(export.manifest_path, store, apply=True)
|
|
|
|
assert len(results) == 1
|
|
assert results[0].applied is True
|
|
entry = store.get_entry(results[0].citation_key)
|
|
assert entry is not None
|
|
assert entry["doi"] == "10.1000/weak"
|
|
assert entry["journal"] == "Journal of Better Metadata"
|
|
assert entry["review_status"] == "enriched"
|
|
finally:
|
|
store.close()
|
|
|
|
|
|
def test_talkorigins_enrich_weak_canonicals_rejects_unsafe_search_match(tmp_path: Path):
|
|
base_url = "https://www.talkorigins.org/origins/biblio/"
|
|
scraper = TalkOriginsScraper(
|
|
source_client=FakeSourceClient(
|
|
{
|
|
base_url: INDEX_HTML,
|
|
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
|
|
f"{base_url}evolution.html": EVOLUTION_HTML,
|
|
}
|
|
)
|
|
)
|
|
|
|
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
|
|
Path(export.seed_sets[0].seed_bib).write_text(
|
|
"""
|
|
@misc{weak2,
|
|
author = "Adams, D",
|
|
year = "1987",
|
|
title = "The bigger they are, the harder they fall"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
Path(export.seed_sets[1].seed_bib).write_text(
|
|
"""
|
|
@misc{weak1,
|
|
author = "Adams, D",
|
|
year = "1987",
|
|
title = "The bigger they are, the harder they fall",
|
|
note = "Copied from legacy source"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
from citegeist.resolve import Resolution
|
|
|
|
scraper.resolver.resolve_entry = lambda entry: Resolution( # type: ignore[method-assign]
|
|
entry=BibEntry(
|
|
entry_type="misc",
|
|
citation_key="resolved",
|
|
fields={
|
|
"author": "Kulik, Dean",
|
|
"title": "The Nexus Recursive Harmonic Framework: Reality as Unbounded, Observerless Computation (SILR / RHA / CST) Ver 2",
|
|
"year": "2026",
|
|
"doi": "10.9999/not-a-match",
|
|
},
|
|
),
|
|
source_type="resolver",
|
|
source_label="datacite:search:The bigger they are, the harder they fall",
|
|
)
|
|
|
|
store = BibliographyStore()
|
|
try:
|
|
scraper.ingest_export(export.manifest_path, store, dedupe=False)
|
|
results = scraper.enrich_weak_canonicals(export.manifest_path, store, apply=True)
|
|
|
|
assert len(results) == 1
|
|
assert results[0].resolved is False
|
|
assert results[0].applied is False
|
|
assert results[0].error == "unsafe resolver match"
|
|
entry = store.get_entry("weak2") or store.get_entry("weak1")
|
|
assert entry is not None
|
|
assert entry["doi"] is None
|
|
finally:
|
|
store.close()
|
|
|
|
|
|
def test_talkorigins_enrich_weak_canonicals_can_allow_unsafe_search_match(tmp_path: Path):
|
|
base_url = "https://www.talkorigins.org/origins/biblio/"
|
|
scraper = TalkOriginsScraper(
|
|
source_client=FakeSourceClient(
|
|
{
|
|
base_url: INDEX_HTML,
|
|
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
|
|
f"{base_url}evolution.html": EVOLUTION_HTML,
|
|
}
|
|
)
|
|
)
|
|
|
|
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
|
|
Path(export.seed_sets[0].seed_bib).write_text(
|
|
"""
|
|
@misc{weak2,
|
|
author = "Adams, D",
|
|
year = "1987",
|
|
title = "The bigger they are, the harder they fall"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
Path(export.seed_sets[1].seed_bib).write_text(
|
|
"""
|
|
@misc{weak1,
|
|
author = "Adams, D",
|
|
year = "1987",
|
|
title = "The bigger they are, the harder they fall",
|
|
note = "Copied from legacy source"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
from citegeist.resolve import Resolution
|
|
|
|
scraper.resolver.resolve_entry = lambda entry: Resolution( # type: ignore[method-assign]
|
|
entry=BibEntry(
|
|
entry_type="misc",
|
|
citation_key="resolved",
|
|
fields={
|
|
"author": "Kulik, Dean",
|
|
"title": "The Nexus Recursive Harmonic Framework: Reality as Unbounded, Observerless Computation (SILR / RHA / CST) Ver 2",
|
|
"year": "2026",
|
|
"doi": "10.9999/not-a-match",
|
|
},
|
|
),
|
|
source_type="resolver",
|
|
source_label="datacite:search:The bigger they are, the harder they fall",
|
|
)
|
|
|
|
store = BibliographyStore()
|
|
try:
|
|
scraper.ingest_export(export.manifest_path, store, dedupe=False)
|
|
results = scraper.enrich_weak_canonicals(
|
|
export.manifest_path,
|
|
store,
|
|
apply=True,
|
|
allow_unsafe_matches=True,
|
|
)
|
|
|
|
assert len(results) == 1
|
|
assert results[0].resolved is True
|
|
assert results[0].applied is True
|
|
entry = store.get_entry(results[0].citation_key)
|
|
assert entry is not None
|
|
assert entry["doi"] == "10.9999/not-a-match"
|
|
finally:
|
|
store.close()
|
|
|
|
|
|
def test_talkorigins_build_review_export_combines_cluster_and_enrichment(tmp_path: Path):
|
|
base_url = "https://www.talkorigins.org/origins/biblio/"
|
|
scraper = TalkOriginsScraper(
|
|
source_client=FakeSourceClient(
|
|
{
|
|
base_url: INDEX_HTML,
|
|
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
|
|
f"{base_url}evolution.html": EVOLUTION_HTML,
|
|
}
|
|
)
|
|
)
|
|
|
|
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
|
|
Path(export.seed_sets[0].seed_bib).write_text(
|
|
"""
|
|
@misc{weak1,
|
|
author = "Smith, Jane",
|
|
year = "1999",
|
|
title = "Weak Duplicate"
|
|
}
|
|
|
|
@misc{weak2,
|
|
author = "Smith, Jane",
|
|
year = "1999",
|
|
title = "Weak Duplicate",
|
|
note = "Copied from legacy source"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
Path(export.seed_sets[1].seed_bib).write_text("", encoding="utf-8")
|
|
|
|
from citegeist.resolve import Resolution
|
|
|
|
scraper.resolver.resolve_entry = lambda entry: Resolution( # type: ignore[method-assign]
|
|
entry=BibEntry(
|
|
entry_type="article",
|
|
citation_key="resolved",
|
|
fields={
|
|
"author": entry.fields["author"],
|
|
"title": entry.fields["title"],
|
|
"year": entry.fields["year"],
|
|
"doi": "10.1000/weak",
|
|
"journal": "Journal of Better Metadata",
|
|
},
|
|
),
|
|
source_type="resolver",
|
|
source_label="crossref:search:Weak Duplicate",
|
|
)
|
|
|
|
store = BibliographyStore()
|
|
try:
|
|
review = scraper.build_review_export(export.manifest_path, store)
|
|
finally:
|
|
store.close()
|
|
|
|
assert review.item_count == 1
|
|
assert review.items[0]["canonical"]["citation_key"] == "weak2"
|
|
assert review.items[0]["enrichment"]["resolved"] is True
|
|
assert review.items[0]["enrichment"]["applied"] is False
|
|
|
|
|
|
def test_talkorigins_apply_review_corrections_updates_store_entry(tmp_path: Path):
|
|
base_url = "https://www.talkorigins.org/origins/biblio/"
|
|
scraper = TalkOriginsScraper(
|
|
source_client=FakeSourceClient(
|
|
{
|
|
base_url: INDEX_HTML,
|
|
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
|
|
f"{base_url}evolution.html": EVOLUTION_HTML,
|
|
}
|
|
)
|
|
)
|
|
|
|
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
|
|
Path(export.seed_sets[0].seed_bib).write_text(
|
|
"""
|
|
@misc{weak1,
|
|
author = "Smith, Jane",
|
|
year = "1999",
|
|
title = "Weak Duplicate"
|
|
}
|
|
|
|
@misc{weak2,
|
|
author = "Smith, Jane",
|
|
year = "1999",
|
|
title = "Weak Duplicate",
|
|
note = "Copied from legacy source"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
Path(export.seed_sets[1].seed_bib).write_text("", encoding="utf-8")
|
|
|
|
corrections_path = tmp_path / "corrections.json"
|
|
corrections_path.write_text(
|
|
json.dumps(
|
|
{
|
|
"corrections": [
|
|
{
|
|
"key": "smith jane|1999|weak duplicate",
|
|
"entry_type": "article",
|
|
"review_status": "reviewed",
|
|
"fields": {
|
|
"journal": "Journal of Better Metadata",
|
|
"doi": "10.1000/weak",
|
|
"note": None,
|
|
},
|
|
}
|
|
]
|
|
}
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
store = BibliographyStore()
|
|
try:
|
|
scraper.ingest_export(export.manifest_path, store, dedupe=True)
|
|
results = scraper.apply_review_corrections(export.manifest_path, corrections_path, store)
|
|
|
|
assert len(results) == 1
|
|
assert results[0].applied is True
|
|
entry = store.get_entry(results[0].citation_key)
|
|
assert entry is not None
|
|
assert entry["entry_type"] == "article"
|
|
assert entry["journal"] == "Journal of Better Metadata"
|
|
assert entry["doi"] == "10.1000/weak"
|
|
assert entry["review_status"] == "reviewed"
|
|
assert entry.get("note") is None
|
|
finally:
|
|
store.close()
|
|
|
|
|
|
def test_talkorigins_scraper_assigns_topics_when_ingesting(tmp_path: Path):
|
|
base_url = "https://www.talkorigins.org/origins/biblio/"
|
|
scraper = TalkOriginsScraper(
|
|
source_client=FakeSourceClient(
|
|
{
|
|
base_url: INDEX_HTML,
|
|
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
|
|
}
|
|
)
|
|
)
|
|
|
|
store = BibliographyStore()
|
|
try:
|
|
export = scraper.scrape_to_directory(
|
|
base_url=base_url,
|
|
output_dir=tmp_path,
|
|
limit_topics=1,
|
|
ingest_store=store,
|
|
)
|
|
|
|
assert export.entry_count == 2
|
|
entry = store.get_entry("smith1998first1")
|
|
assert entry is not None
|
|
assert entry["topics"][0]["slug"] == "abiogenesis"
|
|
assert entry["topics"][0]["name"] == "Abiogenesis"
|
|
assert store.list_topics()[0]["slug"] == "abiogenesis"
|
|
finally:
|
|
store.close()
|
|
|
|
|
|
def test_talkorigins_ingest_export_consolidates_duplicates_into_one_entry(tmp_path: Path):
|
|
base_url = "https://www.talkorigins.org/origins/biblio/"
|
|
scraper = TalkOriginsScraper(
|
|
source_client=FakeSourceClient(
|
|
{
|
|
base_url: INDEX_HTML,
|
|
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
|
|
f"{base_url}evolution.html": EVOLUTION_HTML,
|
|
}
|
|
)
|
|
)
|
|
|
|
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
|
|
Path(export.seed_sets[0].seed_bib).write_text(
|
|
"""
|
|
@article{dup1,
|
|
author = "Smith, Jane",
|
|
year = "1999",
|
|
title = "Duplicate Paper",
|
|
journal = "Journal A"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
Path(export.seed_sets[1].seed_bib).write_text(
|
|
"""
|
|
@article{dup2,
|
|
author = "Smith, Jane",
|
|
year = "1999",
|
|
title = "Duplicate Paper",
|
|
journal = "Journal B",
|
|
doi = "10.1000/dup"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
store = BibliographyStore()
|
|
try:
|
|
report = scraper.ingest_export(export.manifest_path, store)
|
|
|
|
assert report.duplicate_cluster_count >= 1
|
|
assert report.stored_entry_count == 1
|
|
assert report.canonicalized_count >= 1
|
|
entry = store.get_entry("dup2")
|
|
assert entry is not None
|
|
assert entry["doi"] == "10.1000/dup"
|
|
assert [topic["slug"] for topic in entry["topics"]] == ["abiogenesis", "evolution"]
|
|
finally:
|
|
store.close()
|
|
|
|
|
|
def test_talkorigins_ingest_export_avoids_canonical_key_collisions(tmp_path: Path):
|
|
base_url = "https://www.talkorigins.org/origins/biblio/"
|
|
scraper = TalkOriginsScraper(
|
|
source_client=FakeSourceClient(
|
|
{
|
|
base_url: INDEX_HTML,
|
|
f"{base_url}abiogenesis.html": ABIOGENESIS_HTML,
|
|
f"{base_url}evolution.html": EVOLUTION_HTML,
|
|
}
|
|
)
|
|
)
|
|
|
|
export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path)
|
|
Path(export.seed_sets[0].seed_bib).write_text(
|
|
"""
|
|
@article{sharedkey,
|
|
author = "Smith, Jane",
|
|
year = "1999",
|
|
title = "First Paper",
|
|
journal = "Journal A"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
Path(export.seed_sets[1].seed_bib).write_text(
|
|
"""
|
|
@article{sharedkey,
|
|
author = "Jones, Alex",
|
|
year = "2001",
|
|
title = "Second Paper",
|
|
journal = "Journal B"
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
store = BibliographyStore()
|
|
try:
|
|
report = scraper.ingest_export(export.manifest_path, store)
|
|
|
|
assert report.stored_entry_count == 2
|
|
entries = store.list_entries(limit=10)
|
|
assert len(entries) == 2
|
|
assert len({entry["citation_key"] for entry in entries}) == 2
|
|
finally:
|
|
store.close()
|
|
|
|
|
|
def test_load_batch_jobs_resolves_relative_seed_paths(tmp_path: Path):
|
|
seed_bib = tmp_path / "seeds" / "topic.bib"
|
|
seed_bib.parent.mkdir(parents=True)
|
|
seed_bib.write_text("", encoding="utf-8")
|
|
|
|
jobs_json = tmp_path / "jobs.json"
|
|
jobs_json.write_text(
|
|
"""
|
|
{
|
|
"jobs": [
|
|
{"name": "relative-job", "seed_bib": "seeds/topic.bib", "topic": "Abiogenesis"}
|
|
]
|
|
}
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
jobs = load_batch_jobs(jobs_json)
|
|
|
|
assert jobs[0]["seed_bib"] == str(seed_bib.resolve())
|