523 lines
18 KiB
Python
523 lines
18 KiB
Python
import json
|
|
from pathlib import Path
|
|
|
|
from citegeist import (
|
|
available_extraction_backends,
|
|
check_extraction_comparison_summary,
|
|
compare_extraction_backends,
|
|
extract_references,
|
|
parse_bibtex,
|
|
register_extraction_backend,
|
|
summarize_extraction_comparison,
|
|
)
|
|
from citegeist.bibtex import BibEntry
|
|
from citegeist.cli import main
|
|
|
|
|
|
SAMPLE_REFERENCES = """
|
|
[1] Smith, Jane and Doe, Alex. 2024. Graph-first bibliography augmentation. Journal of Research Systems.
|
|
[2] Miller, Sam. 2023. Semantic search for research corpora. Proceedings of the Retrieval Workshop.
|
|
"""
|
|
|
|
APA_AND_BOOK_REFERENCES = """
|
|
Brown, T., & Green, P. (2021). Retrieval methods for scholarly corpora. Journal of Information Retrieval.
|
|
|
|
Nguyen, An. Research Design for Literature Mapping. Example University Press, 2020.
|
|
"""
|
|
|
|
WRAPPED_REFERENCES = """
|
|
[1] Taylor, Ann. 2022. Multi-line reference extraction
|
|
for bibliography pipelines. Journal of Parsing Systems.
|
|
[2] Chen, Bo. 2021. Another entry. Proceedings of the Mining Workshop.
|
|
"""
|
|
|
|
RICH_REFERENCES = """
|
|
[1] Smith, Jane. 2024a. Graph-first bibliography augmentation. Journal of Research Systems 12(3): 45-67. doi:10.1000/example-doi.
|
|
[2] Doe, Alex. 2019. Evolutionary archives. PhD dissertation, Example University.
|
|
[3] Chen, Bo. 2018. Field methods update. Technical Report No. TR-2018-05, Example Research Lab.
|
|
[4] Nguyen, An. 2022. Project page. Retrieved from https://example.org/project-page.
|
|
"""
|
|
|
|
FIXTURE_REFERENCES = Path(__file__).with_name("fixtures").joinpath("extract_backend_fixture.txt").read_text(encoding="utf-8")
|
|
|
|
|
|
def register_fixture_alt_backend() -> None:
|
|
class FixtureAltBackend:
|
|
name = "fixture-alt"
|
|
|
|
def extract_references(self, text: str) -> list[BibEntry]:
|
|
return [
|
|
BibEntry(
|
|
entry_type="article",
|
|
citation_key="smith2024graphfirst1",
|
|
fields={
|
|
"title": "Graph-first bibliography augmentation",
|
|
"year": "2024a",
|
|
"journal": "Journal of Research Systems",
|
|
"pages": "45--67",
|
|
"doi": "10.1000/example-doi",
|
|
},
|
|
),
|
|
BibEntry(
|
|
entry_type="article",
|
|
citation_key="miller2023semantic2",
|
|
fields={
|
|
"title": "Semantic search for research corpora",
|
|
"year": "2023",
|
|
"journal": "Retrieval Workshop Journal",
|
|
},
|
|
),
|
|
BibEntry(
|
|
entry_type="phdthesis",
|
|
citation_key="doe2019evolutionary3",
|
|
fields={
|
|
"title": "Evolutionary archives",
|
|
"year": "2019",
|
|
"school": "Example University",
|
|
},
|
|
),
|
|
BibEntry(
|
|
entry_type="techreport",
|
|
citation_key="chen2018field4",
|
|
fields={
|
|
"title": "Field methods update",
|
|
"year": "2018",
|
|
"institution": "Example Research Lab",
|
|
"number": "TR-2018-05",
|
|
},
|
|
),
|
|
BibEntry(
|
|
entry_type="misc",
|
|
citation_key="nguyen2022project5",
|
|
fields={
|
|
"title": "Project page",
|
|
"year": "2022",
|
|
"url": "https://example.org/project-page",
|
|
},
|
|
),
|
|
]
|
|
|
|
register_extraction_backend(FixtureAltBackend())
|
|
|
|
|
|
def test_extract_references_builds_draft_entries():
|
|
entries = extract_references(SAMPLE_REFERENCES)
|
|
|
|
assert [entry.citation_key for entry in entries] == [
|
|
"smith2024graphfirst1",
|
|
"miller2023semantic2",
|
|
]
|
|
assert entries[0].entry_type == "article"
|
|
assert entries[0].fields["journal"] == "Journal of Research Systems"
|
|
assert "extracted_by = {heuristic}" in entries[0].fields["note"]
|
|
assert entries[1].entry_type == "inproceedings"
|
|
assert entries[1].fields["booktitle"] == "Proceedings of the Retrieval Workshop"
|
|
|
|
|
|
def test_extract_cli_writes_bibtex(tmp_path):
|
|
input_path = tmp_path / "references.txt"
|
|
output_path = tmp_path / "draft.bib"
|
|
input_path.write_text(SAMPLE_REFERENCES, encoding="utf-8")
|
|
|
|
exit_code = main(["extract", str(input_path), "--output", str(output_path)])
|
|
assert exit_code == 0
|
|
|
|
exported = output_path.read_text(encoding="utf-8")
|
|
parsed = {entry.citation_key: entry for entry in parse_bibtex(exported)}
|
|
assert parsed["smith2024graphfirst1"].fields["journal"] == "Journal of Research Systems"
|
|
assert parsed["miller2023semantic2"].fields["booktitle"] == "Proceedings of the Retrieval Workshop"
|
|
|
|
|
|
def test_extract_references_supports_apa_and_book_styles():
|
|
entries = extract_references(APA_AND_BOOK_REFERENCES)
|
|
|
|
assert [entry.entry_type for entry in entries] == ["article", "book"]
|
|
assert entries[0].fields["journal"] == "Journal of Information Retrieval"
|
|
assert entries[0].fields["author"] == "Brown, T., and Green, P"
|
|
assert entries[1].fields["publisher"] == "Example University Press"
|
|
assert entries[1].fields["title"] == "Research Design for Literature Mapping"
|
|
|
|
|
|
def test_extract_references_joins_wrapped_reference_lines():
|
|
entries = extract_references(WRAPPED_REFERENCES)
|
|
|
|
assert len(entries) == 2
|
|
assert entries[0].fields["title"] == "Multi-line reference extraction for bibliography pipelines"
|
|
assert entries[0].fields["journal"] == "Journal of Parsing Systems"
|
|
|
|
|
|
def test_extract_references_preserves_year_suffix_ids_and_pages():
|
|
entries = extract_references(RICH_REFERENCES)
|
|
|
|
article = entries[0]
|
|
assert article.fields["year"] == "2024a"
|
|
assert article.fields["doi"] == "10.1000/example-doi"
|
|
assert article.fields["url"] == "https://doi.org/10.1000/example-doi"
|
|
assert article.fields["journal"] == "Journal of Research Systems"
|
|
assert article.fields["volume"] == "12"
|
|
assert article.fields["number"] == "3"
|
|
assert article.fields["pages"] == "45--67"
|
|
|
|
|
|
def test_extract_references_supports_thesis_report_and_web_entries():
|
|
entries = extract_references(RICH_REFERENCES)
|
|
|
|
thesis = entries[1]
|
|
report = entries[2]
|
|
webpage = entries[3]
|
|
|
|
assert thesis.entry_type == "phdthesis"
|
|
assert thesis.fields["school"] == "Example University"
|
|
|
|
assert report.entry_type == "techreport"
|
|
assert report.fields["institution"] == "Example Research Lab"
|
|
assert report.fields["number"] == "TR-2018-05"
|
|
assert report.fields["type"] == "Technical Report"
|
|
|
|
assert webpage.entry_type == "misc"
|
|
assert webpage.fields["url"] == "https://example.org/project-page"
|
|
assert webpage.fields["howpublished"] == "Retrieved from https://example.org/project-page"
|
|
|
|
|
|
def test_extract_references_supports_registered_backend():
|
|
class StaticBackend:
|
|
name = "static-test"
|
|
|
|
def extract_references(self, text: str) -> list[BibEntry]:
|
|
return [
|
|
BibEntry(
|
|
entry_type="misc",
|
|
citation_key="static2024example1",
|
|
fields={"title": text.strip(), "year": "2024"},
|
|
)
|
|
]
|
|
|
|
register_extraction_backend(StaticBackend())
|
|
|
|
entries = extract_references("Custom backend input", backend="static-test")
|
|
|
|
assert entries[0].citation_key == "static2024example1"
|
|
assert "static-test" in available_extraction_backends()
|
|
|
|
|
|
def test_extract_references_rejects_unknown_backend():
|
|
try:
|
|
extract_references("anything", backend="missing-backend")
|
|
except ValueError as exc:
|
|
assert "Unknown extraction backend" in str(exc)
|
|
else:
|
|
raise AssertionError("expected ValueError for unknown backend")
|
|
|
|
|
|
def test_extract_cli_accepts_backend_flag(tmp_path):
|
|
input_path = tmp_path / "references.txt"
|
|
output_path = tmp_path / "draft.bib"
|
|
input_path.write_text(SAMPLE_REFERENCES, encoding="utf-8")
|
|
|
|
exit_code = main(["extract", str(input_path), "--backend", "heuristic", "--output", str(output_path)])
|
|
|
|
assert exit_code == 0
|
|
exported = output_path.read_text(encoding="utf-8")
|
|
assert "@article{smith2024graphfirst1," in exported
|
|
|
|
|
|
def test_extract_references_anystyle_backend_maps_json(monkeypatch):
|
|
import citegeist.extract as extract_module
|
|
|
|
monkeypatch.setattr(extract_module.shutil, "which", lambda command: "/usr/bin/anystyle")
|
|
|
|
class Result:
|
|
returncode = 0
|
|
stdout = """
|
|
[
|
|
{
|
|
"author": [{"family": "Smith", "given": "Jane"}],
|
|
"date": ["2024"],
|
|
"title": ["Graph-first bibliography augmentation"],
|
|
"journal": ["Journal of Research Systems"],
|
|
"volume": ["12"],
|
|
"issue": ["3"],
|
|
"pages": ["45-67"],
|
|
"doi": ["10.1000/example-doi"],
|
|
"type": "article"
|
|
}
|
|
]
|
|
"""
|
|
stderr = ""
|
|
|
|
monkeypatch.setattr(extract_module.subprocess, "run", lambda *args, **kwargs: Result())
|
|
|
|
entries = extract_references(SAMPLE_REFERENCES, backend="anystyle")
|
|
|
|
assert len(entries) == 1
|
|
assert entries[0].entry_type == "article"
|
|
assert entries[0].fields["author"] == "Smith, Jane"
|
|
assert entries[0].fields["journal"] == "Journal of Research Systems"
|
|
assert entries[0].fields["pages"] == "45--67"
|
|
assert entries[0].fields["doi"] == "10.1000/example-doi"
|
|
assert entries[0].fields["url"] == "https://doi.org/10.1000/example-doi"
|
|
assert "extracted_by = {anystyle}" in entries[0].fields["note"]
|
|
assert "raw_reference = {Smith, Jane and Doe, Alex. 2024. Graph-first bibliography augmentation. Journal of Research Systems.}" in entries[0].fields["note"]
|
|
|
|
|
|
def test_extract_references_anystyle_backend_reports_missing_binary(monkeypatch):
|
|
import citegeist.extract as extract_module
|
|
|
|
monkeypatch.setattr(extract_module.shutil, "which", lambda command: None)
|
|
|
|
try:
|
|
extract_references(SAMPLE_REFERENCES, backend="anystyle")
|
|
except RuntimeError as exc:
|
|
assert "requires the AnyStyle CLI" in str(exc)
|
|
else:
|
|
raise AssertionError("expected RuntimeError when anystyle is unavailable")
|
|
|
|
|
|
def test_extract_references_grobid_backend_maps_bibtex(monkeypatch):
|
|
import citegeist.extract as extract_module
|
|
|
|
class FakeResponse:
|
|
def __enter__(self):
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc, tb):
|
|
return False
|
|
|
|
def read(self) -> bytes:
|
|
return b"""
|
|
@article{-1,
|
|
author = {Smith, Jane},
|
|
title = {Graph-first bibliography augmentation},
|
|
journal = {Journal of Research Systems},
|
|
year = {2024},
|
|
pages = {45--67},
|
|
volume = {12},
|
|
number = {3},
|
|
doi = {10.1000/example-doi}
|
|
}
|
|
"""
|
|
|
|
monkeypatch.setattr(extract_module.urllib.request, "urlopen", lambda request, timeout=30: FakeResponse())
|
|
|
|
entries = extract_references(SAMPLE_REFERENCES, backend="grobid")
|
|
|
|
assert len(entries) == 1
|
|
assert entries[0].citation_key == "smith2024graphfirst1"
|
|
assert entries[0].fields["doi"] == "10.1000/example-doi"
|
|
assert entries[0].fields["url"] == "https://doi.org/10.1000/example-doi"
|
|
assert "extracted_by = {grobid}" in entries[0].fields["note"]
|
|
|
|
|
|
def test_extract_references_grobid_backend_reports_http_errors(monkeypatch):
|
|
import citegeist.extract as extract_module
|
|
|
|
def raise_http(request, timeout=30):
|
|
raise extract_module.urllib.error.HTTPError(
|
|
url=request.full_url,
|
|
code=503,
|
|
msg="Busy",
|
|
hdrs=None,
|
|
fp=None,
|
|
)
|
|
|
|
monkeypatch.setattr(extract_module.urllib.request, "urlopen", raise_http)
|
|
|
|
try:
|
|
extract_references(SAMPLE_REFERENCES, backend="grobid")
|
|
except RuntimeError as exc:
|
|
assert "GROBID extraction failed with HTTP 503" in str(exc)
|
|
else:
|
|
raise AssertionError("expected RuntimeError when grobid returns HTTP error")
|
|
|
|
|
|
def test_compare_extraction_backends_reports_field_differences():
|
|
class CompareA:
|
|
name = "compare-a"
|
|
|
|
def extract_references(self, text: str) -> list[BibEntry]:
|
|
return [
|
|
BibEntry(
|
|
entry_type="article",
|
|
citation_key="a1",
|
|
fields={"title": "Shared Title", "year": "2024", "journal": "Journal A"},
|
|
)
|
|
]
|
|
|
|
class CompareB:
|
|
name = "compare-b"
|
|
|
|
def extract_references(self, text: str) -> list[BibEntry]:
|
|
return [
|
|
BibEntry(
|
|
entry_type="inproceedings",
|
|
citation_key="b1",
|
|
fields={"title": "Shared Title", "year": "2024", "booktitle": "Proceedings B"},
|
|
)
|
|
]
|
|
|
|
register_extraction_backend(CompareA())
|
|
register_extraction_backend(CompareB())
|
|
|
|
rows = compare_extraction_backends(SAMPLE_REFERENCES, backends=["compare-a", "compare-b"])
|
|
|
|
assert rows[0].ordinal == 1
|
|
assert "entry_type" in rows[0].differing_fields
|
|
assert "journal" in rows[0].differing_fields
|
|
assert "booktitle" in rows[0].differing_fields
|
|
assert rows[0].entries["compare-a"]["fields"]["journal"] == "Journal A"
|
|
assert rows[0].entries["compare-b"]["fields"]["booktitle"] == "Proceedings B"
|
|
|
|
|
|
def test_compare_extract_cli_writes_json(tmp_path):
|
|
input_path = tmp_path / "references.txt"
|
|
output_path = tmp_path / "compare.json"
|
|
input_path.write_text(SAMPLE_REFERENCES, encoding="utf-8")
|
|
|
|
exit_code = main(
|
|
[
|
|
"compare-extract",
|
|
str(input_path),
|
|
"--backend",
|
|
"heuristic",
|
|
"--backend",
|
|
"heuristic",
|
|
"--output",
|
|
str(output_path),
|
|
]
|
|
)
|
|
|
|
assert exit_code == 0
|
|
payload = json.loads(output_path.read_text(encoding="utf-8"))
|
|
assert payload[0]["ordinal"] == 1
|
|
assert payload[0]["entries"]["heuristic"]["present"] is True
|
|
|
|
|
|
def test_compare_extraction_backends_fixture_reports_expected_disagreement():
|
|
register_fixture_alt_backend()
|
|
|
|
rows = compare_extraction_backends(FIXTURE_REFERENCES, backends=["heuristic", "fixture-alt"])
|
|
|
|
assert len(rows) == 5
|
|
assert "author" in rows[0].differing_fields
|
|
assert "volume" in rows[0].differing_fields
|
|
assert "number" in rows[0].differing_fields
|
|
assert "entry_type" in rows[1].differing_fields
|
|
assert "journal" in rows[1].differing_fields
|
|
assert "booktitle" in rows[1].differing_fields
|
|
assert "howpublished" in rows[4].differing_fields
|
|
|
|
|
|
def test_compare_extract_cli_fixture_json_contains_all_rows(tmp_path):
|
|
input_path = tmp_path / "fixture.txt"
|
|
output_path = tmp_path / "compare.json"
|
|
input_path.write_text(FIXTURE_REFERENCES, encoding="utf-8")
|
|
|
|
exit_code = main(["compare-extract", str(input_path), "--backend", "heuristic", "--output", str(output_path)])
|
|
|
|
assert exit_code == 0
|
|
payload = json.loads(output_path.read_text(encoding="utf-8"))
|
|
assert len(payload) == 5
|
|
assert payload[2]["entries"]["heuristic"]["entry_type"] == "phdthesis"
|
|
|
|
|
|
def test_summarize_extraction_comparison_counts_differences():
|
|
rows = compare_extraction_backends(FIXTURE_REFERENCES, backends=["heuristic", "heuristic"])
|
|
|
|
summary = summarize_extraction_comparison(rows)
|
|
|
|
assert summary.row_count == 5
|
|
assert summary.rows_with_differences == 0
|
|
assert summary.backend_presence_counts["heuristic"] == 5
|
|
assert summary.differing_field_counts == {}
|
|
|
|
|
|
def test_compare_extract_cli_summary_writes_counts(tmp_path):
|
|
input_path = tmp_path / "fixture.txt"
|
|
output_path = tmp_path / "summary.json"
|
|
input_path.write_text(FIXTURE_REFERENCES, encoding="utf-8")
|
|
|
|
exit_code = main(
|
|
[
|
|
"compare-extract",
|
|
str(input_path),
|
|
"--backend",
|
|
"heuristic",
|
|
"--backend",
|
|
"heuristic",
|
|
"--summary",
|
|
"--output",
|
|
str(output_path),
|
|
]
|
|
)
|
|
|
|
assert exit_code == 0
|
|
payload = json.loads(output_path.read_text(encoding="utf-8"))
|
|
assert payload["row_count"] == 5
|
|
assert payload["rows_with_differences"] == 0
|
|
assert payload["backend_presence_counts"]["heuristic"] == 5
|
|
|
|
|
|
def test_check_extraction_comparison_summary_reports_failure():
|
|
register_fixture_alt_backend()
|
|
rows = compare_extraction_backends(FIXTURE_REFERENCES, backends=["heuristic", "fixture-alt"])
|
|
summary = summarize_extraction_comparison(rows)
|
|
|
|
check = check_extraction_comparison_summary(summary, max_rows_with_differences=0)
|
|
|
|
assert check.passed is False
|
|
assert "rows_with_differences" in check.failures[0]
|
|
|
|
|
|
def test_compare_extract_cli_summary_threshold_passes(tmp_path):
|
|
input_path = tmp_path / "fixture.txt"
|
|
output_path = tmp_path / "summary-pass.json"
|
|
input_path.write_text(FIXTURE_REFERENCES, encoding="utf-8")
|
|
|
|
exit_code = main(
|
|
[
|
|
"compare-extract",
|
|
str(input_path),
|
|
"--backend",
|
|
"heuristic",
|
|
"--backend",
|
|
"heuristic",
|
|
"--summary",
|
|
"--max-rows-with-differences",
|
|
"0",
|
|
"--output",
|
|
str(output_path),
|
|
]
|
|
)
|
|
|
|
assert exit_code == 0
|
|
payload = json.loads(output_path.read_text(encoding="utf-8"))
|
|
assert payload["check"]["passed"] is True
|
|
|
|
|
|
def test_compare_extract_cli_summary_threshold_fails(tmp_path):
|
|
register_fixture_alt_backend()
|
|
input_path = tmp_path / "fixture.txt"
|
|
output_path = tmp_path / "summary-fail.json"
|
|
input_path.write_text(FIXTURE_REFERENCES, encoding="utf-8")
|
|
|
|
exit_code = main(
|
|
[
|
|
"compare-extract",
|
|
str(input_path),
|
|
"--backend",
|
|
"heuristic",
|
|
"--backend",
|
|
"fixture-alt",
|
|
"--summary",
|
|
"--max-rows-with-differences",
|
|
"0",
|
|
"--output",
|
|
str(output_path),
|
|
]
|
|
)
|
|
|
|
assert exit_code == 1
|
|
payload = json.loads(output_path.read_text(encoding="utf-8"))
|
|
assert payload["check"]["passed"] is False
|
|
assert payload["summary"]["rows_with_differences"] > 0
|