from citegeist import OaiPmhHarvester, parse_bibtex
from citegeist.cli import main
OAI_XML = """
Thesis Metadata Harvesting
Doe, Jane
2023-05-01
A dissertation about repository harvesting.
https://example.edu/items/123
Example University
Text
Dissertation
"""
OAI_XML_PAGE_1 = """
First Harvested Thesis
Doe, Jane
2023-05-01
Dissertation
TOKEN123
"""
OAI_XML_PAGE_2 = """
Second Harvested Thesis
Smith, John
2022-05-01
Dissertation
"""
OAI_IDENTIFY_XML = """
Example Repository
https://example.edu/oai
2.0
repo@example.edu
2001-01-01
persistent
YYYY-MM-DD
"""
OAI_LISTSETS_XML = """
theses
Theses and Dissertations
This set contains graduate theses.
"""
OAI_METADATA_FORMATS_XML = """
oai_dc
http://www.openarchives.org/OAI/2.0/oai_dc.xsd
http://www.openarchives.org/OAI/2.0/oai_dc/
mods
http://www.loc.gov/standards/mods/v3/mods-3-7.xsd
http://www.loc.gov/mods/v3
"""
OAI_MODS_XML = """
MODS Thesis Title
Doe
Jane
author
Example University
2022
dissertation
MODS abstract text.
https://example.edu/mods123
"""
def test_oai_harvester_maps_dublin_core_to_bibentry():
harvester = OaiPmhHarvester()
harvester.source_client.get_xml = lambda _url: __import__("xml.etree.ElementTree").etree.ElementTree.fromstring(OAI_XML) # type: ignore[method-assign]
results = harvester.list_records("https://example.edu/oai")
assert len(results) == 1
entry = results[0].entry
assert entry.entry_type == "phdthesis"
assert entry.fields["title"] == "Thesis Metadata Harvesting"
assert entry.fields["author"] == "Doe, Jane"
assert entry.fields["oai"] == "oai:example.edu:123"
def test_oai_harvester_follows_resumption_tokens():
harvester = OaiPmhHarvester()
from xml.etree import ElementTree as ET
payloads = iter([ET.fromstring(OAI_XML_PAGE_1), ET.fromstring(OAI_XML_PAGE_2)])
harvester.source_client.get_xml = lambda _url: next(payloads) # type: ignore[method-assign]
results = harvester.list_records("https://example.edu/oai")
assert [result.identifier for result in results] == [
"oai:example.edu:123",
"oai:example.edu:456",
]
assert [result.entry.citation_key for result in results] == [
"doe2023first1",
"smith2022second2",
]
def test_oai_harvester_passes_date_filters():
harvester = OaiPmhHarvester()
seen_urls: list[str] = []
from xml.etree import ElementTree as ET
def fake_get_xml(url: str):
seen_urls.append(url)
return ET.fromstring(OAI_XML)
harvester.source_client.get_xml = fake_get_xml # type: ignore[method-assign]
harvester.list_records(
"https://example.edu/oai",
date_from="2023-01-01",
date_until="2023-12-31",
limit=1,
)
assert "from=2023-01-01" in seen_urls[0]
assert "until=2023-12-31" in seen_urls[0]
def test_oai_harvester_maps_mods_records():
harvester = OaiPmhHarvester()
from xml.etree import ElementTree as ET
harvester.source_client.get_xml = lambda _url: ET.fromstring(OAI_MODS_XML) # type: ignore[method-assign]
results = harvester.list_records("https://example.edu/oai", metadata_prefix="mods")
assert len(results) == 1
entry = results[0].entry
assert entry.entry_type == "phdthesis"
assert entry.fields["title"] == "MODS Thesis Title"
assert entry.fields["author"] == "Doe, Jane"
assert entry.fields["publisher"] == "Example University"
assert entry.fields["abstract"] == "MODS abstract text."
def test_oai_harvester_can_identify_repository_and_list_sets():
harvester = OaiPmhHarvester()
from xml.etree import ElementTree as ET
payloads = iter(
[ET.fromstring(OAI_IDENTIFY_XML), ET.fromstring(OAI_LISTSETS_XML), ET.fromstring(OAI_METADATA_FORMATS_XML)]
)
harvester.source_client.get_xml = lambda _url: next(payloads) # type: ignore[method-assign]
identify = harvester.identify("https://example.edu/oai")
sets = harvester.list_sets("https://example.edu/oai")
formats = harvester.list_metadata_formats("https://example.edu/oai")
assert identify["repositoryName"] == "Example Repository"
assert identify["granularity"] == "YYYY-MM-DD"
assert sets[0].set_spec == "theses"
assert sets[0].set_name == "Theses and Dissertations"
assert "graduate theses" in sets[0].set_description
assert [item.metadata_prefix for item in formats] == ["oai_dc", "mods"]
def test_harvest_oai_cli_ingests_records(tmp_path):
from unittest.mock import patch
database = tmp_path / "library.sqlite3"
harvester = OaiPmhHarvester()
from xml.etree import ElementTree as ET
harvester.source_client.get_xml = lambda _url: ET.fromstring(OAI_XML) # type: ignore[method-assign]
harvested = harvester.list_records("https://example.edu/oai")
with patch("citegeist.cli.OaiPmhHarvester.list_records") as mocked_list:
mocked_list.return_value = harvested
exit_code = main(
[
"--db",
str(database),
"harvest-oai",
"https://example.edu/oai",
"--metadata-prefix",
"oai_dc",
"--from",
"2023-01-01",
"--until",
"2023-12-31",
"--limit",
"5",
]
)
assert exit_code == 0
from citegeist.storage import BibliographyStore
store = BibliographyStore(database)
try:
entry = store.list_entries(limit=10)[0]
assert entry["citation_key"] == "doe2023thesis1"
bibtex = store.get_entry_bibtex("doe2023thesis1")
parsed = parse_bibtex(bibtex or "")
assert parsed[0].fields["oai"] == "oai:example.edu:123"
finally:
store.close()