from citegeist import OaiPmhHarvester, parse_bibtex from citegeist.cli import main OAI_XML = """
oai:example.edu:123
Thesis Metadata Harvesting Doe, Jane 2023-05-01 A dissertation about repository harvesting. https://example.edu/items/123 Example University Text Dissertation
""" OAI_XML_PAGE_1 = """
oai:example.edu:123
First Harvested Thesis Doe, Jane 2023-05-01 Dissertation
TOKEN123
""" OAI_XML_PAGE_2 = """
oai:example.edu:456
Second Harvested Thesis Smith, John 2022-05-01 Dissertation
""" OAI_IDENTIFY_XML = """ Example Repository https://example.edu/oai 2.0 repo@example.edu 2001-01-01 persistent YYYY-MM-DD """ OAI_LISTSETS_XML = """ theses Theses and Dissertations This set contains graduate theses. """ OAI_METADATA_FORMATS_XML = """ oai_dc http://www.openarchives.org/OAI/2.0/oai_dc.xsd http://www.openarchives.org/OAI/2.0/oai_dc/ mods http://www.loc.gov/standards/mods/v3/mods-3-7.xsd http://www.loc.gov/mods/v3 """ OAI_MODS_XML = """
oai:example.edu:mods123
MODS Thesis Title Doe Jane author Example University 2022 dissertation MODS abstract text. https://example.edu/mods123
""" def test_oai_harvester_maps_dublin_core_to_bibentry(): harvester = OaiPmhHarvester() harvester.source_client.get_xml = lambda _url: __import__("xml.etree.ElementTree").etree.ElementTree.fromstring(OAI_XML) # type: ignore[method-assign] results = harvester.list_records("https://example.edu/oai") assert len(results) == 1 entry = results[0].entry assert entry.entry_type == "phdthesis" assert entry.fields["title"] == "Thesis Metadata Harvesting" assert entry.fields["author"] == "Doe, Jane" assert entry.fields["oai"] == "oai:example.edu:123" def test_oai_harvester_follows_resumption_tokens(): harvester = OaiPmhHarvester() from xml.etree import ElementTree as ET payloads = iter([ET.fromstring(OAI_XML_PAGE_1), ET.fromstring(OAI_XML_PAGE_2)]) harvester.source_client.get_xml = lambda _url: next(payloads) # type: ignore[method-assign] results = harvester.list_records("https://example.edu/oai") assert [result.identifier for result in results] == [ "oai:example.edu:123", "oai:example.edu:456", ] assert [result.entry.citation_key for result in results] == [ "doe2023first1", "smith2022second2", ] def test_oai_harvester_passes_date_filters(): harvester = OaiPmhHarvester() seen_urls: list[str] = [] from xml.etree import ElementTree as ET def fake_get_xml(url: str): seen_urls.append(url) return ET.fromstring(OAI_XML) harvester.source_client.get_xml = fake_get_xml # type: ignore[method-assign] harvester.list_records( "https://example.edu/oai", date_from="2023-01-01", date_until="2023-12-31", limit=1, ) assert "from=2023-01-01" in seen_urls[0] assert "until=2023-12-31" in seen_urls[0] def test_oai_harvester_maps_mods_records(): harvester = OaiPmhHarvester() from xml.etree import ElementTree as ET harvester.source_client.get_xml = lambda _url: ET.fromstring(OAI_MODS_XML) # type: ignore[method-assign] results = harvester.list_records("https://example.edu/oai", metadata_prefix="mods") assert len(results) == 1 entry = results[0].entry assert entry.entry_type == "phdthesis" assert entry.fields["title"] == "MODS Thesis Title" assert entry.fields["author"] == "Doe, Jane" assert entry.fields["publisher"] == "Example University" assert entry.fields["abstract"] == "MODS abstract text." def test_oai_harvester_can_identify_repository_and_list_sets(): harvester = OaiPmhHarvester() from xml.etree import ElementTree as ET payloads = iter( [ET.fromstring(OAI_IDENTIFY_XML), ET.fromstring(OAI_LISTSETS_XML), ET.fromstring(OAI_METADATA_FORMATS_XML)] ) harvester.source_client.get_xml = lambda _url: next(payloads) # type: ignore[method-assign] identify = harvester.identify("https://example.edu/oai") sets = harvester.list_sets("https://example.edu/oai") formats = harvester.list_metadata_formats("https://example.edu/oai") assert identify["repositoryName"] == "Example Repository" assert identify["granularity"] == "YYYY-MM-DD" assert sets[0].set_spec == "theses" assert sets[0].set_name == "Theses and Dissertations" assert "graduate theses" in sets[0].set_description assert [item.metadata_prefix for item in formats] == ["oai_dc", "mods"] def test_harvest_oai_cli_ingests_records(tmp_path): from unittest.mock import patch database = tmp_path / "library.sqlite3" harvester = OaiPmhHarvester() from xml.etree import ElementTree as ET harvester.source_client.get_xml = lambda _url: ET.fromstring(OAI_XML) # type: ignore[method-assign] harvested = harvester.list_records("https://example.edu/oai") with patch("citegeist.cli.OaiPmhHarvester.list_records") as mocked_list: mocked_list.return_value = harvested exit_code = main( [ "--db", str(database), "harvest-oai", "https://example.edu/oai", "--metadata-prefix", "oai_dc", "--from", "2023-01-01", "--until", "2023-12-31", "--limit", "5", ] ) assert exit_code == 0 from citegeist.storage import BibliographyStore store = BibliographyStore(database) try: entry = store.list_entries(limit=10)[0] assert entry["citation_key"] == "doe2023thesis1" bibtex = store.get_entry_bibtex("doe2023thesis1") parsed = parse_bibtex(bibtex or "") assert parsed[0].fields["oai"] == "oai:example.edu:123" finally: store.close()