From 0354d6de891984d7480ce4a38db0e83019bbb385 Mon Sep 17 00:00:00 2001 From: welsberr Date: Fri, 20 Mar 2026 20:22:05 -0400 Subject: [PATCH] Tighten discovered-work admission --- README.md | 4 +- examples/cli/README.md | 4 +- src/citegeist/expand.py | 185 ++++++++++++++++++++++++++--- src/citegeist/resolve.py | 43 ++++++- src/citegeist/storage.py | 34 ++++++ tests/test_expand.py | 22 ++++ tests/test_openalex_expand.py | 217 +++++++++++++++++++++++++++++++++- tests/test_resolve.py | 2 +- 8 files changed, 482 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index c378f17..1de6a3d 100644 --- a/README.md +++ b/README.md @@ -159,7 +159,9 @@ Broad BibTeX exports skip DOI-only placeholder records such as `Referenced work Long-running CLI commands report progress on `stderr` so `stdout` remains clean for JSON, BibTeX, or tabular output. -Crossref reference expansion is intentionally conservative about weak discoveries. If a cited reference has no DOI and Crossref only exposes it as an unstructured citation blob, `expand --source crossref`, `expand-topic --source crossref`, and bootstrap flows now skip materializing that record unless the fallback metadata looks like a cleaner non-`misc` work such as conference proceedings. This reduces junk `@misc` entries whose `title` field is really a pasted citation string. +Crossref reference expansion is intentionally conservative about weak discoveries. If a cited reference has no DOI and Crossref only exposes it as an unstructured citation blob, `expand --source crossref`, `expand-topic --source crossref`, and bootstrap flows now skip materializing that record unless the fallback metadata looks like a cleaner non-`misc` work such as conference proceedings. When Crossref does expose thesis or dissertation references only as unstructured text, citegeist now also tries to extract the actual work title instead of keeping the entire ProQuest-style citation blob in the `title` field. This reduces junk `@misc` entries and cleaner `@phdthesis` fallbacks whose `title` field is really a pasted citation string. + +OpenAlex expansion is also conservative about noisy secondary records. Discoveries now prefer DOI-based citation keys when a DOI is present, which reduces parallel `doi...` and `openalex...` entries for the same work. OpenAlex abstract imports are sanitized to drop obvious webpage/export blobs, and DOI-less records are filtered when they look like venue-title stubs, generic container records, or weak review-like shadows of an already present book, chapter, proceedings paper, or dissertation with the same title. Preview mode uses the same admission rules as write-enabled expansion, so rejected candidates disappear before you commit them to the store. For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run. diff --git a/examples/cli/README.md b/examples/cli/README.md index 3590a6e..67edab1 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -211,7 +211,9 @@ Re-enrich all current `@misc` entries with DOIs: .venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --all-misc --limit 25 ``` -When Crossref expansion only yields an unstructured citation blob without a DOI, citegeist now skips materializing that discovery instead of storing it as a weak `@misc` entry. Cleaner fallback cases that infer a more specific type, such as proceedings-like titles, are still admitted. +When Crossref expansion only yields an unstructured citation blob without a DOI, citegeist now skips materializing that discovery instead of storing it as a weak `@misc` entry. Cleaner fallback cases that infer a more specific type, such as proceedings-like titles, are still admitted. Thesis and dissertation citation blobs are also normalized more aggressively so fallback `@phdthesis` entries keep the work title instead of the entire ProQuest-style citation string. + +OpenAlex expansion now applies the same kind of admission control before writing or previewing discoveries. DOI-backed discoveries prefer DOI-based citation keys, noisy webpage/export abstracts are dropped, generic venue-title stubs are rejected, and weak DOI-less article records that merely shadow an existing book/chapter/dissertation title in your store are suppressed instead of being materialized as parallel duplicates. ## Explore Citation Graphs diff --git a/src/citegeist/expand.py b/src/citegeist/expand.py index c91acff..2a62019 100644 --- a/src/citegeist/expand.py +++ b/src/citegeist/expand.py @@ -1,5 +1,6 @@ from __future__ import annotations +import html import re from dataclasses import dataclass from urllib.parse import quote, urlencode @@ -157,9 +158,15 @@ class OpenAlexExpander: results: list[ExpansionResult] = [] for work in works: + if _skip_openalex_work(work): + continue discovered = _openalex_work_to_entry(work) + existing_key = _existing_entry_key_for_discovered_work(store, discovered) + if existing_key is None and _skip_openalex_review_like_duplicate(store, discovered): + continue + target_key = existing_key or discovered.citation_key created = False - if store.get_entry(discovered.citation_key) is None: + if existing_key is None and store.get_entry(discovered.citation_key) is None: store.upsert_entry( discovered, raw_bibtex=None, @@ -172,9 +179,8 @@ class OpenAlexExpander: if relation_type == "cites": source_key = citation_key - target_key = discovered.citation_key else: - source_key = discovered.citation_key + source_key = target_key target_key = citation_key store.add_relation( @@ -188,7 +194,7 @@ class OpenAlexExpander: results.append( ExpansionResult( source_citation_key=source_key, - discovered_citation_key=discovered.citation_key, + discovered_citation_key=target_key, created_entry=created, relation_type=relation_type, source_label=f"openalex:{relation_type}:{openalex_id}", @@ -385,14 +391,20 @@ class TopicExpander: works = payload.get("results", []) rows: list[tuple[ExpansionResult, dict[str, object]]] = [] for work in works: + if _skip_openalex_work(work): + continue discovered = _openalex_work_to_entry(work) - source_key = citation_key if relation_type == "cites" else discovered.citation_key + existing_key = _existing_entry_key_for_discovered_work(store, discovered) + if existing_key is None and _skip_openalex_review_like_duplicate(store, discovered): + continue + target_key = existing_key or discovered.citation_key + source_key = citation_key if relation_type == "cites" else target_key rows.append( ( ExpansionResult( source_citation_key=source_key, - discovered_citation_key=discovered.citation_key, - created_entry=store.get_entry(discovered.citation_key) is None, + discovered_citation_key=target_key, + created_entry=existing_key is None and store.get_entry(discovered.citation_key) is None, relation_type=relation_type, source_label=f"openalex:{relation_type}:{openalex_id}", ), @@ -403,13 +415,7 @@ class TopicExpander: def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry: - title = ( - reference.get("article-title") - or reference.get("volume-title") - or reference.get("journal-title") - or reference.get("unstructured") - or f"Referenced work {ordinal}" - ) + title = _crossref_reference_title(reference, ordinal) year = str(reference.get("year") or "") author = reference.get("author") or "" doi = reference.get("DOI") or "" @@ -434,6 +440,42 @@ def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordi return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) +def _crossref_reference_title(reference: dict, ordinal: int) -> str: + raw_title = ( + reference.get("article-title") + or reference.get("volume-title") + or reference.get("journal-title") + or _extract_crossref_unstructured_title(str(reference.get("unstructured") or "")) + or f"Referenced work {ordinal}" + ) + return _normalize_text(raw_title) + + +def _extract_crossref_unstructured_title(text: str) -> str: + normalized = _normalize_text(text) + if not normalized: + return "" + + thesis_markers = ( + "(Master", + "(Doctoral", + "PhD dissertation", + "Master's thesis", + "Master’s thesis", + "Doctoral dissertation", + ) + for marker in thesis_markers: + if marker in normalized: + normalized = normalized.split(marker, 1)[0].strip(" .") + break + for marker in (" ProQuest", " UMI No.", " Dissertation Abstracts", " University Microfilms"): + if marker in normalized: + normalized = normalized.split(marker, 1)[0].strip(" .") + if any(marker in text for marker in thesis_markers) and ". " in normalized: + normalized = normalized.split(". ", 1)[1].strip() + return normalized.strip(" .") + + def _skip_crossref_reference(reference: dict, entry: BibEntry) -> bool: if reference.get("DOI"): return False @@ -485,7 +527,8 @@ def _reference_citation_key(reference: dict, title: str, year: str, ordinal: int def _normalize_text(value: str) -> str: - return " ".join(value.split()) + without_tags = re.sub(r"<[^>]+>", "", html.unescape(value)) + return " ".join(without_tags.split()) def _crossref_reference_entry_type(reference: dict, title: str, journal_title: str) -> str: @@ -635,14 +678,16 @@ def _openalex_work_to_entry(work: dict) -> BibEntry: if openalex_id: fields["openalex"] = openalex_id if abstract := work.get("abstract_inverted_index"): - fields["abstract"] = _openalex_abstract_text(abstract) + abstract_text = _openalex_abstract_text(abstract) + if abstract_text: + fields["abstract"] = abstract_text if source: if work_type == "article": fields["journal"] = source else: fields["booktitle"] = source - citation_key = _openalex_citation_key(openalex_id, authors, year, title) + citation_key = _openalex_citation_key(doi, openalex_id, authors, year, title) entry_type = _openalex_type_to_bibtype(work_type) return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) @@ -658,7 +703,8 @@ def _openalex_abstract_text(inverted_index: dict) -> str: for word, indexes in inverted_index.items(): for index in indexes: positions[int(index)] = word - return " ".join(word for _, word in sorted(positions.items())) + text = _normalize_text(" ".join(word for _, word in sorted(positions.items()))) + return "" if _looks_like_openalex_page_blob(text) else text def _openalex_type_to_bibtype(work_type: str) -> str: @@ -672,7 +718,10 @@ def _openalex_type_to_bibtype(work_type: str) -> str: return mapping.get(work_type, "misc") -def _openalex_citation_key(openalex_id: str, authors: str, year: str, title: str) -> str: +def _openalex_citation_key(doi: str, openalex_id: str, authors: str, year: str, title: str) -> str: + if doi: + suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower() + return f"doi{suffix}" if openalex_id: return f"openalex{re.sub(r'[^A-Za-z0-9]+', '', openalex_id).lower()}" author = authors.split(" and ")[0] if authors else "ref" @@ -681,6 +730,104 @@ def _openalex_citation_key(openalex_id: str, authors: str, year: str, title: str return f"{family}{year or 'nd'}{first_word}" +def _looks_like_openalex_page_blob(text: str) -> bool: + lowered = text.casefold() + blob_markers = ( + "research article|", + "download citation file", + "this content is only available via pdf", + "get citation alerts", + "views icon", + "toolbar search", + "publisher site get access", + "authors info & claims", + "publication history", + "copyright ", + ) + return len(text) > 60 and any(marker in lowered for marker in blob_markers) + + +def _skip_openalex_work(work: dict) -> bool: + title = _normalize_text(str(work.get("display_name", "") or "")) + if not title or title.casefold() == "untitled work": + return True + + work_type = str(work.get("type", "") or "") + doi = _normalize_openalex_doi(work.get("doi")) + source = _normalize_text(str(((work.get("primary_location") or {}).get("source") or {}).get("display_name", "") or "")) + abstract = _openalex_abstract_text(work.get("abstract_inverted_index") or {}) if work.get("abstract_inverted_index") else "" + + if not doi and _looks_like_container_title(title, source): + return True + if not doi and not abstract and _looks_like_generic_reference_title(title, work_type): + return True + return False + + +def _looks_like_container_title(title: str, source: str) -> bool: + if not title or not source: + return False + normalized_title = re.sub(r"[^a-z0-9]+", "", title.casefold()) + normalized_source = re.sub(r"[^a-z0-9]+", "", source.casefold()) + return bool(normalized_title) and normalized_title == normalized_source + + +def _looks_like_generic_reference_title(title: str, work_type: str) -> bool: + lowered = title.casefold() + generic_exact = { + "blood", + "cladistics", + "leukemia", + "springer", + "addison-wesley", + "physica d", + "molecular biology and evolution", + "lecture notes in artificial intelligence", + "artificial life ii", + "mcgill j educ", + "j coll sci teach", + } + if lowered in generic_exact: + return True + if work_type in {"book", "book-chapter", "dissertation"}: + return False + return bool(re.fullmatch(r"(?:[A-Z][a-z]?\.?\s*){1,4}", title)) + + +def _existing_entry_key_for_discovered_work(store: BibliographyStore, entry: BibEntry) -> str | None: + doi = entry.fields.get("doi") + if doi: + existing = store.find_entry_by_identifier("doi", doi) + if existing is not None: + return str(existing["citation_key"]) + openalex_id = entry.fields.get("openalex") + if openalex_id: + existing = store.find_entry_by_identifier("openalex", openalex_id) + if existing is not None: + return str(existing["citation_key"]) + return None + + +def _skip_openalex_review_like_duplicate(store: BibliographyStore, entry: BibEntry) -> bool: + if entry.entry_type != "article": + return False + if entry.fields.get("abstract"): + return False + + title = _normalize_text(str(entry.fields.get("title") or "")) + if not title: + return False + + for existing in store.find_entries_by_title(title): + existing_key = str(existing.get("citation_key") or "") + if existing_key == entry.citation_key: + continue + existing_type = str(existing.get("entry_type") or "") + if existing_type in {"book", "incollection", "inproceedings", "phdthesis"}: + return True + return False + + def _normalize_openalex_id(value: str) -> str: if not value: return "" diff --git a/src/citegeist/resolve.py b/src/citegeist/resolve.py index 8957c79..1b5ba03 100644 --- a/src/citegeist/resolve.py +++ b/src/citegeist/resolve.py @@ -1,5 +1,6 @@ from __future__ import annotations +import html import re import urllib.error import urllib.parse @@ -433,7 +434,7 @@ def _make_resolution_key(author_text: str, year: str, title: str) -> str: def _openalex_work_to_entry(work: dict) -> BibEntry: - title = work.get("display_name", "") or "Untitled work" + title = _normalize_text(work.get("display_name", "") or "Untitled work") year = str(work.get("publication_year") or "") doi = _normalize_openalex_doi(work.get("doi")) openalex_id = _normalize_openalex_id(work.get("id", "")) @@ -455,14 +456,16 @@ def _openalex_work_to_entry(work: dict) -> BibEntry: fields["openalex"] = openalex_id fields.setdefault("url", f"https://openalex.org/{openalex_id}") if abstract := work.get("abstract_inverted_index"): - fields["abstract"] = _openalex_abstract_text(abstract) + abstract_text = _openalex_abstract_text(abstract) + if abstract_text: + fields["abstract"] = abstract_text if source: if work_type == "article": fields["journal"] = source else: fields["booktitle"] = source - citation_key = f"openalex{re.sub(r'[^A-Za-z0-9]+', '', openalex_id).lower()}" if openalex_id else _make_resolution_key(authors or "openalex", year or "n.d.", title or "untitled") + citation_key = _openalex_citation_key(doi, openalex_id, authors, year, title) return BibEntry(entry_type=_openalex_type_to_bibtype(work_type), citation_key=citation_key, fields=fields) @@ -476,7 +479,8 @@ def _openalex_abstract_text(inverted_index: dict) -> str: for word, indexes in inverted_index.items(): for index in indexes: positions[int(index)] = word - return " ".join(word for _, word in sorted(positions.items())) + text = _normalize_text(" ".join(word for _, word in sorted(positions.items()))) + return "" if _looks_like_openalex_page_blob(text) else text def _openalex_type_to_bibtype(work_type: str) -> str: @@ -504,6 +508,37 @@ def _normalize_openalex_doi(value: str | None) -> str: return value +def _normalize_text(value: str) -> str: + without_tags = re.sub(r"<[^>]+>", "", html.unescape(value)) + return " ".join(without_tags.split()) + + +def _openalex_citation_key(doi: str, openalex_id: str, authors: str, year: str, title: str) -> str: + if doi: + suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower() + return f"doi{suffix}" + if openalex_id: + return f"openalex{re.sub(r'[^A-Za-z0-9]+', '', openalex_id).lower()}" + return _make_resolution_key(authors or "openalex", year or "n.d.", title or "untitled") + + +def _looks_like_openalex_page_blob(text: str) -> bool: + lowered = text.casefold() + blob_markers = ( + "research article|", + "download citation file", + "this content is only available via pdf", + "get citation alerts", + "views icon", + "toolbar search", + "publisher site get access", + "authors info & claims", + "publication history", + "copyright ", + ) + return len(text) > 60 and any(marker in lowered for marker in blob_markers) + + def _normalize_match_text(value: str) -> str: lowered = value.lower() lowered = re.sub(r"\W+", " ", lowered) diff --git a/src/citegeist/storage.py b/src/citegeist/storage.py index 519c152..b25111a 100644 --- a/src/citegeist/storage.py +++ b/src/citegeist/storage.py @@ -454,6 +454,40 @@ class BibliographyStore: payload["topics"] = self.get_entry_topics(citation_key) return payload + def find_entry_by_identifier(self, scheme: str, value: str) -> dict[str, object] | None: + row = self.connection.execute( + """ + SELECT e.* + FROM identifiers i + JOIN entries e ON e.id = i.entry_id + WHERE i.scheme = ? AND i.value = ? + LIMIT 1 + """, + (scheme, value), + ).fetchone() + if row is None: + return None + payload = self._row_to_entry_dict(row) + payload["topics"] = self.get_entry_topics(str(row["citation_key"])) + return payload + + def find_entries_by_title(self, title: str) -> list[dict[str, object]]: + rows = self.connection.execute( + """ + SELECT * + FROM entries + WHERE trim(lower(title)) = trim(lower(?)) + ORDER BY citation_key + """, + (title,), + ).fetchall() + payloads: list[dict[str, object]] = [] + for row in rows: + payload = self._row_to_entry_dict(row) + payload["topics"] = self.get_entry_topics(str(row["citation_key"])) + payloads.append(payload) + return payloads + def list_entries(self, limit: int = 50) -> list[dict[str, object]]: rows = self.connection.execute( """ diff --git a/tests/test_expand.py b/tests/test_expand.py index 0e16e70..b3f2cc6 100644 --- a/tests/test_expand.py +++ b/tests/test_expand.py @@ -211,6 +211,28 @@ def test_crossref_expander_keeps_simple_unstructured_title_without_identifier(): store.close() +def test_crossref_reference_to_entry_extracts_title_from_thesis_citation_blob(): + entry = _crossref_reference_to_entry( + { + "unstructured": ( + "Johnson WR. Evolution in action in the classroom: Engaging students in scientific " + "practices to develop a conceptual understanding of natural selection " + "(Master’s thesis). ProQuest Dissertations and Theses database. " + "(UMI No. 1517061). 2012." + ), + "year": "2012", + }, + "seed2024", + 1, + ) + + assert entry.entry_type == "phdthesis" + assert entry.fields["title"] == ( + "Evolution in action in the classroom: Engaging students in scientific " + "practices to develop a conceptual understanding of natural selection" + ) + + def test_crossref_expander_returns_empty_on_fetch_error(): store = BibliographyStore() try: diff --git a/tests/test_openalex_expand.py b/tests/test_openalex_expand.py index 2c46af7..53da1ec 100644 --- a/tests/test_openalex_expand.py +++ b/tests/test_openalex_expand.py @@ -16,7 +16,7 @@ def test_openalex_work_to_entry_maps_basic_fields(): } ) - assert entry.citation_key == "openalexw12345" + assert entry.citation_key == "doi101000exampleopenalex" assert entry.fields["openalex"] == "W12345" assert entry.fields["doi"] == "10.1000/example-openalex" assert entry.fields["journal"] == "Journal of Graph Discovery" @@ -50,6 +50,7 @@ def test_openalex_expander_adds_outgoing_and_incoming_edges(): "results": [ { "id": "https://openalex.org/WDISCOVERED", + "doi": "https://doi.org/10.1000/discovered-openalex", "display_name": "Referenced OpenAlex Work", "publication_year": 2021, "type": "article", @@ -76,9 +77,219 @@ def test_openalex_expander_adds_outgoing_and_incoming_edges(): outgoing = expander.expand_entry(store, "seed2024", relation_type="cites", limit=5) incoming = expander.expand_entry(store, "seed2024", relation_type="cited_by", limit=5) - assert outgoing[0].discovered_citation_key == "openalexwdiscovered" + assert outgoing[0].discovered_citation_key == "doi101000discoveredopenalex" assert incoming[0].source_citation_key == "openalexwciting" - assert "openalexwdiscovered" in store.get_relations("seed2024", "cites") + assert "doi101000discoveredopenalex" in store.get_relations("seed2024", "cites") assert "seed2024" in store.get_relations("openalexwciting", "cites") finally: store.close() + + +def test_openalex_work_to_entry_drops_page_blob_abstract(): + entry = _openalex_work_to_entry( + { + "id": "https://openalex.org/W12345", + "display_name": "Noisy OpenAlex Work", + "publication_year": 2022, + "type": "article", + "abstract_inverted_index": { + "Research": [0], + "Article|": [1], + "Download": [2], + "citation": [3], + "file": [4], + "This": [5], + "content": [6], + "is": [7], + "only": [8], + "available": [9], + "via": [10], + "PDF": [11], + }, + } + ) + + assert "abstract" not in entry.fields + + +def test_openalex_expander_reuses_existing_doi_entry(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + doi = {10.1000/seed-doi} +} + +@article{doi101000discoveredopenalex, + author = {Existing, Bob}, + title = {Referenced OpenAlex Work}, + year = {2021}, + doi = {10.1000/discovered-openalex} +} +""" + ) + expander = OpenAlexExpander() + payloads = iter( + [ + {"results": [{"id": "https://openalex.org/WSEED"}]}, + { + "results": [ + { + "id": "https://openalex.org/WDISCOVERED", + "doi": "https://doi.org/10.1000/discovered-openalex", + "display_name": "Referenced OpenAlex Work", + "publication_year": 2021, + "type": "article", + "authorships": [{"author": {"display_name": "Bob Known"}}], + "primary_location": {"source": {"display_name": "OpenAlex Journal"}}, + } + ] + }, + ] + ) + expander.resolver.source_client.get_json = lambda _url: next(payloads) # type: ignore[method-assign] + + results = expander.expand_entry(store, "seed2024", relation_type="cites", limit=5) + + assert [result.discovered_citation_key for result in results] == ["doi101000discoveredopenalex"] + assert results[0].created_entry is False + assert store.get_entry("openalexwdiscovered") is None + assert "doi101000discoveredopenalex" in store.get_relations("seed2024", "cites") + finally: + store.close() + + +def test_openalex_expander_skips_generic_container_title_without_doi(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + doi = {10.1000/seed-doi} +} +""" + ) + expander = OpenAlexExpander() + payloads = iter( + [ + {"results": [{"id": "https://openalex.org/WSEED"}]}, + { + "results": [ + { + "id": "https://openalex.org/WBAD", + "display_name": "Blood", + "publication_year": 2011, + "type": "article", + "primary_location": {"source": {"display_name": "Blood"}}, + } + ] + }, + ] + ) + expander.resolver.source_client.get_json = lambda _url: next(payloads) # type: ignore[method-assign] + + assert expander.expand_entry(store, "seed2024", relation_type="cites", limit=5) == [] + assert store.get_relations("seed2024", "cites") == [] + finally: + store.close() + + +def test_openalex_expander_skips_review_like_article_shadowing_existing_book(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + doi = {10.1000/seed-doi} +} + +@book{darwin1859origin, + author = {Darwin, Charles}, + title = {On the Origin of Species by Means of Natural Selection}, + year = {1859} +} +""" + ) + expander = OpenAlexExpander() + payloads = iter( + [ + {"results": [{"id": "https://openalex.org/WSEED"}]}, + { + "results": [ + { + "id": "https://openalex.org/WREVIEWLIKE", + "display_name": "On the Origin of Species by Means of Natural Selection", + "publication_year": 1953, + "type": "article", + "authorships": [{"author": {"display_name": "R. L. Livezey"}}], + "primary_location": {"source": {"display_name": "The American Midland Naturalist"}}, + } + ] + }, + ] + ) + expander.resolver.source_client.get_json = lambda _url: next(payloads) # type: ignore[method-assign] + + assert expander.expand_entry(store, "seed2024", relation_type="cites", limit=5) == [] + assert store.get_entry("openalexwreviewlike") is None + assert store.get_relations("seed2024", "cites") == [] + finally: + store.close() + + +def test_openalex_expander_keeps_same_title_article_when_it_has_an_abstract(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024}, + doi = {10.1000/seed-doi} +} + +@book{darwin1859origin, + author = {Darwin, Charles}, + title = {On the Origin of Species by Means of Natural Selection}, + year = {1859} +} +""" + ) + expander = OpenAlexExpander() + payloads = iter( + [ + {"results": [{"id": "https://openalex.org/WSEED"}]}, + { + "results": [ + { + "id": "https://openalex.org/WKEPT", + "display_name": "On the Origin of Species by Means of Natural Selection", + "publication_year": 1953, + "type": "article", + "authorships": [{"author": {"display_name": "R. L. Livezey"}}], + "primary_location": {"source": {"display_name": "The American Midland Naturalist"}}, + "abstract_inverted_index": {"Legitimate": [0], "analysis": [1]}, + } + ] + }, + ] + ) + expander.resolver.source_client.get_json = lambda _url: next(payloads) # type: ignore[method-assign] + + results = expander.expand_entry(store, "seed2024", relation_type="cites", limit=5) + + assert [result.discovered_citation_key for result in results] == ["openalexwkept"] + assert "openalexwkept" in store.get_relations("seed2024", "cites") + finally: + store.close() diff --git a/tests/test_resolve.py b/tests/test_resolve.py index da08dc8..79d24b3 100644 --- a/tests/test_resolve.py +++ b/tests/test_resolve.py @@ -201,7 +201,7 @@ def test_openalex_work_to_entry_maps_basic_fields(): } ) - assert entry.citation_key == "openalexw12345" + assert entry.citation_key == "doi101000exampleopenalex" assert entry.fields["openalex"] == "W12345" assert entry.fields["doi"] == "10.1000/example-openalex" assert entry.fields["journal"] == "Journal of Open Graphs"