Tighten discovered-work admission

This commit is contained in:
welsberr 2026-03-20 20:22:05 -04:00
parent d6b5138660
commit 0354d6de89
8 changed files with 482 additions and 29 deletions

View File

@ -159,7 +159,9 @@ Broad BibTeX exports skip DOI-only placeholder records such as `Referenced work
Long-running CLI commands report progress on `stderr` so `stdout` remains clean for JSON, BibTeX, or tabular output. Long-running CLI commands report progress on `stderr` so `stdout` remains clean for JSON, BibTeX, or tabular output.
Crossref reference expansion is intentionally conservative about weak discoveries. If a cited reference has no DOI and Crossref only exposes it as an unstructured citation blob, `expand --source crossref`, `expand-topic --source crossref`, and bootstrap flows now skip materializing that record unless the fallback metadata looks like a cleaner non-`misc` work such as conference proceedings. This reduces junk `@misc` entries whose `title` field is really a pasted citation string. Crossref reference expansion is intentionally conservative about weak discoveries. If a cited reference has no DOI and Crossref only exposes it as an unstructured citation blob, `expand --source crossref`, `expand-topic --source crossref`, and bootstrap flows now skip materializing that record unless the fallback metadata looks like a cleaner non-`misc` work such as conference proceedings. When Crossref does expose thesis or dissertation references only as unstructured text, citegeist now also tries to extract the actual work title instead of keeping the entire ProQuest-style citation blob in the `title` field. This reduces junk `@misc` entries and cleaner `@phdthesis` fallbacks whose `title` field is really a pasted citation string.
OpenAlex expansion is also conservative about noisy secondary records. Discoveries now prefer DOI-based citation keys when a DOI is present, which reduces parallel `doi...` and `openalex...` entries for the same work. OpenAlex abstract imports are sanitized to drop obvious webpage/export blobs, and DOI-less records are filtered when they look like venue-title stubs, generic container records, or weak review-like shadows of an already present book, chapter, proceedings paper, or dissertation with the same title. Preview mode uses the same admission rules as write-enabled expansion, so rejected candidates disappear before you commit them to the store.
For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run. For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run.

View File

@ -211,7 +211,9 @@ Re-enrich all current `@misc` entries with DOIs:
.venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --all-misc --limit 25 .venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --all-misc --limit 25
``` ```
When Crossref expansion only yields an unstructured citation blob without a DOI, citegeist now skips materializing that discovery instead of storing it as a weak `@misc` entry. Cleaner fallback cases that infer a more specific type, such as proceedings-like titles, are still admitted. When Crossref expansion only yields an unstructured citation blob without a DOI, citegeist now skips materializing that discovery instead of storing it as a weak `@misc` entry. Cleaner fallback cases that infer a more specific type, such as proceedings-like titles, are still admitted. Thesis and dissertation citation blobs are also normalized more aggressively so fallback `@phdthesis` entries keep the work title instead of the entire ProQuest-style citation string.
OpenAlex expansion now applies the same kind of admission control before writing or previewing discoveries. DOI-backed discoveries prefer DOI-based citation keys, noisy webpage/export abstracts are dropped, generic venue-title stubs are rejected, and weak DOI-less article records that merely shadow an existing book/chapter/dissertation title in your store are suppressed instead of being materialized as parallel duplicates.
## Explore Citation Graphs ## Explore Citation Graphs

View File

@ -1,5 +1,6 @@
from __future__ import annotations from __future__ import annotations
import html
import re import re
from dataclasses import dataclass from dataclasses import dataclass
from urllib.parse import quote, urlencode from urllib.parse import quote, urlencode
@ -157,9 +158,15 @@ class OpenAlexExpander:
results: list[ExpansionResult] = [] results: list[ExpansionResult] = []
for work in works: for work in works:
if _skip_openalex_work(work):
continue
discovered = _openalex_work_to_entry(work) discovered = _openalex_work_to_entry(work)
existing_key = _existing_entry_key_for_discovered_work(store, discovered)
if existing_key is None and _skip_openalex_review_like_duplicate(store, discovered):
continue
target_key = existing_key or discovered.citation_key
created = False created = False
if store.get_entry(discovered.citation_key) is None: if existing_key is None and store.get_entry(discovered.citation_key) is None:
store.upsert_entry( store.upsert_entry(
discovered, discovered,
raw_bibtex=None, raw_bibtex=None,
@ -172,9 +179,8 @@ class OpenAlexExpander:
if relation_type == "cites": if relation_type == "cites":
source_key = citation_key source_key = citation_key
target_key = discovered.citation_key
else: else:
source_key = discovered.citation_key source_key = target_key
target_key = citation_key target_key = citation_key
store.add_relation( store.add_relation(
@ -188,7 +194,7 @@ class OpenAlexExpander:
results.append( results.append(
ExpansionResult( ExpansionResult(
source_citation_key=source_key, source_citation_key=source_key,
discovered_citation_key=discovered.citation_key, discovered_citation_key=target_key,
created_entry=created, created_entry=created,
relation_type=relation_type, relation_type=relation_type,
source_label=f"openalex:{relation_type}:{openalex_id}", source_label=f"openalex:{relation_type}:{openalex_id}",
@ -385,14 +391,20 @@ class TopicExpander:
works = payload.get("results", []) works = payload.get("results", [])
rows: list[tuple[ExpansionResult, dict[str, object]]] = [] rows: list[tuple[ExpansionResult, dict[str, object]]] = []
for work in works: for work in works:
if _skip_openalex_work(work):
continue
discovered = _openalex_work_to_entry(work) discovered = _openalex_work_to_entry(work)
source_key = citation_key if relation_type == "cites" else discovered.citation_key existing_key = _existing_entry_key_for_discovered_work(store, discovered)
if existing_key is None and _skip_openalex_review_like_duplicate(store, discovered):
continue
target_key = existing_key or discovered.citation_key
source_key = citation_key if relation_type == "cites" else target_key
rows.append( rows.append(
( (
ExpansionResult( ExpansionResult(
source_citation_key=source_key, source_citation_key=source_key,
discovered_citation_key=discovered.citation_key, discovered_citation_key=target_key,
created_entry=store.get_entry(discovered.citation_key) is None, created_entry=existing_key is None and store.get_entry(discovered.citation_key) is None,
relation_type=relation_type, relation_type=relation_type,
source_label=f"openalex:{relation_type}:{openalex_id}", source_label=f"openalex:{relation_type}:{openalex_id}",
), ),
@ -403,13 +415,7 @@ class TopicExpander:
def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry: def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry:
title = ( title = _crossref_reference_title(reference, ordinal)
reference.get("article-title")
or reference.get("volume-title")
or reference.get("journal-title")
or reference.get("unstructured")
or f"Referenced work {ordinal}"
)
year = str(reference.get("year") or "") year = str(reference.get("year") or "")
author = reference.get("author") or "" author = reference.get("author") or ""
doi = reference.get("DOI") or "" doi = reference.get("DOI") or ""
@ -434,6 +440,42 @@ def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordi
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
def _crossref_reference_title(reference: dict, ordinal: int) -> str:
raw_title = (
reference.get("article-title")
or reference.get("volume-title")
or reference.get("journal-title")
or _extract_crossref_unstructured_title(str(reference.get("unstructured") or ""))
or f"Referenced work {ordinal}"
)
return _normalize_text(raw_title)
def _extract_crossref_unstructured_title(text: str) -> str:
normalized = _normalize_text(text)
if not normalized:
return ""
thesis_markers = (
"(Master",
"(Doctoral",
"PhD dissertation",
"Master's thesis",
"Masters thesis",
"Doctoral dissertation",
)
for marker in thesis_markers:
if marker in normalized:
normalized = normalized.split(marker, 1)[0].strip(" .")
break
for marker in (" ProQuest", " UMI No.", " Dissertation Abstracts", " University Microfilms"):
if marker in normalized:
normalized = normalized.split(marker, 1)[0].strip(" .")
if any(marker in text for marker in thesis_markers) and ". " in normalized:
normalized = normalized.split(". ", 1)[1].strip()
return normalized.strip(" .")
def _skip_crossref_reference(reference: dict, entry: BibEntry) -> bool: def _skip_crossref_reference(reference: dict, entry: BibEntry) -> bool:
if reference.get("DOI"): if reference.get("DOI"):
return False return False
@ -485,7 +527,8 @@ def _reference_citation_key(reference: dict, title: str, year: str, ordinal: int
def _normalize_text(value: str) -> str: def _normalize_text(value: str) -> str:
return " ".join(value.split()) without_tags = re.sub(r"<[^>]+>", "", html.unescape(value))
return " ".join(without_tags.split())
def _crossref_reference_entry_type(reference: dict, title: str, journal_title: str) -> str: def _crossref_reference_entry_type(reference: dict, title: str, journal_title: str) -> str:
@ -635,14 +678,16 @@ def _openalex_work_to_entry(work: dict) -> BibEntry:
if openalex_id: if openalex_id:
fields["openalex"] = openalex_id fields["openalex"] = openalex_id
if abstract := work.get("abstract_inverted_index"): if abstract := work.get("abstract_inverted_index"):
fields["abstract"] = _openalex_abstract_text(abstract) abstract_text = _openalex_abstract_text(abstract)
if abstract_text:
fields["abstract"] = abstract_text
if source: if source:
if work_type == "article": if work_type == "article":
fields["journal"] = source fields["journal"] = source
else: else:
fields["booktitle"] = source fields["booktitle"] = source
citation_key = _openalex_citation_key(openalex_id, authors, year, title) citation_key = _openalex_citation_key(doi, openalex_id, authors, year, title)
entry_type = _openalex_type_to_bibtype(work_type) entry_type = _openalex_type_to_bibtype(work_type)
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
@ -658,7 +703,8 @@ def _openalex_abstract_text(inverted_index: dict) -> str:
for word, indexes in inverted_index.items(): for word, indexes in inverted_index.items():
for index in indexes: for index in indexes:
positions[int(index)] = word positions[int(index)] = word
return " ".join(word for _, word in sorted(positions.items())) text = _normalize_text(" ".join(word for _, word in sorted(positions.items())))
return "" if _looks_like_openalex_page_blob(text) else text
def _openalex_type_to_bibtype(work_type: str) -> str: def _openalex_type_to_bibtype(work_type: str) -> str:
@ -672,7 +718,10 @@ def _openalex_type_to_bibtype(work_type: str) -> str:
return mapping.get(work_type, "misc") return mapping.get(work_type, "misc")
def _openalex_citation_key(openalex_id: str, authors: str, year: str, title: str) -> str: def _openalex_citation_key(doi: str, openalex_id: str, authors: str, year: str, title: str) -> str:
if doi:
suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower()
return f"doi{suffix}"
if openalex_id: if openalex_id:
return f"openalex{re.sub(r'[^A-Za-z0-9]+', '', openalex_id).lower()}" return f"openalex{re.sub(r'[^A-Za-z0-9]+', '', openalex_id).lower()}"
author = authors.split(" and ")[0] if authors else "ref" author = authors.split(" and ")[0] if authors else "ref"
@ -681,6 +730,104 @@ def _openalex_citation_key(openalex_id: str, authors: str, year: str, title: str
return f"{family}{year or 'nd'}{first_word}" return f"{family}{year or 'nd'}{first_word}"
def _looks_like_openalex_page_blob(text: str) -> bool:
lowered = text.casefold()
blob_markers = (
"research article|",
"download citation file",
"this content is only available via pdf",
"get citation alerts",
"views icon",
"toolbar search",
"publisher site get access",
"authors info & claims",
"publication history",
"copyright ",
)
return len(text) > 60 and any(marker in lowered for marker in blob_markers)
def _skip_openalex_work(work: dict) -> bool:
title = _normalize_text(str(work.get("display_name", "") or ""))
if not title or title.casefold() == "untitled work":
return True
work_type = str(work.get("type", "") or "")
doi = _normalize_openalex_doi(work.get("doi"))
source = _normalize_text(str(((work.get("primary_location") or {}).get("source") or {}).get("display_name", "") or ""))
abstract = _openalex_abstract_text(work.get("abstract_inverted_index") or {}) if work.get("abstract_inverted_index") else ""
if not doi and _looks_like_container_title(title, source):
return True
if not doi and not abstract and _looks_like_generic_reference_title(title, work_type):
return True
return False
def _looks_like_container_title(title: str, source: str) -> bool:
if not title or not source:
return False
normalized_title = re.sub(r"[^a-z0-9]+", "", title.casefold())
normalized_source = re.sub(r"[^a-z0-9]+", "", source.casefold())
return bool(normalized_title) and normalized_title == normalized_source
def _looks_like_generic_reference_title(title: str, work_type: str) -> bool:
lowered = title.casefold()
generic_exact = {
"blood",
"cladistics",
"leukemia",
"springer",
"addison-wesley",
"physica d",
"molecular biology and evolution",
"lecture notes in artificial intelligence",
"artificial life ii",
"mcgill j educ",
"j coll sci teach",
}
if lowered in generic_exact:
return True
if work_type in {"book", "book-chapter", "dissertation"}:
return False
return bool(re.fullmatch(r"(?:[A-Z][a-z]?\.?\s*){1,4}", title))
def _existing_entry_key_for_discovered_work(store: BibliographyStore, entry: BibEntry) -> str | None:
doi = entry.fields.get("doi")
if doi:
existing = store.find_entry_by_identifier("doi", doi)
if existing is not None:
return str(existing["citation_key"])
openalex_id = entry.fields.get("openalex")
if openalex_id:
existing = store.find_entry_by_identifier("openalex", openalex_id)
if existing is not None:
return str(existing["citation_key"])
return None
def _skip_openalex_review_like_duplicate(store: BibliographyStore, entry: BibEntry) -> bool:
if entry.entry_type != "article":
return False
if entry.fields.get("abstract"):
return False
title = _normalize_text(str(entry.fields.get("title") or ""))
if not title:
return False
for existing in store.find_entries_by_title(title):
existing_key = str(existing.get("citation_key") or "")
if existing_key == entry.citation_key:
continue
existing_type = str(existing.get("entry_type") or "")
if existing_type in {"book", "incollection", "inproceedings", "phdthesis"}:
return True
return False
def _normalize_openalex_id(value: str) -> str: def _normalize_openalex_id(value: str) -> str:
if not value: if not value:
return "" return ""

View File

@ -1,5 +1,6 @@
from __future__ import annotations from __future__ import annotations
import html
import re import re
import urllib.error import urllib.error
import urllib.parse import urllib.parse
@ -433,7 +434,7 @@ def _make_resolution_key(author_text: str, year: str, title: str) -> str:
def _openalex_work_to_entry(work: dict) -> BibEntry: def _openalex_work_to_entry(work: dict) -> BibEntry:
title = work.get("display_name", "") or "Untitled work" title = _normalize_text(work.get("display_name", "") or "Untitled work")
year = str(work.get("publication_year") or "") year = str(work.get("publication_year") or "")
doi = _normalize_openalex_doi(work.get("doi")) doi = _normalize_openalex_doi(work.get("doi"))
openalex_id = _normalize_openalex_id(work.get("id", "")) openalex_id = _normalize_openalex_id(work.get("id", ""))
@ -455,14 +456,16 @@ def _openalex_work_to_entry(work: dict) -> BibEntry:
fields["openalex"] = openalex_id fields["openalex"] = openalex_id
fields.setdefault("url", f"https://openalex.org/{openalex_id}") fields.setdefault("url", f"https://openalex.org/{openalex_id}")
if abstract := work.get("abstract_inverted_index"): if abstract := work.get("abstract_inverted_index"):
fields["abstract"] = _openalex_abstract_text(abstract) abstract_text = _openalex_abstract_text(abstract)
if abstract_text:
fields["abstract"] = abstract_text
if source: if source:
if work_type == "article": if work_type == "article":
fields["journal"] = source fields["journal"] = source
else: else:
fields["booktitle"] = source fields["booktitle"] = source
citation_key = f"openalex{re.sub(r'[^A-Za-z0-9]+', '', openalex_id).lower()}" if openalex_id else _make_resolution_key(authors or "openalex", year or "n.d.", title or "untitled") citation_key = _openalex_citation_key(doi, openalex_id, authors, year, title)
return BibEntry(entry_type=_openalex_type_to_bibtype(work_type), citation_key=citation_key, fields=fields) return BibEntry(entry_type=_openalex_type_to_bibtype(work_type), citation_key=citation_key, fields=fields)
@ -476,7 +479,8 @@ def _openalex_abstract_text(inverted_index: dict) -> str:
for word, indexes in inverted_index.items(): for word, indexes in inverted_index.items():
for index in indexes: for index in indexes:
positions[int(index)] = word positions[int(index)] = word
return " ".join(word for _, word in sorted(positions.items())) text = _normalize_text(" ".join(word for _, word in sorted(positions.items())))
return "" if _looks_like_openalex_page_blob(text) else text
def _openalex_type_to_bibtype(work_type: str) -> str: def _openalex_type_to_bibtype(work_type: str) -> str:
@ -504,6 +508,37 @@ def _normalize_openalex_doi(value: str | None) -> str:
return value return value
def _normalize_text(value: str) -> str:
without_tags = re.sub(r"<[^>]+>", "", html.unescape(value))
return " ".join(without_tags.split())
def _openalex_citation_key(doi: str, openalex_id: str, authors: str, year: str, title: str) -> str:
if doi:
suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower()
return f"doi{suffix}"
if openalex_id:
return f"openalex{re.sub(r'[^A-Za-z0-9]+', '', openalex_id).lower()}"
return _make_resolution_key(authors or "openalex", year or "n.d.", title or "untitled")
def _looks_like_openalex_page_blob(text: str) -> bool:
lowered = text.casefold()
blob_markers = (
"research article|",
"download citation file",
"this content is only available via pdf",
"get citation alerts",
"views icon",
"toolbar search",
"publisher site get access",
"authors info & claims",
"publication history",
"copyright ",
)
return len(text) > 60 and any(marker in lowered for marker in blob_markers)
def _normalize_match_text(value: str) -> str: def _normalize_match_text(value: str) -> str:
lowered = value.lower() lowered = value.lower()
lowered = re.sub(r"\W+", " ", lowered) lowered = re.sub(r"\W+", " ", lowered)

View File

@ -454,6 +454,40 @@ class BibliographyStore:
payload["topics"] = self.get_entry_topics(citation_key) payload["topics"] = self.get_entry_topics(citation_key)
return payload return payload
def find_entry_by_identifier(self, scheme: str, value: str) -> dict[str, object] | None:
row = self.connection.execute(
"""
SELECT e.*
FROM identifiers i
JOIN entries e ON e.id = i.entry_id
WHERE i.scheme = ? AND i.value = ?
LIMIT 1
""",
(scheme, value),
).fetchone()
if row is None:
return None
payload = self._row_to_entry_dict(row)
payload["topics"] = self.get_entry_topics(str(row["citation_key"]))
return payload
def find_entries_by_title(self, title: str) -> list[dict[str, object]]:
rows = self.connection.execute(
"""
SELECT *
FROM entries
WHERE trim(lower(title)) = trim(lower(?))
ORDER BY citation_key
""",
(title,),
).fetchall()
payloads: list[dict[str, object]] = []
for row in rows:
payload = self._row_to_entry_dict(row)
payload["topics"] = self.get_entry_topics(str(row["citation_key"]))
payloads.append(payload)
return payloads
def list_entries(self, limit: int = 50) -> list[dict[str, object]]: def list_entries(self, limit: int = 50) -> list[dict[str, object]]:
rows = self.connection.execute( rows = self.connection.execute(
""" """

View File

@ -211,6 +211,28 @@ def test_crossref_expander_keeps_simple_unstructured_title_without_identifier():
store.close() store.close()
def test_crossref_reference_to_entry_extracts_title_from_thesis_citation_blob():
entry = _crossref_reference_to_entry(
{
"unstructured": (
"Johnson WR. Evolution in action in the classroom: Engaging students in scientific "
"practices to develop a conceptual understanding of natural selection "
"(Masters thesis). ProQuest Dissertations and Theses database. "
"(UMI No. 1517061). 2012."
),
"year": "2012",
},
"seed2024",
1,
)
assert entry.entry_type == "phdthesis"
assert entry.fields["title"] == (
"Evolution in action in the classroom: Engaging students in scientific "
"practices to develop a conceptual understanding of natural selection"
)
def test_crossref_expander_returns_empty_on_fetch_error(): def test_crossref_expander_returns_empty_on_fetch_error():
store = BibliographyStore() store = BibliographyStore()
try: try:

View File

@ -16,7 +16,7 @@ def test_openalex_work_to_entry_maps_basic_fields():
} }
) )
assert entry.citation_key == "openalexw12345" assert entry.citation_key == "doi101000exampleopenalex"
assert entry.fields["openalex"] == "W12345" assert entry.fields["openalex"] == "W12345"
assert entry.fields["doi"] == "10.1000/example-openalex" assert entry.fields["doi"] == "10.1000/example-openalex"
assert entry.fields["journal"] == "Journal of Graph Discovery" assert entry.fields["journal"] == "Journal of Graph Discovery"
@ -50,6 +50,7 @@ def test_openalex_expander_adds_outgoing_and_incoming_edges():
"results": [ "results": [
{ {
"id": "https://openalex.org/WDISCOVERED", "id": "https://openalex.org/WDISCOVERED",
"doi": "https://doi.org/10.1000/discovered-openalex",
"display_name": "Referenced OpenAlex Work", "display_name": "Referenced OpenAlex Work",
"publication_year": 2021, "publication_year": 2021,
"type": "article", "type": "article",
@ -76,9 +77,219 @@ def test_openalex_expander_adds_outgoing_and_incoming_edges():
outgoing = expander.expand_entry(store, "seed2024", relation_type="cites", limit=5) outgoing = expander.expand_entry(store, "seed2024", relation_type="cites", limit=5)
incoming = expander.expand_entry(store, "seed2024", relation_type="cited_by", limit=5) incoming = expander.expand_entry(store, "seed2024", relation_type="cited_by", limit=5)
assert outgoing[0].discovered_citation_key == "openalexwdiscovered" assert outgoing[0].discovered_citation_key == "doi101000discoveredopenalex"
assert incoming[0].source_citation_key == "openalexwciting" assert incoming[0].source_citation_key == "openalexwciting"
assert "openalexwdiscovered" in store.get_relations("seed2024", "cites") assert "doi101000discoveredopenalex" in store.get_relations("seed2024", "cites")
assert "seed2024" in store.get_relations("openalexwciting", "cites") assert "seed2024" in store.get_relations("openalexwciting", "cites")
finally: finally:
store.close() store.close()
def test_openalex_work_to_entry_drops_page_blob_abstract():
entry = _openalex_work_to_entry(
{
"id": "https://openalex.org/W12345",
"display_name": "Noisy OpenAlex Work",
"publication_year": 2022,
"type": "article",
"abstract_inverted_index": {
"Research": [0],
"Article|": [1],
"Download": [2],
"citation": [3],
"file": [4],
"This": [5],
"content": [6],
"is": [7],
"only": [8],
"available": [9],
"via": [10],
"PDF": [11],
},
}
)
assert "abstract" not in entry.fields
def test_openalex_expander_reuses_existing_doi_entry():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024},
doi = {10.1000/seed-doi}
}
@article{doi101000discoveredopenalex,
author = {Existing, Bob},
title = {Referenced OpenAlex Work},
year = {2021},
doi = {10.1000/discovered-openalex}
}
"""
)
expander = OpenAlexExpander()
payloads = iter(
[
{"results": [{"id": "https://openalex.org/WSEED"}]},
{
"results": [
{
"id": "https://openalex.org/WDISCOVERED",
"doi": "https://doi.org/10.1000/discovered-openalex",
"display_name": "Referenced OpenAlex Work",
"publication_year": 2021,
"type": "article",
"authorships": [{"author": {"display_name": "Bob Known"}}],
"primary_location": {"source": {"display_name": "OpenAlex Journal"}},
}
]
},
]
)
expander.resolver.source_client.get_json = lambda _url: next(payloads) # type: ignore[method-assign]
results = expander.expand_entry(store, "seed2024", relation_type="cites", limit=5)
assert [result.discovered_citation_key for result in results] == ["doi101000discoveredopenalex"]
assert results[0].created_entry is False
assert store.get_entry("openalexwdiscovered") is None
assert "doi101000discoveredopenalex" in store.get_relations("seed2024", "cites")
finally:
store.close()
def test_openalex_expander_skips_generic_container_title_without_doi():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024},
doi = {10.1000/seed-doi}
}
"""
)
expander = OpenAlexExpander()
payloads = iter(
[
{"results": [{"id": "https://openalex.org/WSEED"}]},
{
"results": [
{
"id": "https://openalex.org/WBAD",
"display_name": "Blood",
"publication_year": 2011,
"type": "article",
"primary_location": {"source": {"display_name": "Blood"}},
}
]
},
]
)
expander.resolver.source_client.get_json = lambda _url: next(payloads) # type: ignore[method-assign]
assert expander.expand_entry(store, "seed2024", relation_type="cites", limit=5) == []
assert store.get_relations("seed2024", "cites") == []
finally:
store.close()
def test_openalex_expander_skips_review_like_article_shadowing_existing_book():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024},
doi = {10.1000/seed-doi}
}
@book{darwin1859origin,
author = {Darwin, Charles},
title = {On the Origin of Species by Means of Natural Selection},
year = {1859}
}
"""
)
expander = OpenAlexExpander()
payloads = iter(
[
{"results": [{"id": "https://openalex.org/WSEED"}]},
{
"results": [
{
"id": "https://openalex.org/WREVIEWLIKE",
"display_name": "On the Origin of Species by Means of Natural Selection",
"publication_year": 1953,
"type": "article",
"authorships": [{"author": {"display_name": "R. L. Livezey"}}],
"primary_location": {"source": {"display_name": "The American Midland Naturalist"}},
}
]
},
]
)
expander.resolver.source_client.get_json = lambda _url: next(payloads) # type: ignore[method-assign]
assert expander.expand_entry(store, "seed2024", relation_type="cites", limit=5) == []
assert store.get_entry("openalexwreviewlike") is None
assert store.get_relations("seed2024", "cites") == []
finally:
store.close()
def test_openalex_expander_keeps_same_title_article_when_it_has_an_abstract():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024},
doi = {10.1000/seed-doi}
}
@book{darwin1859origin,
author = {Darwin, Charles},
title = {On the Origin of Species by Means of Natural Selection},
year = {1859}
}
"""
)
expander = OpenAlexExpander()
payloads = iter(
[
{"results": [{"id": "https://openalex.org/WSEED"}]},
{
"results": [
{
"id": "https://openalex.org/WKEPT",
"display_name": "On the Origin of Species by Means of Natural Selection",
"publication_year": 1953,
"type": "article",
"authorships": [{"author": {"display_name": "R. L. Livezey"}}],
"primary_location": {"source": {"display_name": "The American Midland Naturalist"}},
"abstract_inverted_index": {"Legitimate": [0], "analysis": [1]},
}
]
},
]
)
expander.resolver.source_client.get_json = lambda _url: next(payloads) # type: ignore[method-assign]
results = expander.expand_entry(store, "seed2024", relation_type="cites", limit=5)
assert [result.discovered_citation_key for result in results] == ["openalexwkept"]
assert "openalexwkept" in store.get_relations("seed2024", "cites")
finally:
store.close()

View File

@ -201,7 +201,7 @@ def test_openalex_work_to_entry_maps_basic_fields():
} }
) )
assert entry.citation_key == "openalexw12345" assert entry.citation_key == "doi101000exampleopenalex"
assert entry.fields["openalex"] == "W12345" assert entry.fields["openalex"] == "W12345"
assert entry.fields["doi"] == "10.1000/example-openalex" assert entry.fields["doi"] == "10.1000/example-openalex"
assert entry.fields["journal"] == "Journal of Open Graphs" assert entry.fields["journal"] == "Journal of Open Graphs"