Add PubMed support to CiteGeist

This commit is contained in:
welsberr 2026-04-07 01:41:53 -04:00
parent 7bdaf37c59
commit 663fb1973a
7 changed files with 444 additions and 6 deletions

View File

@ -51,7 +51,7 @@ The initial repo includes:
- staged plaintext reference extraction that now preserves more structured metadata from legacy references, including year suffixes, identifiers, volume/issue/pages, and thesis/report/web-style venue hints;
- a reference-extraction backend seam with the local `heuristic` parser as the default implementation, so optional external backends can be added later without changing the core extract workflow;
- standalone verification and disambiguation of free-text references or partial BibTeX into auditable BibTeX/JSON results with `x_status`, `x_confidence`, `x_source`, `x_query`, and alternate-candidate traces;
- identifier-first metadata resolution for DOI, OpenAlex, DBLP, arXiv, and DataCite-backed entries, with OpenAlex/DataCite title-search fallback;
- identifier-first metadata resolution for DOI, PMID/PubMed, OpenAlex, DBLP, arXiv, and DataCite-backed entries, with OpenAlex/DataCite/PubMed title-search fallback;
- local citation-graph traversal over stored `cites`, `cited_by`, and `crossref` edges;
- Crossref- and OpenAlex-backed graph expansion that materializes draft related works and edge provenance;
- a dedicated source-client layer with fixture/cache support for live-source development;

View File

@ -358,15 +358,12 @@ class Bootstrapper:
})
return results
def _deadline_reached(deadline: float | None) -> bool:
return deadline is not None and time.monotonic() >= deadline
def _topic_candidates(self, topic: str, seed_keys: list[str], limit: int) -> list[tuple[BibEntry, float]]:
scored: dict[str, tuple[BibEntry, float]] = {}
for source_name, base_score, entries in (
for _source_name, base_score, entries in (
("openalex", 3.0, self.resolver.search_openalex(topic, limit=limit)),
("pubmed", 2.5, self.resolver.search_pubmed(topic, limit=limit)),
("crossref", 2.0, self.resolver.search_crossref(topic, limit=limit)),
("datacite", 1.5, self.resolver.search_datacite(topic, limit=limit)),
):
@ -383,6 +380,10 @@ def _deadline_reached(deadline: float | None) -> bool:
return ranked[:limit]
def _deadline_reached(deadline: float | None) -> bool:
return deadline is not None and time.monotonic() >= deadline
def _topic_relevance_score(entry: BibEntry, topic: str) -> float:
topic_terms = _tokenize(topic)
title_terms = _tokenize(entry.fields.get("title", ""))

View File

@ -36,6 +36,11 @@ class MetadataResolver:
if resolved is not None:
return resolved
if pmid := entry.fields.get("pmid"):
resolved = self.resolve_pmid(pmid)
if resolved is not None:
return resolved
if openalex_id := entry.fields.get("openalex"):
resolved = self.resolve_openalex(openalex_id)
if resolved is not None:
@ -73,6 +78,13 @@ class MetadataResolver:
)
if resolved is not None:
return resolved
resolved = self.search_pubmed_best_match(
title=title,
author_text=entry.fields.get("author", ""),
year=entry.fields.get("year", ""),
)
if resolved is not None:
return resolved
return None
@ -166,6 +178,23 @@ class MetadataResolver:
source_label=f"arxiv:id:{arxiv_id}",
)
def resolve_pmid(self, pmid: str) -> Resolution | None:
normalized_pmid = _normalize_pmid(pmid)
if not normalized_pmid:
return None
query = urllib.parse.urlencode({"db": "pubmed", "id": normalized_pmid, "retmode": "xml"})
root = self._safe_get_xml(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{query}")
if root is None:
return None
article = _find_pubmed_article(root, normalized_pmid)
if article is None:
return None
return Resolution(
entry=_pubmed_article_to_entry(article, fallback_pmid=normalized_pmid),
source_type="resolver",
source_label=f"pubmed:pmid:{normalized_pmid}",
)
def resolve_openalex(self, openalex_id: str) -> Resolution | None:
normalized_id = _normalize_openalex_id(openalex_id)
payload = self._safe_get_json(f"https://api.openalex.org/works/{normalized_id}")
@ -227,6 +256,30 @@ class MetadataResolver:
return []
return [_openalex_work_to_entry(item) for item in payload.get("results", [])]
def search_pubmed(self, title: str, limit: int = 5) -> list[BibEntry]:
query_text = " ".join(title.split())
if not query_text:
return []
query = urllib.parse.urlencode(
{
"db": "pubmed",
"retmode": "json",
"retmax": max(1, limit),
"term": query_text,
}
)
payload = self._safe_get_json(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?{query}")
if payload is None:
return []
ids = [
normalized
for value in payload.get("esearchresult", {}).get("idlist", [])
if (normalized := _normalize_pmid(str(value)))
]
if not ids:
return []
return self._fetch_pubmed_entries(ids[:limit])
def _safe_get_json(self, url: str) -> dict | None:
try:
return self.source_client.get_json(url)
@ -265,6 +318,51 @@ class MetadataResolver:
source_label=f"openalex:search:{title}",
)
def search_pubmed_best_match(
self,
title: str,
author_text: str = "",
year: str = "",
) -> Resolution | None:
candidate = _select_best_title_match(
self.search_pubmed(title, limit=5),
title=title,
author_text=author_text,
year=year,
)
if candidate is None:
return None
return Resolution(
entry=candidate,
source_type="resolver",
source_label=f"pubmed:search:{title}",
)
def _fetch_pubmed_entries(self, pmids: list[str]) -> list[BibEntry]:
ordered_pmids = [pmid for pmid in dict.fromkeys(pmids) if pmid]
if not ordered_pmids:
return []
id_param = ",".join(ordered_pmids)
summary_query = urllib.parse.urlencode({"db": "pubmed", "retmode": "json", "id": id_param})
summaries_payload = self._safe_get_json(
f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?{summary_query}"
) or {}
summaries = summaries_payload.get("result", {})
fetch_query = urllib.parse.urlencode({"db": "pubmed", "id": id_param, "retmode": "xml"})
root = self._safe_get_xml(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{fetch_query}")
articles = _pubmed_articles_by_pmid(root)
entries: list[BibEntry] = []
for pmid in ordered_pmids:
summary = summaries.get(pmid)
article = articles.get(pmid)
if not summary and article is None:
continue
entries.append(_pubmed_record_to_entry(summary or {}, article, fallback_pmid=pmid))
return entries
def merge_entries(base: BibEntry, resolved: BibEntry) -> BibEntry:
merged, _ = merge_entries_with_conflicts(base, resolved)
return merged
@ -651,6 +749,214 @@ def _candidate_matches_author_tokens(candidate: BibEntry, author_tokens: set[str
return bool(author_tokens & candidate_tokens)
def _normalize_pmid(value: str) -> str:
return "".join(ch for ch in str(value) if ch.isdigit())
def _pubmed_articles_by_pmid(root: ET.Element | None) -> dict[str, ET.Element]:
if root is None:
return {}
articles: dict[str, ET.Element] = {}
for article in root.findall(".//PubmedArticle"):
pmid = _normalize_pmid(_node_text(article.find("./MedlineCitation/PMID")))
if pmid:
articles[pmid] = article
return articles
def _find_pubmed_article(root: ET.Element, pmid: str) -> ET.Element | None:
return _pubmed_articles_by_pmid(root).get(_normalize_pmid(pmid))
def _pubmed_record_to_entry(summary: dict, article: ET.Element | None, fallback_pmid: str) -> BibEntry:
if article is not None:
entry = _pubmed_article_to_entry(article, fallback_pmid=fallback_pmid)
_merge_pubmed_summary_into_fields(entry.fields, summary, fallback_pmid)
return entry
fields = _pubmed_summary_fields(summary, fallback_pmid)
citation_key = _pubmed_citation_key(
fields.get("doi", ""),
fields.get("pmid", ""),
fields.get("author", ""),
fields.get("year", ""),
fields.get("title", ""),
)
return BibEntry(entry_type="article", citation_key=citation_key, fields=fields)
def _pubmed_article_to_entry(article: ET.Element, fallback_pmid: str = "") -> BibEntry:
medline = article.find("./MedlineCitation")
article_node = medline.find("./Article") if medline is not None else None
pubmed_data = article.find("./PubmedData")
pmid = _normalize_pmid(_node_text(medline.find("./PMID")) if medline is not None else fallback_pmid) or _normalize_pmid(
fallback_pmid
)
title = _normalize_text(_element_text(article_node.find("./ArticleTitle")) if article_node is not None else "")
authors = " and ".join(
name
for name in (_pubmed_author_name(author) for author in article.findall(".//AuthorList/Author"))
if name
)
journal = _normalize_text(_node_text(article.find(".//Journal/Title")))
year = _pubmed_article_year(article)
abstract = _pubmed_abstract_text(article)
doi = _pubmed_article_identifier(article, "doi")
pmcid = _pubmed_article_identifier(pubmed_data, "pmc")
fields: dict[str, str] = {}
if title:
fields["title"] = title
if authors:
fields["author"] = authors
if year:
fields["year"] = year
if journal:
fields["journal"] = journal
if abstract:
fields["abstract"] = abstract
if doi:
fields["doi"] = doi
if pmid:
fields["pmid"] = pmid
if pmcid:
fields["pmcid"] = pmcid
fields["url"] = f"https://pmc.ncbi.nlm.nih.gov/articles/{pmcid}/"
elif pmid:
fields["url"] = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
citation_key = _pubmed_citation_key(doi, pmid, authors, year, title)
return BibEntry(entry_type="article", citation_key=citation_key, fields=fields)
def _merge_pubmed_summary_into_fields(fields: dict[str, str], summary: dict, fallback_pmid: str) -> None:
for key, value in _pubmed_summary_fields(summary, fallback_pmid).items():
if value and not fields.get(key):
fields[key] = value
def _pubmed_summary_fields(summary: dict, fallback_pmid: str) -> dict[str, str]:
pmid = _normalize_pmid(str(summary.get("uid") or fallback_pmid))
title = _normalize_text(str(summary.get("title") or ""))
year = _pubmed_year_from_text(str(summary.get("pubdate") or ""))
journal = _normalize_text(str(summary.get("fulljournalname") or ""))
authors = " and ".join(
name
for name in (
_normalize_person_display_name(str(author.get("name") or ""))
for author in summary.get("authors", [])
)
if name
)
doi = ""
pmcid = ""
for article_id in summary.get("articleids", []) or []:
id_type = str(article_id.get("idtype") or "").lower()
value = str(article_id.get("value") or "")
if id_type == "doi" and value:
doi = value
elif id_type in {"pmc", "pmcid"} and value:
pmcid = value
fields: dict[str, str] = {}
if title:
fields["title"] = title
if authors:
fields["author"] = authors
if year:
fields["year"] = year
if journal:
fields["journal"] = journal
if doi:
fields["doi"] = doi
if pmid:
fields["pmid"] = pmid
if pmcid:
fields["pmcid"] = pmcid
fields["url"] = f"https://pmc.ncbi.nlm.nih.gov/articles/{pmcid}/"
elif pmid:
fields["url"] = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
return fields
def _pubmed_author_name(author: ET.Element) -> str:
collective = _normalize_text(_node_text(author.find("./CollectiveName")))
if collective:
return collective
family = _normalize_text(_node_text(author.find("./LastName")))
given = _normalize_text(_node_text(author.find("./ForeName")))
initials = _normalize_text(_node_text(author.find("./Initials")))
if family and given:
return f"{family}, {given}"
if family and initials:
normalized_initials = " ".join(f"{letter}." for letter in re.findall(r"[A-Za-z]", initials))
return f"{family}, {normalized_initials}" if normalized_initials else family
return family or given
def _pubmed_article_year(article: ET.Element) -> str:
for path in (
".//JournalIssue/PubDate/Year",
".//ArticleDate/Year",
".//PubDate/Year",
):
year = _node_text(article.find(path))
if year:
return year
for path in (
".//JournalIssue/PubDate/MedlineDate",
".//PubDate/MedlineDate",
):
year = _pubmed_year_from_text(_node_text(article.find(path)))
if year:
return year
return ""
def _pubmed_year_from_text(value: str) -> str:
match = re.search(r"\b(1[6-9]\d{2}|20\d{2}|21\d{2})\b", value)
return match.group(1) if match else ""
def _pubmed_abstract_text(article: ET.Element) -> str:
parts: list[str] = []
for node in article.findall(".//Abstract/AbstractText"):
text = _normalize_text(_element_text(node))
if not text:
continue
label = _normalize_text(node.attrib.get("Label", ""))
parts.append(f"{label}: {text}" if label else text)
return " ".join(parts)
def _pubmed_article_identifier(root: ET.Element | None, identifier_type: str) -> str:
if root is None:
return ""
normalized_type = identifier_type.lower()
for node in root.findall(".//ArticleId"):
if str(node.attrib.get("IdType") or "").lower() == normalized_type:
return _normalize_text(_element_text(node))
if normalized_type == "doi":
for node in root.findall(".//ELocationID"):
if str(node.attrib.get("EIdType") or "").lower() == "doi":
return _normalize_text(_element_text(node))
return ""
def _pubmed_citation_key(doi: str, pmid: str, authors: str, year: str, title: str) -> str:
if doi:
suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower()
return f"doi{suffix}"
if pmid:
return f"pmid{pmid}"
return _make_resolution_key(authors or "pubmed", year or "n.d.", title or "untitled")
def _element_text(node: ET.Element | None) -> str:
if node is None:
return ""
return " ".join("".join(node.itertext()).split())
def _datacite_work_to_entry(data: dict) -> BibEntry:
attributes = data.get("attributes", {})
doi = str(attributes.get("doi") or "")

View File

@ -149,6 +149,20 @@ class BibliographyVerifier:
input_type=input_type,
input_key=input_key,
)
if source_entry is not None and source_entry.fields.get("pmid"):
direct = self.resolver.resolve_pmid(source_entry.fields["pmid"])
if direct is not None:
return VerificationResult(
query=query,
context=context,
status="exact",
confidence=1.0,
entry=direct.entry,
source_label=direct.source_label,
alternates=[],
input_type=input_type,
input_key=input_key,
)
candidate_limit = max(1, limit)
candidates = self._collect_candidates(
@ -209,6 +223,7 @@ class BibliographyVerifier:
("crossref", self.resolver.search_crossref(search_title, limit=limit)),
("openalex", self.resolver.search_openalex(search_title, limit=limit)),
("datacite", self.resolver.search_datacite(search_title, limit=limit)),
("pubmed", self.resolver.search_pubmed(search_title, limit=limit)),
):
for entry in source_entries:
signature = _candidate_signature(entry)

View File

@ -34,6 +34,7 @@ def test_bootstrap_from_topic_only():
try:
bootstrapper = Bootstrapper()
bootstrapper.resolver.search_openalex = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [ # type: ignore[method-assign]
__import__("citegeist").BibEntry(
@ -139,6 +140,7 @@ def test_bootstrap_ranks_and_deduplicates_topic_candidates():
fields={"title": "Graph Topic Ranking", "abstract": "graph topic graph"},
)
]
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [ # type: ignore[method-assign]
BibEntry(
entry_type="article",
@ -172,6 +174,7 @@ def test_bootstrap_preview_does_not_write_to_database():
bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign]
BibEntry(entry_type="article", citation_key="preview2024graph", fields={"title": "Preview Graph Topic"})
]
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
@ -194,6 +197,7 @@ def test_bootstrap_topic_commit_limit_restricts_persisted_candidates():
BibEntry(entry_type="article", citation_key="rank1", fields={"title": "Graph Topic One"}),
BibEntry(entry_type="article", citation_key="rank2", fields={"title": "Graph Topic Two"}),
]
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign]
@ -227,6 +231,7 @@ def test_bootstrap_topic_candidates_are_attached_to_topic():
fields={"title": "Graph Topic Result", "year": "2024"},
)
]
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign]
@ -278,6 +283,7 @@ def test_bootstrap_topic_commit_requires_title_anchor():
},
),
]
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign]
@ -482,6 +488,7 @@ def test_bootstrap_preview_uses_topic_commit_limit_when_larger_than_topic_limit(
)
for index in range(1, 8)
][:limit]
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]

View File

@ -8,6 +8,7 @@ from citegeist.resolve import (
_crossref_message_to_entry,
_datacite_work_to_entry,
_openalex_work_to_entry,
_pubmed_article_to_entry,
merge_entries_with_conflicts,
merge_entries,
)
@ -88,6 +89,52 @@ def test_arxiv_atom_entry_to_bib_maps_basic_fields():
assert entry.fields["doi"] == "10.1000/arxiv-example"
def test_pubmed_article_to_entry_maps_basic_fields():
xml = ET.fromstring(
"""
<PubmedArticle>
<MedlineCitation>
<PMID>12345678</PMID>
<Article>
<ArticleTitle>PubMed Resolved Work</ArticleTitle>
<Abstract>
<AbstractText Label="Background">Evidence summary.</AbstractText>
<AbstractText>Second paragraph.</AbstractText>
</Abstract>
<Journal>
<JournalIssue>
<PubDate><Year>2021</Year></PubDate>
</JournalIssue>
<Title>Journal of Evidence</Title>
</Journal>
<AuthorList>
<Author><LastName>Smith</LastName><ForeName>Jane</ForeName></Author>
</AuthorList>
<ELocationID EIdType="doi">10.1000/pubmed-example</ELocationID>
</Article>
</MedlineCitation>
<PubmedData>
<ArticleIdList>
<ArticleId IdType="pubmed">12345678</ArticleId>
<ArticleId IdType="pmc">PMC123456</ArticleId>
</ArticleIdList>
</PubmedData>
</PubmedArticle>
"""
)
entry = _pubmed_article_to_entry(xml)
assert entry.citation_key == "doi101000pubmedexample"
assert entry.fields["title"] == "PubMed Resolved Work"
assert entry.fields["author"] == "Smith, Jane"
assert entry.fields["journal"] == "Journal of Evidence"
assert entry.fields["year"] == "2021"
assert entry.fields["pmid"] == "12345678"
assert entry.fields["pmcid"] == "PMC123456"
assert entry.fields["abstract"] == "Background: Evidence summary. Second paragraph."
def test_merge_entries_prefers_existing_values_and_adds_missing_fields():
base = BibEntry(
entry_type="article",
@ -209,6 +256,35 @@ def test_resolver_tries_doi_before_dblp():
]
def test_resolver_tries_pmid_before_dblp():
resolver = MetadataResolver()
calls: list[tuple[str, str]] = []
def fake_pmid(value: str):
calls.append(("pmid", value))
return None
def fake_dblp(value: str):
calls.append(("dblp", value))
return None
resolver.resolve_pmid = fake_pmid # type: ignore[method-assign]
resolver.resolve_dblp = fake_dblp # type: ignore[method-assign]
resolver.resolve_entry(
BibEntry(
entry_type="article",
citation_key="smith2024graphs",
fields={"pmid": "12345678", "dblp": "conf/test/Smith24"},
)
)
assert calls == [
("pmid", "12345678"),
("dblp", "conf/test/Smith24"),
]
def test_openalex_work_to_entry_maps_basic_fields():
entry = _openalex_work_to_entry(
{

View File

@ -36,6 +36,37 @@ def test_verifier_uses_direct_doi_resolution_for_bib_entries():
assert result.source_label == "crossref:doi:10.1000/example"
def test_verifier_uses_direct_pmid_resolution_for_bib_entries():
verifier = BibliographyVerifier()
verifier.resolver.resolve_pmid = lambda value: Resolution( # type: ignore[method-assign]
entry=BibEntry(
entry_type="article",
citation_key="pmid12345678",
fields={
"author": "Smith, Jane",
"title": "Resolved PubMed Work",
"year": "2024",
"pmid": value,
},
),
source_type="resolver",
source_label=f"pubmed:pmid:{value}",
)
result = verifier.verify_bib_entry(
BibEntry(
entry_type="misc",
citation_key="seed2024",
fields={"title": "Rough Work", "pmid": "12345678"},
)
)
assert result.status == "exact"
assert result.confidence == 1.0
assert result.entry.fields["title"] == "Resolved PubMed Work"
assert result.source_label == "pubmed:pmid:12345678"
def test_verifier_scores_and_sorts_search_candidates():
verifier = BibliographyVerifier()
verifier.resolver.search_crossref = lambda title, limit=5: [ # type: ignore[method-assign]
@ -61,6 +92,7 @@ def test_verifier_scores_and_sorts_search_candidates():
]
verifier.resolver.search_openalex = lambda title, limit=5: [] # type: ignore[method-assign]
verifier.resolver.search_datacite = lambda title, limit=5: [] # type: ignore[method-assign]
verifier.resolver.search_pubmed = lambda title, limit=5: [] # type: ignore[method-assign]
result = verifier.verify_string('"Graph-first bibliography augmentation" Smith 2024')
@ -74,6 +106,7 @@ def test_verification_result_to_bib_entry_contains_audit_fields():
verifier.resolver.search_crossref = lambda title, limit=5: [] # type: ignore[method-assign]
verifier.resolver.search_openalex = lambda title, limit=5: [] # type: ignore[method-assign]
verifier.resolver.search_datacite = lambda title, limit=5: [] # type: ignore[method-assign]
verifier.resolver.search_pubmed = lambda title, limit=5: [] # type: ignore[method-assign]
result = verifier._verify_query( # type: ignore[attr-defined]
{"title": "Missing Work", "authors": [], "year": "", "venue": ""},