Add PubMed support to CiteGeist
This commit is contained in:
parent
7bdaf37c59
commit
663fb1973a
|
|
@ -51,7 +51,7 @@ The initial repo includes:
|
|||
- staged plaintext reference extraction that now preserves more structured metadata from legacy references, including year suffixes, identifiers, volume/issue/pages, and thesis/report/web-style venue hints;
|
||||
- a reference-extraction backend seam with the local `heuristic` parser as the default implementation, so optional external backends can be added later without changing the core extract workflow;
|
||||
- standalone verification and disambiguation of free-text references or partial BibTeX into auditable BibTeX/JSON results with `x_status`, `x_confidence`, `x_source`, `x_query`, and alternate-candidate traces;
|
||||
- identifier-first metadata resolution for DOI, OpenAlex, DBLP, arXiv, and DataCite-backed entries, with OpenAlex/DataCite title-search fallback;
|
||||
- identifier-first metadata resolution for DOI, PMID/PubMed, OpenAlex, DBLP, arXiv, and DataCite-backed entries, with OpenAlex/DataCite/PubMed title-search fallback;
|
||||
- local citation-graph traversal over stored `cites`, `cited_by`, and `crossref` edges;
|
||||
- Crossref- and OpenAlex-backed graph expansion that materializes draft related works and edge provenance;
|
||||
- a dedicated source-client layer with fixture/cache support for live-source development;
|
||||
|
|
|
|||
|
|
@ -358,15 +358,12 @@ class Bootstrapper:
|
|||
})
|
||||
return results
|
||||
|
||||
|
||||
def _deadline_reached(deadline: float | None) -> bool:
|
||||
return deadline is not None and time.monotonic() >= deadline
|
||||
|
||||
def _topic_candidates(self, topic: str, seed_keys: list[str], limit: int) -> list[tuple[BibEntry, float]]:
|
||||
scored: dict[str, tuple[BibEntry, float]] = {}
|
||||
|
||||
for source_name, base_score, entries in (
|
||||
for _source_name, base_score, entries in (
|
||||
("openalex", 3.0, self.resolver.search_openalex(topic, limit=limit)),
|
||||
("pubmed", 2.5, self.resolver.search_pubmed(topic, limit=limit)),
|
||||
("crossref", 2.0, self.resolver.search_crossref(topic, limit=limit)),
|
||||
("datacite", 1.5, self.resolver.search_datacite(topic, limit=limit)),
|
||||
):
|
||||
|
|
@ -383,6 +380,10 @@ def _deadline_reached(deadline: float | None) -> bool:
|
|||
return ranked[:limit]
|
||||
|
||||
|
||||
def _deadline_reached(deadline: float | None) -> bool:
|
||||
return deadline is not None and time.monotonic() >= deadline
|
||||
|
||||
|
||||
def _topic_relevance_score(entry: BibEntry, topic: str) -> float:
|
||||
topic_terms = _tokenize(topic)
|
||||
title_terms = _tokenize(entry.fields.get("title", ""))
|
||||
|
|
|
|||
|
|
@ -36,6 +36,11 @@ class MetadataResolver:
|
|||
if resolved is not None:
|
||||
return resolved
|
||||
|
||||
if pmid := entry.fields.get("pmid"):
|
||||
resolved = self.resolve_pmid(pmid)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
|
||||
if openalex_id := entry.fields.get("openalex"):
|
||||
resolved = self.resolve_openalex(openalex_id)
|
||||
if resolved is not None:
|
||||
|
|
@ -73,6 +78,13 @@ class MetadataResolver:
|
|||
)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
resolved = self.search_pubmed_best_match(
|
||||
title=title,
|
||||
author_text=entry.fields.get("author", ""),
|
||||
year=entry.fields.get("year", ""),
|
||||
)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
|
||||
return None
|
||||
|
||||
|
|
@ -166,6 +178,23 @@ class MetadataResolver:
|
|||
source_label=f"arxiv:id:{arxiv_id}",
|
||||
)
|
||||
|
||||
def resolve_pmid(self, pmid: str) -> Resolution | None:
|
||||
normalized_pmid = _normalize_pmid(pmid)
|
||||
if not normalized_pmid:
|
||||
return None
|
||||
query = urllib.parse.urlencode({"db": "pubmed", "id": normalized_pmid, "retmode": "xml"})
|
||||
root = self._safe_get_xml(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{query}")
|
||||
if root is None:
|
||||
return None
|
||||
article = _find_pubmed_article(root, normalized_pmid)
|
||||
if article is None:
|
||||
return None
|
||||
return Resolution(
|
||||
entry=_pubmed_article_to_entry(article, fallback_pmid=normalized_pmid),
|
||||
source_type="resolver",
|
||||
source_label=f"pubmed:pmid:{normalized_pmid}",
|
||||
)
|
||||
|
||||
def resolve_openalex(self, openalex_id: str) -> Resolution | None:
|
||||
normalized_id = _normalize_openalex_id(openalex_id)
|
||||
payload = self._safe_get_json(f"https://api.openalex.org/works/{normalized_id}")
|
||||
|
|
@ -227,6 +256,30 @@ class MetadataResolver:
|
|||
return []
|
||||
return [_openalex_work_to_entry(item) for item in payload.get("results", [])]
|
||||
|
||||
def search_pubmed(self, title: str, limit: int = 5) -> list[BibEntry]:
|
||||
query_text = " ".join(title.split())
|
||||
if not query_text:
|
||||
return []
|
||||
query = urllib.parse.urlencode(
|
||||
{
|
||||
"db": "pubmed",
|
||||
"retmode": "json",
|
||||
"retmax": max(1, limit),
|
||||
"term": query_text,
|
||||
}
|
||||
)
|
||||
payload = self._safe_get_json(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?{query}")
|
||||
if payload is None:
|
||||
return []
|
||||
ids = [
|
||||
normalized
|
||||
for value in payload.get("esearchresult", {}).get("idlist", [])
|
||||
if (normalized := _normalize_pmid(str(value)))
|
||||
]
|
||||
if not ids:
|
||||
return []
|
||||
return self._fetch_pubmed_entries(ids[:limit])
|
||||
|
||||
def _safe_get_json(self, url: str) -> dict | None:
|
||||
try:
|
||||
return self.source_client.get_json(url)
|
||||
|
|
@ -265,6 +318,51 @@ class MetadataResolver:
|
|||
source_label=f"openalex:search:{title}",
|
||||
)
|
||||
|
||||
def search_pubmed_best_match(
|
||||
self,
|
||||
title: str,
|
||||
author_text: str = "",
|
||||
year: str = "",
|
||||
) -> Resolution | None:
|
||||
candidate = _select_best_title_match(
|
||||
self.search_pubmed(title, limit=5),
|
||||
title=title,
|
||||
author_text=author_text,
|
||||
year=year,
|
||||
)
|
||||
if candidate is None:
|
||||
return None
|
||||
return Resolution(
|
||||
entry=candidate,
|
||||
source_type="resolver",
|
||||
source_label=f"pubmed:search:{title}",
|
||||
)
|
||||
|
||||
def _fetch_pubmed_entries(self, pmids: list[str]) -> list[BibEntry]:
|
||||
ordered_pmids = [pmid for pmid in dict.fromkeys(pmids) if pmid]
|
||||
if not ordered_pmids:
|
||||
return []
|
||||
|
||||
id_param = ",".join(ordered_pmids)
|
||||
summary_query = urllib.parse.urlencode({"db": "pubmed", "retmode": "json", "id": id_param})
|
||||
summaries_payload = self._safe_get_json(
|
||||
f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?{summary_query}"
|
||||
) or {}
|
||||
summaries = summaries_payload.get("result", {})
|
||||
|
||||
fetch_query = urllib.parse.urlencode({"db": "pubmed", "id": id_param, "retmode": "xml"})
|
||||
root = self._safe_get_xml(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{fetch_query}")
|
||||
articles = _pubmed_articles_by_pmid(root)
|
||||
|
||||
entries: list[BibEntry] = []
|
||||
for pmid in ordered_pmids:
|
||||
summary = summaries.get(pmid)
|
||||
article = articles.get(pmid)
|
||||
if not summary and article is None:
|
||||
continue
|
||||
entries.append(_pubmed_record_to_entry(summary or {}, article, fallback_pmid=pmid))
|
||||
return entries
|
||||
|
||||
def merge_entries(base: BibEntry, resolved: BibEntry) -> BibEntry:
|
||||
merged, _ = merge_entries_with_conflicts(base, resolved)
|
||||
return merged
|
||||
|
|
@ -651,6 +749,214 @@ def _candidate_matches_author_tokens(candidate: BibEntry, author_tokens: set[str
|
|||
return bool(author_tokens & candidate_tokens)
|
||||
|
||||
|
||||
def _normalize_pmid(value: str) -> str:
|
||||
return "".join(ch for ch in str(value) if ch.isdigit())
|
||||
|
||||
|
||||
def _pubmed_articles_by_pmid(root: ET.Element | None) -> dict[str, ET.Element]:
|
||||
if root is None:
|
||||
return {}
|
||||
articles: dict[str, ET.Element] = {}
|
||||
for article in root.findall(".//PubmedArticle"):
|
||||
pmid = _normalize_pmid(_node_text(article.find("./MedlineCitation/PMID")))
|
||||
if pmid:
|
||||
articles[pmid] = article
|
||||
return articles
|
||||
|
||||
|
||||
def _find_pubmed_article(root: ET.Element, pmid: str) -> ET.Element | None:
|
||||
return _pubmed_articles_by_pmid(root).get(_normalize_pmid(pmid))
|
||||
|
||||
|
||||
def _pubmed_record_to_entry(summary: dict, article: ET.Element | None, fallback_pmid: str) -> BibEntry:
|
||||
if article is not None:
|
||||
entry = _pubmed_article_to_entry(article, fallback_pmid=fallback_pmid)
|
||||
_merge_pubmed_summary_into_fields(entry.fields, summary, fallback_pmid)
|
||||
return entry
|
||||
fields = _pubmed_summary_fields(summary, fallback_pmid)
|
||||
citation_key = _pubmed_citation_key(
|
||||
fields.get("doi", ""),
|
||||
fields.get("pmid", ""),
|
||||
fields.get("author", ""),
|
||||
fields.get("year", ""),
|
||||
fields.get("title", ""),
|
||||
)
|
||||
return BibEntry(entry_type="article", citation_key=citation_key, fields=fields)
|
||||
|
||||
|
||||
def _pubmed_article_to_entry(article: ET.Element, fallback_pmid: str = "") -> BibEntry:
|
||||
medline = article.find("./MedlineCitation")
|
||||
article_node = medline.find("./Article") if medline is not None else None
|
||||
pubmed_data = article.find("./PubmedData")
|
||||
pmid = _normalize_pmid(_node_text(medline.find("./PMID")) if medline is not None else fallback_pmid) or _normalize_pmid(
|
||||
fallback_pmid
|
||||
)
|
||||
title = _normalize_text(_element_text(article_node.find("./ArticleTitle")) if article_node is not None else "")
|
||||
authors = " and ".join(
|
||||
name
|
||||
for name in (_pubmed_author_name(author) for author in article.findall(".//AuthorList/Author"))
|
||||
if name
|
||||
)
|
||||
journal = _normalize_text(_node_text(article.find(".//Journal/Title")))
|
||||
year = _pubmed_article_year(article)
|
||||
abstract = _pubmed_abstract_text(article)
|
||||
doi = _pubmed_article_identifier(article, "doi")
|
||||
pmcid = _pubmed_article_identifier(pubmed_data, "pmc")
|
||||
|
||||
fields: dict[str, str] = {}
|
||||
if title:
|
||||
fields["title"] = title
|
||||
if authors:
|
||||
fields["author"] = authors
|
||||
if year:
|
||||
fields["year"] = year
|
||||
if journal:
|
||||
fields["journal"] = journal
|
||||
if abstract:
|
||||
fields["abstract"] = abstract
|
||||
if doi:
|
||||
fields["doi"] = doi
|
||||
if pmid:
|
||||
fields["pmid"] = pmid
|
||||
if pmcid:
|
||||
fields["pmcid"] = pmcid
|
||||
fields["url"] = f"https://pmc.ncbi.nlm.nih.gov/articles/{pmcid}/"
|
||||
elif pmid:
|
||||
fields["url"] = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
|
||||
|
||||
citation_key = _pubmed_citation_key(doi, pmid, authors, year, title)
|
||||
return BibEntry(entry_type="article", citation_key=citation_key, fields=fields)
|
||||
|
||||
|
||||
def _merge_pubmed_summary_into_fields(fields: dict[str, str], summary: dict, fallback_pmid: str) -> None:
|
||||
for key, value in _pubmed_summary_fields(summary, fallback_pmid).items():
|
||||
if value and not fields.get(key):
|
||||
fields[key] = value
|
||||
|
||||
|
||||
def _pubmed_summary_fields(summary: dict, fallback_pmid: str) -> dict[str, str]:
|
||||
pmid = _normalize_pmid(str(summary.get("uid") or fallback_pmid))
|
||||
title = _normalize_text(str(summary.get("title") or ""))
|
||||
year = _pubmed_year_from_text(str(summary.get("pubdate") or ""))
|
||||
journal = _normalize_text(str(summary.get("fulljournalname") or ""))
|
||||
authors = " and ".join(
|
||||
name
|
||||
for name in (
|
||||
_normalize_person_display_name(str(author.get("name") or ""))
|
||||
for author in summary.get("authors", [])
|
||||
)
|
||||
if name
|
||||
)
|
||||
doi = ""
|
||||
pmcid = ""
|
||||
for article_id in summary.get("articleids", []) or []:
|
||||
id_type = str(article_id.get("idtype") or "").lower()
|
||||
value = str(article_id.get("value") or "")
|
||||
if id_type == "doi" and value:
|
||||
doi = value
|
||||
elif id_type in {"pmc", "pmcid"} and value:
|
||||
pmcid = value
|
||||
|
||||
fields: dict[str, str] = {}
|
||||
if title:
|
||||
fields["title"] = title
|
||||
if authors:
|
||||
fields["author"] = authors
|
||||
if year:
|
||||
fields["year"] = year
|
||||
if journal:
|
||||
fields["journal"] = journal
|
||||
if doi:
|
||||
fields["doi"] = doi
|
||||
if pmid:
|
||||
fields["pmid"] = pmid
|
||||
if pmcid:
|
||||
fields["pmcid"] = pmcid
|
||||
fields["url"] = f"https://pmc.ncbi.nlm.nih.gov/articles/{pmcid}/"
|
||||
elif pmid:
|
||||
fields["url"] = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
|
||||
return fields
|
||||
|
||||
|
||||
def _pubmed_author_name(author: ET.Element) -> str:
|
||||
collective = _normalize_text(_node_text(author.find("./CollectiveName")))
|
||||
if collective:
|
||||
return collective
|
||||
family = _normalize_text(_node_text(author.find("./LastName")))
|
||||
given = _normalize_text(_node_text(author.find("./ForeName")))
|
||||
initials = _normalize_text(_node_text(author.find("./Initials")))
|
||||
if family and given:
|
||||
return f"{family}, {given}"
|
||||
if family and initials:
|
||||
normalized_initials = " ".join(f"{letter}." for letter in re.findall(r"[A-Za-z]", initials))
|
||||
return f"{family}, {normalized_initials}" if normalized_initials else family
|
||||
return family or given
|
||||
|
||||
|
||||
def _pubmed_article_year(article: ET.Element) -> str:
|
||||
for path in (
|
||||
".//JournalIssue/PubDate/Year",
|
||||
".//ArticleDate/Year",
|
||||
".//PubDate/Year",
|
||||
):
|
||||
year = _node_text(article.find(path))
|
||||
if year:
|
||||
return year
|
||||
for path in (
|
||||
".//JournalIssue/PubDate/MedlineDate",
|
||||
".//PubDate/MedlineDate",
|
||||
):
|
||||
year = _pubmed_year_from_text(_node_text(article.find(path)))
|
||||
if year:
|
||||
return year
|
||||
return ""
|
||||
|
||||
|
||||
def _pubmed_year_from_text(value: str) -> str:
|
||||
match = re.search(r"\b(1[6-9]\d{2}|20\d{2}|21\d{2})\b", value)
|
||||
return match.group(1) if match else ""
|
||||
|
||||
|
||||
def _pubmed_abstract_text(article: ET.Element) -> str:
|
||||
parts: list[str] = []
|
||||
for node in article.findall(".//Abstract/AbstractText"):
|
||||
text = _normalize_text(_element_text(node))
|
||||
if not text:
|
||||
continue
|
||||
label = _normalize_text(node.attrib.get("Label", ""))
|
||||
parts.append(f"{label}: {text}" if label else text)
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
def _pubmed_article_identifier(root: ET.Element | None, identifier_type: str) -> str:
|
||||
if root is None:
|
||||
return ""
|
||||
normalized_type = identifier_type.lower()
|
||||
for node in root.findall(".//ArticleId"):
|
||||
if str(node.attrib.get("IdType") or "").lower() == normalized_type:
|
||||
return _normalize_text(_element_text(node))
|
||||
if normalized_type == "doi":
|
||||
for node in root.findall(".//ELocationID"):
|
||||
if str(node.attrib.get("EIdType") or "").lower() == "doi":
|
||||
return _normalize_text(_element_text(node))
|
||||
return ""
|
||||
|
||||
|
||||
def _pubmed_citation_key(doi: str, pmid: str, authors: str, year: str, title: str) -> str:
|
||||
if doi:
|
||||
suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower()
|
||||
return f"doi{suffix}"
|
||||
if pmid:
|
||||
return f"pmid{pmid}"
|
||||
return _make_resolution_key(authors or "pubmed", year or "n.d.", title or "untitled")
|
||||
|
||||
|
||||
def _element_text(node: ET.Element | None) -> str:
|
||||
if node is None:
|
||||
return ""
|
||||
return " ".join("".join(node.itertext()).split())
|
||||
|
||||
|
||||
def _datacite_work_to_entry(data: dict) -> BibEntry:
|
||||
attributes = data.get("attributes", {})
|
||||
doi = str(attributes.get("doi") or "")
|
||||
|
|
|
|||
|
|
@ -149,6 +149,20 @@ class BibliographyVerifier:
|
|||
input_type=input_type,
|
||||
input_key=input_key,
|
||||
)
|
||||
if source_entry is not None and source_entry.fields.get("pmid"):
|
||||
direct = self.resolver.resolve_pmid(source_entry.fields["pmid"])
|
||||
if direct is not None:
|
||||
return VerificationResult(
|
||||
query=query,
|
||||
context=context,
|
||||
status="exact",
|
||||
confidence=1.0,
|
||||
entry=direct.entry,
|
||||
source_label=direct.source_label,
|
||||
alternates=[],
|
||||
input_type=input_type,
|
||||
input_key=input_key,
|
||||
)
|
||||
|
||||
candidate_limit = max(1, limit)
|
||||
candidates = self._collect_candidates(
|
||||
|
|
@ -209,6 +223,7 @@ class BibliographyVerifier:
|
|||
("crossref", self.resolver.search_crossref(search_title, limit=limit)),
|
||||
("openalex", self.resolver.search_openalex(search_title, limit=limit)),
|
||||
("datacite", self.resolver.search_datacite(search_title, limit=limit)),
|
||||
("pubmed", self.resolver.search_pubmed(search_title, limit=limit)),
|
||||
):
|
||||
for entry in source_entries:
|
||||
signature = _candidate_signature(entry)
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ def test_bootstrap_from_topic_only():
|
|||
try:
|
||||
bootstrapper = Bootstrapper()
|
||||
bootstrapper.resolver.search_openalex = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [ # type: ignore[method-assign]
|
||||
__import__("citegeist").BibEntry(
|
||||
|
|
@ -139,6 +140,7 @@ def test_bootstrap_ranks_and_deduplicates_topic_candidates():
|
|||
fields={"title": "Graph Topic Ranking", "abstract": "graph topic graph"},
|
||||
)
|
||||
]
|
||||
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [ # type: ignore[method-assign]
|
||||
BibEntry(
|
||||
entry_type="article",
|
||||
|
|
@ -172,6 +174,7 @@ def test_bootstrap_preview_does_not_write_to_database():
|
|||
bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign]
|
||||
BibEntry(entry_type="article", citation_key="preview2024graph", fields={"title": "Preview Graph Topic"})
|
||||
]
|
||||
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
|
||||
|
|
@ -194,6 +197,7 @@ def test_bootstrap_topic_commit_limit_restricts_persisted_candidates():
|
|||
BibEntry(entry_type="article", citation_key="rank1", fields={"title": "Graph Topic One"}),
|
||||
BibEntry(entry_type="article", citation_key="rank2", fields={"title": "Graph Topic Two"}),
|
||||
]
|
||||
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign]
|
||||
|
|
@ -227,6 +231,7 @@ def test_bootstrap_topic_candidates_are_attached_to_topic():
|
|||
fields={"title": "Graph Topic Result", "year": "2024"},
|
||||
)
|
||||
]
|
||||
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign]
|
||||
|
|
@ -278,6 +283,7 @@ def test_bootstrap_topic_commit_requires_title_anchor():
|
|||
},
|
||||
),
|
||||
]
|
||||
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign]
|
||||
|
|
@ -482,6 +488,7 @@ def test_bootstrap_preview_uses_topic_commit_limit_when_larger_than_topic_limit(
|
|||
)
|
||||
for index in range(1, 8)
|
||||
][:limit]
|
||||
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ from citegeist.resolve import (
|
|||
_crossref_message_to_entry,
|
||||
_datacite_work_to_entry,
|
||||
_openalex_work_to_entry,
|
||||
_pubmed_article_to_entry,
|
||||
merge_entries_with_conflicts,
|
||||
merge_entries,
|
||||
)
|
||||
|
|
@ -88,6 +89,52 @@ def test_arxiv_atom_entry_to_bib_maps_basic_fields():
|
|||
assert entry.fields["doi"] == "10.1000/arxiv-example"
|
||||
|
||||
|
||||
def test_pubmed_article_to_entry_maps_basic_fields():
|
||||
xml = ET.fromstring(
|
||||
"""
|
||||
<PubmedArticle>
|
||||
<MedlineCitation>
|
||||
<PMID>12345678</PMID>
|
||||
<Article>
|
||||
<ArticleTitle>PubMed Resolved Work</ArticleTitle>
|
||||
<Abstract>
|
||||
<AbstractText Label="Background">Evidence summary.</AbstractText>
|
||||
<AbstractText>Second paragraph.</AbstractText>
|
||||
</Abstract>
|
||||
<Journal>
|
||||
<JournalIssue>
|
||||
<PubDate><Year>2021</Year></PubDate>
|
||||
</JournalIssue>
|
||||
<Title>Journal of Evidence</Title>
|
||||
</Journal>
|
||||
<AuthorList>
|
||||
<Author><LastName>Smith</LastName><ForeName>Jane</ForeName></Author>
|
||||
</AuthorList>
|
||||
<ELocationID EIdType="doi">10.1000/pubmed-example</ELocationID>
|
||||
</Article>
|
||||
</MedlineCitation>
|
||||
<PubmedData>
|
||||
<ArticleIdList>
|
||||
<ArticleId IdType="pubmed">12345678</ArticleId>
|
||||
<ArticleId IdType="pmc">PMC123456</ArticleId>
|
||||
</ArticleIdList>
|
||||
</PubmedData>
|
||||
</PubmedArticle>
|
||||
"""
|
||||
)
|
||||
|
||||
entry = _pubmed_article_to_entry(xml)
|
||||
|
||||
assert entry.citation_key == "doi101000pubmedexample"
|
||||
assert entry.fields["title"] == "PubMed Resolved Work"
|
||||
assert entry.fields["author"] == "Smith, Jane"
|
||||
assert entry.fields["journal"] == "Journal of Evidence"
|
||||
assert entry.fields["year"] == "2021"
|
||||
assert entry.fields["pmid"] == "12345678"
|
||||
assert entry.fields["pmcid"] == "PMC123456"
|
||||
assert entry.fields["abstract"] == "Background: Evidence summary. Second paragraph."
|
||||
|
||||
|
||||
def test_merge_entries_prefers_existing_values_and_adds_missing_fields():
|
||||
base = BibEntry(
|
||||
entry_type="article",
|
||||
|
|
@ -209,6 +256,35 @@ def test_resolver_tries_doi_before_dblp():
|
|||
]
|
||||
|
||||
|
||||
def test_resolver_tries_pmid_before_dblp():
|
||||
resolver = MetadataResolver()
|
||||
calls: list[tuple[str, str]] = []
|
||||
|
||||
def fake_pmid(value: str):
|
||||
calls.append(("pmid", value))
|
||||
return None
|
||||
|
||||
def fake_dblp(value: str):
|
||||
calls.append(("dblp", value))
|
||||
return None
|
||||
|
||||
resolver.resolve_pmid = fake_pmid # type: ignore[method-assign]
|
||||
resolver.resolve_dblp = fake_dblp # type: ignore[method-assign]
|
||||
|
||||
resolver.resolve_entry(
|
||||
BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="smith2024graphs",
|
||||
fields={"pmid": "12345678", "dblp": "conf/test/Smith24"},
|
||||
)
|
||||
)
|
||||
|
||||
assert calls == [
|
||||
("pmid", "12345678"),
|
||||
("dblp", "conf/test/Smith24"),
|
||||
]
|
||||
|
||||
|
||||
def test_openalex_work_to_entry_maps_basic_fields():
|
||||
entry = _openalex_work_to_entry(
|
||||
{
|
||||
|
|
|
|||
|
|
@ -36,6 +36,37 @@ def test_verifier_uses_direct_doi_resolution_for_bib_entries():
|
|||
assert result.source_label == "crossref:doi:10.1000/example"
|
||||
|
||||
|
||||
def test_verifier_uses_direct_pmid_resolution_for_bib_entries():
|
||||
verifier = BibliographyVerifier()
|
||||
verifier.resolver.resolve_pmid = lambda value: Resolution( # type: ignore[method-assign]
|
||||
entry=BibEntry(
|
||||
entry_type="article",
|
||||
citation_key="pmid12345678",
|
||||
fields={
|
||||
"author": "Smith, Jane",
|
||||
"title": "Resolved PubMed Work",
|
||||
"year": "2024",
|
||||
"pmid": value,
|
||||
},
|
||||
),
|
||||
source_type="resolver",
|
||||
source_label=f"pubmed:pmid:{value}",
|
||||
)
|
||||
|
||||
result = verifier.verify_bib_entry(
|
||||
BibEntry(
|
||||
entry_type="misc",
|
||||
citation_key="seed2024",
|
||||
fields={"title": "Rough Work", "pmid": "12345678"},
|
||||
)
|
||||
)
|
||||
|
||||
assert result.status == "exact"
|
||||
assert result.confidence == 1.0
|
||||
assert result.entry.fields["title"] == "Resolved PubMed Work"
|
||||
assert result.source_label == "pubmed:pmid:12345678"
|
||||
|
||||
|
||||
def test_verifier_scores_and_sorts_search_candidates():
|
||||
verifier = BibliographyVerifier()
|
||||
verifier.resolver.search_crossref = lambda title, limit=5: [ # type: ignore[method-assign]
|
||||
|
|
@ -61,6 +92,7 @@ def test_verifier_scores_and_sorts_search_candidates():
|
|||
]
|
||||
verifier.resolver.search_openalex = lambda title, limit=5: [] # type: ignore[method-assign]
|
||||
verifier.resolver.search_datacite = lambda title, limit=5: [] # type: ignore[method-assign]
|
||||
verifier.resolver.search_pubmed = lambda title, limit=5: [] # type: ignore[method-assign]
|
||||
|
||||
result = verifier.verify_string('"Graph-first bibliography augmentation" Smith 2024')
|
||||
|
||||
|
|
@ -74,6 +106,7 @@ def test_verification_result_to_bib_entry_contains_audit_fields():
|
|||
verifier.resolver.search_crossref = lambda title, limit=5: [] # type: ignore[method-assign]
|
||||
verifier.resolver.search_openalex = lambda title, limit=5: [] # type: ignore[method-assign]
|
||||
verifier.resolver.search_datacite = lambda title, limit=5: [] # type: ignore[method-assign]
|
||||
verifier.resolver.search_pubmed = lambda title, limit=5: [] # type: ignore[method-assign]
|
||||
|
||||
result = verifier._verify_query( # type: ignore[attr-defined]
|
||||
{"title": "Missing Work", "authors": [], "year": "", "venue": ""},
|
||||
|
|
|
|||
Loading…
Reference in New Issue