Add PubMed support to CiteGeist
This commit is contained in:
parent
7bdaf37c59
commit
663fb1973a
|
|
@ -51,7 +51,7 @@ The initial repo includes:
|
||||||
- staged plaintext reference extraction that now preserves more structured metadata from legacy references, including year suffixes, identifiers, volume/issue/pages, and thesis/report/web-style venue hints;
|
- staged plaintext reference extraction that now preserves more structured metadata from legacy references, including year suffixes, identifiers, volume/issue/pages, and thesis/report/web-style venue hints;
|
||||||
- a reference-extraction backend seam with the local `heuristic` parser as the default implementation, so optional external backends can be added later without changing the core extract workflow;
|
- a reference-extraction backend seam with the local `heuristic` parser as the default implementation, so optional external backends can be added later without changing the core extract workflow;
|
||||||
- standalone verification and disambiguation of free-text references or partial BibTeX into auditable BibTeX/JSON results with `x_status`, `x_confidence`, `x_source`, `x_query`, and alternate-candidate traces;
|
- standalone verification and disambiguation of free-text references or partial BibTeX into auditable BibTeX/JSON results with `x_status`, `x_confidence`, `x_source`, `x_query`, and alternate-candidate traces;
|
||||||
- identifier-first metadata resolution for DOI, OpenAlex, DBLP, arXiv, and DataCite-backed entries, with OpenAlex/DataCite title-search fallback;
|
- identifier-first metadata resolution for DOI, PMID/PubMed, OpenAlex, DBLP, arXiv, and DataCite-backed entries, with OpenAlex/DataCite/PubMed title-search fallback;
|
||||||
- local citation-graph traversal over stored `cites`, `cited_by`, and `crossref` edges;
|
- local citation-graph traversal over stored `cites`, `cited_by`, and `crossref` edges;
|
||||||
- Crossref- and OpenAlex-backed graph expansion that materializes draft related works and edge provenance;
|
- Crossref- and OpenAlex-backed graph expansion that materializes draft related works and edge provenance;
|
||||||
- a dedicated source-client layer with fixture/cache support for live-source development;
|
- a dedicated source-client layer with fixture/cache support for live-source development;
|
||||||
|
|
|
||||||
|
|
@ -358,15 +358,12 @@ class Bootstrapper:
|
||||||
})
|
})
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def _deadline_reached(deadline: float | None) -> bool:
|
|
||||||
return deadline is not None and time.monotonic() >= deadline
|
|
||||||
|
|
||||||
def _topic_candidates(self, topic: str, seed_keys: list[str], limit: int) -> list[tuple[BibEntry, float]]:
|
def _topic_candidates(self, topic: str, seed_keys: list[str], limit: int) -> list[tuple[BibEntry, float]]:
|
||||||
scored: dict[str, tuple[BibEntry, float]] = {}
|
scored: dict[str, tuple[BibEntry, float]] = {}
|
||||||
|
|
||||||
for source_name, base_score, entries in (
|
for _source_name, base_score, entries in (
|
||||||
("openalex", 3.0, self.resolver.search_openalex(topic, limit=limit)),
|
("openalex", 3.0, self.resolver.search_openalex(topic, limit=limit)),
|
||||||
|
("pubmed", 2.5, self.resolver.search_pubmed(topic, limit=limit)),
|
||||||
("crossref", 2.0, self.resolver.search_crossref(topic, limit=limit)),
|
("crossref", 2.0, self.resolver.search_crossref(topic, limit=limit)),
|
||||||
("datacite", 1.5, self.resolver.search_datacite(topic, limit=limit)),
|
("datacite", 1.5, self.resolver.search_datacite(topic, limit=limit)),
|
||||||
):
|
):
|
||||||
|
|
@ -383,6 +380,10 @@ def _deadline_reached(deadline: float | None) -> bool:
|
||||||
return ranked[:limit]
|
return ranked[:limit]
|
||||||
|
|
||||||
|
|
||||||
|
def _deadline_reached(deadline: float | None) -> bool:
|
||||||
|
return deadline is not None and time.monotonic() >= deadline
|
||||||
|
|
||||||
|
|
||||||
def _topic_relevance_score(entry: BibEntry, topic: str) -> float:
|
def _topic_relevance_score(entry: BibEntry, topic: str) -> float:
|
||||||
topic_terms = _tokenize(topic)
|
topic_terms = _tokenize(topic)
|
||||||
title_terms = _tokenize(entry.fields.get("title", ""))
|
title_terms = _tokenize(entry.fields.get("title", ""))
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,11 @@ class MetadataResolver:
|
||||||
if resolved is not None:
|
if resolved is not None:
|
||||||
return resolved
|
return resolved
|
||||||
|
|
||||||
|
if pmid := entry.fields.get("pmid"):
|
||||||
|
resolved = self.resolve_pmid(pmid)
|
||||||
|
if resolved is not None:
|
||||||
|
return resolved
|
||||||
|
|
||||||
if openalex_id := entry.fields.get("openalex"):
|
if openalex_id := entry.fields.get("openalex"):
|
||||||
resolved = self.resolve_openalex(openalex_id)
|
resolved = self.resolve_openalex(openalex_id)
|
||||||
if resolved is not None:
|
if resolved is not None:
|
||||||
|
|
@ -73,6 +78,13 @@ class MetadataResolver:
|
||||||
)
|
)
|
||||||
if resolved is not None:
|
if resolved is not None:
|
||||||
return resolved
|
return resolved
|
||||||
|
resolved = self.search_pubmed_best_match(
|
||||||
|
title=title,
|
||||||
|
author_text=entry.fields.get("author", ""),
|
||||||
|
year=entry.fields.get("year", ""),
|
||||||
|
)
|
||||||
|
if resolved is not None:
|
||||||
|
return resolved
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
@ -166,6 +178,23 @@ class MetadataResolver:
|
||||||
source_label=f"arxiv:id:{arxiv_id}",
|
source_label=f"arxiv:id:{arxiv_id}",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def resolve_pmid(self, pmid: str) -> Resolution | None:
|
||||||
|
normalized_pmid = _normalize_pmid(pmid)
|
||||||
|
if not normalized_pmid:
|
||||||
|
return None
|
||||||
|
query = urllib.parse.urlencode({"db": "pubmed", "id": normalized_pmid, "retmode": "xml"})
|
||||||
|
root = self._safe_get_xml(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{query}")
|
||||||
|
if root is None:
|
||||||
|
return None
|
||||||
|
article = _find_pubmed_article(root, normalized_pmid)
|
||||||
|
if article is None:
|
||||||
|
return None
|
||||||
|
return Resolution(
|
||||||
|
entry=_pubmed_article_to_entry(article, fallback_pmid=normalized_pmid),
|
||||||
|
source_type="resolver",
|
||||||
|
source_label=f"pubmed:pmid:{normalized_pmid}",
|
||||||
|
)
|
||||||
|
|
||||||
def resolve_openalex(self, openalex_id: str) -> Resolution | None:
|
def resolve_openalex(self, openalex_id: str) -> Resolution | None:
|
||||||
normalized_id = _normalize_openalex_id(openalex_id)
|
normalized_id = _normalize_openalex_id(openalex_id)
|
||||||
payload = self._safe_get_json(f"https://api.openalex.org/works/{normalized_id}")
|
payload = self._safe_get_json(f"https://api.openalex.org/works/{normalized_id}")
|
||||||
|
|
@ -227,6 +256,30 @@ class MetadataResolver:
|
||||||
return []
|
return []
|
||||||
return [_openalex_work_to_entry(item) for item in payload.get("results", [])]
|
return [_openalex_work_to_entry(item) for item in payload.get("results", [])]
|
||||||
|
|
||||||
|
def search_pubmed(self, title: str, limit: int = 5) -> list[BibEntry]:
|
||||||
|
query_text = " ".join(title.split())
|
||||||
|
if not query_text:
|
||||||
|
return []
|
||||||
|
query = urllib.parse.urlencode(
|
||||||
|
{
|
||||||
|
"db": "pubmed",
|
||||||
|
"retmode": "json",
|
||||||
|
"retmax": max(1, limit),
|
||||||
|
"term": query_text,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
payload = self._safe_get_json(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?{query}")
|
||||||
|
if payload is None:
|
||||||
|
return []
|
||||||
|
ids = [
|
||||||
|
normalized
|
||||||
|
for value in payload.get("esearchresult", {}).get("idlist", [])
|
||||||
|
if (normalized := _normalize_pmid(str(value)))
|
||||||
|
]
|
||||||
|
if not ids:
|
||||||
|
return []
|
||||||
|
return self._fetch_pubmed_entries(ids[:limit])
|
||||||
|
|
||||||
def _safe_get_json(self, url: str) -> dict | None:
|
def _safe_get_json(self, url: str) -> dict | None:
|
||||||
try:
|
try:
|
||||||
return self.source_client.get_json(url)
|
return self.source_client.get_json(url)
|
||||||
|
|
@ -265,6 +318,51 @@ class MetadataResolver:
|
||||||
source_label=f"openalex:search:{title}",
|
source_label=f"openalex:search:{title}",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def search_pubmed_best_match(
|
||||||
|
self,
|
||||||
|
title: str,
|
||||||
|
author_text: str = "",
|
||||||
|
year: str = "",
|
||||||
|
) -> Resolution | None:
|
||||||
|
candidate = _select_best_title_match(
|
||||||
|
self.search_pubmed(title, limit=5),
|
||||||
|
title=title,
|
||||||
|
author_text=author_text,
|
||||||
|
year=year,
|
||||||
|
)
|
||||||
|
if candidate is None:
|
||||||
|
return None
|
||||||
|
return Resolution(
|
||||||
|
entry=candidate,
|
||||||
|
source_type="resolver",
|
||||||
|
source_label=f"pubmed:search:{title}",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _fetch_pubmed_entries(self, pmids: list[str]) -> list[BibEntry]:
|
||||||
|
ordered_pmids = [pmid for pmid in dict.fromkeys(pmids) if pmid]
|
||||||
|
if not ordered_pmids:
|
||||||
|
return []
|
||||||
|
|
||||||
|
id_param = ",".join(ordered_pmids)
|
||||||
|
summary_query = urllib.parse.urlencode({"db": "pubmed", "retmode": "json", "id": id_param})
|
||||||
|
summaries_payload = self._safe_get_json(
|
||||||
|
f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?{summary_query}"
|
||||||
|
) or {}
|
||||||
|
summaries = summaries_payload.get("result", {})
|
||||||
|
|
||||||
|
fetch_query = urllib.parse.urlencode({"db": "pubmed", "id": id_param, "retmode": "xml"})
|
||||||
|
root = self._safe_get_xml(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{fetch_query}")
|
||||||
|
articles = _pubmed_articles_by_pmid(root)
|
||||||
|
|
||||||
|
entries: list[BibEntry] = []
|
||||||
|
for pmid in ordered_pmids:
|
||||||
|
summary = summaries.get(pmid)
|
||||||
|
article = articles.get(pmid)
|
||||||
|
if not summary and article is None:
|
||||||
|
continue
|
||||||
|
entries.append(_pubmed_record_to_entry(summary or {}, article, fallback_pmid=pmid))
|
||||||
|
return entries
|
||||||
|
|
||||||
def merge_entries(base: BibEntry, resolved: BibEntry) -> BibEntry:
|
def merge_entries(base: BibEntry, resolved: BibEntry) -> BibEntry:
|
||||||
merged, _ = merge_entries_with_conflicts(base, resolved)
|
merged, _ = merge_entries_with_conflicts(base, resolved)
|
||||||
return merged
|
return merged
|
||||||
|
|
@ -651,6 +749,214 @@ def _candidate_matches_author_tokens(candidate: BibEntry, author_tokens: set[str
|
||||||
return bool(author_tokens & candidate_tokens)
|
return bool(author_tokens & candidate_tokens)
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_pmid(value: str) -> str:
|
||||||
|
return "".join(ch for ch in str(value) if ch.isdigit())
|
||||||
|
|
||||||
|
|
||||||
|
def _pubmed_articles_by_pmid(root: ET.Element | None) -> dict[str, ET.Element]:
|
||||||
|
if root is None:
|
||||||
|
return {}
|
||||||
|
articles: dict[str, ET.Element] = {}
|
||||||
|
for article in root.findall(".//PubmedArticle"):
|
||||||
|
pmid = _normalize_pmid(_node_text(article.find("./MedlineCitation/PMID")))
|
||||||
|
if pmid:
|
||||||
|
articles[pmid] = article
|
||||||
|
return articles
|
||||||
|
|
||||||
|
|
||||||
|
def _find_pubmed_article(root: ET.Element, pmid: str) -> ET.Element | None:
|
||||||
|
return _pubmed_articles_by_pmid(root).get(_normalize_pmid(pmid))
|
||||||
|
|
||||||
|
|
||||||
|
def _pubmed_record_to_entry(summary: dict, article: ET.Element | None, fallback_pmid: str) -> BibEntry:
|
||||||
|
if article is not None:
|
||||||
|
entry = _pubmed_article_to_entry(article, fallback_pmid=fallback_pmid)
|
||||||
|
_merge_pubmed_summary_into_fields(entry.fields, summary, fallback_pmid)
|
||||||
|
return entry
|
||||||
|
fields = _pubmed_summary_fields(summary, fallback_pmid)
|
||||||
|
citation_key = _pubmed_citation_key(
|
||||||
|
fields.get("doi", ""),
|
||||||
|
fields.get("pmid", ""),
|
||||||
|
fields.get("author", ""),
|
||||||
|
fields.get("year", ""),
|
||||||
|
fields.get("title", ""),
|
||||||
|
)
|
||||||
|
return BibEntry(entry_type="article", citation_key=citation_key, fields=fields)
|
||||||
|
|
||||||
|
|
||||||
|
def _pubmed_article_to_entry(article: ET.Element, fallback_pmid: str = "") -> BibEntry:
|
||||||
|
medline = article.find("./MedlineCitation")
|
||||||
|
article_node = medline.find("./Article") if medline is not None else None
|
||||||
|
pubmed_data = article.find("./PubmedData")
|
||||||
|
pmid = _normalize_pmid(_node_text(medline.find("./PMID")) if medline is not None else fallback_pmid) or _normalize_pmid(
|
||||||
|
fallback_pmid
|
||||||
|
)
|
||||||
|
title = _normalize_text(_element_text(article_node.find("./ArticleTitle")) if article_node is not None else "")
|
||||||
|
authors = " and ".join(
|
||||||
|
name
|
||||||
|
for name in (_pubmed_author_name(author) for author in article.findall(".//AuthorList/Author"))
|
||||||
|
if name
|
||||||
|
)
|
||||||
|
journal = _normalize_text(_node_text(article.find(".//Journal/Title")))
|
||||||
|
year = _pubmed_article_year(article)
|
||||||
|
abstract = _pubmed_abstract_text(article)
|
||||||
|
doi = _pubmed_article_identifier(article, "doi")
|
||||||
|
pmcid = _pubmed_article_identifier(pubmed_data, "pmc")
|
||||||
|
|
||||||
|
fields: dict[str, str] = {}
|
||||||
|
if title:
|
||||||
|
fields["title"] = title
|
||||||
|
if authors:
|
||||||
|
fields["author"] = authors
|
||||||
|
if year:
|
||||||
|
fields["year"] = year
|
||||||
|
if journal:
|
||||||
|
fields["journal"] = journal
|
||||||
|
if abstract:
|
||||||
|
fields["abstract"] = abstract
|
||||||
|
if doi:
|
||||||
|
fields["doi"] = doi
|
||||||
|
if pmid:
|
||||||
|
fields["pmid"] = pmid
|
||||||
|
if pmcid:
|
||||||
|
fields["pmcid"] = pmcid
|
||||||
|
fields["url"] = f"https://pmc.ncbi.nlm.nih.gov/articles/{pmcid}/"
|
||||||
|
elif pmid:
|
||||||
|
fields["url"] = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
|
||||||
|
|
||||||
|
citation_key = _pubmed_citation_key(doi, pmid, authors, year, title)
|
||||||
|
return BibEntry(entry_type="article", citation_key=citation_key, fields=fields)
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_pubmed_summary_into_fields(fields: dict[str, str], summary: dict, fallback_pmid: str) -> None:
|
||||||
|
for key, value in _pubmed_summary_fields(summary, fallback_pmid).items():
|
||||||
|
if value and not fields.get(key):
|
||||||
|
fields[key] = value
|
||||||
|
|
||||||
|
|
||||||
|
def _pubmed_summary_fields(summary: dict, fallback_pmid: str) -> dict[str, str]:
|
||||||
|
pmid = _normalize_pmid(str(summary.get("uid") or fallback_pmid))
|
||||||
|
title = _normalize_text(str(summary.get("title") or ""))
|
||||||
|
year = _pubmed_year_from_text(str(summary.get("pubdate") or ""))
|
||||||
|
journal = _normalize_text(str(summary.get("fulljournalname") or ""))
|
||||||
|
authors = " and ".join(
|
||||||
|
name
|
||||||
|
for name in (
|
||||||
|
_normalize_person_display_name(str(author.get("name") or ""))
|
||||||
|
for author in summary.get("authors", [])
|
||||||
|
)
|
||||||
|
if name
|
||||||
|
)
|
||||||
|
doi = ""
|
||||||
|
pmcid = ""
|
||||||
|
for article_id in summary.get("articleids", []) or []:
|
||||||
|
id_type = str(article_id.get("idtype") or "").lower()
|
||||||
|
value = str(article_id.get("value") or "")
|
||||||
|
if id_type == "doi" and value:
|
||||||
|
doi = value
|
||||||
|
elif id_type in {"pmc", "pmcid"} and value:
|
||||||
|
pmcid = value
|
||||||
|
|
||||||
|
fields: dict[str, str] = {}
|
||||||
|
if title:
|
||||||
|
fields["title"] = title
|
||||||
|
if authors:
|
||||||
|
fields["author"] = authors
|
||||||
|
if year:
|
||||||
|
fields["year"] = year
|
||||||
|
if journal:
|
||||||
|
fields["journal"] = journal
|
||||||
|
if doi:
|
||||||
|
fields["doi"] = doi
|
||||||
|
if pmid:
|
||||||
|
fields["pmid"] = pmid
|
||||||
|
if pmcid:
|
||||||
|
fields["pmcid"] = pmcid
|
||||||
|
fields["url"] = f"https://pmc.ncbi.nlm.nih.gov/articles/{pmcid}/"
|
||||||
|
elif pmid:
|
||||||
|
fields["url"] = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
|
||||||
|
return fields
|
||||||
|
|
||||||
|
|
||||||
|
def _pubmed_author_name(author: ET.Element) -> str:
|
||||||
|
collective = _normalize_text(_node_text(author.find("./CollectiveName")))
|
||||||
|
if collective:
|
||||||
|
return collective
|
||||||
|
family = _normalize_text(_node_text(author.find("./LastName")))
|
||||||
|
given = _normalize_text(_node_text(author.find("./ForeName")))
|
||||||
|
initials = _normalize_text(_node_text(author.find("./Initials")))
|
||||||
|
if family and given:
|
||||||
|
return f"{family}, {given}"
|
||||||
|
if family and initials:
|
||||||
|
normalized_initials = " ".join(f"{letter}." for letter in re.findall(r"[A-Za-z]", initials))
|
||||||
|
return f"{family}, {normalized_initials}" if normalized_initials else family
|
||||||
|
return family or given
|
||||||
|
|
||||||
|
|
||||||
|
def _pubmed_article_year(article: ET.Element) -> str:
|
||||||
|
for path in (
|
||||||
|
".//JournalIssue/PubDate/Year",
|
||||||
|
".//ArticleDate/Year",
|
||||||
|
".//PubDate/Year",
|
||||||
|
):
|
||||||
|
year = _node_text(article.find(path))
|
||||||
|
if year:
|
||||||
|
return year
|
||||||
|
for path in (
|
||||||
|
".//JournalIssue/PubDate/MedlineDate",
|
||||||
|
".//PubDate/MedlineDate",
|
||||||
|
):
|
||||||
|
year = _pubmed_year_from_text(_node_text(article.find(path)))
|
||||||
|
if year:
|
||||||
|
return year
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _pubmed_year_from_text(value: str) -> str:
|
||||||
|
match = re.search(r"\b(1[6-9]\d{2}|20\d{2}|21\d{2})\b", value)
|
||||||
|
return match.group(1) if match else ""
|
||||||
|
|
||||||
|
|
||||||
|
def _pubmed_abstract_text(article: ET.Element) -> str:
|
||||||
|
parts: list[str] = []
|
||||||
|
for node in article.findall(".//Abstract/AbstractText"):
|
||||||
|
text = _normalize_text(_element_text(node))
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
label = _normalize_text(node.attrib.get("Label", ""))
|
||||||
|
parts.append(f"{label}: {text}" if label else text)
|
||||||
|
return " ".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _pubmed_article_identifier(root: ET.Element | None, identifier_type: str) -> str:
|
||||||
|
if root is None:
|
||||||
|
return ""
|
||||||
|
normalized_type = identifier_type.lower()
|
||||||
|
for node in root.findall(".//ArticleId"):
|
||||||
|
if str(node.attrib.get("IdType") or "").lower() == normalized_type:
|
||||||
|
return _normalize_text(_element_text(node))
|
||||||
|
if normalized_type == "doi":
|
||||||
|
for node in root.findall(".//ELocationID"):
|
||||||
|
if str(node.attrib.get("EIdType") or "").lower() == "doi":
|
||||||
|
return _normalize_text(_element_text(node))
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _pubmed_citation_key(doi: str, pmid: str, authors: str, year: str, title: str) -> str:
|
||||||
|
if doi:
|
||||||
|
suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower()
|
||||||
|
return f"doi{suffix}"
|
||||||
|
if pmid:
|
||||||
|
return f"pmid{pmid}"
|
||||||
|
return _make_resolution_key(authors or "pubmed", year or "n.d.", title or "untitled")
|
||||||
|
|
||||||
|
|
||||||
|
def _element_text(node: ET.Element | None) -> str:
|
||||||
|
if node is None:
|
||||||
|
return ""
|
||||||
|
return " ".join("".join(node.itertext()).split())
|
||||||
|
|
||||||
|
|
||||||
def _datacite_work_to_entry(data: dict) -> BibEntry:
|
def _datacite_work_to_entry(data: dict) -> BibEntry:
|
||||||
attributes = data.get("attributes", {})
|
attributes = data.get("attributes", {})
|
||||||
doi = str(attributes.get("doi") or "")
|
doi = str(attributes.get("doi") or "")
|
||||||
|
|
|
||||||
|
|
@ -149,6 +149,20 @@ class BibliographyVerifier:
|
||||||
input_type=input_type,
|
input_type=input_type,
|
||||||
input_key=input_key,
|
input_key=input_key,
|
||||||
)
|
)
|
||||||
|
if source_entry is not None and source_entry.fields.get("pmid"):
|
||||||
|
direct = self.resolver.resolve_pmid(source_entry.fields["pmid"])
|
||||||
|
if direct is not None:
|
||||||
|
return VerificationResult(
|
||||||
|
query=query,
|
||||||
|
context=context,
|
||||||
|
status="exact",
|
||||||
|
confidence=1.0,
|
||||||
|
entry=direct.entry,
|
||||||
|
source_label=direct.source_label,
|
||||||
|
alternates=[],
|
||||||
|
input_type=input_type,
|
||||||
|
input_key=input_key,
|
||||||
|
)
|
||||||
|
|
||||||
candidate_limit = max(1, limit)
|
candidate_limit = max(1, limit)
|
||||||
candidates = self._collect_candidates(
|
candidates = self._collect_candidates(
|
||||||
|
|
@ -209,6 +223,7 @@ class BibliographyVerifier:
|
||||||
("crossref", self.resolver.search_crossref(search_title, limit=limit)),
|
("crossref", self.resolver.search_crossref(search_title, limit=limit)),
|
||||||
("openalex", self.resolver.search_openalex(search_title, limit=limit)),
|
("openalex", self.resolver.search_openalex(search_title, limit=limit)),
|
||||||
("datacite", self.resolver.search_datacite(search_title, limit=limit)),
|
("datacite", self.resolver.search_datacite(search_title, limit=limit)),
|
||||||
|
("pubmed", self.resolver.search_pubmed(search_title, limit=limit)),
|
||||||
):
|
):
|
||||||
for entry in source_entries:
|
for entry in source_entries:
|
||||||
signature = _candidate_signature(entry)
|
signature = _candidate_signature(entry)
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,7 @@ def test_bootstrap_from_topic_only():
|
||||||
try:
|
try:
|
||||||
bootstrapper = Bootstrapper()
|
bootstrapper = Bootstrapper()
|
||||||
bootstrapper.resolver.search_openalex = lambda topic, limit=5: [] # type: ignore[method-assign]
|
bootstrapper.resolver.search_openalex = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||||
|
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||||
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||||
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [ # type: ignore[method-assign]
|
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [ # type: ignore[method-assign]
|
||||||
__import__("citegeist").BibEntry(
|
__import__("citegeist").BibEntry(
|
||||||
|
|
@ -139,6 +140,7 @@ def test_bootstrap_ranks_and_deduplicates_topic_candidates():
|
||||||
fields={"title": "Graph Topic Ranking", "abstract": "graph topic graph"},
|
fields={"title": "Graph Topic Ranking", "abstract": "graph topic graph"},
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||||
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [ # type: ignore[method-assign]
|
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [ # type: ignore[method-assign]
|
||||||
BibEntry(
|
BibEntry(
|
||||||
entry_type="article",
|
entry_type="article",
|
||||||
|
|
@ -172,6 +174,7 @@ def test_bootstrap_preview_does_not_write_to_database():
|
||||||
bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign]
|
bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign]
|
||||||
BibEntry(entry_type="article", citation_key="preview2024graph", fields={"title": "Preview Graph Topic"})
|
BibEntry(entry_type="article", citation_key="preview2024graph", fields={"title": "Preview Graph Topic"})
|
||||||
]
|
]
|
||||||
|
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||||
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||||
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
|
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||||
|
|
||||||
|
|
@ -194,6 +197,7 @@ def test_bootstrap_topic_commit_limit_restricts_persisted_candidates():
|
||||||
BibEntry(entry_type="article", citation_key="rank1", fields={"title": "Graph Topic One"}),
|
BibEntry(entry_type="article", citation_key="rank1", fields={"title": "Graph Topic One"}),
|
||||||
BibEntry(entry_type="article", citation_key="rank2", fields={"title": "Graph Topic Two"}),
|
BibEntry(entry_type="article", citation_key="rank2", fields={"title": "Graph Topic Two"}),
|
||||||
]
|
]
|
||||||
|
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||||
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||||
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
|
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||||
bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign]
|
bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign]
|
||||||
|
|
@ -227,6 +231,7 @@ def test_bootstrap_topic_candidates_are_attached_to_topic():
|
||||||
fields={"title": "Graph Topic Result", "year": "2024"},
|
fields={"title": "Graph Topic Result", "year": "2024"},
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||||
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||||
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
|
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||||
bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign]
|
bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign]
|
||||||
|
|
@ -278,6 +283,7 @@ def test_bootstrap_topic_commit_requires_title_anchor():
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||||
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||||
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
|
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||||
bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign]
|
bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign]
|
||||||
|
|
@ -482,6 +488,7 @@ def test_bootstrap_preview_uses_topic_commit_limit_when_larger_than_topic_limit(
|
||||||
)
|
)
|
||||||
for index in range(1, 8)
|
for index in range(1, 8)
|
||||||
][:limit]
|
][:limit]
|
||||||
|
bootstrapper.resolver.search_pubmed = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||||
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||||
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
|
bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,7 @@ from citegeist.resolve import (
|
||||||
_crossref_message_to_entry,
|
_crossref_message_to_entry,
|
||||||
_datacite_work_to_entry,
|
_datacite_work_to_entry,
|
||||||
_openalex_work_to_entry,
|
_openalex_work_to_entry,
|
||||||
|
_pubmed_article_to_entry,
|
||||||
merge_entries_with_conflicts,
|
merge_entries_with_conflicts,
|
||||||
merge_entries,
|
merge_entries,
|
||||||
)
|
)
|
||||||
|
|
@ -88,6 +89,52 @@ def test_arxiv_atom_entry_to_bib_maps_basic_fields():
|
||||||
assert entry.fields["doi"] == "10.1000/arxiv-example"
|
assert entry.fields["doi"] == "10.1000/arxiv-example"
|
||||||
|
|
||||||
|
|
||||||
|
def test_pubmed_article_to_entry_maps_basic_fields():
|
||||||
|
xml = ET.fromstring(
|
||||||
|
"""
|
||||||
|
<PubmedArticle>
|
||||||
|
<MedlineCitation>
|
||||||
|
<PMID>12345678</PMID>
|
||||||
|
<Article>
|
||||||
|
<ArticleTitle>PubMed Resolved Work</ArticleTitle>
|
||||||
|
<Abstract>
|
||||||
|
<AbstractText Label="Background">Evidence summary.</AbstractText>
|
||||||
|
<AbstractText>Second paragraph.</AbstractText>
|
||||||
|
</Abstract>
|
||||||
|
<Journal>
|
||||||
|
<JournalIssue>
|
||||||
|
<PubDate><Year>2021</Year></PubDate>
|
||||||
|
</JournalIssue>
|
||||||
|
<Title>Journal of Evidence</Title>
|
||||||
|
</Journal>
|
||||||
|
<AuthorList>
|
||||||
|
<Author><LastName>Smith</LastName><ForeName>Jane</ForeName></Author>
|
||||||
|
</AuthorList>
|
||||||
|
<ELocationID EIdType="doi">10.1000/pubmed-example</ELocationID>
|
||||||
|
</Article>
|
||||||
|
</MedlineCitation>
|
||||||
|
<PubmedData>
|
||||||
|
<ArticleIdList>
|
||||||
|
<ArticleId IdType="pubmed">12345678</ArticleId>
|
||||||
|
<ArticleId IdType="pmc">PMC123456</ArticleId>
|
||||||
|
</ArticleIdList>
|
||||||
|
</PubmedData>
|
||||||
|
</PubmedArticle>
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
entry = _pubmed_article_to_entry(xml)
|
||||||
|
|
||||||
|
assert entry.citation_key == "doi101000pubmedexample"
|
||||||
|
assert entry.fields["title"] == "PubMed Resolved Work"
|
||||||
|
assert entry.fields["author"] == "Smith, Jane"
|
||||||
|
assert entry.fields["journal"] == "Journal of Evidence"
|
||||||
|
assert entry.fields["year"] == "2021"
|
||||||
|
assert entry.fields["pmid"] == "12345678"
|
||||||
|
assert entry.fields["pmcid"] == "PMC123456"
|
||||||
|
assert entry.fields["abstract"] == "Background: Evidence summary. Second paragraph."
|
||||||
|
|
||||||
|
|
||||||
def test_merge_entries_prefers_existing_values_and_adds_missing_fields():
|
def test_merge_entries_prefers_existing_values_and_adds_missing_fields():
|
||||||
base = BibEntry(
|
base = BibEntry(
|
||||||
entry_type="article",
|
entry_type="article",
|
||||||
|
|
@ -209,6 +256,35 @@ def test_resolver_tries_doi_before_dblp():
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolver_tries_pmid_before_dblp():
|
||||||
|
resolver = MetadataResolver()
|
||||||
|
calls: list[tuple[str, str]] = []
|
||||||
|
|
||||||
|
def fake_pmid(value: str):
|
||||||
|
calls.append(("pmid", value))
|
||||||
|
return None
|
||||||
|
|
||||||
|
def fake_dblp(value: str):
|
||||||
|
calls.append(("dblp", value))
|
||||||
|
return None
|
||||||
|
|
||||||
|
resolver.resolve_pmid = fake_pmid # type: ignore[method-assign]
|
||||||
|
resolver.resolve_dblp = fake_dblp # type: ignore[method-assign]
|
||||||
|
|
||||||
|
resolver.resolve_entry(
|
||||||
|
BibEntry(
|
||||||
|
entry_type="article",
|
||||||
|
citation_key="smith2024graphs",
|
||||||
|
fields={"pmid": "12345678", "dblp": "conf/test/Smith24"},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
assert calls == [
|
||||||
|
("pmid", "12345678"),
|
||||||
|
("dblp", "conf/test/Smith24"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_openalex_work_to_entry_maps_basic_fields():
|
def test_openalex_work_to_entry_maps_basic_fields():
|
||||||
entry = _openalex_work_to_entry(
|
entry = _openalex_work_to_entry(
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,37 @@ def test_verifier_uses_direct_doi_resolution_for_bib_entries():
|
||||||
assert result.source_label == "crossref:doi:10.1000/example"
|
assert result.source_label == "crossref:doi:10.1000/example"
|
||||||
|
|
||||||
|
|
||||||
|
def test_verifier_uses_direct_pmid_resolution_for_bib_entries():
|
||||||
|
verifier = BibliographyVerifier()
|
||||||
|
verifier.resolver.resolve_pmid = lambda value: Resolution( # type: ignore[method-assign]
|
||||||
|
entry=BibEntry(
|
||||||
|
entry_type="article",
|
||||||
|
citation_key="pmid12345678",
|
||||||
|
fields={
|
||||||
|
"author": "Smith, Jane",
|
||||||
|
"title": "Resolved PubMed Work",
|
||||||
|
"year": "2024",
|
||||||
|
"pmid": value,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
source_type="resolver",
|
||||||
|
source_label=f"pubmed:pmid:{value}",
|
||||||
|
)
|
||||||
|
|
||||||
|
result = verifier.verify_bib_entry(
|
||||||
|
BibEntry(
|
||||||
|
entry_type="misc",
|
||||||
|
citation_key="seed2024",
|
||||||
|
fields={"title": "Rough Work", "pmid": "12345678"},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.status == "exact"
|
||||||
|
assert result.confidence == 1.0
|
||||||
|
assert result.entry.fields["title"] == "Resolved PubMed Work"
|
||||||
|
assert result.source_label == "pubmed:pmid:12345678"
|
||||||
|
|
||||||
|
|
||||||
def test_verifier_scores_and_sorts_search_candidates():
|
def test_verifier_scores_and_sorts_search_candidates():
|
||||||
verifier = BibliographyVerifier()
|
verifier = BibliographyVerifier()
|
||||||
verifier.resolver.search_crossref = lambda title, limit=5: [ # type: ignore[method-assign]
|
verifier.resolver.search_crossref = lambda title, limit=5: [ # type: ignore[method-assign]
|
||||||
|
|
@ -61,6 +92,7 @@ def test_verifier_scores_and_sorts_search_candidates():
|
||||||
]
|
]
|
||||||
verifier.resolver.search_openalex = lambda title, limit=5: [] # type: ignore[method-assign]
|
verifier.resolver.search_openalex = lambda title, limit=5: [] # type: ignore[method-assign]
|
||||||
verifier.resolver.search_datacite = lambda title, limit=5: [] # type: ignore[method-assign]
|
verifier.resolver.search_datacite = lambda title, limit=5: [] # type: ignore[method-assign]
|
||||||
|
verifier.resolver.search_pubmed = lambda title, limit=5: [] # type: ignore[method-assign]
|
||||||
|
|
||||||
result = verifier.verify_string('"Graph-first bibliography augmentation" Smith 2024')
|
result = verifier.verify_string('"Graph-first bibliography augmentation" Smith 2024')
|
||||||
|
|
||||||
|
|
@ -74,6 +106,7 @@ def test_verification_result_to_bib_entry_contains_audit_fields():
|
||||||
verifier.resolver.search_crossref = lambda title, limit=5: [] # type: ignore[method-assign]
|
verifier.resolver.search_crossref = lambda title, limit=5: [] # type: ignore[method-assign]
|
||||||
verifier.resolver.search_openalex = lambda title, limit=5: [] # type: ignore[method-assign]
|
verifier.resolver.search_openalex = lambda title, limit=5: [] # type: ignore[method-assign]
|
||||||
verifier.resolver.search_datacite = lambda title, limit=5: [] # type: ignore[method-assign]
|
verifier.resolver.search_datacite = lambda title, limit=5: [] # type: ignore[method-assign]
|
||||||
|
verifier.resolver.search_pubmed = lambda title, limit=5: [] # type: ignore[method-assign]
|
||||||
|
|
||||||
result = verifier._verify_query( # type: ignore[attr-defined]
|
result = verifier._verify_query( # type: ignore[attr-defined]
|
||||||
{"title": "Missing Work", "authors": [], "year": "", "venue": ""},
|
{"title": "Missing Work", "authors": [], "year": "", "venue": ""},
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue