from __future__ import annotations import html import http.client import os import re import urllib.error import urllib.parse import xml.etree.ElementTree as ET from dataclasses import dataclass from .bibtex import BibEntry, parse_bibtex from .sources import SourceClient @dataclass(slots=True) class Resolution: entry: BibEntry source_type: str source_label: str class MetadataResolver: def __init__( self, user_agent: str = "citegeist/0.1 (local research tool)", source_client: SourceClient | None = None, ncbi_api_key: str | None = None, ncbi_tool: str | None = None, ncbi_email: str | None = None, ) -> None: self.user_agent = user_agent self.source_client = source_client or SourceClient(user_agent=user_agent) self.ncbi_api_key = ncbi_api_key if ncbi_api_key is not None else os.environ.get("NCBI_API_KEY", "") self.ncbi_tool = ncbi_tool if ncbi_tool is not None else os.environ.get("NCBI_TOOL", "citegeist") self.ncbi_email = ncbi_email if ncbi_email is not None else os.environ.get("NCBI_EMAIL", "") def resolve_entry(self, entry: BibEntry) -> Resolution | None: if doi := entry.fields.get("doi"): resolved = self.resolve_doi(doi) if resolved is not None: return resolved resolved = self.resolve_datacite_doi(doi) if resolved is not None: return resolved if pmid := entry.fields.get("pmid"): resolved = self.resolve_pmid(pmid) if resolved is not None: return resolved if openalex_id := entry.fields.get("openalex"): resolved = self.resolve_openalex(openalex_id) if resolved is not None: return resolved if dblp_key := entry.fields.get("dblp"): resolved = self.resolve_dblp(dblp_key) if resolved is not None: return resolved if arxiv_id := entry.fields.get("arxiv"): resolved = self.resolve_arxiv(arxiv_id) if resolved is not None: return resolved if title := entry.fields.get("title"): resolved = self.search_crossref_best_match( title=title, author_text=entry.fields.get("author", ""), year=entry.fields.get("year", ""), ) if resolved is not None: return resolved resolved = self.search_datacite_best_match( title=title, author_text=entry.fields.get("author", ""), year=entry.fields.get("year", ""), ) if resolved is not None: return resolved resolved = self.search_openalex_best_match( title=title, author_text=entry.fields.get("author", ""), year=entry.fields.get("year", ""), ) if resolved is not None: return resolved resolved = self.search_pubmed_best_match( title=title, author_text=entry.fields.get("author", ""), year=entry.fields.get("year", ""), ) if resolved is not None: return resolved return None def resolve_doi(self, doi: str) -> Resolution | None: encoded = urllib.parse.quote(doi, safe="") payload = self._safe_get_json(f"https://api.crossref.org/works/{encoded}") if payload is None: return None message = payload.get("message", {}) if not message: return None return Resolution( entry=_crossref_message_to_entry(message), source_type="resolver", source_label=f"crossref:doi:{doi}", ) def search_crossref(self, title: str, limit: int = 5) -> list[BibEntry]: query = urllib.parse.urlencode({"query.title": title, "rows": limit}) payload = self._safe_get_json(f"https://api.crossref.org/works?{query}") if payload is None: return [] items = payload.get("message", {}).get("items", []) return [_crossref_message_to_entry(item) for item in items] def search_crossref_best_match( self, title: str, author_text: str = "", year: str = "", ) -> Resolution | None: candidate = _select_best_title_match( self.search_crossref(title, limit=5), title=title, author_text=author_text, year=year, ) if candidate is None: return None return Resolution( entry=candidate, source_type="resolver", source_label=f"crossref:search:{title}", ) def resolve_dblp(self, dblp_key: str) -> Resolution | None: encoded_key = urllib.parse.quote(dblp_key, safe="/:") text = self._safe_get_text(f"https://dblp.org/rec/{encoded_key}.bib") if text is None: return None entries = parse_bibtex(text) if not entries: return None return Resolution( entry=entries[0], source_type="resolver", source_label=f"dblp:key:{dblp_key}", ) def search_dblp(self, query_text: str, limit: int = 5) -> list[BibEntry]: query = urllib.parse.urlencode({"q": query_text, "format": "json", "h": limit}) payload = self._safe_get_json(f"https://dblp.org/search/publ/api?{query}") if payload is None: return [] hits = payload.get("result", {}).get("hits", {}).get("hit", []) if isinstance(hits, dict): hits = [hits] results: list[BibEntry] = [] for hit in hits: info = hit.get("info", {}) dblp_key = info.get("key") if dblp_key: resolved = self.resolve_dblp(dblp_key) if resolved is not None: results.append(resolved.entry) return results def resolve_arxiv(self, arxiv_id: str) -> Resolution | None: query = urllib.parse.urlencode({"id_list": arxiv_id}) root = self._safe_get_xml(f"https://export.arxiv.org/api/query?{query}") if root is None: return None namespace = {"atom": "http://www.w3.org/2005/Atom"} entry = root.find("atom:entry", namespace) if entry is None: return None return Resolution( entry=_arxiv_atom_entry_to_bib(entry, arxiv_id), source_type="resolver", source_label=f"arxiv:id:{arxiv_id}", ) def resolve_pmid(self, pmid: str) -> Resolution | None: normalized_pmid = _normalize_pmid(pmid) if not normalized_pmid: return None query = urllib.parse.urlencode( self._ncbi_params({"db": "pubmed", "id": normalized_pmid, "retmode": "xml"}) ) root = self._safe_get_xml(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{query}") if root is None: return None article = _find_pubmed_article(root, normalized_pmid) if article is None: return None return Resolution( entry=_pubmed_article_to_entry(article, fallback_pmid=normalized_pmid), source_type="resolver", source_label=f"pubmed:pmid:{normalized_pmid}", ) def resolve_openalex(self, openalex_id: str) -> Resolution | None: normalized_id = _normalize_openalex_id(openalex_id) payload = self._safe_get_json(f"https://api.openalex.org/works/{normalized_id}") if payload is None: return None if not payload: return None return Resolution( entry=_openalex_work_to_entry(payload), source_type="resolver", source_label=f"openalex:id:{normalized_id}", ) def resolve_datacite_doi(self, doi: str) -> Resolution | None: encoded = urllib.parse.quote(doi, safe="") payload = self._safe_get_json(f"https://api.datacite.org/dois/{encoded}") if payload is None: return None data = payload.get("data", {}) if not data: return None return Resolution( entry=_datacite_work_to_entry(data), source_type="resolver", source_label=f"datacite:doi:{doi}", ) def search_datacite(self, title: str, limit: int = 5) -> list[BibEntry]: query = urllib.parse.urlencode({"query": title, "page[size]": limit}) payload = self._safe_get_json(f"https://api.datacite.org/dois?{query}") if payload is None: return [] return [_datacite_work_to_entry(item) for item in payload.get("data", [])] def search_datacite_best_match( self, title: str, author_text: str = "", year: str = "", ) -> Resolution | None: candidate = _select_best_title_match( self.search_datacite(title, limit=5), title=title, author_text=author_text, year=year, ) if candidate is None: return None return Resolution( entry=candidate, source_type="resolver", source_label=f"datacite:search:{title}", ) def search_openalex(self, title: str, limit: int = 5) -> list[BibEntry]: query = urllib.parse.urlencode({"search": title, "per-page": limit}) payload = self._safe_get_json(f"https://api.openalex.org/works?{query}") if payload is None: return [] return [_openalex_work_to_entry(item) for item in payload.get("results", [])] def search_pubmed(self, title: str, limit: int = 5) -> list[BibEntry]: query_text = " ".join(title.split()) if not query_text: return [] query = urllib.parse.urlencode( self._ncbi_params({ "db": "pubmed", "retmode": "json", "retmax": max(1, limit), "term": query_text, }) ) payload = self._safe_get_json(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?{query}") if payload is None: return [] ids = [ normalized for value in payload.get("esearchresult", {}).get("idlist", []) if (normalized := _normalize_pmid(str(value))) ] if not ids: return [] return self._fetch_pubmed_entries(ids[:limit]) def _safe_get_json(self, url: str) -> dict | None: try: return self.source_client.get_json(url) except ( http.client.RemoteDisconnected, urllib.error.HTTPError, urllib.error.URLError, TimeoutError, ValueError, ): return None def _safe_get_text(self, url: str) -> str | None: try: return self.source_client.get_text(url) except ( http.client.RemoteDisconnected, urllib.error.HTTPError, urllib.error.URLError, TimeoutError, ValueError, ): return None def _safe_get_xml(self, url: str) -> ET.Element | None: try: return self.source_client.get_xml(url) except ( http.client.RemoteDisconnected, urllib.error.HTTPError, urllib.error.URLError, TimeoutError, ET.ParseError, ValueError, ): return None def search_openalex_best_match( self, title: str, author_text: str = "", year: str = "", ) -> Resolution | None: candidate = _select_best_title_match( self.search_openalex(title, limit=5), title=title, author_text=author_text, year=year, ) if candidate is None: return None return Resolution( entry=candidate, source_type="resolver", source_label=f"openalex:search:{title}", ) def search_pubmed_best_match( self, title: str, author_text: str = "", year: str = "", ) -> Resolution | None: candidate = _select_best_title_match( self.search_pubmed(title, limit=5), title=title, author_text=author_text, year=year, ) if candidate is None: return None return Resolution( entry=candidate, source_type="resolver", source_label=f"pubmed:search:{title}", ) def _fetch_pubmed_entries(self, pmids: list[str]) -> list[BibEntry]: ordered_pmids = [pmid for pmid in dict.fromkeys(pmids) if pmid] if not ordered_pmids: return [] id_param = ",".join(ordered_pmids) summary_query = urllib.parse.urlencode(self._ncbi_params({"db": "pubmed", "retmode": "json", "id": id_param})) summaries_payload = self._safe_get_json( f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?{summary_query}" ) or {} summaries = summaries_payload.get("result", {}) fetch_query = urllib.parse.urlencode(self._ncbi_params({"db": "pubmed", "id": id_param, "retmode": "xml"})) root = self._safe_get_xml(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{fetch_query}") articles = _pubmed_articles_by_pmid(root) entries: list[BibEntry] = [] for pmid in ordered_pmids: summary = summaries.get(pmid) article = articles.get(pmid) if not summary and article is None: continue entries.append(_pubmed_record_to_entry(summary or {}, article, fallback_pmid=pmid)) return entries def _ncbi_params(self, params: dict[str, object]) -> dict[str, object]: enriched = dict(params) if self.ncbi_api_key: enriched["api_key"] = self.ncbi_api_key if self.ncbi_tool: enriched["tool"] = self.ncbi_tool if self.ncbi_email: enriched["email"] = self.ncbi_email return enriched def merge_entries(base: BibEntry, resolved: BibEntry) -> BibEntry: merged, _ = merge_entries_with_conflicts(base, resolved) return merged def merge_entries_with_conflicts(base: BibEntry, resolved: BibEntry) -> tuple[BibEntry, list[dict[str, str]]]: merged_fields = dict(base.fields) conflicts: list[dict[str, str]] = [] for key, value in resolved.fields.items(): if not value: continue current_value = merged_fields.get(key, "") if _is_placeholder_value(key, current_value) and current_value != value: merged_fields[key] = value continue if current_value and current_value != value: conflicts.append( { "field_name": key, "current_value": current_value, "proposed_value": value, } ) continue if key not in merged_fields or not merged_fields[key]: merged_fields[key] = value return ( BibEntry( entry_type=_merged_entry_type(base.entry_type, resolved.entry_type), citation_key=base.citation_key, fields=merged_fields, ), conflicts, ) def _is_placeholder_value(field_name: str, value: str) -> bool: normalized = " ".join((value or "").split()).strip() if not normalized: return True lowered = normalized.lower() if field_name == "title": return bool(re.fullmatch(r"referenced work \d+", lowered)) or lowered.startswith("untitled") return False def _merged_entry_type(base_entry_type: str, resolved_entry_type: str) -> str: if base_entry_type == "misc" and resolved_entry_type and resolved_entry_type != "misc": return resolved_entry_type return base_entry_type or resolved_entry_type def _crossref_message_to_entry(message: dict) -> BibEntry: entry_type = _crossref_type_to_bibtype(message.get("type", "article")) title_values = message.get("title", []) title = _normalize_text(title_values[0] if title_values else "") year = _extract_crossref_year(message) authors = " and ".join(_crossref_person_to_name(person) for person in message.get("author", [])) venue = "" if container_title := message.get("container-title", []): venue = _normalize_text(container_title[0]) fields: dict[str, str] = {} if authors: fields["author"] = authors if title: fields["title"] = title if year: fields["year"] = year if doi := message.get("DOI"): fields["doi"] = doi if url := message.get("URL"): fields["url"] = url if abstract := message.get("abstract"): normalized_abstract = _normalize_abstract_text(str(abstract)) if normalized_abstract: fields["abstract"] = normalized_abstract if venue: if entry_type == "article": fields["journal"] = venue else: fields["booktitle"] = venue if volume := message.get("volume"): fields["volume"] = str(volume) if issue := message.get("issue"): fields["number"] = str(issue) if pages := message.get("page"): fields["pages"] = str(pages) citation_key = _make_resolution_key(fields.get("author", "crossref"), year or "n.d.", title or "untitled") return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) def _arxiv_atom_entry_to_bib(node: ET.Element, arxiv_id: str) -> BibEntry: ns = { "atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom", } title = _node_text(node.find("atom:title", ns)) summary = _node_text(node.find("atom:summary", ns)) published = _node_text(node.find("atom:published", ns)) year = published[:4] if published else "" authors = " and ".join( _node_text(author.find("atom:name", ns)) for author in node.findall("atom:author", ns) ) doi = _node_text(node.find("arxiv:doi", ns)) fields: dict[str, str] = { "title": title, "author": authors, "year": year, "arxiv": arxiv_id, "url": f"https://arxiv.org/abs/{arxiv_id}", "pdf": f"https://arxiv.org/pdf/{arxiv_id}.pdf", } if summary: fields["abstract"] = summary if doi: fields["doi"] = doi return BibEntry(entry_type="article", citation_key=f"arxiv{arxiv_id.replace('.', '').replace('/', '')}", fields=fields) def _crossref_type_to_bibtype(crossref_type: str) -> str: mapping = { "journal-article": "article", "proceedings-article": "inproceedings", "book-chapter": "incollection", "book": "book", "proceedings": "proceedings", } return mapping.get(crossref_type, "misc") def _extract_crossref_year(message: dict) -> str: for field_name in ("published-print", "published-online", "issued", "created"): date_parts = message.get(field_name, {}).get("date-parts", []) if date_parts and date_parts[0]: return str(date_parts[0][0]) return "" def _crossref_person_to_name(person: dict) -> str: family = person.get("family", "") given = person.get("given", "") if family and given: return f"{family}, {given}" return family or given def _node_text(node: ET.Element | None) -> str: if node is None or node.text is None: return "" return " ".join(node.text.split()) def _make_resolution_key(author_text: str, year: str, title: str) -> str: normalized_author_text = " ".join((author_text or "").split()) first_author = normalized_author_text.split(" and ")[0].strip() if normalized_author_text else "" if "," in first_author: family_name = first_author.split(",")[0].strip() elif first_author: author_tokens = first_author.split() family_name = author_tokens[-1] if author_tokens else "" else: family_name = "" family_name = "".join(ch for ch in family_name.lower() if ch.isalnum()) or "ref" first_word = "".join(ch for ch in title.split()[0].lower() if ch.isalnum()) if title.split() else "untitled" return f"{family_name}{year}{first_word}" def _openalex_work_to_entry(work: dict) -> BibEntry: title = _normalize_text(work.get("display_name", "") or "Untitled work") year = str(work.get("publication_year") or "") doi = _normalize_openalex_doi(work.get("doi")) openalex_id = _normalize_openalex_id(work.get("id", "")) authors = " and ".join(_openalex_author_name(item) for item in work.get("authorships", [])) source_info = (work.get("primary_location") or {}).get("source") or {} source = source_info.get("display_name", "") source_type = _normalize_text(str(source_info.get("type") or "")).casefold() work_type = work.get("type", "") fields: dict[str, str] = {} if authors: fields["author"] = authors if title: fields["title"] = title if year: fields["year"] = year if doi: fields["doi"] = doi fields["url"] = f"https://doi.org/{doi}" if openalex_id: fields["openalex"] = openalex_id fields.setdefault("url", f"https://openalex.org/{openalex_id}") if abstract := work.get("abstract_inverted_index"): abstract_text = _openalex_abstract_text(abstract) if abstract_text: fields["abstract"] = abstract_text if source: if _openalex_should_use_journal_field(work_type, source_type): fields["journal"] = source else: fields["booktitle"] = source citation_key = _openalex_citation_key(doi, openalex_id, authors, year, title) return BibEntry(entry_type=_openalex_type_to_bibtype(work_type, source_type), citation_key=citation_key, fields=fields) def _openalex_author_name(authorship: dict) -> str: author = authorship.get("author") or {} return _normalize_person_display_name(str(author.get("display_name", ""))) def _openalex_abstract_text(inverted_index: dict) -> str: positions: dict[int, str] = {} for word, indexes in inverted_index.items(): for index in indexes: positions[int(index)] = word text = _normalize_text(" ".join(word for _, word in sorted(positions.items()))) return "" if _looks_like_openalex_page_blob(text) else text def _openalex_should_use_journal_field(work_type: str, source_type: str) -> bool: if work_type == "article": return True return source_type == "journal" def _openalex_type_to_bibtype(work_type: str, source_type: str = "") -> str: mapping = { "article": "article", "book": "book", "book-chapter": "incollection", "dissertation": "phdthesis", "proceedings-article": "inproceedings", } if work_type in mapping: return mapping[work_type] if source_type == "journal": return "article" if source_type == "conference": return "inproceedings" return "misc" def _normalize_openalex_id(value: str) -> str: if not value: return "" return value.rsplit("/", 1)[-1] def _normalize_openalex_doi(value: str | None) -> str: if not value: return "" if value.startswith("https://doi.org/"): return value[len("https://doi.org/") :] return value def _normalize_text(value: str) -> str: without_tags = re.sub(r"<[^>]+>", " ", html.unescape(value)) normalized = " ".join(without_tags.split()) normalized = re.sub(r"([\(\[\{])\s+", r"\1", normalized) normalized = re.sub(r"\s+([\)\]\},.;:!?])", r"\1", normalized) return normalized def _normalize_abstract_text(value: str) -> str: normalized = _normalize_text(value) normalized = re.sub(r"^abstract\s*[:.\-]?\s*", "", normalized, flags=re.IGNORECASE) return normalized def _normalize_person_display_name(value: str) -> str: normalized = _normalize_text(value) if "," not in normalized: return normalized left, right = [part.strip() for part in normalized.split(",", 1)] if not (_looks_like_initial_block(left) and right): return normalized right_tokens = right.split() trailing_initials: list[str] = [] while right_tokens and _looks_like_initial_block(right_tokens[-1]): trailing_initials.insert(0, right_tokens.pop()) if not right_tokens: return normalized family = " ".join(right_tokens).strip() given_parts = [ _initial_block_to_given_names(" ".join(trailing_initials)), _initial_block_to_given_names(left), ] given = " ".join(part for part in given_parts if part).strip() return f"{family}, {given}" if given else family def _looks_like_initial_block(value: str) -> bool: letters = re.sub(r"[^A-Za-z]+", "", value) return 0 < len(letters) <= 4 and letters.upper() == letters def _initial_block_to_given_names(value: str) -> str: letters = re.findall(r"[A-Za-z]", value) return " ".join(f"{letter.upper()}." for letter in letters) def _openalex_citation_key(doi: str, openalex_id: str, authors: str, year: str, title: str) -> str: if doi: suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower() return f"doi{suffix}" if openalex_id: return f"openalex{re.sub(r'[^A-Za-z0-9]+', '', openalex_id).lower()}" return _make_resolution_key(authors or "openalex", year or "n.d.", title or "untitled") def _looks_like_openalex_page_blob(text: str) -> bool: lowered = text.casefold() blob_markers = ( "research article|", "download citation file", "this content is only available via pdf", "get citation alerts", "views icon", "toolbar search", "publisher site get access", "authors info & claims", "publication history", "copyright ", ) return len(text) > 60 and any(marker in lowered for marker in blob_markers) def _normalize_match_text(value: str) -> str: lowered = value.lower() lowered = re.sub(r"\W+", " ", lowered) return " ".join(lowered.split()) def _select_best_title_match( candidates: list[BibEntry], title: str, author_text: str = "", year: str = "", ) -> BibEntry | None: if not candidates: return None title_norm = _normalize_match_text(title) author_tokens = _author_match_tokens(author_text) year_text = str(year or "").strip() for candidate in candidates: candidate_title = _normalize_match_text(candidate.fields.get("title", "")) if candidate_title != title_norm: continue candidate_year = str(candidate.fields.get("year", "") or "").strip() if year_text and candidate_year and year_text != candidate_year: continue if author_tokens and not _candidate_matches_author_tokens(candidate, author_tokens): continue return candidate return None def _author_match_tokens(author_text: str) -> set[str]: normalized = _normalize_match_text(author_text) if not normalized: return set() tokens = { token for token in re.findall(r"[a-z0-9]+", normalized) if len(token) >= 2 and token not in {"and", "et", "al"} } return tokens def _candidate_matches_author_tokens(candidate: BibEntry, author_tokens: set[str]) -> bool: candidate_author = _normalize_match_text(candidate.fields.get("author", "")) if not candidate_author: return False candidate_tokens = set(re.findall(r"[a-z0-9]+", candidate_author)) return bool(author_tokens & candidate_tokens) def _normalize_pmid(value: str) -> str: return "".join(ch for ch in str(value) if ch.isdigit()) def _pubmed_articles_by_pmid(root: ET.Element | None) -> dict[str, ET.Element]: if root is None: return {} articles: dict[str, ET.Element] = {} for article in root.findall(".//PubmedArticle"): pmid = _normalize_pmid(_node_text(article.find("./MedlineCitation/PMID"))) if pmid: articles[pmid] = article return articles def _find_pubmed_article(root: ET.Element, pmid: str) -> ET.Element | None: return _pubmed_articles_by_pmid(root).get(_normalize_pmid(pmid)) def _pubmed_record_to_entry(summary: dict, article: ET.Element | None, fallback_pmid: str) -> BibEntry: if article is not None: entry = _pubmed_article_to_entry(article, fallback_pmid=fallback_pmid) _merge_pubmed_summary_into_fields(entry.fields, summary, fallback_pmid) return entry fields = _pubmed_summary_fields(summary, fallback_pmid) citation_key = _pubmed_citation_key( fields.get("doi", ""), fields.get("pmid", ""), fields.get("author", ""), fields.get("year", ""), fields.get("title", ""), ) return BibEntry(entry_type="article", citation_key=citation_key, fields=fields) def _pubmed_article_to_entry(article: ET.Element, fallback_pmid: str = "") -> BibEntry: medline = article.find("./MedlineCitation") article_node = medline.find("./Article") if medline is not None else None pubmed_data = article.find("./PubmedData") pmid = _normalize_pmid(_node_text(medline.find("./PMID")) if medline is not None else fallback_pmid) or _normalize_pmid( fallback_pmid ) title = _normalize_text(_element_text(article_node.find("./ArticleTitle")) if article_node is not None else "") authors = " and ".join( name for name in (_pubmed_author_name(author) for author in article.findall(".//AuthorList/Author")) if name ) journal = _normalize_text(_node_text(article.find(".//Journal/Title"))) year = _pubmed_article_year(article) abstract = _pubmed_abstract_text(article) doi = _pubmed_article_identifier(article, "doi") pmcid = _pubmed_article_identifier(pubmed_data, "pmc") fields: dict[str, str] = {} if title: fields["title"] = title if authors: fields["author"] = authors if year: fields["year"] = year if journal: fields["journal"] = journal if abstract: fields["abstract"] = abstract if doi: fields["doi"] = doi if pmid: fields["pmid"] = pmid if pmcid: fields["pmcid"] = pmcid fields["url"] = f"https://pmc.ncbi.nlm.nih.gov/articles/{pmcid}/" elif pmid: fields["url"] = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/" citation_key = _pubmed_citation_key(doi, pmid, authors, year, title) return BibEntry(entry_type="article", citation_key=citation_key, fields=fields) def _merge_pubmed_summary_into_fields(fields: dict[str, str], summary: dict, fallback_pmid: str) -> None: for key, value in _pubmed_summary_fields(summary, fallback_pmid).items(): if value and not fields.get(key): fields[key] = value def _pubmed_summary_fields(summary: dict, fallback_pmid: str) -> dict[str, str]: pmid = _normalize_pmid(str(summary.get("uid") or fallback_pmid)) title = _normalize_text(str(summary.get("title") or "")) year = _pubmed_year_from_text(str(summary.get("pubdate") or "")) journal = _normalize_text(str(summary.get("fulljournalname") or "")) authors = " and ".join( name for name in ( _normalize_person_display_name(str(author.get("name") or "")) for author in summary.get("authors", []) ) if name ) doi = "" pmcid = "" for article_id in summary.get("articleids", []) or []: id_type = str(article_id.get("idtype") or "").lower() value = str(article_id.get("value") or "") if id_type == "doi" and value: doi = value elif id_type in {"pmc", "pmcid"} and value: pmcid = value fields: dict[str, str] = {} if title: fields["title"] = title if authors: fields["author"] = authors if year: fields["year"] = year if journal: fields["journal"] = journal if doi: fields["doi"] = doi if pmid: fields["pmid"] = pmid if pmcid: fields["pmcid"] = pmcid fields["url"] = f"https://pmc.ncbi.nlm.nih.gov/articles/{pmcid}/" elif pmid: fields["url"] = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/" return fields def _pubmed_author_name(author: ET.Element) -> str: collective = _normalize_text(_node_text(author.find("./CollectiveName"))) if collective: return collective family = _normalize_text(_node_text(author.find("./LastName"))) given = _normalize_text(_node_text(author.find("./ForeName"))) initials = _normalize_text(_node_text(author.find("./Initials"))) if family and given: return f"{family}, {given}" if family and initials: normalized_initials = " ".join(f"{letter}." for letter in re.findall(r"[A-Za-z]", initials)) return f"{family}, {normalized_initials}" if normalized_initials else family return family or given def _pubmed_article_year(article: ET.Element) -> str: for path in ( ".//JournalIssue/PubDate/Year", ".//ArticleDate/Year", ".//PubDate/Year", ): year = _node_text(article.find(path)) if year: return year for path in ( ".//JournalIssue/PubDate/MedlineDate", ".//PubDate/MedlineDate", ): year = _pubmed_year_from_text(_node_text(article.find(path))) if year: return year return "" def _pubmed_year_from_text(value: str) -> str: match = re.search(r"\b(1[6-9]\d{2}|20\d{2}|21\d{2})\b", value) return match.group(1) if match else "" def _pubmed_abstract_text(article: ET.Element) -> str: parts: list[str] = [] for node in article.findall(".//Abstract/AbstractText"): text = _normalize_text(_element_text(node)) if not text: continue label = _normalize_text(node.attrib.get("Label", "")) parts.append(f"{label}: {text}" if label else text) return " ".join(parts) def _pubmed_article_identifier(root: ET.Element | None, identifier_type: str) -> str: if root is None: return "" normalized_type = identifier_type.lower() for node in root.findall(".//ArticleId"): if str(node.attrib.get("IdType") or "").lower() == normalized_type: return _normalize_text(_element_text(node)) if normalized_type == "doi": for node in root.findall(".//ELocationID"): if str(node.attrib.get("EIdType") or "").lower() == "doi": return _normalize_text(_element_text(node)) return "" def _pubmed_citation_key(doi: str, pmid: str, authors: str, year: str, title: str) -> str: if doi: suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower() return f"doi{suffix}" if pmid: return f"pmid{pmid}" return _make_resolution_key(authors or "pubmed", year or "n.d.", title or "untitled") def _element_text(node: ET.Element | None) -> str: if node is None: return "" return " ".join("".join(node.itertext()).split()) def _datacite_work_to_entry(data: dict) -> BibEntry: attributes = data.get("attributes", {}) doi = str(attributes.get("doi") or "") titles = attributes.get("titles") or [] creators = attributes.get("creators") or [] descriptions = attributes.get("descriptions") or [] publisher = str(attributes.get("publisher") or "") year = str(attributes.get("publicationYear") or "") url = str(attributes.get("url") or "") types = attributes.get("types") or {} title = titles[0].get("title", "") if titles else "" author_names = " and ".join(_datacite_creator_name(creator) for creator in creators if _datacite_creator_name(creator)) abstract = _datacite_abstract(descriptions) entry_type = _datacite_type_to_bibtype(str(types.get("resourceTypeGeneral") or "")) fields: dict[str, str] = {} if title: fields["title"] = title if author_names: fields["author"] = author_names if year: fields["year"] = year if doi: fields["doi"] = doi if url: fields["url"] = url elif doi: fields["url"] = f"https://doi.org/{doi}" if publisher: fields["publisher"] = publisher if abstract: fields["abstract"] = abstract citation_key = _make_resolution_key(author_names or "datacite", year or "n.d.", title or doi or "untitled") return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) def _datacite_creator_name(creator: dict) -> str: family = str(creator.get("familyName") or "") given = str(creator.get("givenName") or "") if family and given: return f"{family}, {given}" return str(creator.get("name") or family or given) def _datacite_abstract(descriptions: list[dict]) -> str: for description in descriptions: if str(description.get("descriptionType") or "").lower() == "abstract": return str(description.get("description") or "") return "" def _datacite_type_to_bibtype(resource_type: str) -> str: lowered = resource_type.lower() mapping = { "audiovisual": "misc", "book": "book", "bookchapter": "incollection", "collection": "misc", "computationalnotebook": "misc", "conferencepaper": "inproceedings", "dataset": "misc", "dissertation": "phdthesis", "image": "misc", "journalarticle": "article", "model": "misc", "report": "techreport", "software": "misc", "text": "misc", } return mapping.get(lowered, "misc")