Normalize malformed discovered author names

This commit is contained in:
welsberr 2026-03-21 03:17:47 -04:00
parent bf33b898d6
commit f06a68aedc
6 changed files with 104 additions and 6 deletions

View File

@ -163,7 +163,7 @@ BibTeX parse/render round-trips normalize simple escaped special characters such
Crossref reference expansion is intentionally conservative about weak discoveries. If a cited reference has no DOI and Crossref only exposes it as an unstructured citation blob, `expand --source crossref`, `expand-topic --source crossref`, and bootstrap flows now skip materializing that record unless the fallback metadata looks like a cleaner non-`misc` work such as conference proceedings. When Crossref does expose thesis or dissertation references only as unstructured text, citegeist now also tries to extract the actual work title instead of keeping the entire ProQuest-style citation blob in the `title` field. This reduces junk `@misc` entries and cleaner `@phdthesis` fallbacks whose `title` field is really a pasted citation string. Crossref reference expansion is intentionally conservative about weak discoveries. If a cited reference has no DOI and Crossref only exposes it as an unstructured citation blob, `expand --source crossref`, `expand-topic --source crossref`, and bootstrap flows now skip materializing that record unless the fallback metadata looks like a cleaner non-`misc` work such as conference proceedings. When Crossref does expose thesis or dissertation references only as unstructured text, citegeist now also tries to extract the actual work title instead of keeping the entire ProQuest-style citation blob in the `title` field. This reduces junk `@misc` entries and cleaner `@phdthesis` fallbacks whose `title` field is really a pasted citation string.
OpenAlex expansion is also conservative about noisy secondary records. Discoveries now prefer DOI-based citation keys when a DOI is present, which reduces parallel `doi...` and `openalex...` entries for the same work. OpenAlex abstract imports are sanitized to drop obvious webpage/export blobs, and DOI-less records are filtered when they look like venue-title stubs, generic container records, or weak review-like shadows of an already present book, chapter, proceedings paper, or dissertation with the same title. Preview mode uses the same admission rules as write-enabled expansion, so rejected candidates disappear before you commit them to the store. OpenAlex expansion is also conservative about noisy secondary records. Discoveries now prefer DOI-based citation keys when a DOI is present, which reduces parallel `doi...` and `openalex...` entries for the same work. OpenAlex abstract imports are sanitized to drop obvious webpage/export blobs, and DOI-less records are filtered when they look like venue-title stubs, generic container records, or weak review-like shadows of an already present book, chapter, proceedings paper, or dissertation with the same title. Both OpenAlex and Crossref discovery paths also normalize some malformed author strings from upstream metadata, including inverted-initial patterns such as `J., Fogel L.`, into stable BibTeX names like `Fogel, L. J.`. Preview mode uses the same admission rules as write-enabled expansion, so rejected candidates disappear before you commit them to the store.
For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run. For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run.

View File

@ -213,7 +213,7 @@ Re-enrich all current `@misc` entries with DOIs:
When Crossref expansion only yields an unstructured citation blob without a DOI, citegeist now skips materializing that discovery instead of storing it as a weak `@misc` entry. Cleaner fallback cases that infer a more specific type, such as proceedings-like titles, are still admitted. Thesis and dissertation citation blobs are also normalized more aggressively so fallback `@phdthesis` entries keep the work title instead of the entire ProQuest-style citation string. When Crossref expansion only yields an unstructured citation blob without a DOI, citegeist now skips materializing that discovery instead of storing it as a weak `@misc` entry. Cleaner fallback cases that infer a more specific type, such as proceedings-like titles, are still admitted. Thesis and dissertation citation blobs are also normalized more aggressively so fallback `@phdthesis` entries keep the work title instead of the entire ProQuest-style citation string.
OpenAlex expansion now applies the same kind of admission control before writing or previewing discoveries. DOI-backed discoveries prefer DOI-based citation keys, noisy webpage/export abstracts are dropped, generic venue-title stubs are rejected, and weak DOI-less article records that merely shadow an existing book/chapter/dissertation title in your store are suppressed instead of being materialized as parallel duplicates. OpenAlex expansion now applies the same kind of admission control before writing or previewing discoveries. DOI-backed discoveries prefer DOI-based citation keys, noisy webpage/export abstracts are dropped, generic venue-title stubs are rejected, and weak DOI-less article records that merely shadow an existing book/chapter/dissertation title in your store are suppressed instead of being materialized as parallel duplicates. Both OpenAlex and Crossref discovery also normalize some malformed upstream author strings so records like `J., Fogel L.` are stored in stable BibTeX form as `Fogel, L. J.`.
## Explore Citation Graphs ## Explore Citation Graphs

View File

@ -417,7 +417,7 @@ class TopicExpander:
def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry: def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry:
title = _crossref_reference_title(reference, ordinal) title = _crossref_reference_title(reference, ordinal)
year = str(reference.get("year") or "") year = str(reference.get("year") or "")
author = reference.get("author") or "" author = _normalize_person_display_name(str(reference.get("author") or ""))
doi = reference.get("DOI") or "" doi = reference.get("DOI") or ""
journal_title = reference.get("journal-title") or "" journal_title = reference.get("journal-title") or ""
@ -428,7 +428,7 @@ def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordi
if year: if year:
fields["year"] = year fields["year"] = year
if author: if author:
fields["author"] = _normalize_text(author) fields["author"] = author
if doi: if doi:
fields["doi"] = doi fields["doi"] = doi
fields["url"] = f"https://doi.org/{doi}" fields["url"] = f"https://doi.org/{doi}"
@ -531,6 +531,41 @@ def _normalize_text(value: str) -> str:
return " ".join(without_tags.split()) return " ".join(without_tags.split())
def _normalize_person_display_name(value: str) -> str:
normalized = _normalize_text(value)
if "," not in normalized:
return normalized
left, right = [part.strip() for part in normalized.split(",", 1)]
if not (_looks_like_initial_block(left) and right):
return normalized
right_tokens = right.split()
trailing_initials: list[str] = []
while right_tokens and _looks_like_initial_block(right_tokens[-1]):
trailing_initials.insert(0, right_tokens.pop())
if not right_tokens:
return normalized
family = " ".join(right_tokens).strip()
given_parts = [
_initial_block_to_given_names(" ".join(trailing_initials)),
_initial_block_to_given_names(left),
]
given = " ".join(part for part in given_parts if part).strip()
return f"{family}, {given}" if given else family
def _looks_like_initial_block(value: str) -> bool:
letters = re.sub(r"[^A-Za-z]+", "", value)
return 0 < len(letters) <= 4 and letters.upper() == letters
def _initial_block_to_given_names(value: str) -> str:
letters = re.findall(r"[A-Za-z]", value)
return " ".join(f"{letter.upper()}." for letter in letters)
def _crossref_reference_entry_type(reference: dict, title: str, journal_title: str) -> str: def _crossref_reference_entry_type(reference: dict, title: str, journal_title: str) -> str:
if journal_title: if journal_title:
return "article" return "article"
@ -695,7 +730,7 @@ def _openalex_work_to_entry(work: dict) -> BibEntry:
def _openalex_author_name(authorship: dict) -> str: def _openalex_author_name(authorship: dict) -> str:
author = authorship.get("author") or {} author = authorship.get("author") or {}
name = author.get("display_name", "") name = author.get("display_name", "")
return _normalize_text(name) return _normalize_person_display_name(str(name))
def _openalex_abstract_text(inverted_index: dict) -> str: def _openalex_abstract_text(inverted_index: dict) -> str:

View File

@ -471,7 +471,7 @@ def _openalex_work_to_entry(work: dict) -> BibEntry:
def _openalex_author_name(authorship: dict) -> str: def _openalex_author_name(authorship: dict) -> str:
author = authorship.get("author") or {} author = authorship.get("author") or {}
return " ".join(str(author.get("display_name", "")).split()) return _normalize_person_display_name(str(author.get("display_name", "")))
def _openalex_abstract_text(inverted_index: dict) -> str: def _openalex_abstract_text(inverted_index: dict) -> str:
@ -513,6 +513,41 @@ def _normalize_text(value: str) -> str:
return " ".join(without_tags.split()) return " ".join(without_tags.split())
def _normalize_person_display_name(value: str) -> str:
normalized = _normalize_text(value)
if "," not in normalized:
return normalized
left, right = [part.strip() for part in normalized.split(",", 1)]
if not (_looks_like_initial_block(left) and right):
return normalized
right_tokens = right.split()
trailing_initials: list[str] = []
while right_tokens and _looks_like_initial_block(right_tokens[-1]):
trailing_initials.insert(0, right_tokens.pop())
if not right_tokens:
return normalized
family = " ".join(right_tokens).strip()
given_parts = [
_initial_block_to_given_names(" ".join(trailing_initials)),
_initial_block_to_given_names(left),
]
given = " ".join(part for part in given_parts if part).strip()
return f"{family}, {given}" if given else family
def _looks_like_initial_block(value: str) -> bool:
letters = re.sub(r"[^A-Za-z]+", "", value)
return 0 < len(letters) <= 4 and letters.upper() == letters
def _initial_block_to_given_names(value: str) -> str:
letters = re.findall(r"[A-Za-z]", value)
return " ".join(f"{letter.upper()}." for letter in letters)
def _openalex_citation_key(doi: str, openalex_id: str, authors: str, year: str, title: str) -> str: def _openalex_citation_key(doi: str, openalex_id: str, authors: str, year: str, title: str) -> str:
if doi: if doi:
suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower() suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower()

View File

@ -233,6 +233,20 @@ def test_crossref_reference_to_entry_extracts_title_from_thesis_citation_blob():
) )
def test_crossref_reference_to_entry_normalizes_reversed_initial_author_name():
entry = _crossref_reference_to_entry(
{
"author": "J., Fogel L.",
"article-title": "Evolutionary Programming",
"year": "1995",
},
"seed2024",
1,
)
assert entry.fields["author"] == "Fogel, L. J."
def test_crossref_expander_returns_empty_on_fetch_error(): def test_crossref_expander_returns_empty_on_fetch_error():
store = BibliographyStore() store = BibliographyStore()
try: try:

View File

@ -208,6 +208,20 @@ def test_openalex_work_to_entry_maps_basic_fields():
assert entry.fields["abstract"] == "OpenAlex resolved" assert entry.fields["abstract"] == "OpenAlex resolved"
def test_openalex_work_to_entry_normalizes_reversed_initial_author_name():
entry = _openalex_work_to_entry(
{
"id": "https://openalex.org/W12345",
"display_name": "Evolutionary Programming",
"publication_year": 1995,
"type": "book-chapter",
"authorships": [{"author": {"display_name": "J., Fogel L."}}],
}
)
assert entry.fields["author"] == "Fogel, L. J."
def test_resolver_can_resolve_openalex_id(): def test_resolver_can_resolve_openalex_id():
resolver = MetadataResolver() resolver = MetadataResolver()
resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign] resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign]