Normalize malformed discovered author names

2026-03-21 03:17:47 -04:00 · 2026-03-21 03:17:47 -04:00 · f06a68aedc
parent bf33b898d6
commit f06a68aedc
6 changed files with 104 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -163,7 +163,7 @@ BibTeX parse/render round-trips normalize simple escaped special characters such
 Crossref reference expansion is intentionally conservative about weak discoveries. If a cited reference has no DOI and Crossref only exposes it as an unstructured citation blob, `expand --source crossref`, `expand-topic --source crossref`, and bootstrap flows now skip materializing that record unless the fallback metadata looks like a cleaner non-`misc` work such as conference proceedings. When Crossref does expose thesis or dissertation references only as unstructured text, citegeist now also tries to extract the actual work title instead of keeping the entire ProQuest-style citation blob in the `title` field. This reduces junk `@misc` entries and cleaner `@phdthesis` fallbacks whose `title` field is really a pasted citation string.
-OpenAlex expansion is also conservative about noisy secondary records. Discoveries now prefer DOI-based citation keys when a DOI is present, which reduces parallel `doi...` and `openalex...` entries for the same work. OpenAlex abstract imports are sanitized to drop obvious webpage/export blobs, and DOI-less records are filtered when they look like venue-title stubs, generic container records, or weak review-like shadows of an already present book, chapter, proceedings paper, or dissertation with the same title. Preview mode uses the same admission rules as write-enabled expansion, so rejected candidates disappear before you commit them to the store.
+OpenAlex expansion is also conservative about noisy secondary records. Discoveries now prefer DOI-based citation keys when a DOI is present, which reduces parallel `doi...` and `openalex...` entries for the same work. OpenAlex abstract imports are sanitized to drop obvious webpage/export blobs, and DOI-less records are filtered when they look like venue-title stubs, generic container records, or weak review-like shadows of an already present book, chapter, proceedings paper, or dissertation with the same title. Both OpenAlex and Crossref discovery paths also normalize some malformed author strings from upstream metadata, including inverted-initial patterns such as `J., Fogel L.`, into stable BibTeX names like `Fogel, L. J.`. Preview mode uses the same admission rules as write-enabled expansion, so rejected candidates disappear before you commit them to the store.
 For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run.
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -213,7 +213,7 @@ Re-enrich all current `@misc` entries with DOIs:
 When Crossref expansion only yields an unstructured citation blob without a DOI, citegeist now skips materializing that discovery instead of storing it as a weak `@misc` entry. Cleaner fallback cases that infer a more specific type, such as proceedings-like titles, are still admitted. Thesis and dissertation citation blobs are also normalized more aggressively so fallback `@phdthesis` entries keep the work title instead of the entire ProQuest-style citation string.
-OpenAlex expansion now applies the same kind of admission control before writing or previewing discoveries. DOI-backed discoveries prefer DOI-based citation keys, noisy webpage/export abstracts are dropped, generic venue-title stubs are rejected, and weak DOI-less article records that merely shadow an existing book/chapter/dissertation title in your store are suppressed instead of being materialized as parallel duplicates.
+OpenAlex expansion now applies the same kind of admission control before writing or previewing discoveries. DOI-backed discoveries prefer DOI-based citation keys, noisy webpage/export abstracts are dropped, generic venue-title stubs are rejected, and weak DOI-less article records that merely shadow an existing book/chapter/dissertation title in your store are suppressed instead of being materialized as parallel duplicates. Both OpenAlex and Crossref discovery also normalize some malformed upstream author strings so records like `J., Fogel L.` are stored in stable BibTeX form as `Fogel, L. J.`.
 ## Explore Citation Graphs
--- a/src/citegeist/expand.py
+++ b/src/citegeist/expand.py
@ -417,7 +417,7 @@ class TopicExpander:
 def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry:
    title = _crossref_reference_title(reference, ordinal)
    year = str(reference.get("year") or "")
-    author = reference.get("author") or ""
+    author = _normalize_person_display_name(str(reference.get("author") or ""))
    doi = reference.get("DOI") or ""
    journal_title = reference.get("journal-title") or ""
@ -428,7 +428,7 @@ def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordi
    if year:
        fields["year"] = year
    if author:
-        fields["author"] = _normalize_text(author)
+        fields["author"] = author
    if doi:
        fields["doi"] = doi
        fields["url"] = f"https://doi.org/{doi}"
@ -531,6 +531,41 @@ def _normalize_text(value: str) -> str:
    return " ".join(without_tags.split())
 def _normalize_person_display_name(value: str) -> str:
    normalized = _normalize_text(value)
    if "," not in normalized:
        return normalized
    left, right = [part.strip() for part in normalized.split(",", 1)]
    if not (_looks_like_initial_block(left) and right):
        return normalized
    right_tokens = right.split()
    trailing_initials: list[str] = []
    while right_tokens and _looks_like_initial_block(right_tokens[-1]):
        trailing_initials.insert(0, right_tokens.pop())
    if not right_tokens:
        return normalized
    family = " ".join(right_tokens).strip()
    given_parts = [
        _initial_block_to_given_names(" ".join(trailing_initials)),
        _initial_block_to_given_names(left),
    ]
    given = " ".join(part for part in given_parts if part).strip()
    return f"{family}, {given}" if given else family
 def _looks_like_initial_block(value: str) -> bool:
    letters = re.sub(r"[^A-Za-z]+", "", value)
    return 0 < len(letters) <= 4 and letters.upper() == letters
 def _initial_block_to_given_names(value: str) -> str:
    letters = re.findall(r"[A-Za-z]", value)
    return " ".join(f"{letter.upper()}." for letter in letters)
 def _crossref_reference_entry_type(reference: dict, title: str, journal_title: str) -> str:
    if journal_title:
        return "article"
@ -695,7 +730,7 @@ def _openalex_work_to_entry(work: dict) -> BibEntry:
 def _openalex_author_name(authorship: dict) -> str:
    author = authorship.get("author") or {}
    name = author.get("display_name", "")
-    return _normalize_text(name)
+    return _normalize_person_display_name(str(name))
 def _openalex_abstract_text(inverted_index: dict) -> str:
--- a/src/citegeist/resolve.py
+++ b/src/citegeist/resolve.py
@ -471,7 +471,7 @@ def _openalex_work_to_entry(work: dict) -> BibEntry:
 def _openalex_author_name(authorship: dict) -> str:
    author = authorship.get("author") or {}
-    return " ".join(str(author.get("display_name", "")).split())
+    return _normalize_person_display_name(str(author.get("display_name", "")))
 def _openalex_abstract_text(inverted_index: dict) -> str:
@ -513,6 +513,41 @@ def _normalize_text(value: str) -> str:
    return " ".join(without_tags.split())
 def _normalize_person_display_name(value: str) -> str:
    normalized = _normalize_text(value)
    if "," not in normalized:
        return normalized
    left, right = [part.strip() for part in normalized.split(",", 1)]
    if not (_looks_like_initial_block(left) and right):
        return normalized
    right_tokens = right.split()
    trailing_initials: list[str] = []
    while right_tokens and _looks_like_initial_block(right_tokens[-1]):
        trailing_initials.insert(0, right_tokens.pop())
    if not right_tokens:
        return normalized
    family = " ".join(right_tokens).strip()
    given_parts = [
        _initial_block_to_given_names(" ".join(trailing_initials)),
        _initial_block_to_given_names(left),
    ]
    given = " ".join(part for part in given_parts if part).strip()
    return f"{family}, {given}" if given else family
 def _looks_like_initial_block(value: str) -> bool:
    letters = re.sub(r"[^A-Za-z]+", "", value)
    return 0 < len(letters) <= 4 and letters.upper() == letters
 def _initial_block_to_given_names(value: str) -> str:
    letters = re.findall(r"[A-Za-z]", value)
    return " ".join(f"{letter.upper()}." for letter in letters)
 def _openalex_citation_key(doi: str, openalex_id: str, authors: str, year: str, title: str) -> str:
    if doi:
        suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower()
--- a/tests/test_expand.py
+++ b/tests/test_expand.py
@ -233,6 +233,20 @@ def test_crossref_reference_to_entry_extracts_title_from_thesis_citation_blob():
    )
 def test_crossref_reference_to_entry_normalizes_reversed_initial_author_name():
    entry = _crossref_reference_to_entry(
        {
            "author": "J., Fogel L.",
            "article-title": "Evolutionary Programming",
            "year": "1995",
        },
        "seed2024",
        1,
    )
    assert entry.fields["author"] == "Fogel, L. J."
 def test_crossref_expander_returns_empty_on_fetch_error():
    store = BibliographyStore()
    try:
--- a/tests/test_resolve.py
+++ b/tests/test_resolve.py
@ -208,6 +208,20 @@ def test_openalex_work_to_entry_maps_basic_fields():
    assert entry.fields["abstract"] == "OpenAlex resolved"
 def test_openalex_work_to_entry_normalizes_reversed_initial_author_name():
    entry = _openalex_work_to_entry(
        {
            "id": "https://openalex.org/W12345",
            "display_name": "Evolutionary Programming",
            "publication_year": 1995,
            "type": "book-chapter",
            "authorships": [{"author": {"display_name": "J., Fogel L."}}],
        }
    )
    assert entry.fields["author"] == "Fogel, L. J."
 def test_resolver_can_resolve_openalex_id():
    resolver = MetadataResolver()
    resolver.source_client.get_json = lambda _url: {  # type: ignore[method-assign]