From f06a68aedcb0ac883f88e063f97ec18215de4350 Mon Sep 17 00:00:00 2001 From: welsberr Date: Sat, 21 Mar 2026 03:17:47 -0400 Subject: [PATCH] Normalize malformed discovered author names --- README.md | 2 +- examples/cli/README.md | 2 +- src/citegeist/expand.py | 41 +++++++++++++++++++++++++++++++++++++--- src/citegeist/resolve.py | 37 +++++++++++++++++++++++++++++++++++- tests/test_expand.py | 14 ++++++++++++++ tests/test_resolve.py | 14 ++++++++++++++ 6 files changed, 104 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 63cd0d8..60d717d 100644 --- a/README.md +++ b/README.md @@ -163,7 +163,7 @@ BibTeX parse/render round-trips normalize simple escaped special characters such Crossref reference expansion is intentionally conservative about weak discoveries. If a cited reference has no DOI and Crossref only exposes it as an unstructured citation blob, `expand --source crossref`, `expand-topic --source crossref`, and bootstrap flows now skip materializing that record unless the fallback metadata looks like a cleaner non-`misc` work such as conference proceedings. When Crossref does expose thesis or dissertation references only as unstructured text, citegeist now also tries to extract the actual work title instead of keeping the entire ProQuest-style citation blob in the `title` field. This reduces junk `@misc` entries and cleaner `@phdthesis` fallbacks whose `title` field is really a pasted citation string. -OpenAlex expansion is also conservative about noisy secondary records. Discoveries now prefer DOI-based citation keys when a DOI is present, which reduces parallel `doi...` and `openalex...` entries for the same work. OpenAlex abstract imports are sanitized to drop obvious webpage/export blobs, and DOI-less records are filtered when they look like venue-title stubs, generic container records, or weak review-like shadows of an already present book, chapter, proceedings paper, or dissertation with the same title. Preview mode uses the same admission rules as write-enabled expansion, so rejected candidates disappear before you commit them to the store. +OpenAlex expansion is also conservative about noisy secondary records. Discoveries now prefer DOI-based citation keys when a DOI is present, which reduces parallel `doi...` and `openalex...` entries for the same work. OpenAlex abstract imports are sanitized to drop obvious webpage/export blobs, and DOI-less records are filtered when they look like venue-title stubs, generic container records, or weak review-like shadows of an already present book, chapter, proceedings paper, or dissertation with the same title. Both OpenAlex and Crossref discovery paths also normalize some malformed author strings from upstream metadata, including inverted-initial patterns such as `J., Fogel L.`, into stable BibTeX names like `Fogel, L. J.`. Preview mode uses the same admission rules as write-enabled expansion, so rejected candidates disappear before you commit them to the store. For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run. diff --git a/examples/cli/README.md b/examples/cli/README.md index 67edab1..8fbeed3 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -213,7 +213,7 @@ Re-enrich all current `@misc` entries with DOIs: When Crossref expansion only yields an unstructured citation blob without a DOI, citegeist now skips materializing that discovery instead of storing it as a weak `@misc` entry. Cleaner fallback cases that infer a more specific type, such as proceedings-like titles, are still admitted. Thesis and dissertation citation blobs are also normalized more aggressively so fallback `@phdthesis` entries keep the work title instead of the entire ProQuest-style citation string. -OpenAlex expansion now applies the same kind of admission control before writing or previewing discoveries. DOI-backed discoveries prefer DOI-based citation keys, noisy webpage/export abstracts are dropped, generic venue-title stubs are rejected, and weak DOI-less article records that merely shadow an existing book/chapter/dissertation title in your store are suppressed instead of being materialized as parallel duplicates. +OpenAlex expansion now applies the same kind of admission control before writing or previewing discoveries. DOI-backed discoveries prefer DOI-based citation keys, noisy webpage/export abstracts are dropped, generic venue-title stubs are rejected, and weak DOI-less article records that merely shadow an existing book/chapter/dissertation title in your store are suppressed instead of being materialized as parallel duplicates. Both OpenAlex and Crossref discovery also normalize some malformed upstream author strings so records like `J., Fogel L.` are stored in stable BibTeX form as `Fogel, L. J.`. ## Explore Citation Graphs diff --git a/src/citegeist/expand.py b/src/citegeist/expand.py index 2a62019..3eb617e 100644 --- a/src/citegeist/expand.py +++ b/src/citegeist/expand.py @@ -417,7 +417,7 @@ class TopicExpander: def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry: title = _crossref_reference_title(reference, ordinal) year = str(reference.get("year") or "") - author = reference.get("author") or "" + author = _normalize_person_display_name(str(reference.get("author") or "")) doi = reference.get("DOI") or "" journal_title = reference.get("journal-title") or "" @@ -428,7 +428,7 @@ def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordi if year: fields["year"] = year if author: - fields["author"] = _normalize_text(author) + fields["author"] = author if doi: fields["doi"] = doi fields["url"] = f"https://doi.org/{doi}" @@ -531,6 +531,41 @@ def _normalize_text(value: str) -> str: return " ".join(without_tags.split()) +def _normalize_person_display_name(value: str) -> str: + normalized = _normalize_text(value) + if "," not in normalized: + return normalized + + left, right = [part.strip() for part in normalized.split(",", 1)] + if not (_looks_like_initial_block(left) and right): + return normalized + + right_tokens = right.split() + trailing_initials: list[str] = [] + while right_tokens and _looks_like_initial_block(right_tokens[-1]): + trailing_initials.insert(0, right_tokens.pop()) + if not right_tokens: + return normalized + + family = " ".join(right_tokens).strip() + given_parts = [ + _initial_block_to_given_names(" ".join(trailing_initials)), + _initial_block_to_given_names(left), + ] + given = " ".join(part for part in given_parts if part).strip() + return f"{family}, {given}" if given else family + + +def _looks_like_initial_block(value: str) -> bool: + letters = re.sub(r"[^A-Za-z]+", "", value) + return 0 < len(letters) <= 4 and letters.upper() == letters + + +def _initial_block_to_given_names(value: str) -> str: + letters = re.findall(r"[A-Za-z]", value) + return " ".join(f"{letter.upper()}." for letter in letters) + + def _crossref_reference_entry_type(reference: dict, title: str, journal_title: str) -> str: if journal_title: return "article" @@ -695,7 +730,7 @@ def _openalex_work_to_entry(work: dict) -> BibEntry: def _openalex_author_name(authorship: dict) -> str: author = authorship.get("author") or {} name = author.get("display_name", "") - return _normalize_text(name) + return _normalize_person_display_name(str(name)) def _openalex_abstract_text(inverted_index: dict) -> str: diff --git a/src/citegeist/resolve.py b/src/citegeist/resolve.py index 1b5ba03..89f8e52 100644 --- a/src/citegeist/resolve.py +++ b/src/citegeist/resolve.py @@ -471,7 +471,7 @@ def _openalex_work_to_entry(work: dict) -> BibEntry: def _openalex_author_name(authorship: dict) -> str: author = authorship.get("author") or {} - return " ".join(str(author.get("display_name", "")).split()) + return _normalize_person_display_name(str(author.get("display_name", ""))) def _openalex_abstract_text(inverted_index: dict) -> str: @@ -513,6 +513,41 @@ def _normalize_text(value: str) -> str: return " ".join(without_tags.split()) +def _normalize_person_display_name(value: str) -> str: + normalized = _normalize_text(value) + if "," not in normalized: + return normalized + + left, right = [part.strip() for part in normalized.split(",", 1)] + if not (_looks_like_initial_block(left) and right): + return normalized + + right_tokens = right.split() + trailing_initials: list[str] = [] + while right_tokens and _looks_like_initial_block(right_tokens[-1]): + trailing_initials.insert(0, right_tokens.pop()) + if not right_tokens: + return normalized + + family = " ".join(right_tokens).strip() + given_parts = [ + _initial_block_to_given_names(" ".join(trailing_initials)), + _initial_block_to_given_names(left), + ] + given = " ".join(part for part in given_parts if part).strip() + return f"{family}, {given}" if given else family + + +def _looks_like_initial_block(value: str) -> bool: + letters = re.sub(r"[^A-Za-z]+", "", value) + return 0 < len(letters) <= 4 and letters.upper() == letters + + +def _initial_block_to_given_names(value: str) -> str: + letters = re.findall(r"[A-Za-z]", value) + return " ".join(f"{letter.upper()}." for letter in letters) + + def _openalex_citation_key(doi: str, openalex_id: str, authors: str, year: str, title: str) -> str: if doi: suffix = re.sub(r"[^A-Za-z0-9]+", "", doi).lower() diff --git a/tests/test_expand.py b/tests/test_expand.py index b3f2cc6..ab5951c 100644 --- a/tests/test_expand.py +++ b/tests/test_expand.py @@ -233,6 +233,20 @@ def test_crossref_reference_to_entry_extracts_title_from_thesis_citation_blob(): ) +def test_crossref_reference_to_entry_normalizes_reversed_initial_author_name(): + entry = _crossref_reference_to_entry( + { + "author": "J., Fogel L.", + "article-title": "Evolutionary Programming", + "year": "1995", + }, + "seed2024", + 1, + ) + + assert entry.fields["author"] == "Fogel, L. J." + + def test_crossref_expander_returns_empty_on_fetch_error(): store = BibliographyStore() try: diff --git a/tests/test_resolve.py b/tests/test_resolve.py index 90bba47..3c34d19 100644 --- a/tests/test_resolve.py +++ b/tests/test_resolve.py @@ -208,6 +208,20 @@ def test_openalex_work_to_entry_maps_basic_fields(): assert entry.fields["abstract"] == "OpenAlex resolved" +def test_openalex_work_to_entry_normalizes_reversed_initial_author_name(): + entry = _openalex_work_to_entry( + { + "id": "https://openalex.org/W12345", + "display_name": "Evolutionary Programming", + "publication_year": 1995, + "type": "book-chapter", + "authorships": [{"author": {"display_name": "J., Fogel L."}}], + } + ) + + assert entry.fields["author"] == "Fogel, L. J." + + def test_resolver_can_resolve_openalex_id(): resolver = MetadataResolver() resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign]