Fix BibTeX escape round-tripping

This commit is contained in:
welsberr 2026-03-20 23:33:11 -04:00
parent 0354d6de89
commit bf33b898d6
3 changed files with 34 additions and 2 deletions

View File

@ -159,6 +159,8 @@ Broad BibTeX exports skip DOI-only placeholder records such as `Referenced work
Long-running CLI commands report progress on `stderr` so `stdout` remains clean for JSON, BibTeX, or tabular output.
BibTeX parse/render round-trips normalize simple escaped special characters such as `\_`, `\&`, and `\%` back to plain field values internally, then re-escape them on export. This prevents repeated commands such as `resolve` from turning a valid field like `discovered\_from = {...}` into `discovered\\_from = {...}` after rewriting an entry.
Crossref reference expansion is intentionally conservative about weak discoveries. If a cited reference has no DOI and Crossref only exposes it as an unstructured citation blob, `expand --source crossref`, `expand-topic --source crossref`, and bootstrap flows now skip materializing that record unless the fallback metadata looks like a cleaner non-`misc` work such as conference proceedings. When Crossref does expose thesis or dissertation references only as unstructured text, citegeist now also tries to extract the actual work title instead of keeping the entire ProQuest-style citation blob in the `title` field. This reduces junk `@misc` entries and cleaner `@phdthesis` fallbacks whose `title` field is really a pasted citation string.
OpenAlex expansion is also conservative about noisy secondary records. Discoveries now prefer DOI-based citation keys when a DOI is present, which reduces parallel `doi...` and `openalex...` entries for the same work. OpenAlex abstract imports are sanitized to drop obvious webpage/export blobs, and DOI-less records are filtered when they look like venue-title stubs, generic container records, or weak review-like shadows of an already present book, chapter, proceedings paper, or dissertation with the same title. Preview mode uses the same admission rules as write-enabled expansion, so rejected candidates disappear before you commit them to the store.

View File

@ -25,7 +25,7 @@ def parse_bibtex(text: str) -> list[BibEntry]:
bibliography = parse_string(text, bib_format="bibtex")
entries: list[BibEntry] = []
for citation_key, entry in bibliography.entries.items():
fields = dict(entry.fields.items())
fields = {key: _normalize_parsed_bibtex_value(value) for key, value in entry.fields.items()}
for role, persons in entry.persons.items():
fields[role] = " and ".join(str(person) for person in persons)
entries.append(
@ -114,3 +114,13 @@ def _sanitize_bibtex_value(value: str) -> str:
def _flatten_bibtex_braces(value: str) -> str:
return value.replace("{", "(").replace("}", ")")
def _normalize_parsed_bibtex_value(value: str) -> str:
return (
value.replace(r"\_", "_")
.replace(r"\&", "&")
.replace(r"\%", "%")
.replace(r"\$", "$")
.replace(r"\#", "#")
)

View File

@ -1,7 +1,7 @@
from xml.etree import ElementTree as ET
import urllib.error
from citegeist.bibtex import BibEntry, render_bibtex
from citegeist.bibtex import BibEntry, parse_bibtex, render_bibtex
from citegeist.resolve import (
MetadataResolver,
_arxiv_atom_entry_to_bib,
@ -483,3 +483,23 @@ def test_render_bibtex_tolerates_unmatched_braces_in_field_values():
assert "@misc{broken2026," in rendered
assert "Unmatched { braces } example ) tail" in rendered
assert "Open ( brace only" in rendered
def test_parse_and_render_do_not_double_escape_simple_bibtex_specials():
parsed = parse_bibtex(
"""
@misc{escaped2026,
title = "A \\& B",
note = "discovered\\_from = {doi10100718462821441}; confidence = 100\\%"
}
"""
)[0]
assert parsed.fields["title"] == "A & B"
assert parsed.fields["note"] == "discovered_from = {doi10100718462821441}; confidence = 100%"
rendered = render_bibtex([parsed])
assert 'title = "A \\& B"' in rendered
assert 'note = "discovered\\_from = {doi10100718462821441}; confidence = 100\\%"' in rendered
assert 'discovered\\\\_from' not in rendered