diff --git a/README.md b/README.md index 1de6a3d..63cd0d8 100644 --- a/README.md +++ b/README.md @@ -159,6 +159,8 @@ Broad BibTeX exports skip DOI-only placeholder records such as `Referenced work Long-running CLI commands report progress on `stderr` so `stdout` remains clean for JSON, BibTeX, or tabular output. +BibTeX parse/render round-trips normalize simple escaped special characters such as `\_`, `\&`, and `\%` back to plain field values internally, then re-escape them on export. This prevents repeated commands such as `resolve` from turning a valid field like `discovered\_from = {...}` into `discovered\\_from = {...}` after rewriting an entry. + Crossref reference expansion is intentionally conservative about weak discoveries. If a cited reference has no DOI and Crossref only exposes it as an unstructured citation blob, `expand --source crossref`, `expand-topic --source crossref`, and bootstrap flows now skip materializing that record unless the fallback metadata looks like a cleaner non-`misc` work such as conference proceedings. When Crossref does expose thesis or dissertation references only as unstructured text, citegeist now also tries to extract the actual work title instead of keeping the entire ProQuest-style citation blob in the `title` field. This reduces junk `@misc` entries and cleaner `@phdthesis` fallbacks whose `title` field is really a pasted citation string. OpenAlex expansion is also conservative about noisy secondary records. Discoveries now prefer DOI-based citation keys when a DOI is present, which reduces parallel `doi...` and `openalex...` entries for the same work. OpenAlex abstract imports are sanitized to drop obvious webpage/export blobs, and DOI-less records are filtered when they look like venue-title stubs, generic container records, or weak review-like shadows of an already present book, chapter, proceedings paper, or dissertation with the same title. Preview mode uses the same admission rules as write-enabled expansion, so rejected candidates disappear before you commit them to the store. diff --git a/src/citegeist/bibtex.py b/src/citegeist/bibtex.py index d815b9a..f725db7 100644 --- a/src/citegeist/bibtex.py +++ b/src/citegeist/bibtex.py @@ -25,7 +25,7 @@ def parse_bibtex(text: str) -> list[BibEntry]: bibliography = parse_string(text, bib_format="bibtex") entries: list[BibEntry] = [] for citation_key, entry in bibliography.entries.items(): - fields = dict(entry.fields.items()) + fields = {key: _normalize_parsed_bibtex_value(value) for key, value in entry.fields.items()} for role, persons in entry.persons.items(): fields[role] = " and ".join(str(person) for person in persons) entries.append( @@ -114,3 +114,13 @@ def _sanitize_bibtex_value(value: str) -> str: def _flatten_bibtex_braces(value: str) -> str: return value.replace("{", "(").replace("}", ")") + + +def _normalize_parsed_bibtex_value(value: str) -> str: + return ( + value.replace(r"\_", "_") + .replace(r"\&", "&") + .replace(r"\%", "%") + .replace(r"\$", "$") + .replace(r"\#", "#") + ) diff --git a/tests/test_resolve.py b/tests/test_resolve.py index 79d24b3..90bba47 100644 --- a/tests/test_resolve.py +++ b/tests/test_resolve.py @@ -1,7 +1,7 @@ from xml.etree import ElementTree as ET import urllib.error -from citegeist.bibtex import BibEntry, render_bibtex +from citegeist.bibtex import BibEntry, parse_bibtex, render_bibtex from citegeist.resolve import ( MetadataResolver, _arxiv_atom_entry_to_bib, @@ -483,3 +483,23 @@ def test_render_bibtex_tolerates_unmatched_braces_in_field_values(): assert "@misc{broken2026," in rendered assert "Unmatched { braces } example ) tail" in rendered assert "Open ( brace only" in rendered + + +def test_parse_and_render_do_not_double_escape_simple_bibtex_specials(): + parsed = parse_bibtex( + """ +@misc{escaped2026, + title = "A \\& B", + note = "discovered\\_from = {doi10100718462821441}; confidence = 100\\%" +} +""" + )[0] + + assert parsed.fields["title"] == "A & B" + assert parsed.fields["note"] == "discovered_from = {doi10100718462821441}; confidence = 100%" + + rendered = render_bibtex([parsed]) + + assert 'title = "A \\& B"' in rendered + assert 'note = "discovered\\_from = {doi10100718462821441}; confidence = 100\\%"' in rendered + assert 'discovered\\\\_from' not in rendered