Fix BibTeX escape round-tripping
This commit is contained in:
parent
0354d6de89
commit
bf33b898d6
|
|
@ -159,6 +159,8 @@ Broad BibTeX exports skip DOI-only placeholder records such as `Referenced work
|
||||||
|
|
||||||
Long-running CLI commands report progress on `stderr` so `stdout` remains clean for JSON, BibTeX, or tabular output.
|
Long-running CLI commands report progress on `stderr` so `stdout` remains clean for JSON, BibTeX, or tabular output.
|
||||||
|
|
||||||
|
BibTeX parse/render round-trips normalize simple escaped special characters such as `\_`, `\&`, and `\%` back to plain field values internally, then re-escape them on export. This prevents repeated commands such as `resolve` from turning a valid field like `discovered\_from = {...}` into `discovered\\_from = {...}` after rewriting an entry.
|
||||||
|
|
||||||
Crossref reference expansion is intentionally conservative about weak discoveries. If a cited reference has no DOI and Crossref only exposes it as an unstructured citation blob, `expand --source crossref`, `expand-topic --source crossref`, and bootstrap flows now skip materializing that record unless the fallback metadata looks like a cleaner non-`misc` work such as conference proceedings. When Crossref does expose thesis or dissertation references only as unstructured text, citegeist now also tries to extract the actual work title instead of keeping the entire ProQuest-style citation blob in the `title` field. This reduces junk `@misc` entries and cleaner `@phdthesis` fallbacks whose `title` field is really a pasted citation string.
|
Crossref reference expansion is intentionally conservative about weak discoveries. If a cited reference has no DOI and Crossref only exposes it as an unstructured citation blob, `expand --source crossref`, `expand-topic --source crossref`, and bootstrap flows now skip materializing that record unless the fallback metadata looks like a cleaner non-`misc` work such as conference proceedings. When Crossref does expose thesis or dissertation references only as unstructured text, citegeist now also tries to extract the actual work title instead of keeping the entire ProQuest-style citation blob in the `title` field. This reduces junk `@misc` entries and cleaner `@phdthesis` fallbacks whose `title` field is really a pasted citation string.
|
||||||
|
|
||||||
OpenAlex expansion is also conservative about noisy secondary records. Discoveries now prefer DOI-based citation keys when a DOI is present, which reduces parallel `doi...` and `openalex...` entries for the same work. OpenAlex abstract imports are sanitized to drop obvious webpage/export blobs, and DOI-less records are filtered when they look like venue-title stubs, generic container records, or weak review-like shadows of an already present book, chapter, proceedings paper, or dissertation with the same title. Preview mode uses the same admission rules as write-enabled expansion, so rejected candidates disappear before you commit them to the store.
|
OpenAlex expansion is also conservative about noisy secondary records. Discoveries now prefer DOI-based citation keys when a DOI is present, which reduces parallel `doi...` and `openalex...` entries for the same work. OpenAlex abstract imports are sanitized to drop obvious webpage/export blobs, and DOI-less records are filtered when they look like venue-title stubs, generic container records, or weak review-like shadows of an already present book, chapter, proceedings paper, or dissertation with the same title. Preview mode uses the same admission rules as write-enabled expansion, so rejected candidates disappear before you commit them to the store.
|
||||||
|
|
|
||||||
|
|
@ -25,7 +25,7 @@ def parse_bibtex(text: str) -> list[BibEntry]:
|
||||||
bibliography = parse_string(text, bib_format="bibtex")
|
bibliography = parse_string(text, bib_format="bibtex")
|
||||||
entries: list[BibEntry] = []
|
entries: list[BibEntry] = []
|
||||||
for citation_key, entry in bibliography.entries.items():
|
for citation_key, entry in bibliography.entries.items():
|
||||||
fields = dict(entry.fields.items())
|
fields = {key: _normalize_parsed_bibtex_value(value) for key, value in entry.fields.items()}
|
||||||
for role, persons in entry.persons.items():
|
for role, persons in entry.persons.items():
|
||||||
fields[role] = " and ".join(str(person) for person in persons)
|
fields[role] = " and ".join(str(person) for person in persons)
|
||||||
entries.append(
|
entries.append(
|
||||||
|
|
@ -114,3 +114,13 @@ def _sanitize_bibtex_value(value: str) -> str:
|
||||||
|
|
||||||
def _flatten_bibtex_braces(value: str) -> str:
|
def _flatten_bibtex_braces(value: str) -> str:
|
||||||
return value.replace("{", "(").replace("}", ")")
|
return value.replace("{", "(").replace("}", ")")
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_parsed_bibtex_value(value: str) -> str:
|
||||||
|
return (
|
||||||
|
value.replace(r"\_", "_")
|
||||||
|
.replace(r"\&", "&")
|
||||||
|
.replace(r"\%", "%")
|
||||||
|
.replace(r"\$", "$")
|
||||||
|
.replace(r"\#", "#")
|
||||||
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
from xml.etree import ElementTree as ET
|
from xml.etree import ElementTree as ET
|
||||||
import urllib.error
|
import urllib.error
|
||||||
|
|
||||||
from citegeist.bibtex import BibEntry, render_bibtex
|
from citegeist.bibtex import BibEntry, parse_bibtex, render_bibtex
|
||||||
from citegeist.resolve import (
|
from citegeist.resolve import (
|
||||||
MetadataResolver,
|
MetadataResolver,
|
||||||
_arxiv_atom_entry_to_bib,
|
_arxiv_atom_entry_to_bib,
|
||||||
|
|
@ -483,3 +483,23 @@ def test_render_bibtex_tolerates_unmatched_braces_in_field_values():
|
||||||
assert "@misc{broken2026," in rendered
|
assert "@misc{broken2026," in rendered
|
||||||
assert "Unmatched { braces } example ) tail" in rendered
|
assert "Unmatched { braces } example ) tail" in rendered
|
||||||
assert "Open ( brace only" in rendered
|
assert "Open ( brace only" in rendered
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_and_render_do_not_double_escape_simple_bibtex_specials():
|
||||||
|
parsed = parse_bibtex(
|
||||||
|
"""
|
||||||
|
@misc{escaped2026,
|
||||||
|
title = "A \\& B",
|
||||||
|
note = "discovered\\_from = {doi10100718462821441}; confidence = 100\\%"
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
)[0]
|
||||||
|
|
||||||
|
assert parsed.fields["title"] == "A & B"
|
||||||
|
assert parsed.fields["note"] == "discovered_from = {doi10100718462821441}; confidence = 100%"
|
||||||
|
|
||||||
|
rendered = render_bibtex([parsed])
|
||||||
|
|
||||||
|
assert 'title = "A \\& B"' in rendered
|
||||||
|
assert 'note = "discovered\\_from = {doi10100718462821441}; confidence = 100\\%"' in rendered
|
||||||
|
assert 'discovered\\\\_from' not in rendered
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue