Skip stub entries in default exports

This commit is contained in:
welsberr 2026-03-20 15:12:48 -04:00
parent 912dc59301
commit 0144bd9ef4
6 changed files with 173 additions and 7 deletions

View File

@ -154,6 +154,8 @@ PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export --outpu
For a fuller option-by-option CLI cookbook, see [examples/cli/README.md](./examples/cli/README.md). For a fuller option-by-option CLI cookbook, see [examples/cli/README.md](./examples/cli/README.md).
Broad BibTeX exports skip DOI-only placeholder records such as `Referenced work N` by default. Use `--include-stubs` on `export` or `export-topic` if you want those entries included anyway.
For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run. For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run.
## Example Application ## Example Application

View File

@ -129,6 +129,12 @@ Write BibTeX to a file:
.venv/bin/python -m citegeist --db library.sqlite3 export --output artificial-life.bib .venv/bin/python -m citegeist --db library.sqlite3 export --output artificial-life.bib
``` ```
Include DOI-only placeholder records in a broad export:
```bash
.venv/bin/python -m citegeist --db library.sqlite3 export --include-stubs --output artificial-life.bib
```
## Review And Clean Metadata ## Review And Clean Metadata
Purpose: inspect merge conflicts, apply corrections, and enrich incomplete records. Purpose: inspect merge conflicts, apply corrections, and enrich incomplete records.
@ -393,6 +399,12 @@ Write the topic slice to a file:
.venv/bin/python -m citegeist --db library.sqlite3 export-topic artificial-life --output artificial-life-topic.bib .venv/bin/python -m citegeist --db library.sqlite3 export-topic artificial-life --output artificial-life-topic.bib
``` ```
Include DOI-only placeholder records in the topic export:
```bash
.venv/bin/python -m citegeist --db library.sqlite3 export-topic artificial-life --include-stubs --output artificial-life-topic.bib
```
### Bootstrap ### Bootstrap
Seed from a BibTeX file: Seed from a BibTeX file:

View File

@ -43,6 +43,11 @@ def build_parser() -> argparse.ArgumentParser:
export_parser = subparsers.add_parser("export", help="Export entries as BibTeX") export_parser = subparsers.add_parser("export", help="Export entries as BibTeX")
export_parser.add_argument("citation_keys", nargs="*", help="Optional citation keys to export") export_parser.add_argument("citation_keys", nargs="*", help="Optional citation keys to export")
export_parser.add_argument("--output", help="Write BibTeX to a file instead of stdout") export_parser.add_argument("--output", help="Write BibTeX to a file instead of stdout")
export_parser.add_argument(
"--include-stubs",
action="store_true",
help="Include DOI-only placeholder records in broad exports",
)
status_parser = subparsers.add_parser("set-status", help="Set the review status for one entry") status_parser = subparsers.add_parser("set-status", help="Set the review status for one entry")
status_parser.add_argument("citation_key", help="Citation key to update") status_parser.add_argument("citation_key", help="Citation key to update")
@ -494,6 +499,11 @@ def build_parser() -> argparse.ArgumentParser:
) )
export_topic_parser.add_argument("topic_slug", help="Topic slug to export") export_topic_parser.add_argument("topic_slug", help="Topic slug to export")
export_topic_parser.add_argument("--output", help="Write BibTeX to a file instead of stdout") export_topic_parser.add_argument("--output", help="Write BibTeX to a file instead of stdout")
export_topic_parser.add_argument(
"--include-stubs",
action="store_true",
help="Include DOI-only placeholder records in the topic export",
)
return parser return parser
@ -511,7 +521,7 @@ def main(argv: list[str] | None = None) -> int:
if args.command == "show": if args.command == "show":
return _run_show(store, args.citation_key, args.limit, args.provenance, args.conflicts) return _run_show(store, args.citation_key, args.limit, args.provenance, args.conflicts)
if args.command == "export": if args.command == "export":
return _run_export(store, args.citation_keys, args.output) return _run_export(store, args.citation_keys, args.output, args.include_stubs)
if args.command == "set-status": if args.command == "set-status":
return _run_set_status(store, args.citation_key, args.review_status) return _run_set_status(store, args.citation_key, args.review_status)
if args.command == "resolve-conflicts": if args.command == "resolve-conflicts":
@ -660,7 +670,7 @@ def main(argv: list[str] | None = None) -> int:
if args.command == "topic-entries": if args.command == "topic-entries":
return _run_topic_entries(store, args.topic_slug, args.limit) return _run_topic_entries(store, args.topic_slug, args.limit)
if args.command == "export-topic": if args.command == "export-topic":
return _run_export_topic(store, args.topic_slug, args.output) return _run_export_topic(store, args.topic_slug, args.output, args.include_stubs)
finally: finally:
store.close() store.close()
@ -715,8 +725,14 @@ def _run_show(
return 0 return 0
def _run_export(store: BibliographyStore, citation_keys: list[str], output: str | None) -> int: def _run_export(
rendered = store.export_bibtex(citation_keys or None) store: BibliographyStore,
citation_keys: list[str],
output: str | None,
include_stubs: bool,
) -> int:
explicit_keys = citation_keys or None
rendered = store.export_bibtex(explicit_keys, include_stubs=include_stubs or explicit_keys is not None)
if output: if output:
Path(output).write_text(rendered + ("\n" if rendered else ""), encoding="utf-8") Path(output).write_text(rendered + ("\n" if rendered else ""), encoding="utf-8")
else: else:
@ -1731,13 +1747,13 @@ def _run_topic_entries(store: BibliographyStore, topic_slug: str, limit: int) ->
return 0 return 0
def _run_export_topic(store: BibliographyStore, topic_slug: str, output: str | None) -> int: def _run_export_topic(store: BibliographyStore, topic_slug: str, output: str | None, include_stubs: bool) -> int:
topic = store.get_topic(topic_slug) topic = store.get_topic(topic_slug)
if topic is None: if topic is None:
print(f"Topic not found: {topic_slug}", file=sys.stderr) print(f"Topic not found: {topic_slug}", file=sys.stderr)
return 1 return 1
citation_keys = [row["citation_key"] for row in store.list_topic_entries(topic_slug, limit=100000)] citation_keys = [row["citation_key"] for row in store.list_topic_entries(topic_slug, limit=100000)]
rendered = store.export_bibtex(citation_keys) rendered = store.export_bibtex(citation_keys, include_stubs=include_stubs)
if output: if output:
Path(output).write_text(rendered + ("\n" if rendered else ""), encoding="utf-8") Path(output).write_text(rendered + ("\n" if rendered else ""), encoding="utf-8")
else: else:

View File

@ -1010,7 +1010,10 @@ class BibliographyStore:
return None return None
return render_bibtex([entry]) return render_bibtex([entry])
def export_bibtex(self, citation_keys: list[str] | None = None) -> str: def export_bibtex(self, citation_keys: list[str] | None = None, include_stubs: bool | None = None) -> str:
explicit_keys = citation_keys is not None
if include_stubs is None:
include_stubs = explicit_keys
if citation_keys is None: if citation_keys is None:
rows = self.connection.execute( rows = self.connection.execute(
"SELECT citation_key FROM entries ORDER BY COALESCE(year, ''), citation_key" "SELECT citation_key FROM entries ORDER BY COALESCE(year, ''), citation_key"
@ -1022,6 +1025,8 @@ class BibliographyStore:
for citation_key in citation_keys: for citation_key in citation_keys:
entry = self._load_bib_entry(citation_key) entry = self._load_bib_entry(citation_key)
if entry is not None: if entry is not None:
if not include_stubs and self._is_export_stub(entry):
continue
entries.append(entry) entries.append(entry)
if not entries: if not entries:
return "" return ""
@ -1091,6 +1096,22 @@ class BibliographyStore:
fields=dict(fields), fields=dict(fields),
) )
def _is_export_stub(self, entry: BibEntry) -> bool:
title = " ".join(entry.fields.get("title", "").split()).strip().lower()
doi = " ".join(entry.fields.get("doi", "").split()).strip()
url = " ".join(entry.fields.get("url", "").split()).strip()
has_author = bool(" ".join(entry.fields.get("author", "").split()).strip())
has_abstract = bool(" ".join(entry.fields.get("abstract", "").split()).strip())
has_journal = bool(" ".join(entry.fields.get("journal", "").split()).strip())
has_booktitle = bool(" ".join(entry.fields.get("booktitle", "").split()).strip())
if not doi:
return False
if title and not (title.startswith("referenced work ") or title.startswith("untitled")):
return False
return not any((has_author, has_abstract, has_journal, has_booktitle)) and (
not url or url.startswith("https://doi.org/")
)
def _load_creator_names(self, citation_key: str, role: str) -> list[str]: def _load_creator_names(self, citation_key: str, role: str) -> list[str]:
rows = self.connection.execute( rows = self.connection.execute(
""" """

View File

@ -74,6 +74,42 @@ def test_cli_ingest_show_search_and_export(tmp_path: Path):
assert "@article{smith2024graphs," in exported assert "@article{smith2024graphs," in exported
def test_cli_export_skips_stub_entries_by_default_but_can_include_them(tmp_path: Path):
bib_path = tmp_path / "input.bib"
bib_path.write_text(
"""
@misc{stubdoi,
title = {Referenced work 6},
doi = {10.1200/JCO.2002.04.117},
url = {https://doi.org/10.1200/JCO.2002.04.117}
}
@article{realentry,
author = {Smith, Jane},
title = {Real Entry},
year = {2024},
doi = {10.1000/real}
}
""",
encoding="utf-8",
)
assert run_cli(tmp_path, "ingest", str(bib_path)).returncode == 0
default_export = run_cli(tmp_path, "export")
assert default_export.returncode == 0
assert "@article{realentry," in default_export.stdout
assert "@misc{stubdoi," not in default_export.stdout
explicit_export = run_cli(tmp_path, "export", "stubdoi")
assert explicit_export.returncode == 0
assert "@misc{stubdoi," in explicit_export.stdout
include_export = run_cli(tmp_path, "export", "--include-stubs")
assert include_export.returncode == 0
assert "@misc{stubdoi," in include_export.stdout
def test_cli_provenance_and_status_updates(tmp_path: Path): def test_cli_provenance_and_status_updates(tmp_path: Path):
bib_path = tmp_path / "input.bib" bib_path = tmp_path / "input.bib"
bib_path.write_text(SAMPLE_BIB, encoding="utf-8") bib_path.write_text(SAMPLE_BIB, encoding="utf-8")
@ -1140,6 +1176,52 @@ def test_cli_export_topic(tmp_path: Path):
assert "@article{seed2024," in exported assert "@article{seed2024," in exported
def test_cli_export_topic_skips_stub_entries_by_default(tmp_path: Path):
bib_path = tmp_path / "input.bib"
bib_path.write_text(
"""
@misc{stubdoi,
title = {Referenced work 6},
doi = {10.1200/JCO.2002.04.117},
url = {https://doi.org/10.1200/JCO.2002.04.117}
}
@article{seed2024,
author = {Seed, Alice},
title = {Seed Paper},
year = {2024}
}
""",
encoding="utf-8",
)
assert run_cli(tmp_path, "ingest", str(bib_path)).returncode == 0
from citegeist.storage import BibliographyStore
database = tmp_path / "library.sqlite3"
store = BibliographyStore(database)
try:
for citation_key in ("stubdoi", "seed2024"):
store.add_entry_topic(
citation_key,
topic_slug="graph-methods",
topic_name="Graph Methods",
source_label="topic-seed",
)
store.connection.commit()
finally:
store.close()
default_export = run_cli(tmp_path, "export-topic", "graph-methods")
assert default_export.returncode == 0
assert "@article{seed2024," in default_export.stdout
assert "@misc{stubdoi," not in default_export.stdout
include_export = run_cli(tmp_path, "export-topic", "graph-methods", "--include-stubs")
assert include_export.returncode == 0
assert "@misc{stubdoi," in include_export.stdout
def test_cli_search_can_filter_by_topic(tmp_path: Path): def test_cli_search_can_filter_by_topic(tmp_path: Path):
bib_path = tmp_path / "input.bib" bib_path = tmp_path / "input.bib"
bib_path.write_text( bib_path.write_text(

View File

@ -69,6 +69,39 @@ def test_store_exports_bibtex_from_normalized_rows():
store.close() store.close()
def test_store_export_skips_doi_only_stub_by_default():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@misc{stubdoi,
title = {Referenced work 6},
doi = {10.1200/JCO.2002.04.117},
url = {https://doi.org/10.1200/JCO.2002.04.117}
}
@article{realentry,
author = {Smith, Jane},
title = {Real Entry},
year = {2024},
doi = {10.1000/real}
}
"""
)
exported = store.export_bibtex()
assert "@article{realentry," in exported
assert "@misc{stubdoi," not in exported
explicit = store.export_bibtex(["stubdoi"])
assert "@misc{stubdoi," in explicit
with_stubs = store.export_bibtex(include_stubs=True)
assert "@misc{stubdoi," in with_stubs
finally:
store.close()
def test_store_records_provenance_and_review_status(): def test_store_records_provenance_and_review_status():
store = BibliographyStore() store = BibliographyStore()
try: try: