From 0144bd9ef4405b19e3cad4d7abd9e740dfb6e2a6 Mon Sep 17 00:00:00 2001 From: welsberr Date: Fri, 20 Mar 2026 15:12:48 -0400 Subject: [PATCH] Skip stub entries in default exports --- README.md | 2 + examples/cli/README.md | 12 ++++++ src/citegeist/cli.py | 28 +++++++++++--- src/citegeist/storage.py | 23 ++++++++++- tests/test_cli.py | 82 ++++++++++++++++++++++++++++++++++++++++ tests/test_storage.py | 33 ++++++++++++++++ 6 files changed, 173 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 69dba83..f55af39 100644 --- a/README.md +++ b/README.md @@ -154,6 +154,8 @@ PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export --outpu For a fuller option-by-option CLI cookbook, see [examples/cli/README.md](./examples/cli/README.md). +Broad BibTeX exports skip DOI-only placeholder records such as `Referenced work N` by default. Use `--include-stubs` on `export` or `export-topic` if you want those entries included anyway. + For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run. ## Example Application diff --git a/examples/cli/README.md b/examples/cli/README.md index 8897137..b4c16fe 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -129,6 +129,12 @@ Write BibTeX to a file: .venv/bin/python -m citegeist --db library.sqlite3 export --output artificial-life.bib ``` +Include DOI-only placeholder records in a broad export: + +```bash +.venv/bin/python -m citegeist --db library.sqlite3 export --include-stubs --output artificial-life.bib +``` + ## Review And Clean Metadata Purpose: inspect merge conflicts, apply corrections, and enrich incomplete records. @@ -393,6 +399,12 @@ Write the topic slice to a file: .venv/bin/python -m citegeist --db library.sqlite3 export-topic artificial-life --output artificial-life-topic.bib ``` +Include DOI-only placeholder records in the topic export: + +```bash +.venv/bin/python -m citegeist --db library.sqlite3 export-topic artificial-life --include-stubs --output artificial-life-topic.bib +``` + ### Bootstrap Seed from a BibTeX file: diff --git a/src/citegeist/cli.py b/src/citegeist/cli.py index 1eb908e..b820016 100644 --- a/src/citegeist/cli.py +++ b/src/citegeist/cli.py @@ -43,6 +43,11 @@ def build_parser() -> argparse.ArgumentParser: export_parser = subparsers.add_parser("export", help="Export entries as BibTeX") export_parser.add_argument("citation_keys", nargs="*", help="Optional citation keys to export") export_parser.add_argument("--output", help="Write BibTeX to a file instead of stdout") + export_parser.add_argument( + "--include-stubs", + action="store_true", + help="Include DOI-only placeholder records in broad exports", + ) status_parser = subparsers.add_parser("set-status", help="Set the review status for one entry") status_parser.add_argument("citation_key", help="Citation key to update") @@ -494,6 +499,11 @@ def build_parser() -> argparse.ArgumentParser: ) export_topic_parser.add_argument("topic_slug", help="Topic slug to export") export_topic_parser.add_argument("--output", help="Write BibTeX to a file instead of stdout") + export_topic_parser.add_argument( + "--include-stubs", + action="store_true", + help="Include DOI-only placeholder records in the topic export", + ) return parser @@ -511,7 +521,7 @@ def main(argv: list[str] | None = None) -> int: if args.command == "show": return _run_show(store, args.citation_key, args.limit, args.provenance, args.conflicts) if args.command == "export": - return _run_export(store, args.citation_keys, args.output) + return _run_export(store, args.citation_keys, args.output, args.include_stubs) if args.command == "set-status": return _run_set_status(store, args.citation_key, args.review_status) if args.command == "resolve-conflicts": @@ -660,7 +670,7 @@ def main(argv: list[str] | None = None) -> int: if args.command == "topic-entries": return _run_topic_entries(store, args.topic_slug, args.limit) if args.command == "export-topic": - return _run_export_topic(store, args.topic_slug, args.output) + return _run_export_topic(store, args.topic_slug, args.output, args.include_stubs) finally: store.close() @@ -715,8 +725,14 @@ def _run_show( return 0 -def _run_export(store: BibliographyStore, citation_keys: list[str], output: str | None) -> int: - rendered = store.export_bibtex(citation_keys or None) +def _run_export( + store: BibliographyStore, + citation_keys: list[str], + output: str | None, + include_stubs: bool, +) -> int: + explicit_keys = citation_keys or None + rendered = store.export_bibtex(explicit_keys, include_stubs=include_stubs or explicit_keys is not None) if output: Path(output).write_text(rendered + ("\n" if rendered else ""), encoding="utf-8") else: @@ -1731,13 +1747,13 @@ def _run_topic_entries(store: BibliographyStore, topic_slug: str, limit: int) -> return 0 -def _run_export_topic(store: BibliographyStore, topic_slug: str, output: str | None) -> int: +def _run_export_topic(store: BibliographyStore, topic_slug: str, output: str | None, include_stubs: bool) -> int: topic = store.get_topic(topic_slug) if topic is None: print(f"Topic not found: {topic_slug}", file=sys.stderr) return 1 citation_keys = [row["citation_key"] for row in store.list_topic_entries(topic_slug, limit=100000)] - rendered = store.export_bibtex(citation_keys) + rendered = store.export_bibtex(citation_keys, include_stubs=include_stubs) if output: Path(output).write_text(rendered + ("\n" if rendered else ""), encoding="utf-8") else: diff --git a/src/citegeist/storage.py b/src/citegeist/storage.py index ace5c7f..ee753e5 100644 --- a/src/citegeist/storage.py +++ b/src/citegeist/storage.py @@ -1010,7 +1010,10 @@ class BibliographyStore: return None return render_bibtex([entry]) - def export_bibtex(self, citation_keys: list[str] | None = None) -> str: + def export_bibtex(self, citation_keys: list[str] | None = None, include_stubs: bool | None = None) -> str: + explicit_keys = citation_keys is not None + if include_stubs is None: + include_stubs = explicit_keys if citation_keys is None: rows = self.connection.execute( "SELECT citation_key FROM entries ORDER BY COALESCE(year, ''), citation_key" @@ -1022,6 +1025,8 @@ class BibliographyStore: for citation_key in citation_keys: entry = self._load_bib_entry(citation_key) if entry is not None: + if not include_stubs and self._is_export_stub(entry): + continue entries.append(entry) if not entries: return "" @@ -1091,6 +1096,22 @@ class BibliographyStore: fields=dict(fields), ) + def _is_export_stub(self, entry: BibEntry) -> bool: + title = " ".join(entry.fields.get("title", "").split()).strip().lower() + doi = " ".join(entry.fields.get("doi", "").split()).strip() + url = " ".join(entry.fields.get("url", "").split()).strip() + has_author = bool(" ".join(entry.fields.get("author", "").split()).strip()) + has_abstract = bool(" ".join(entry.fields.get("abstract", "").split()).strip()) + has_journal = bool(" ".join(entry.fields.get("journal", "").split()).strip()) + has_booktitle = bool(" ".join(entry.fields.get("booktitle", "").split()).strip()) + if not doi: + return False + if title and not (title.startswith("referenced work ") or title.startswith("untitled")): + return False + return not any((has_author, has_abstract, has_journal, has_booktitle)) and ( + not url or url.startswith("https://doi.org/") + ) + def _load_creator_names(self, citation_key: str, role: str) -> list[str]: rows = self.connection.execute( """ diff --git a/tests/test_cli.py b/tests/test_cli.py index 57ed995..06a33ce 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -74,6 +74,42 @@ def test_cli_ingest_show_search_and_export(tmp_path: Path): assert "@article{smith2024graphs," in exported +def test_cli_export_skips_stub_entries_by_default_but_can_include_them(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text( + """ +@misc{stubdoi, + title = {Referenced work 6}, + doi = {10.1200/JCO.2002.04.117}, + url = {https://doi.org/10.1200/JCO.2002.04.117} +} + +@article{realentry, + author = {Smith, Jane}, + title = {Real Entry}, + year = {2024}, + doi = {10.1000/real} +} +""", + encoding="utf-8", + ) + + assert run_cli(tmp_path, "ingest", str(bib_path)).returncode == 0 + + default_export = run_cli(tmp_path, "export") + assert default_export.returncode == 0 + assert "@article{realentry," in default_export.stdout + assert "@misc{stubdoi," not in default_export.stdout + + explicit_export = run_cli(tmp_path, "export", "stubdoi") + assert explicit_export.returncode == 0 + assert "@misc{stubdoi," in explicit_export.stdout + + include_export = run_cli(tmp_path, "export", "--include-stubs") + assert include_export.returncode == 0 + assert "@misc{stubdoi," in include_export.stdout + + def test_cli_provenance_and_status_updates(tmp_path: Path): bib_path = tmp_path / "input.bib" bib_path.write_text(SAMPLE_BIB, encoding="utf-8") @@ -1140,6 +1176,52 @@ def test_cli_export_topic(tmp_path: Path): assert "@article{seed2024," in exported +def test_cli_export_topic_skips_stub_entries_by_default(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text( + """ +@misc{stubdoi, + title = {Referenced work 6}, + doi = {10.1200/JCO.2002.04.117}, + url = {https://doi.org/10.1200/JCO.2002.04.117} +} + +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024} +} +""", + encoding="utf-8", + ) + assert run_cli(tmp_path, "ingest", str(bib_path)).returncode == 0 + + from citegeist.storage import BibliographyStore + + database = tmp_path / "library.sqlite3" + store = BibliographyStore(database) + try: + for citation_key in ("stubdoi", "seed2024"): + store.add_entry_topic( + citation_key, + topic_slug="graph-methods", + topic_name="Graph Methods", + source_label="topic-seed", + ) + store.connection.commit() + finally: + store.close() + + default_export = run_cli(tmp_path, "export-topic", "graph-methods") + assert default_export.returncode == 0 + assert "@article{seed2024," in default_export.stdout + assert "@misc{stubdoi," not in default_export.stdout + + include_export = run_cli(tmp_path, "export-topic", "graph-methods", "--include-stubs") + assert include_export.returncode == 0 + assert "@misc{stubdoi," in include_export.stdout + + def test_cli_search_can_filter_by_topic(tmp_path: Path): bib_path = tmp_path / "input.bib" bib_path.write_text( diff --git a/tests/test_storage.py b/tests/test_storage.py index ad0a7fa..6cef433 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -69,6 +69,39 @@ def test_store_exports_bibtex_from_normalized_rows(): store.close() +def test_store_export_skips_doi_only_stub_by_default(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@misc{stubdoi, + title = {Referenced work 6}, + doi = {10.1200/JCO.2002.04.117}, + url = {https://doi.org/10.1200/JCO.2002.04.117} +} + +@article{realentry, + author = {Smith, Jane}, + title = {Real Entry}, + year = {2024}, + doi = {10.1000/real} +} +""" + ) + + exported = store.export_bibtex() + assert "@article{realentry," in exported + assert "@misc{stubdoi," not in exported + + explicit = store.export_bibtex(["stubdoi"]) + assert "@misc{stubdoi," in explicit + + with_stubs = store.export_bibtex(include_stubs=True) + assert "@misc{stubdoi," in with_stubs + finally: + store.close() + + def test_store_records_provenance_and_review_status(): store = BibliographyStore() try: