From ae68ceaa3cd19446f903055403a67aa1e41a273f Mon Sep 17 00:00:00 2001 From: welsberr Date: Fri, 20 Mar 2026 15:35:57 -0400 Subject: [PATCH] Add misc re-enrichment mode --- README.md | 1 + examples/cli/README.md | 12 ++++++ src/citegeist/cli.py | 11 +++++- src/citegeist/resolve.py | 8 +++- src/citegeist/storage.py | 4 ++ tests/test_cli.py | 85 ++++++++++++++++++++++++++++++++++++++++ tests/test_resolve.py | 24 ++++++++++++ tests/test_storage.py | 27 +++++++++++++ 8 files changed, 169 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f55af39..5196ee9 100644 --- a/README.md +++ b/README.md @@ -134,6 +134,7 @@ PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 bootstrap --to PYTHONPATH=src .venv/bin/python -m citegeist extract references.txt --output draft.bib PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve smith2024graphs PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --preview --limit 25 +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --all-misc --limit 25 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topics PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topic-entries abiogenesis PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export-topic abiogenesis --output abiogenesis.bib diff --git a/examples/cli/README.md b/examples/cli/README.md index b4c16fe..bf029fd 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -199,6 +199,18 @@ Enrich DOI-bearing placeholder records inside one topic slice: .venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --topic artificial-life --limit 25 ``` +Preview all current `@misc` entries with DOIs, not just placeholder-like stubs: + +```bash +.venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --all-misc --preview --limit 25 +``` + +Re-enrich all current `@misc` entries with DOIs: + +```bash +.venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --all-misc --limit 25 +``` + ## Explore Citation Graphs Purpose: traverse citation edges, export graph data, and render quick visualizations. diff --git a/src/citegeist/cli.py b/src/citegeist/cli.py index b820016..4cd3092 100644 --- a/src/citegeist/cli.py +++ b/src/citegeist/cli.py @@ -82,6 +82,11 @@ def build_parser() -> argparse.ArgumentParser: action="store_true", help="Only consider candidates that already have a DOI", ) + resolve_stubs_parser.add_argument( + "--all-misc", + action="store_true", + help="Consider all stored @misc entries instead of only placeholder-like stub records", + ) resolve_stubs_parser.add_argument( "--topic", help="Optional topic slug to limit candidate selection", @@ -533,7 +538,7 @@ def main(argv: list[str] | None = None) -> int: if args.command == "resolve": return _run_resolve(store, args.citation_keys) if args.command == "resolve-stubs": - return _run_resolve_stubs(store, args.limit, args.doi_only, args.topic, args.preview) + return _run_resolve_stubs(store, args.limit, args.doi_only, args.all_misc, args.topic, args.preview) if args.command == "graph": return _run_graph( store, @@ -824,13 +829,15 @@ def _run_resolve_stubs( store: BibliographyStore, limit: int, doi_only: bool, + all_misc: bool, topic_slug: str | None, preview: bool, ) -> int: candidates = store.list_resolution_candidates( limit=limit, doi_only=doi_only, - stub_only=True, + stub_only=not all_misc, + misc_only=all_misc, topic_slug=topic_slug, ) if preview: diff --git a/src/citegeist/resolve.py b/src/citegeist/resolve.py index f795193..8957c79 100644 --- a/src/citegeist/resolve.py +++ b/src/citegeist/resolve.py @@ -292,7 +292,7 @@ def merge_entries_with_conflicts(base: BibEntry, resolved: BibEntry) -> tuple[Bi merged_fields[key] = value return ( BibEntry( - entry_type=base.entry_type or resolved.entry_type, + entry_type=_merged_entry_type(base.entry_type, resolved.entry_type), citation_key=base.citation_key, fields=merged_fields, ), @@ -310,6 +310,12 @@ def _is_placeholder_value(field_name: str, value: str) -> bool: return False +def _merged_entry_type(base_entry_type: str, resolved_entry_type: str) -> str: + if base_entry_type == "misc" and resolved_entry_type and resolved_entry_type != "misc": + return resolved_entry_type + return base_entry_type or resolved_entry_type + + def _crossref_message_to_entry(message: dict) -> BibEntry: entry_type = _crossref_type_to_bibtype(message.get("type", "article")) title_values = message.get("title", []) diff --git a/src/citegeist/storage.py b/src/citegeist/storage.py index ee753e5..519c152 100644 --- a/src/citegeist/storage.py +++ b/src/citegeist/storage.py @@ -472,6 +472,7 @@ class BibliographyStore: limit: int = 50, doi_only: bool = False, stub_only: bool = False, + misc_only: bool = False, topic_slug: str | None = None, ) -> list[dict[str, object]]: clauses: list[str] = [] @@ -489,6 +490,9 @@ class BibliographyStore: if doi_only: clauses.append("e.doi IS NOT NULL AND TRIM(e.doi) <> ''") + if misc_only: + clauses.append("e.entry_type = 'misc'") + if stub_only: clauses.append( """ diff --git a/tests/test_cli.py b/tests/test_cli.py index 06a33ce..857c0f8 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -218,6 +218,34 @@ def test_cli_resolve_stubs_preview_lists_doi_stub_candidates(tmp_path: Path): assert payload[0]["title"] == "Referenced work 6" +def test_cli_resolve_stubs_preview_can_target_all_misc_entries(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text( + """ +@misc{miscwithtitle, + author = {Doe, Alex}, + title = {Avida Conference Record}, + year = {2005}, + doi = {10.1117/12.512613} +} + +@article{complete, + author = {Smith, Jane}, + title = {Complete Record}, + year = {2024}, + doi = {10.1000/complete} +} +""", + encoding="utf-8", + ) + assert run_cli(tmp_path, "ingest", str(bib_path)).returncode == 0 + + result = run_cli(tmp_path, "resolve-stubs", "--doi-only", "--all-misc", "--preview", "--limit", "10") + assert result.returncode == 0 + payload = json.loads(result.stdout) + assert [row["citation_key"] for row in payload] == ["miscwithtitle"] + + def test_cli_resolve_stubs_enriches_matching_candidates(tmp_path: Path): bib_path = tmp_path / "input.bib" bib_path.write_text( @@ -271,6 +299,63 @@ def test_cli_resolve_stubs_enriches_matching_candidates(tmp_path: Path): assert payload["review_status"] == "enriched" +def test_cli_resolve_stubs_can_enrich_all_misc_entries(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text( + """ +@misc{miscwithtitle, + author = {Doe, Alex}, + title = {Avida Conference Record}, + year = {2005}, + doi = {10.1117/12.512613} +} +""", + encoding="utf-8", + ) + assert run_cli(tmp_path, "ingest", str(bib_path)).returncode == 0 + + from citegeist.bibtex import BibEntry + from citegeist.resolve import Resolution + + database = tmp_path / "library.sqlite3" + + with patch("citegeist.cli.MetadataResolver.resolve_entry") as mocked_resolve: + mocked_resolve.return_value = Resolution( + entry=BibEntry( + entry_type="inproceedings", + citation_key="resolvedkey", + fields={ + "author": "Koza, J. R.", + "title": "Genetic Programming IV: Routine Human-Competitive Machine Intelligence", + "year": "2005", + "booktitle": "Genetic and Evolutionary Computation Conference", + "doi": "10.1117/12.512613", + }, + ), + source_type="resolver", + source_label="crossref:doi:10.1117/12.512613", + ) + exit_code = main( + [ + "--db", + str(database), + "resolve-stubs", + "--doi-only", + "--all-misc", + "--limit", + "10", + ] + ) + + assert exit_code == 0 + show = run_cli(tmp_path, "show", "--conflicts", "miscwithtitle") + payload = json.loads(show.stdout) + assert payload["entry_type"] == "inproceedings" + assert payload["title"] == "Avida Conference Record" + assert payload["booktitle"] == "Genetic and Evolutionary Computation Conference" + assert "title" in {item["field_name"] for item in payload["field_conflicts"]} + + def test_cli_resolve_conflicts_updates_status(tmp_path: Path): bib_path = tmp_path / "input.bib" bib_path.write_text( diff --git a/tests/test_resolve.py b/tests/test_resolve.py index 22c2c89..da08dc8 100644 --- a/tests/test_resolve.py +++ b/tests/test_resolve.py @@ -128,6 +128,30 @@ def test_merge_entries_replaces_placeholder_titles_without_conflict(): assert conflicts == [] +def test_merge_entries_upgrades_misc_type_when_resolver_has_better_type(): + base = BibEntry( + entry_type="misc", + citation_key="miscwithtitle", + fields={"title": "Avida Conference Record", "doi": "10.1117/12.512613"}, + ) + resolved = BibEntry( + entry_type="inproceedings", + citation_key="resolved", + fields={"title": "Genetic Programming IV", "booktitle": "GECCO"}, + ) + + merged, conflicts = merge_entries_with_conflicts(base, resolved) + + assert merged.entry_type == "inproceedings" + assert conflicts == [ + { + "field_name": "title", + "current_value": "Avida Conference Record", + "proposed_value": "Genetic Programming IV", + } + ] + + def test_resolver_tries_doi_before_dblp(): resolver = MetadataResolver() calls: list[tuple[str, str]] = [] diff --git a/tests/test_storage.py b/tests/test_storage.py index 6cef433..3211c00 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -354,6 +354,33 @@ def test_store_lists_stub_resolution_candidates(): store.close() +def test_store_can_list_all_misc_resolution_candidates(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@misc{miscwithtitle, + author = {Doe, Alex}, + title = {Avida Conference Record}, + year = {2005}, + doi = {10.1117/12.512613} +} + +@article{complete, + author = {Smith, Jane}, + title = {Complete Record}, + year = {2024}, + doi = {10.1000/complete} +} +""" + ) + + candidates = store.list_resolution_candidates(limit=10, doi_only=True, misc_only=True) + assert [row["citation_key"] for row in candidates] == ["miscwithtitle"] + finally: + store.close() + + def test_store_can_stage_and_review_topic_phrase_suggestion(): store = BibliographyStore() try: