Add misc re-enrichment mode

This commit is contained in:
welsberr 2026-03-20 15:35:57 -04:00
parent 753b8a2ccf
commit ae68ceaa3c
8 changed files with 169 additions and 3 deletions

View File

@ -134,6 +134,7 @@ PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 bootstrap --to
PYTHONPATH=src .venv/bin/python -m citegeist extract references.txt --output draft.bib
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve smith2024graphs
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --preview --limit 25
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --all-misc --limit 25
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topics
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topic-entries abiogenesis
PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export-topic abiogenesis --output abiogenesis.bib

View File

@ -199,6 +199,18 @@ Enrich DOI-bearing placeholder records inside one topic slice:
.venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --topic artificial-life --limit 25
```
Preview all current `@misc` entries with DOIs, not just placeholder-like stubs:
```bash
.venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --all-misc --preview --limit 25
```
Re-enrich all current `@misc` entries with DOIs:
```bash
.venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --all-misc --limit 25
```
## Explore Citation Graphs
Purpose: traverse citation edges, export graph data, and render quick visualizations.

View File

@ -82,6 +82,11 @@ def build_parser() -> argparse.ArgumentParser:
action="store_true",
help="Only consider candidates that already have a DOI",
)
resolve_stubs_parser.add_argument(
"--all-misc",
action="store_true",
help="Consider all stored @misc entries instead of only placeholder-like stub records",
)
resolve_stubs_parser.add_argument(
"--topic",
help="Optional topic slug to limit candidate selection",
@ -533,7 +538,7 @@ def main(argv: list[str] | None = None) -> int:
if args.command == "resolve":
return _run_resolve(store, args.citation_keys)
if args.command == "resolve-stubs":
return _run_resolve_stubs(store, args.limit, args.doi_only, args.topic, args.preview)
return _run_resolve_stubs(store, args.limit, args.doi_only, args.all_misc, args.topic, args.preview)
if args.command == "graph":
return _run_graph(
store,
@ -824,13 +829,15 @@ def _run_resolve_stubs(
store: BibliographyStore,
limit: int,
doi_only: bool,
all_misc: bool,
topic_slug: str | None,
preview: bool,
) -> int:
candidates = store.list_resolution_candidates(
limit=limit,
doi_only=doi_only,
stub_only=True,
stub_only=not all_misc,
misc_only=all_misc,
topic_slug=topic_slug,
)
if preview:

View File

@ -292,7 +292,7 @@ def merge_entries_with_conflicts(base: BibEntry, resolved: BibEntry) -> tuple[Bi
merged_fields[key] = value
return (
BibEntry(
entry_type=base.entry_type or resolved.entry_type,
entry_type=_merged_entry_type(base.entry_type, resolved.entry_type),
citation_key=base.citation_key,
fields=merged_fields,
),
@ -310,6 +310,12 @@ def _is_placeholder_value(field_name: str, value: str) -> bool:
return False
def _merged_entry_type(base_entry_type: str, resolved_entry_type: str) -> str:
if base_entry_type == "misc" and resolved_entry_type and resolved_entry_type != "misc":
return resolved_entry_type
return base_entry_type or resolved_entry_type
def _crossref_message_to_entry(message: dict) -> BibEntry:
entry_type = _crossref_type_to_bibtype(message.get("type", "article"))
title_values = message.get("title", [])

View File

@ -472,6 +472,7 @@ class BibliographyStore:
limit: int = 50,
doi_only: bool = False,
stub_only: bool = False,
misc_only: bool = False,
topic_slug: str | None = None,
) -> list[dict[str, object]]:
clauses: list[str] = []
@ -489,6 +490,9 @@ class BibliographyStore:
if doi_only:
clauses.append("e.doi IS NOT NULL AND TRIM(e.doi) <> ''")
if misc_only:
clauses.append("e.entry_type = 'misc'")
if stub_only:
clauses.append(
"""

View File

@ -218,6 +218,34 @@ def test_cli_resolve_stubs_preview_lists_doi_stub_candidates(tmp_path: Path):
assert payload[0]["title"] == "Referenced work 6"
def test_cli_resolve_stubs_preview_can_target_all_misc_entries(tmp_path: Path):
bib_path = tmp_path / "input.bib"
bib_path.write_text(
"""
@misc{miscwithtitle,
author = {Doe, Alex},
title = {Avida Conference Record},
year = {2005},
doi = {10.1117/12.512613}
}
@article{complete,
author = {Smith, Jane},
title = {Complete Record},
year = {2024},
doi = {10.1000/complete}
}
""",
encoding="utf-8",
)
assert run_cli(tmp_path, "ingest", str(bib_path)).returncode == 0
result = run_cli(tmp_path, "resolve-stubs", "--doi-only", "--all-misc", "--preview", "--limit", "10")
assert result.returncode == 0
payload = json.loads(result.stdout)
assert [row["citation_key"] for row in payload] == ["miscwithtitle"]
def test_cli_resolve_stubs_enriches_matching_candidates(tmp_path: Path):
bib_path = tmp_path / "input.bib"
bib_path.write_text(
@ -271,6 +299,63 @@ def test_cli_resolve_stubs_enriches_matching_candidates(tmp_path: Path):
assert payload["review_status"] == "enriched"
def test_cli_resolve_stubs_can_enrich_all_misc_entries(tmp_path: Path):
bib_path = tmp_path / "input.bib"
bib_path.write_text(
"""
@misc{miscwithtitle,
author = {Doe, Alex},
title = {Avida Conference Record},
year = {2005},
doi = {10.1117/12.512613}
}
""",
encoding="utf-8",
)
assert run_cli(tmp_path, "ingest", str(bib_path)).returncode == 0
from citegeist.bibtex import BibEntry
from citegeist.resolve import Resolution
database = tmp_path / "library.sqlite3"
with patch("citegeist.cli.MetadataResolver.resolve_entry") as mocked_resolve:
mocked_resolve.return_value = Resolution(
entry=BibEntry(
entry_type="inproceedings",
citation_key="resolvedkey",
fields={
"author": "Koza, J. R.",
"title": "Genetic Programming IV: Routine Human-Competitive Machine Intelligence",
"year": "2005",
"booktitle": "Genetic and Evolutionary Computation Conference",
"doi": "10.1117/12.512613",
},
),
source_type="resolver",
source_label="crossref:doi:10.1117/12.512613",
)
exit_code = main(
[
"--db",
str(database),
"resolve-stubs",
"--doi-only",
"--all-misc",
"--limit",
"10",
]
)
assert exit_code == 0
show = run_cli(tmp_path, "show", "--conflicts", "miscwithtitle")
payload = json.loads(show.stdout)
assert payload["entry_type"] == "inproceedings"
assert payload["title"] == "Avida Conference Record"
assert payload["booktitle"] == "Genetic and Evolutionary Computation Conference"
assert "title" in {item["field_name"] for item in payload["field_conflicts"]}
def test_cli_resolve_conflicts_updates_status(tmp_path: Path):
bib_path = tmp_path / "input.bib"
bib_path.write_text(

View File

@ -128,6 +128,30 @@ def test_merge_entries_replaces_placeholder_titles_without_conflict():
assert conflicts == []
def test_merge_entries_upgrades_misc_type_when_resolver_has_better_type():
base = BibEntry(
entry_type="misc",
citation_key="miscwithtitle",
fields={"title": "Avida Conference Record", "doi": "10.1117/12.512613"},
)
resolved = BibEntry(
entry_type="inproceedings",
citation_key="resolved",
fields={"title": "Genetic Programming IV", "booktitle": "GECCO"},
)
merged, conflicts = merge_entries_with_conflicts(base, resolved)
assert merged.entry_type == "inproceedings"
assert conflicts == [
{
"field_name": "title",
"current_value": "Avida Conference Record",
"proposed_value": "Genetic Programming IV",
}
]
def test_resolver_tries_doi_before_dblp():
resolver = MetadataResolver()
calls: list[tuple[str, str]] = []

View File

@ -354,6 +354,33 @@ def test_store_lists_stub_resolution_candidates():
store.close()
def test_store_can_list_all_misc_resolution_candidates():
store = BibliographyStore()
try:
store.ingest_bibtex(
"""
@misc{miscwithtitle,
author = {Doe, Alex},
title = {Avida Conference Record},
year = {2005},
doi = {10.1117/12.512613}
}
@article{complete,
author = {Smith, Jane},
title = {Complete Record},
year = {2024},
doi = {10.1000/complete}
}
"""
)
candidates = store.list_resolution_candidates(limit=10, doi_only=True, misc_only=True)
assert [row["citation_key"] for row in candidates] == ["miscwithtitle"]
finally:
store.close()
def test_store_can_stage_and_review_topic_phrase_suggestion():
store = BibliographyStore()
try: