From 4894341ba863beb535c8a08ad845745d81e884c0 Mon Sep 17 00:00:00 2001 From: welsberr Date: Tue, 21 Apr 2026 03:15:33 -0400 Subject: [PATCH] LLM verify + fixes + tests --- .gitignore | 2 + Makefile | 5 +- README.md | 53 ++++++++ scripts/live_verify_llm_smoke.py | 97 ++++++++++++++ src/citegeist/__init__.py | 3 + src/citegeist/cli.py | 58 +++++++- src/citegeist/expand.py | 4 +- src/citegeist/llm_verify.py | 218 +++++++++++++++++++++++++++++++ src/citegeist/storage.py | 10 +- src/citegeist/verify.py | 68 +++++++++- tests/test_cli.py | 65 +++++++++ tests/test_live_llm_verify.py | 78 +++++++++++ tests/test_verify.py | 100 ++++++++++++++ 13 files changed, 751 insertions(+), 10 deletions(-) create mode 100644 scripts/live_verify_llm_smoke.py create mode 100644 src/citegeist/llm_verify.py create mode 100644 tests/test_live_llm_verify.py diff --git a/.gitignore b/.gitignore index 301a038..db536e2 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ __pycache__/ *.egg-info/ library.sqlite3 ops/ +.codex +SESSION_* diff --git a/Makefile b/Makefile index 6456e47..ac6dffd 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ PYTHONPATH_SRC=PYTHONPATH=src VENV_PYTHON=.venv/bin/python -.PHONY: test test-live live-smoke validate-talkorigins +.PHONY: test test-live live-smoke live-verify-llm-smoke validate-talkorigins test: $(PYTHONPATH_SRC) $(VENV_PYTHON) -m pytest -q @@ -12,5 +12,8 @@ test-live: live-smoke: CITEGEIST_SOURCE_CACHE=.cache/citegeist $(PYTHONPATH_SRC) $(VENV_PYTHON) scripts/live_smoke.py +live-verify-llm-smoke: + $(PYTHONPATH_SRC) $(VENV_PYTHON) scripts/live_verify_llm_smoke.py + validate-talkorigins: $(PYTHONPATH_SRC) $(VENV_PYTHON) -m citegeist validate-talkorigins talkorigins-out/talkorigins_manifest.json diff --git a/README.md b/README.md index 054fe01..c560f82 100644 --- a/README.md +++ b/README.md @@ -172,6 +172,7 @@ PYTHONPATH=src .venv/bin/python -m citegeist compare-extract references.txt --ba PYTHONPATH=src .venv/bin/python -m citegeist compare-extract references.txt --backend heuristic --backend grobid --summary --output compare-summary.json PYTHONPATH=src .venv/bin/python -m citegeist compare-extract references.txt --backend heuristic --backend grobid --summary --max-rows-with-differences 0 --output compare-check.json PYTHONPATH=src .venv/bin/python -m citegeist verify --string '"Graph-first bibliography augmentation" Smith 2024' --context "citation graphs" --format json +PYTHONPATH=src .venv/bin/python -m citegeist verify --string 'Evans 1960' --context "bottlenose dolphin echolocation" --llm --llm-base-url http://localhost:11434 --llm-model qwen3 --llm-role both --format json PYTHONPATH=src .venv/bin/python -m citegeist verify --bib draft.bib --output verified.bib PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve smith2024graphs PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve-stubs --doi-only --preview --limit 25 @@ -257,6 +258,58 @@ The built-in extraction backends are: The backend interface exists so future GROBID- or other parser adapters can be registered without replacing the local parser or changing the CLI contract. +## LLM-Assisted Verify + +`citegeist verify` can optionally use a local LLM for two bounded tasks: + +- `expand`: infer missing bibliographic clues from free text and context +- `rerank`: advisory reranking of already fetched resolver candidates + +Example: + +```bash +PYTHONPATH=src .venv/bin/python -m citegeist verify \ + --string 'Evans 1960' \ + --context "bottlenose dolphin echolocation" \ + --llm \ + --llm-base-url http://localhost:11434 \ + --llm-model qwen3 \ + --llm-role both \ + --format json +``` + +Supported local endpoint styles: + +- OpenAI-compatible APIs such as `http://localhost:11434/v1` +- Ollama native chat APIs such as `http://localhost:11434` + +For the current local GenieHive setup, this also works directly: + +```bash +PYTHONPATH=src .venv/bin/python -m citegeist verify \ + --string 'Evans 1960' \ + --context "bottlenose dolphin echolocation" \ + --llm \ + --llm-base-url http://127.0.0.1:8800/v1 \ + --llm-api-key change-me-client-key \ + --llm-model general_assistant \ + --llm-role both \ + --format json +``` + +There is also a local smoke script for the LLM helper path alone: + +```bash +make live-verify-llm-smoke +``` + +Safety constraints: + +- the LLM is never trusted for DOI or identifier invention +- the LLM only fills missing query clues or suggests candidate order +- `exact` status still requires verified resolver evidence, not LLM output +- if the LLM fails or returns unusable JSON, `verify` falls back to the normal resolver-only path + To compare backend output on the same plaintext references, use `compare-extract`. It aligns entries by ordinal/reference block and emits JSON with per-backend payloads plus a `differing_fields` summary for each row. Add `--summary` when you want a compact evaluation artifact with disagreement counts by field and backend presence counts instead of the full row-by-row payload. Add `--max-rows-with-differences` and/or `--max-field-difference-count` when you want CI-style failure thresholds; the command will emit the summary JSON and return a nonzero exit code if the limits are exceeded. For regression-oriented parser work, keep a small curated plaintext fixture set and run `compare-extract` against multiple backends before changing heuristics. That makes backend disagreement explicit and gives you a stable review artifact for parser changes. diff --git a/scripts/live_verify_llm_smoke.py b/scripts/live_verify_llm_smoke.py new file mode 100644 index 0000000..2829e61 --- /dev/null +++ b/scripts/live_verify_llm_smoke.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +import argparse +import json +import os + +from citegeist.bibtex import BibEntry +from citegeist.llm_verify import VerificationLlmClient, VerificationLlmConfig + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Run live LLM verify smoke checks against a local OpenAI-compatible endpoint") + parser.add_argument( + "--base-url", + default=os.environ.get("CITEGEIST_VERIFY_LLM_BASE_URL", "http://127.0.0.1:8800/v1"), + help="OpenAI-compatible or Ollama base URL", + ) + parser.add_argument( + "--model", + default=os.environ.get("CITEGEIST_VERIFY_LLM_MODEL", "general_assistant"), + help="Model or route ID exposed by the local endpoint", + ) + parser.add_argument( + "--api-key", + default=os.environ.get("CITEGEIST_VERIFY_LLM_API_KEY", "change-me-client-key"), + help="Optional API key for the local endpoint", + ) + parser.add_argument( + "--provider", + default=os.environ.get("CITEGEIST_VERIFY_LLM_PROVIDER", "auto"), + choices=["auto", "openai", "ollama-native"], + help="Endpoint protocol style", + ) + return parser + + +def main() -> int: + args = build_parser().parse_args() + client = VerificationLlmClient() + config = VerificationLlmConfig( + base_url=args.base_url, + model=args.model, + api_key=args.api_key, + provider=args.provider, + role="both", + ) + + analysis = client.analyze_query( + config, + "Evans 1960", + "marine mammals; bottlenose dolphin echolocation", + ) + rerank = client.rerank_candidates( + config, + {"title": "", "authors": ["Evans"], "year": "1960", "venue": ""}, + "bottlenose dolphin echolocation", + [ + BibEntry( + entry_type="article", + citation_key="candidate_a", + fields={ + "author": "Doe, Jane", + "title": "General Marine Biology Survey", + "year": "1960", + "journal": "Marine Science", + }, + ), + BibEntry( + entry_type="article", + citation_key="candidate_b", + fields={ + "author": "Evans, William", + "title": "Echolocation by marine dolphins", + "year": "1960", + "journal": "Journal of the Acoustical Society", + }, + ), + ], + ) + + print( + json.dumps( + { + "base_url": args.base_url, + "model": args.model, + "analysis": analysis, + "rerank": rerank, + }, + indent=2, + sort_keys=True, + ) + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/citegeist/__init__.py b/src/citegeist/__init__.py index a906e56..8ced583 100644 --- a/src/citegeist/__init__.py +++ b/src/citegeist/__init__.py @@ -13,6 +13,7 @@ from .extract import ( summarize_extraction_comparison, ) from .harvest import OaiMetadataFormat, OaiPmhHarvester, OaiSet +from .llm_verify import VerificationLlmClient, VerificationLlmConfig from .resolve import MetadataResolver, merge_entries, merge_entries_with_conflicts from .sources import SourceClient from .storage import BibliographyStore @@ -34,6 +35,8 @@ __all__ = [ "OaiMetadataFormat", "OaiSet", "SourceClient", + "VerificationLlmClient", + "VerificationLlmConfig", "VerificationMatch", "VerificationResult", "available_extraction_backends", diff --git a/src/citegeist/cli.py b/src/citegeist/cli.py index 0d729a8..3ff99b0 100644 --- a/src/citegeist/cli.py +++ b/src/citegeist/cli.py @@ -20,6 +20,7 @@ from .extract import ( summarize_extraction_comparison, ) from .harvest import OaiPmhHarvester +from .llm_verify import VerificationLlmConfig from .resolve import MetadataResolver, merge_entries_with_conflicts from .storage import BibliographyStore from .verify import BibliographyVerifier, render_verification_results @@ -145,6 +146,22 @@ def build_parser() -> argparse.ArgumentParser: verify_group.add_argument("--bib", help="Path to a BibTeX file whose entries should be verified") verify_parser.add_argument("--context", default="", help="Optional topic context used for scoring") verify_parser.add_argument("--limit", type=int, default=5, help="Maximum candidates to inspect per input") + verify_parser.add_argument("--llm", action="store_true", help="Enable optional local LLM assistance for verify") + verify_parser.add_argument("--llm-base-url", help="OpenAI-compatible or Ollama base URL for local LLM assistance") + verify_parser.add_argument("--llm-model", help="Model ID for local LLM assistance") + verify_parser.add_argument("--llm-api-key", default="", help="Optional API key for the LLM endpoint") + verify_parser.add_argument( + "--llm-provider", + choices=["auto", "openai", "ollama-native"], + default="auto", + help="LLM API style; auto treats `/v1` endpoints as OpenAI-compatible", + ) + verify_parser.add_argument( + "--llm-role", + choices=["expand", "rerank", "both"], + default="both", + help="Use the local LLM for query-clue extraction, candidate reranking, or both", + ) verify_parser.add_argument( "--format", choices=["bibtex", "json"], @@ -715,7 +732,21 @@ def main(argv: list[str] | None = None) -> int: args.output, ) if args.command == "verify": - return _run_verify(args.string, args.list_input, args.bib, args.context, args.limit, args.format, args.output) + return _run_verify( + args.string, + args.list_input, + args.bib, + args.context, + args.limit, + args.format, + args.output, + llm_enabled=args.llm, + llm_base_url=args.llm_base_url, + llm_model=args.llm_model, + llm_api_key=args.llm_api_key, + llm_provider=args.llm_provider, + llm_role=args.llm_role, + ) if args.command == "resolve": return _run_resolve(store, args.citation_keys) if args.command == "resolve-stubs": @@ -750,8 +781,6 @@ def main(argv: list[str] | None = None) -> int: args.rounds, args.recent_years, args.target_recent_entries, - args.max_expanded_entries, - args.max_expand_seconds, ) if args.command == "set-topic-phrase": return _run_set_topic_phrase(store, args.topic_slug, args.phrase, args.clear) @@ -785,6 +814,8 @@ def main(argv: list[str] | None = None) -> int: args.expansion_rounds, args.recent_years, args.target_recent_entries, + args.max_expanded_entries, + args.max_expand_seconds, ) if args.command == "bootstrap-batch": return _run_bootstrap_batch(store, Path(args.input)) @@ -1121,8 +1152,27 @@ def _run_verify( limit: int, output_format: str, output: str | None, + *, + llm_enabled: bool = False, + llm_base_url: str | None = None, + llm_model: str | None = None, + llm_api_key: str = "", + llm_provider: str = "auto", + llm_role: str = "both", ) -> int: - verifier = BibliographyVerifier() + llm_config = None + if llm_enabled: + if not llm_base_url or not llm_model: + print("--llm requires --llm-base-url and --llm-model", file=sys.stderr) + return 1 + llm_config = VerificationLlmConfig( + base_url=llm_base_url, + model=llm_model, + api_key=llm_api_key, + provider=llm_provider, + role=llm_role, + ) + verifier = BibliographyVerifier(llm_config=llm_config) if string_input is not None: results = [verifier.verify_string(string_input, context=context, limit=limit)] elif list_input is not None: diff --git a/src/citegeist/expand.py b/src/citegeist/expand.py index 612d9a1..55f3542 100644 --- a/src/citegeist/expand.py +++ b/src/citegeist/expand.py @@ -196,7 +196,7 @@ class OpenAlexExpander: ) results.append( ExpansionResult( - source_citation_key=citation_key, + source_citation_key=source_key, discovered_citation_key=existing_key or discovered.citation_key, created_entry=created, relation_type=relation_type, @@ -335,7 +335,7 @@ class TopicExpander: assigned_to_topic=assigned, ) ) - if target_recent_entries is not None and len(recent_topic_hits) >= target_recent_entries: + if target_recent_entries is not None and len(recent_hits) >= target_recent_entries: self.last_run_meta.update({ "stop_reason": "target_recent_entries", "recent_hits": len(recent_hits), diff --git a/src/citegeist/llm_verify.py b/src/citegeist/llm_verify.py new file mode 100644 index 0000000..c072706 --- /dev/null +++ b/src/citegeist/llm_verify.py @@ -0,0 +1,218 @@ +from __future__ import annotations + +import json +import re +import urllib.request +from dataclasses import dataclass +from typing import Any, Callable + +from .bibtex import BibEntry + + +DEFAULT_SYSTEM_PROMPT = ( + "You are a meticulous bibliography verification assistant. " + "You never invent DOIs, page ranges, venues, or identifiers. " + "You may only suggest missing clues from the provided input and context. " + "When uncertain, return null or an empty list. " + "Always respond with strict JSON matching the requested shape." +) + + +@dataclass(slots=True) +class VerificationLlmConfig: + base_url: str + model: str + api_key: str = "" + provider: str = "auto" + role: str = "both" + + def enabled_for(self, capability: str) -> bool: + return bool(self.base_url and self.model) and self.role in {capability, "both"} + + +class VerificationLlmClient: + def __init__( + self, + *, + timeout_s: int = 60, + post_json: Callable[[str, dict[str, Any], dict[str, str], int], dict[str, Any]] | None = None, + ) -> None: + self.timeout_s = timeout_s + self._post_json = post_json or _default_post_json + + def analyze_query( + self, + config: VerificationLlmConfig, + free_text: str, + context: str, + ) -> dict[str, Any] | None: + if not config.enabled_for("expand"): + return None + payload = { + "task": "extract_bibliographic_clues", + "input": {"free_text": free_text, "context": context}, + "rules": [ + "Never invent a DOI or identifier.", + "Only fill clues that plausibly follow from the input and context.", + "Return null for unknown scalar fields.", + ], + "schema": { + "type": "object", + "properties": { + "title": {"type": ["string", "null"]}, + "authors": {"type": "array", "items": {"type": "string"}}, + "year": {"type": ["string", "null"]}, + "venue": {"type": ["string", "null"]}, + "keywords": {"type": "array", "items": {"type": "string"}}, + }, + "required": ["authors", "keywords"], + }, + } + result = self._chat_json(config, payload) + if not isinstance(result, dict): + return None + authors = [str(value).strip() for value in result.get("authors", []) if str(value).strip()] + keywords = [str(value).strip() for value in result.get("keywords", []) if str(value).strip()] + return { + "title": _optional_string(result.get("title")), + "authors": authors, + "year": _optional_string(result.get("year")), + "venue": _optional_string(result.get("venue")), + "keywords": keywords, + } + + def rerank_candidates( + self, + config: VerificationLlmConfig, + query_fields: dict[str, object], + context: str, + candidates: list[BibEntry], + ) -> list[int] | None: + if not config.enabled_for("rerank") or not candidates: + return None + payload = { + "task": "rerank_candidates", + "instruction": ( + "Return a JSON array of candidate indices sorted best to worst. " + "Do not invent metadata. Prefer candidates that better match the given clues." + ), + "input": { + "query_fields": query_fields, + "context": context, + "candidates": [ + { + "title": entry.fields.get("title", ""), + "authors": entry.fields.get("author", "").split(" and ") if entry.fields.get("author") else [], + "year": entry.fields.get("year", ""), + "venue": entry.fields.get("journal", "") or entry.fields.get("booktitle", ""), + "doi": entry.fields.get("doi", ""), + } + for entry in candidates[:8] + ], + }, + } + result = self._chat_json(config, payload) + if not isinstance(result, list): + return None + indices = [value for value in result if isinstance(value, int) and 0 <= value < len(candidates)] + return indices or None + + def _chat_json(self, config: VerificationLlmConfig, payload: dict[str, Any]) -> Any: + try: + if _llm_mode(config.base_url, config.provider) == "openai": + return self._chat_openai(config, payload) + return self._chat_ollama_native(config, payload) + except Exception: + return None + + def _chat_openai(self, config: VerificationLlmConfig, payload: dict[str, Any]) -> Any: + headers = {"Content-Type": "application/json"} + if config.api_key: + headers["Authorization"] = f"Bearer {config.api_key}" + body = { + "model": config.model, + "temperature": 0, + "messages": [ + {"role": "system", "content": DEFAULT_SYSTEM_PROMPT}, + {"role": "user", "content": json.dumps(payload)}, + ], + } + data = self._post_json( + config.base_url.rstrip("/") + "/chat/completions", + body, + headers, + self.timeout_s, + ) + content = data["choices"][0]["message"]["content"] + return _loads_lenient_json(content) + + def _chat_ollama_native(self, config: VerificationLlmConfig, payload: dict[str, Any]) -> Any: + base_url = config.base_url.rstrip("/") + if base_url.endswith("/v1"): + base_url = base_url[:-3] + body = { + "model": config.model, + "messages": [ + {"role": "system", "content": DEFAULT_SYSTEM_PROMPT}, + {"role": "user", "content": json.dumps(payload)}, + ], + "options": {"temperature": 0}, + "stream": False, + } + headers = {"Content-Type": "application/json"} + if config.api_key: + headers["Authorization"] = f"Bearer {config.api_key}" + data = self._post_json( + base_url + "/api/chat", + body, + headers, + self.timeout_s, + ) + content = data["message"]["content"] + return _loads_lenient_json(content) + + +def _default_post_json(url: str, payload: dict[str, Any], headers: dict[str, str], timeout_s: int) -> dict[str, Any]: + request = urllib.request.Request( + url, + data=json.dumps(payload).encode("utf-8"), + headers=headers, + method="POST", + ) + with urllib.request.urlopen(request, timeout=timeout_s) as response: + return json.loads(response.read().decode("utf-8")) + + +def _llm_mode(base_url: str, provider: str) -> str: + if provider == "openai": + return "openai" + if provider == "ollama-native": + return "ollama-native" + return "openai" if base_url.rstrip("/").endswith("/v1") else "ollama-native" + + +def _optional_string(value: object) -> str | None: + text = str(value or "").strip() + return text or None + + +def _loads_lenient_json(content: str) -> Any: + try: + return json.loads(content) + except Exception: + pass + + fenced = re.search(r"```(?:json)?\s*(\{.*\}|\[.*\])\s*```", content, flags=re.DOTALL) + if fenced: + return json.loads(fenced.group(1)) + + for opener, closer in (("{", "}"), ("[", "]")): + start = content.find(opener) + end = content.rfind(closer) + if start != -1 and end != -1 and end > start: + snippet = content[start : end + 1] + try: + return json.loads(snippet) + except Exception: + continue + raise ValueError("Model response did not contain parseable JSON") diff --git a/src/citegeist/storage.py b/src/citegeist/storage.py index dfe4be7..1a08267 100644 --- a/src/citegeist/storage.py +++ b/src/citegeist/storage.py @@ -246,7 +246,13 @@ class BibliographyStore: entry.fields.get("isbn"), fulltext, raw_bibtex, - json.dumps({k: v for k, v in entry.fields.items() if k not in CORE_ENTRY_FIELDS and k not in RELATION_FIELDS}), + json.dumps( + { + k: v + for k, v in entry.fields.items() + if k not in CORE_ENTRY_FIELDS and k not in RELATION_FIELDS and k not in {"author", "editor"} + } + ), ), ).fetchone() entry_id = int(row["id"]) @@ -1142,6 +1148,8 @@ class BibliographyStore: extra_fields = json.loads(row["extra_fields_json"]) for field_name in sorted(extra_fields): + if field_name in {"author", "editor"}: + continue value = extra_fields[field_name] if value: fields[field_name] = str(value) diff --git a/src/citegeist/verify.py b/src/citegeist/verify.py index 48104d1..f8e385f 100644 --- a/src/citegeist/verify.py +++ b/src/citegeist/verify.py @@ -6,6 +6,7 @@ from dataclasses import dataclass from pathlib import Path from .bibtex import BibEntry, parse_bibtex, render_bibtex +from .llm_verify import VerificationLlmClient, VerificationLlmConfig from .resolve import MetadataResolver, Resolution @@ -75,8 +76,16 @@ class VerificationResult: class BibliographyVerifier: - def __init__(self, resolver: MetadataResolver | None = None) -> None: + def __init__( + self, + resolver: MetadataResolver | None = None, + *, + llm_config: VerificationLlmConfig | None = None, + llm_client: VerificationLlmClient | None = None, + ) -> None: self.resolver = resolver or MetadataResolver() + self.llm_config = llm_config + self.llm_client = llm_client or VerificationLlmClient() def verify_string(self, value: str, context: str = "", limit: int = 5) -> VerificationResult: query_fields = _fields_from_string(value) @@ -164,10 +173,18 @@ class BibliographyVerifier: input_key=input_key, ) + query_fields = _clone_query_fields(query_fields) + search_query = query + if self.llm_config is not None: + hints = self.llm_client.analyze_query(self.llm_config, query, context) + if hints: + _apply_llm_hints(query_fields, hints) + search_query = _build_search_query(search_query, hints) + candidate_limit = max(1, limit) candidates = self._collect_candidates( title=str(query_fields.get("title", "")), - query=query, + query=search_query, limit=candidate_limit, ) scored = [ @@ -178,9 +195,21 @@ class BibliographyVerifier: ) for entry, source_label in candidates ] + llm_ranks = _compute_llm_ranks( + self.llm_client.rerank_candidates( + self.llm_config, + query_fields, + context, + [match.entry for match in scored], + ) + if self.llm_config is not None + else None, + scored, + ) scored.sort( key=lambda item: ( -item.score, + llm_ranks.get(item.entry.citation_key, len(scored)), item.entry.fields.get("year", ""), item.entry.citation_key, ) @@ -255,6 +284,31 @@ def _fields_from_string(value: str) -> dict[str, object]: return {"title": title, "authors": authors, "year": year, "venue": ""} +def _clone_query_fields(query_fields: dict[str, object]) -> dict[str, object]: + cloned = dict(query_fields) + authors = cloned.get("authors", []) + cloned["authors"] = list(authors) if isinstance(authors, list) else [] + return cloned + + +def _apply_llm_hints(query_fields: dict[str, object], hints: dict[str, object]) -> None: + if not str(query_fields.get("title", "")).strip() and hints.get("title"): + query_fields["title"] = str(hints["title"]) + if not query_fields.get("authors") and hints.get("authors"): + query_fields["authors"] = [str(author) for author in hints["authors"] if str(author).strip()] + if not str(query_fields.get("year", "")).strip() and hints.get("year"): + query_fields["year"] = str(hints["year"]) + if not str(query_fields.get("venue", "")).strip() and hints.get("venue"): + query_fields["venue"] = str(hints["venue"]) + + +def _build_search_query(query: str, hints: dict[str, object]) -> str: + keywords = [str(value).strip() for value in hints.get("keywords", []) if str(value).strip()] + if not keywords: + return query + return " ".join(part for part in [query, " ".join(keywords[:5])] if part).strip() + + def _score_candidate(query_fields: dict[str, object], context: str, entry: BibEntry) -> float: score = 0.0 query_title = _tokenize(str(query_fields.get("title", ""))) @@ -371,3 +425,13 @@ def _placeholder_entry(query_fields: dict[str, object], query: str, input_key: s def _slugify_key(value: str) -> str: slug = re.sub(r"[^a-z0-9]+", "", value.lower()) return slug[:40] or "verification" + + +def _compute_llm_ranks(order: list[int] | None, matches: list[VerificationMatch]) -> dict[str, int]: + if not order: + return {} + ranks: dict[str, int] = {} + for rank, index in enumerate(order): + if 0 <= index < len(matches): + ranks[matches[index].entry.citation_key] = rank + return ranks diff --git a/tests/test_cli.py b/tests/test_cli.py index 6a0fe92..66b6ad9 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -247,6 +247,71 @@ def test_cli_verify_bib_outputs_json(tmp_path: Path): assert payload[0]["entry"]["citation_key"] == "candidate2024" +def test_cli_verify_rejects_incomplete_llm_config(tmp_path: Path): + stderr_buffer = io.StringIO() + with redirect_stderr(stderr_buffer): + exit_code = main( + [ + "--db", + str(tmp_path / "library.sqlite3"), + "verify", + "--string", + "Evans 1960", + "--llm", + ] + ) + + assert exit_code == 1 + assert "--llm requires --llm-base-url and --llm-model" in stderr_buffer.getvalue() + + +def test_cli_verify_builds_llm_config(tmp_path: Path): + from citegeist.bibtex import BibEntry + from citegeist.verify import VerificationResult + + database = tmp_path / "library.sqlite3" + with patch("citegeist.cli.BibliographyVerifier") as mocked_verifier_cls: + mocked_verifier = mocked_verifier_cls.return_value + mocked_verifier.verify_string.return_value = VerificationResult( + query="Evans 1960", + context="marine mammals", + status="ambiguous", + confidence=0.6, + entry=BibEntry(entry_type="misc", citation_key="evans1960", fields={"title": "Evans 1960"}), + source_label="none", + alternates=[], + input_type="string", + input_key=None, + ) + + stdout_buffer = io.StringIO() + with redirect_stdout(stdout_buffer): + exit_code = main( + [ + "--db", + str(database), + "verify", + "--string", + "Evans 1960", + "--llm", + "--llm-base-url", + "http://localhost:11434", + "--llm-model", + "qwen3", + "--llm-role", + "rerank", + "--format", + "json", + ] + ) + + assert exit_code == 0 + kwargs = mocked_verifier_cls.call_args.kwargs + assert kwargs["llm_config"].base_url == "http://localhost:11434" + assert kwargs["llm_config"].model == "qwen3" + assert kwargs["llm_config"].role == "rerank" + + def test_cli_sync_jabref_ingests_resolves_and_exports(tmp_path: Path): bib_path = tmp_path / "jabref-library.bib" bib_path.write_text( diff --git a/tests/test_live_llm_verify.py b/tests/test_live_llm_verify.py new file mode 100644 index 0000000..3440400 --- /dev/null +++ b/tests/test_live_llm_verify.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +import os + +import pytest + +from citegeist.bibtex import BibEntry +from citegeist.llm_verify import VerificationLlmClient, VerificationLlmConfig + + +pytestmark = pytest.mark.live + + +def _live_llm_config() -> VerificationLlmConfig: + return VerificationLlmConfig( + base_url=os.environ.get("CITEGEIST_VERIFY_LLM_BASE_URL", "http://127.0.0.1:8800/v1"), + model=os.environ.get("CITEGEIST_VERIFY_LLM_MODEL", "general_assistant"), + api_key=os.environ.get("CITEGEIST_VERIFY_LLM_API_KEY", "change-me-client-key"), + provider=os.environ.get("CITEGEIST_VERIFY_LLM_PROVIDER", "auto"), + role="both", + ) + + +def test_live_llm_query_analysis_via_geniehive(): + client = VerificationLlmClient() + result = client.analyze_query( + _live_llm_config(), + "Evans 1960", + "marine mammals; bottlenose dolphin echolocation", + ) + + if result is None: + pytest.skip("local GenieHive route did not return parseable JSON for query analysis") + assert isinstance(result["authors"], list) + assert isinstance(result["keywords"], list) + + +def test_live_llm_candidate_rerank_via_geniehive(): + client = VerificationLlmClient() + candidates = [ + BibEntry( + entry_type="article", + citation_key="candidate_a", + fields={ + "author": "Doe, Jane", + "title": "General Marine Biology Survey", + "year": "1960", + "journal": "Marine Science", + }, + ), + BibEntry( + entry_type="article", + citation_key="candidate_b", + fields={ + "author": "Evans, William", + "title": "Echolocation by marine dolphins", + "year": "1960", + "journal": "Journal of the Acoustical Society", + }, + ), + ] + + result = client.rerank_candidates( + _live_llm_config(), + { + "title": "", + "authors": ["Evans"], + "year": "1960", + "venue": "", + }, + "bottlenose dolphin echolocation", + candidates, + ) + + if result is None: + pytest.skip("local GenieHive route did not return parseable JSON for candidate reranking") + assert result + assert all(isinstance(index, int) for index in result) diff --git a/tests/test_verify.py b/tests/test_verify.py index 1923172..c372d53 100644 --- a/tests/test_verify.py +++ b/tests/test_verify.py @@ -1,6 +1,7 @@ from __future__ import annotations from citegeist.bibtex import BibEntry +from citegeist.llm_verify import VerificationLlmConfig, _loads_lenient_json from citegeist.resolve import Resolution from citegeist.verify import BibliographyVerifier @@ -120,3 +121,102 @@ def test_verification_result_to_bib_entry_contains_audit_fields(): assert bib_entry.fields["x_status"] == "not_found" assert bib_entry.fields["x_query"] == "Missing Work" + + +def test_verifier_llm_expand_only_fills_missing_fields(): + class _FakeLlmClient: + def analyze_query(self, config, query, context): + return { + "title": "Expanded Title", + "authors": ["Smith"], + "year": "2024", + "venue": "Journal of Tests", + "keywords": ["echolocation", "marine"], + } + + def rerank_candidates(self, config, query_fields, context, candidates): + return None + + verifier = BibliographyVerifier( + llm_config=VerificationLlmConfig(base_url="http://localhost:11434", model="qwen", role="expand"), + llm_client=_FakeLlmClient(), + ) + seen_titles: list[str] = [] + verifier.resolver.search_crossref = lambda title, limit=5: (seen_titles.append(title) or []) # type: ignore[method-assign] + verifier.resolver.search_openalex = lambda title, limit=5: [] # type: ignore[method-assign] + verifier.resolver.search_datacite = lambda title, limit=5: [] # type: ignore[method-assign] + verifier.resolver.search_pubmed = lambda title, limit=5: [] # type: ignore[method-assign] + + verifier.verify_string("Evans 1960", context="bottlenose dolphin echolocation") + + assert seen_titles == ["Expanded Title"] + + +def test_verifier_llm_rerank_only_breaks_score_ties(): + class _FakeLlmClient: + def analyze_query(self, config, query, context): + return None + + def rerank_candidates(self, config, query_fields, context, candidates): + return [1, 0] + + verifier = BibliographyVerifier( + llm_config=VerificationLlmConfig(base_url="http://localhost:11434", model="qwen", role="rerank"), + llm_client=_FakeLlmClient(), + ) + verifier.resolver.search_crossref = lambda title, limit=5: [ # type: ignore[method-assign] + BibEntry( + entry_type="article", + citation_key="alpha", + fields={"author": "Smith, Jane", "title": "Shared Match Primary", "year": "2024"}, + ), + BibEntry( + entry_type="article", + citation_key="beta", + fields={"author": "Smith, Jane", "title": "Shared Match Secondary", "year": "2024"}, + ), + ] + verifier.resolver.search_openalex = lambda title, limit=5: [] # type: ignore[method-assign] + verifier.resolver.search_datacite = lambda title, limit=5: [] # type: ignore[method-assign] + verifier.resolver.search_pubmed = lambda title, limit=5: [] # type: ignore[method-assign] + + result = verifier.verify_string('"Shared Match" Smith 2024') + + assert result.entry.citation_key == "beta" + assert result.alternates[0].entry.citation_key == "alpha" + + +def test_verifier_llm_cannot_create_exact_without_verified_doi(): + class _FakeLlmClient: + def analyze_query(self, config, query, context): + return {"title": "Resolved Work", "authors": ["Smith"], "year": "2024", "venue": None, "keywords": []} + + def rerank_candidates(self, config, query_fields, context, candidates): + return None + + verifier = BibliographyVerifier( + llm_config=VerificationLlmConfig(base_url="http://localhost:11434", model="qwen", role="expand"), + llm_client=_FakeLlmClient(), + ) + verifier.resolver.search_crossref = lambda title, limit=5: [ # type: ignore[method-assign] + BibEntry( + entry_type="article", + citation_key="candidate", + fields={"author": "Smith, Jane", "title": "Resolved Work", "year": "2024"}, + ) + ] + verifier.resolver.search_openalex = lambda title, limit=5: [] # type: ignore[method-assign] + verifier.resolver.search_datacite = lambda title, limit=5: [] # type: ignore[method-assign] + verifier.resolver.search_pubmed = lambda title, limit=5: [] # type: ignore[method-assign] + + result = verifier.verify_string("Smith 2024", context="citation graphs") + + assert result.status != "exact" + + +def test_llm_json_loader_accepts_fenced_payload(): + payload = '```json\n{"title":"Resolved Work","authors":["Smith"],"keywords":["graphs"]}\n```' + + result = _loads_lenient_json(payload) + + assert result["title"] == "Resolved Work"