diff --git a/src/citegeist/claim_support.py b/src/citegeist/claim_support.py index d6e9491..0e6a7a5 100644 --- a/src/citegeist/claim_support.py +++ b/src/citegeist/claim_support.py @@ -41,6 +41,7 @@ NON_CLAIM_START_PATTERN = re.compile( re.IGNORECASE, ) DOI_PATTERN = re.compile(r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b", re.IGNORECASE) +TOKEN_PATTERN = re.compile(r"[a-z0-9]{4,}") @dataclass(slots=True) @@ -146,6 +147,13 @@ def analyze_support_gaps( "journal": str(entry.fields.get("journal") or entry.fields.get("booktitle") or ""), "source_label": source_label, "score": round(float(score), 4), + "reason": _build_reference_reason( + claim.text, + title=title, + journal=str(entry.fields.get("journal") or entry.fields.get("booktitle") or ""), + source_label=source_label, + is_primary=entry is verification.entry, + ), } ) @@ -332,6 +340,43 @@ def _normalize_doi(value: str) -> str: return value.strip().lower() +def _build_reference_reason( + claim_text: str, + *, + title: str, + journal: str, + source_label: str, + is_primary: bool, +) -> str: + claim_terms = _meaningful_tokens(claim_text) + title_terms = _meaningful_tokens(title) + journal_terms = _meaningful_tokens(journal) + overlap = sorted(claim_terms & title_terms) + overlap_preview = ", ".join(overlap[:3]) + + reasons: list[str] = [] + reasons.append("Top candidate match." if is_primary else "Alternate candidate retained after verification.") + if overlap_preview: + reasons.append(f"Shares claim terms: {overlap_preview}.") + elif claim_terms & journal_terms: + reasons.append("Venue terms overlap with the claim topic.") + elif source_label.startswith("openalex:search:"): + reasons.append("Returned from topic-oriented OpenAlex search for this claim.") + elif source_label.startswith("crossref:search:"): + reasons.append("Returned from Crossref search for this claim.") + else: + reasons.append("Returned by the bibliography verifier for this claim.") + return " ".join(reasons) + + +def _meaningful_tokens(value: str) -> set[str]: + return { + token + for token in TOKEN_PATTERN.findall(value.lower()) + if token not in {"this", "that", "with", "from", "their", "there", "into", "about", "through", "using"} + } + + def _build_note(markers: list[str], titles: list[str]) -> str | None: if not markers: return "No existing inline citation markers detected for this claim." diff --git a/tests/test_claim_support.py b/tests/test_claim_support.py index 93eba00..988e9f2 100644 --- a/tests/test_claim_support.py +++ b/tests/test_claim_support.py @@ -64,6 +64,7 @@ J. J. Hopfield, David W. Tank assert suggestion["existing_reference_titles"] == ["Neural computation of decisions in optimization problems"] assert suggestion["suggested_references"][0]["title"] == "A Better Support Paper" assert suggestion["needs_support_score"] > 0 + assert suggestion["suggested_references"][0]["reason"].startswith("Top candidate match.") titles = [item["title"] for item in suggestion["suggested_references"]] assert "Neural computation of decisions in optimization problems" not in titles @@ -171,3 +172,20 @@ doi: 10.1000/existing suggested_titles = [item["title"] for item in payload["suggestions"][0]["suggested_references"]] assert "A Better Support Paper Retitled" not in suggested_titles assert "A Different Support Paper" in suggested_titles + + +def test_analyze_support_gaps_includes_reason_for_alternate_candidates(): + verifier = FakeVerifier() + text = """ +Computational research touching on movement of agents spans many different fields. Movement strategies in artificial life systems can improve resource exploitation under selection pressures [1]. + +References + +[[1]]Earlier Cited Paper +""" + payload = analyze_support_gaps(text, verifier=verifier, max_claims=3, min_claim_chars=40) + suggestion = payload["suggestions"][0] + assert len(suggestion["suggested_references"]) == 2 + primary, alternate = suggestion["suggested_references"] + assert "Top candidate match." in primary["reason"] + assert "Alternate candidate retained after verification." in alternate["reason"]