Tighten claim support deduping
This commit is contained in:
parent
72c3e3131e
commit
ef61366eea
|
|
@ -15,6 +15,7 @@ AUTHOR_YEAR_INLINE_PATTERN = re.compile(
|
||||||
r"\b([A-Z][A-Za-z'’.-]+(?:\s+(?:and|&|et al\.?))?(?:\s+[A-Z][A-Za-z'’.-]+)*)\s*\((\d{4}[a-z]?)\)"
|
r"\b([A-Z][A-Za-z'’.-]+(?:\s+(?:and|&|et al\.?))?(?:\s+[A-Z][A-Za-z'’.-]+)*)\s*\((\d{4}[a-z]?)\)"
|
||||||
)
|
)
|
||||||
REFERENCE_ENTRY_PATTERN = re.compile(r"^\s*\[\[(\d+)\]\]\s*(.+)$", re.MULTILINE)
|
REFERENCE_ENTRY_PATTERN = re.compile(r"^\s*\[\[(\d+)\]\]\s*(.+)$", re.MULTILINE)
|
||||||
|
REFERENCE_BLOCK_PATTERN = re.compile(r"^\s*\[\[(\d+)\]\]\s*(.+?)(?=^\s*\[\[\d+\]\]|\Z)", re.MULTILINE | re.DOTALL)
|
||||||
SENTENCE_SPLIT_PATTERN = re.compile(r'(?<=[.!?])\s+(?=[A-Z0-9"\[])')
|
SENTENCE_SPLIT_PATTERN = re.compile(r'(?<=[.!?])\s+(?=[A-Z0-9"\[])')
|
||||||
SECTION_HEADER_PATTERN = re.compile(r"^(?:[IVX]+\.|[A-Z]\.)\s+[A-Z]")
|
SECTION_HEADER_PATTERN = re.compile(r"^(?:[IVX]+\.|[A-Z]\.)\s+[A-Z]")
|
||||||
CONTINUATION_START_PATTERN = re.compile(
|
CONTINUATION_START_PATTERN = re.compile(
|
||||||
|
|
@ -39,6 +40,7 @@ NON_CLAIM_START_PATTERN = re.compile(
|
||||||
r"new references found)",
|
r"new references found)",
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
|
DOI_PATTERN = re.compile(r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
@dataclass(slots=True)
|
@dataclass(slots=True)
|
||||||
|
|
@ -68,6 +70,12 @@ class ClaimCandidate:
|
||||||
needs_support_score: float
|
needs_support_score: float
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(slots=True)
|
||||||
|
class ExistingReference:
|
||||||
|
title: str
|
||||||
|
doi: str = ""
|
||||||
|
|
||||||
|
|
||||||
def analyze_support_gaps(
|
def analyze_support_gaps(
|
||||||
text: str,
|
text: str,
|
||||||
*,
|
*,
|
||||||
|
|
@ -79,15 +87,24 @@ def analyze_support_gaps(
|
||||||
) -> dict[str, object]:
|
) -> dict[str, object]:
|
||||||
verifier = verifier or BibliographyVerifier()
|
verifier = verifier or BibliographyVerifier()
|
||||||
existing_references = _extract_existing_references(text)
|
existing_references = _extract_existing_references(text)
|
||||||
existing_titles_normalized = {_normalize_title(title) for title in existing_references.values() if title}
|
existing_titles_normalized = {
|
||||||
|
_normalize_title(reference.title)
|
||||||
|
for reference in existing_references.values()
|
||||||
|
if reference.title
|
||||||
|
}
|
||||||
|
existing_dois_normalized = {
|
||||||
|
_normalize_doi(reference.doi)
|
||||||
|
for reference in existing_references.values()
|
||||||
|
if reference.doi
|
||||||
|
}
|
||||||
claims = _extract_claim_candidates(text, max_claims=max_claims, min_claim_chars=min_claim_chars)
|
claims = _extract_claim_candidates(text, max_claims=max_claims, min_claim_chars=min_claim_chars)
|
||||||
|
|
||||||
suggestions: list[ClaimSupportSuggestion] = []
|
suggestions: list[ClaimSupportSuggestion] = []
|
||||||
for claim in claims:
|
for claim in claims:
|
||||||
referenced_titles = [
|
referenced_titles = [
|
||||||
existing_references[marker]
|
existing_references[marker].title
|
||||||
for marker in claim.citation_markers
|
for marker in claim.citation_markers
|
||||||
if marker in existing_references and existing_references[marker]
|
if marker in existing_references and existing_references[marker].title
|
||||||
]
|
]
|
||||||
verification = verifier.verify_string(claim.text, context=context, limit=limit)
|
verification = verifier.verify_string(claim.text, context=context, limit=limit)
|
||||||
candidates = [verification.entry, *[alt.entry for alt in verification.alternates]]
|
candidates = [verification.entry, *[alt.entry for alt in verification.alternates]]
|
||||||
|
|
@ -96,20 +113,36 @@ def analyze_support_gaps(
|
||||||
|
|
||||||
rendered: list[dict[str, object]] = []
|
rendered: list[dict[str, object]] = []
|
||||||
seen_titles: set[str] = set()
|
seen_titles: set[str] = set()
|
||||||
|
seen_dois: set[str] = set()
|
||||||
|
seen_keys: set[str] = set()
|
||||||
for entry, source_label, score in zip(candidates, sources, scores):
|
for entry, source_label, score in zip(candidates, sources, scores):
|
||||||
title = str(entry.fields.get("title") or "").strip()
|
title = str(entry.fields.get("title") or "").strip()
|
||||||
|
doi = str(entry.fields.get("doi") or "").strip()
|
||||||
normalized_title = _normalize_title(title)
|
normalized_title = _normalize_title(title)
|
||||||
if not title or normalized_title in existing_titles_normalized or normalized_title in seen_titles:
|
normalized_doi = _normalize_doi(doi)
|
||||||
|
citation_key = str(entry.citation_key or "").strip()
|
||||||
|
normalized_key = citation_key.lower()
|
||||||
|
if not title:
|
||||||
|
continue
|
||||||
|
if normalized_title in existing_titles_normalized or normalized_title in seen_titles:
|
||||||
|
continue
|
||||||
|
if normalized_doi and (normalized_doi in existing_dois_normalized or normalized_doi in seen_dois):
|
||||||
|
continue
|
||||||
|
if normalized_key and normalized_key in seen_keys:
|
||||||
continue
|
continue
|
||||||
seen_titles.add(normalized_title)
|
seen_titles.add(normalized_title)
|
||||||
|
if normalized_doi:
|
||||||
|
seen_dois.add(normalized_doi)
|
||||||
|
if normalized_key:
|
||||||
|
seen_keys.add(normalized_key)
|
||||||
rendered.append(
|
rendered.append(
|
||||||
{
|
{
|
||||||
"citation_key": entry.citation_key,
|
"citation_key": citation_key,
|
||||||
"entry_type": entry.entry_type,
|
"entry_type": entry.entry_type,
|
||||||
"title": title,
|
"title": title,
|
||||||
"authors": str(entry.fields.get("author") or ""),
|
"authors": str(entry.fields.get("author") or ""),
|
||||||
"year": str(entry.fields.get("year") or ""),
|
"year": str(entry.fields.get("year") or ""),
|
||||||
"doi": str(entry.fields.get("doi") or ""),
|
"doi": doi,
|
||||||
"journal": str(entry.fields.get("journal") or entry.fields.get("booktitle") or ""),
|
"journal": str(entry.fields.get("journal") or entry.fields.get("booktitle") or ""),
|
||||||
"source_label": source_label,
|
"source_label": source_label,
|
||||||
"score": round(float(score), 4),
|
"score": round(float(score), 4),
|
||||||
|
|
@ -223,15 +256,20 @@ def _should_merge_continuation(current: str, next_sentence: str, *, min_claim_ch
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _extract_existing_references(text: str) -> dict[str, str]:
|
def _extract_existing_references(text: str) -> dict[str, ExistingReference]:
|
||||||
if "References" not in text:
|
if "References" not in text:
|
||||||
return {}
|
return {}
|
||||||
_, _, tail = text.partition("References")
|
_, _, tail = text.partition("References")
|
||||||
references: dict[str, str] = {}
|
references: dict[str, ExistingReference] = {}
|
||||||
for match in REFERENCE_ENTRY_PATTERN.finditer(tail):
|
for match in REFERENCE_BLOCK_PATTERN.finditer(tail):
|
||||||
marker = match.group(1)
|
marker = match.group(1)
|
||||||
title = match.group(2).strip()
|
block = match.group(2).strip()
|
||||||
references[marker] = title
|
first_line = block.splitlines()[0].strip() if block else ""
|
||||||
|
doi_match = DOI_PATTERN.search(block)
|
||||||
|
references[marker] = ExistingReference(
|
||||||
|
title=first_line,
|
||||||
|
doi=doi_match.group(0) if doi_match else "",
|
||||||
|
)
|
||||||
return references
|
return references
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -290,6 +328,10 @@ def _normalize_title(value: str) -> str:
|
||||||
return re.sub(r"[^a-z0-9]+", " ", value.lower()).strip()
|
return re.sub(r"[^a-z0-9]+", " ", value.lower()).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_doi(value: str) -> str:
|
||||||
|
return value.strip().lower()
|
||||||
|
|
||||||
|
|
||||||
def _build_note(markers: list[str], titles: list[str]) -> str | None:
|
def _build_note(markers: list[str], titles: list[str]) -> str | None:
|
||||||
if not markers:
|
if not markers:
|
||||||
return "No existing inline citation markers detected for this claim."
|
return "No existing inline citation markers detected for this claim."
|
||||||
|
|
|
||||||
|
|
@ -114,3 +114,60 @@ References
|
||||||
assert first["existing_citation_markers"] == []
|
assert first["existing_citation_markers"] == []
|
||||||
assert second["existing_citation_markers"] == ["1"]
|
assert second["existing_citation_markers"] == ["1"]
|
||||||
assert first["needs_support_score"] > second["needs_support_score"]
|
assert first["needs_support_score"] > second["needs_support_score"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_analyze_support_gaps_filters_existing_reference_dois():
|
||||||
|
class DoiVerifier(FakeVerifier):
|
||||||
|
def verify_string(self, value: str, context: str = "", limit: int = 5) -> VerificationResult:
|
||||||
|
self.queries.append(value)
|
||||||
|
return VerificationResult(
|
||||||
|
query=value,
|
||||||
|
context=context,
|
||||||
|
status="high_confidence",
|
||||||
|
confidence=0.91,
|
||||||
|
entry=BibEntry(
|
||||||
|
entry_type="article",
|
||||||
|
citation_key="dup2020support",
|
||||||
|
fields={
|
||||||
|
"title": "A Better Support Paper Retitled",
|
||||||
|
"author": "Smith, Jane",
|
||||||
|
"year": "2020",
|
||||||
|
"doi": "10.1000/existing",
|
||||||
|
"journal": "Journal of Better Support",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
source_label="openalex:search:A Better Support Paper Retitled",
|
||||||
|
alternates=[
|
||||||
|
VerificationMatch(
|
||||||
|
entry=BibEntry(
|
||||||
|
entry_type="article",
|
||||||
|
citation_key="novel2021support",
|
||||||
|
fields={
|
||||||
|
"title": "A Different Support Paper",
|
||||||
|
"author": "Doe, Alex",
|
||||||
|
"year": "2021",
|
||||||
|
"doi": "10.1000/new-distinct",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
score=0.7,
|
||||||
|
source_label="crossref:search:A Different Support Paper",
|
||||||
|
)
|
||||||
|
],
|
||||||
|
input_type="string",
|
||||||
|
input_key=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
verifier = DoiVerifier()
|
||||||
|
text = """
|
||||||
|
Computational research touching on movement of agents spans many different fields. Movement may not be modeled at all, but simply assigned a cost value, as in work in artificial neural systems applied to the traveling salesman problem [1].
|
||||||
|
|
||||||
|
References
|
||||||
|
|
||||||
|
[[1]]Existing cited paper
|
||||||
|
doi: 10.1000/existing
|
||||||
|
"""
|
||||||
|
payload = analyze_support_gaps(text, verifier=verifier, max_claims=3, min_claim_chars=40)
|
||||||
|
assert payload["suggestion_count"] == 1
|
||||||
|
suggested_titles = [item["title"] for item in payload["suggestions"][0]["suggested_references"]]
|
||||||
|
assert "A Better Support Paper Retitled" not in suggested_titles
|
||||||
|
assert "A Different Support Paper" in suggested_titles
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue