884 lines
33 KiB
Python
884 lines
33 KiB
Python
from __future__ import annotations
|
||
|
||
import json
|
||
import os
|
||
import re
|
||
import shutil
|
||
import subprocess
|
||
import tempfile
|
||
import urllib.error
|
||
import urllib.parse
|
||
import urllib.request
|
||
from dataclasses import dataclass
|
||
from typing import Protocol
|
||
|
||
from .bibtex import BibEntry, parse_bibtex
|
||
|
||
YEAR_PATTERN = re.compile(r"\b(?:1[6-9]|20|21)\d{2}[a-z]?\b", re.IGNORECASE)
|
||
YEAR_PAREN_PATTERN = re.compile(r"\((?:1[6-9]|20|21)\d{2}[a-z]?\)", re.IGNORECASE)
|
||
REF_START_PATTERN = re.compile(r"^(?:\[\d+\]|\d+\.|\(\d+\))\s*")
|
||
DOI_PATTERN = re.compile(r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b", re.IGNORECASE)
|
||
URL_PATTERN = re.compile(r"https?://\S+", re.IGNORECASE)
|
||
ARXIV_PATTERN = re.compile(r"\barXiv:\s*([A-Za-z0-9.\-]+)", re.IGNORECASE)
|
||
ISBN_PATTERN = re.compile(r"\bISBN(?:-1[03])?:?\s*([0-9Xx\-]{10,20})\b")
|
||
ISSN_PATTERN = re.compile(r"\bISSN:?\s*([0-9Xx\-]{8,12})\b", re.IGNORECASE)
|
||
VOLUME_ISSUE_PAGES_PATTERN = re.compile(
|
||
r"(?P<volume>\d+)\s*(?:\((?P<number>[^)]+)\))?\s*[:;,]\s*(?P<pages>\d+\s*[-\u2013]\s*\d+)\b"
|
||
)
|
||
PAGES_PATTERN = re.compile(r"\bpp?\.\s*(?P<pages>\d+\s*[-\u2013]\s*\d+)\b", re.IGNORECASE)
|
||
TRAILING_PAGE_PATTERN = re.compile(r"[,;]\s*(?P<pages>\d+\s*[-\u2013]\s*\d+)\.?$")
|
||
REPORT_NUMBER_PATTERN = re.compile(r"\b(?:technical\s+report|report|working\s+paper|bulletin)\s+(?:no\.?|number)?\s*(?P<number>[A-Za-z0-9.\-]+)\b", re.IGNORECASE)
|
||
THESIS_MARKER_PATTERN = re.compile(
|
||
r"\((?:master|doctoral).*?\)|phd dissertation|master'?s thesis|master’s thesis|doctoral dissertation",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
|
||
@dataclass(slots=True)
|
||
class ParsedReferenceParts:
|
||
raw_line: str
|
||
authors: str
|
||
year: str
|
||
title: str
|
||
venue: str
|
||
|
||
|
||
@dataclass(slots=True)
|
||
class ExtractionComparisonRow:
|
||
ordinal: int
|
||
raw_reference: str
|
||
entries: dict[str, dict[str, object]]
|
||
differing_fields: list[str]
|
||
|
||
def to_dict(self) -> dict[str, object]:
|
||
return {
|
||
"ordinal": self.ordinal,
|
||
"raw_reference": self.raw_reference,
|
||
"entries": self.entries,
|
||
"differing_fields": self.differing_fields,
|
||
}
|
||
|
||
|
||
@dataclass(slots=True)
|
||
class ExtractionComparisonSummary:
|
||
backends: list[str]
|
||
row_count: int
|
||
rows_with_differences: int
|
||
differing_field_counts: dict[str, int]
|
||
backend_presence_counts: dict[str, int]
|
||
|
||
def to_dict(self) -> dict[str, object]:
|
||
return {
|
||
"backends": self.backends,
|
||
"row_count": self.row_count,
|
||
"rows_with_differences": self.rows_with_differences,
|
||
"differing_field_counts": self.differing_field_counts,
|
||
"backend_presence_counts": self.backend_presence_counts,
|
||
}
|
||
|
||
|
||
@dataclass(slots=True)
|
||
class ExtractionComparisonCheckResult:
|
||
passed: bool
|
||
failures: list[str]
|
||
|
||
def to_dict(self) -> dict[str, object]:
|
||
return {
|
||
"passed": self.passed,
|
||
"failures": self.failures,
|
||
}
|
||
|
||
|
||
class ReferenceExtractionBackend(Protocol):
|
||
name: str
|
||
|
||
def extract_references(self, text: str) -> list[BibEntry]:
|
||
...
|
||
|
||
|
||
@dataclass(slots=True)
|
||
class HeuristicReferenceExtractionBackend:
|
||
name: str = "heuristic"
|
||
|
||
def extract_references(self, text: str) -> list[BibEntry]:
|
||
return _extract_references_heuristic(text)
|
||
|
||
|
||
@dataclass(slots=True)
|
||
class AnystyleCliReferenceExtractionBackend:
|
||
name: str = "anystyle"
|
||
command: str | None = None
|
||
parser_model: str | None = None
|
||
|
||
def extract_references(self, text: str) -> list[BibEntry]:
|
||
command = self.command or os.getenv("CITEGEIST_ANYSTYLE_BIN", "anystyle")
|
||
parser_model = self.parser_model or os.getenv("CITEGEIST_ANYSTYLE_PARSER_MODEL")
|
||
if shutil.which(command) is None:
|
||
raise RuntimeError(
|
||
"The 'anystyle' extraction backend requires the AnyStyle CLI to be installed and on PATH. "
|
||
"Set CITEGEIST_ANYSTYLE_BIN if the binary is elsewhere."
|
||
)
|
||
|
||
blocks = _iter_reference_blocks(text)
|
||
if not blocks:
|
||
return []
|
||
|
||
with tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".txt", delete=False) as handle:
|
||
handle.write("\n".join(blocks) + "\n")
|
||
input_path = handle.name
|
||
|
||
args = [command, "--stdout", "-f", "json"]
|
||
if parser_model:
|
||
args.extend(["-P", parser_model])
|
||
args.extend(["parse", input_path])
|
||
|
||
try:
|
||
result = subprocess.run(args, capture_output=True, text=True, check=False)
|
||
finally:
|
||
try:
|
||
os.unlink(input_path)
|
||
except OSError:
|
||
pass
|
||
|
||
if result.returncode != 0:
|
||
message = result.stderr.strip() or result.stdout.strip() or "unknown AnyStyle error"
|
||
raise RuntimeError(f"AnyStyle extraction failed: {message}")
|
||
|
||
payload = json.loads(result.stdout or "[]")
|
||
if not isinstance(payload, list):
|
||
raise RuntimeError("AnyStyle extraction returned an unexpected payload")
|
||
return [_anystyle_item_to_entry(item, index) for index, item in enumerate(payload, start=1)]
|
||
|
||
|
||
@dataclass(slots=True)
|
||
class GrobidReferenceExtractionBackend:
|
||
name: str = "grobid"
|
||
base_url: str | None = None
|
||
consolidate_citations: int = 0
|
||
include_raw_citations: int = 1
|
||
|
||
def extract_references(self, text: str) -> list[BibEntry]:
|
||
blocks = _iter_reference_blocks(text)
|
||
if not blocks:
|
||
return []
|
||
|
||
base_url = (self.base_url or os.getenv("CITEGEIST_GROBID_URL", "http://127.0.0.1:8070")).rstrip("/")
|
||
payload = urllib.parse.urlencode(
|
||
{
|
||
"citations": blocks,
|
||
"consolidateCitations": str(self.consolidate_citations),
|
||
"includeRawCitations": str(self.include_raw_citations),
|
||
},
|
||
doseq=True,
|
||
).encode("utf-8")
|
||
request = urllib.request.Request(
|
||
f"{base_url}/api/processCitationList",
|
||
data=payload,
|
||
headers={
|
||
"Accept": "application/x-bibtex",
|
||
"Content-Type": "application/x-www-form-urlencoded",
|
||
},
|
||
method="POST",
|
||
)
|
||
|
||
try:
|
||
with urllib.request.urlopen(request, timeout=30) as response:
|
||
body = response.read().decode("utf-8")
|
||
except urllib.error.HTTPError as exc:
|
||
error_body = exc.read()
|
||
if isinstance(error_body, bytes):
|
||
detail = error_body.decode("utf-8", errors="replace").strip()
|
||
else:
|
||
detail = str(error_body or "").strip()
|
||
raise RuntimeError(f"GROBID extraction failed with HTTP {exc.code}: {detail or exc.reason}") from exc
|
||
except urllib.error.URLError as exc:
|
||
raise RuntimeError(f"GROBID extraction failed: {exc.reason}") from exc
|
||
|
||
if not body.strip():
|
||
return []
|
||
|
||
try:
|
||
entries = parse_bibtex(body)
|
||
except Exception as exc:
|
||
raise RuntimeError("GROBID extraction returned invalid BibTeX output") from exc
|
||
|
||
for index, entry in enumerate(entries, start=1):
|
||
if entry.citation_key in {"-1", "1", ""}:
|
||
entry.citation_key = _make_citation_key(
|
||
entry.fields.get("author", "ref"),
|
||
entry.fields.get("year", "nd"),
|
||
entry.fields.get("title", "untitled"),
|
||
index,
|
||
)
|
||
return entries
|
||
|
||
|
||
_EXTRACTION_BACKENDS: dict[str, ReferenceExtractionBackend] = {
|
||
"heuristic": HeuristicReferenceExtractionBackend(),
|
||
"anystyle": AnystyleCliReferenceExtractionBackend(),
|
||
"grobid": GrobidReferenceExtractionBackend(),
|
||
}
|
||
|
||
|
||
def available_extraction_backends() -> list[str]:
|
||
return sorted(_EXTRACTION_BACKENDS)
|
||
|
||
|
||
def get_extraction_backend(name: str = "heuristic") -> ReferenceExtractionBackend:
|
||
try:
|
||
return _EXTRACTION_BACKENDS[name]
|
||
except KeyError as exc:
|
||
choices = ", ".join(available_extraction_backends())
|
||
raise ValueError(f"Unknown extraction backend: {name}. Available backends: {choices}") from exc
|
||
|
||
|
||
def register_extraction_backend(backend: ReferenceExtractionBackend) -> None:
|
||
_EXTRACTION_BACKENDS[backend.name] = backend
|
||
|
||
|
||
def extract_references(text: str, backend: str = "heuristic") -> list[BibEntry]:
|
||
backend_impl = get_extraction_backend(backend)
|
||
entries = backend_impl.extract_references(text)
|
||
raw_references = _iter_reference_blocks(text)
|
||
return _normalize_extracted_entries(entries, raw_references, backend_impl.name)
|
||
|
||
|
||
def render_extracted_bibtex(text: str, backend: str = "heuristic") -> str:
|
||
from .bibtex import render_bibtex
|
||
|
||
return render_bibtex(extract_references(text, backend=backend))
|
||
|
||
|
||
def compare_extraction_backends(text: str, backends: list[str] | None = None) -> list[ExtractionComparisonRow]:
|
||
selected = backends or available_extraction_backends()
|
||
raw_references = _iter_reference_blocks(text)
|
||
extracted_by_backend = {backend: extract_references(text, backend=backend) for backend in selected}
|
||
|
||
rows: list[ExtractionComparisonRow] = []
|
||
max_count = max([len(raw_references), *(len(entries) for entries in extracted_by_backend.values())], default=0)
|
||
for index in range(max_count):
|
||
entries_payload: dict[str, dict[str, object]] = {}
|
||
all_field_names: set[str] = set()
|
||
for backend in selected:
|
||
entry = extracted_by_backend[backend][index] if index < len(extracted_by_backend[backend]) else None
|
||
payload = _entry_to_comparison_payload(entry)
|
||
entries_payload[backend] = payload
|
||
all_field_names.update(str(field_name) for field_name in payload.get("fields", {}))
|
||
|
||
differing_fields: list[str] = []
|
||
entry_type_values = {str(entries_payload[backend].get("entry_type") or "") for backend in selected}
|
||
if len(entry_type_values) > 1:
|
||
differing_fields.append("entry_type")
|
||
for field_name in sorted(all_field_names):
|
||
values = {
|
||
str(entries_payload[backend].get("fields", {}).get(field_name, "<missing>"))
|
||
for backend in selected
|
||
}
|
||
if len(values) > 1:
|
||
differing_fields.append(field_name)
|
||
rows.append(
|
||
ExtractionComparisonRow(
|
||
ordinal=index + 1,
|
||
raw_reference=raw_references[index] if index < len(raw_references) else "",
|
||
entries=entries_payload,
|
||
differing_fields=differing_fields,
|
||
)
|
||
)
|
||
return rows
|
||
|
||
|
||
def summarize_extraction_comparison(rows: list[ExtractionComparisonRow]) -> ExtractionComparisonSummary:
|
||
backend_names = sorted({backend for row in rows for backend in row.entries})
|
||
differing_field_counts: dict[str, int] = {}
|
||
backend_presence_counts: dict[str, int] = {backend: 0 for backend in backend_names}
|
||
rows_with_differences = 0
|
||
|
||
for row in rows:
|
||
if row.differing_fields:
|
||
rows_with_differences += 1
|
||
for field_name in row.differing_fields:
|
||
differing_field_counts[field_name] = differing_field_counts.get(field_name, 0) + 1
|
||
for backend, payload in row.entries.items():
|
||
if payload.get("present"):
|
||
backend_presence_counts[backend] = backend_presence_counts.get(backend, 0) + 1
|
||
|
||
return ExtractionComparisonSummary(
|
||
backends=backend_names,
|
||
row_count=len(rows),
|
||
rows_with_differences=rows_with_differences,
|
||
differing_field_counts=dict(sorted(differing_field_counts.items())),
|
||
backend_presence_counts=dict(sorted(backend_presence_counts.items())),
|
||
)
|
||
|
||
|
||
def check_extraction_comparison_summary(
|
||
summary: ExtractionComparisonSummary,
|
||
*,
|
||
max_rows_with_differences: int | None = None,
|
||
max_field_difference_count: int | None = None,
|
||
) -> ExtractionComparisonCheckResult:
|
||
failures: list[str] = []
|
||
if max_rows_with_differences is not None and summary.rows_with_differences > max_rows_with_differences:
|
||
failures.append(
|
||
f"rows_with_differences {summary.rows_with_differences} exceeds limit {max_rows_with_differences}"
|
||
)
|
||
if max_field_difference_count is not None:
|
||
for field_name, count in summary.differing_field_counts.items():
|
||
if count > max_field_difference_count:
|
||
failures.append(
|
||
f"field '{field_name}' difference count {count} exceeds limit {max_field_difference_count}"
|
||
)
|
||
return ExtractionComparisonCheckResult(passed=not failures, failures=failures)
|
||
|
||
|
||
def _extract_references_heuristic(text: str) -> list[BibEntry]:
|
||
entries: list[BibEntry] = []
|
||
for index, line in enumerate(_iter_reference_blocks(text), start=1):
|
||
parsed = _parse_reference_line(line, index)
|
||
if parsed is not None:
|
||
entries.append(parsed)
|
||
return entries
|
||
|
||
|
||
def _entry_to_comparison_payload(entry: BibEntry | None) -> dict[str, object]:
|
||
if entry is None:
|
||
return {"present": False, "citation_key": None, "entry_type": None, "fields": {}}
|
||
return {
|
||
"present": True,
|
||
"citation_key": entry.citation_key,
|
||
"entry_type": entry.entry_type,
|
||
"fields": dict(entry.fields),
|
||
}
|
||
|
||
|
||
def _normalize_extracted_entries(
|
||
entries: list[BibEntry],
|
||
raw_references: list[str],
|
||
backend_name: str,
|
||
) -> list[BibEntry]:
|
||
normalized_entries: list[BibEntry] = []
|
||
for index, entry in enumerate(entries):
|
||
raw_reference = raw_references[index] if index < len(raw_references) else ""
|
||
normalized_entries.append(_normalize_extracted_entry(entry, backend_name, raw_reference))
|
||
return normalized_entries
|
||
|
||
|
||
def _normalize_extracted_entry(entry: BibEntry, backend_name: str, raw_reference: str) -> BibEntry:
|
||
fields = dict(entry.fields)
|
||
|
||
for key in (
|
||
"title",
|
||
"journal",
|
||
"booktitle",
|
||
"publisher",
|
||
"school",
|
||
"institution",
|
||
"howpublished",
|
||
"address",
|
||
):
|
||
if fields.get(key):
|
||
fields[key] = _clean_title(fields[key])
|
||
|
||
if year := fields.get("year"):
|
||
if match := YEAR_PATTERN.search(year):
|
||
fields["year"] = match.group(0)
|
||
|
||
if pages := fields.get("pages"):
|
||
fields["pages"] = _normalize_pages(pages)
|
||
|
||
if doi := fields.get("doi"):
|
||
normalized_doi = doi.strip().rstrip(".,;)")
|
||
fields["doi"] = normalized_doi
|
||
fields["url"] = f"https://doi.org/{normalized_doi}"
|
||
elif url := fields.get("url"):
|
||
fields["url"] = url.strip().rstrip(".,;)")
|
||
|
||
fields["note"] = _merge_extraction_note(fields.get("note", ""), backend_name, raw_reference)
|
||
return BibEntry(entry_type=entry.entry_type, citation_key=entry.citation_key, fields=fields)
|
||
|
||
|
||
def _merge_extraction_note(existing: str, backend_name: str, raw_reference: str) -> str:
|
||
parts: list[str] = []
|
||
existing_clean = existing.strip()
|
||
if existing_clean:
|
||
parts.append(existing_clean)
|
||
lowered = existing_clean.casefold()
|
||
if "extracted_reference" not in lowered:
|
||
parts.append("extracted_reference = {true}")
|
||
if "extracted_by" not in lowered:
|
||
parts.append(f"extracted_by = {{{backend_name}}}")
|
||
if raw_reference and "raw_reference" not in lowered:
|
||
parts.append(f"raw_reference = {{{raw_reference}}}")
|
||
return "; ".join(parts)
|
||
|
||
|
||
def _anystyle_item_to_entry(item: object, ordinal: int) -> BibEntry:
|
||
if not isinstance(item, dict):
|
||
raise RuntimeError("AnyStyle extraction item is not an object")
|
||
|
||
title = _clean_title(_first_text(item.get("title")))
|
||
authors = _anystyle_people_to_names(item.get("author"))
|
||
year = _extract_year_from_values(item.get("date"))
|
||
entry_type = _map_anystyle_type(_first_text(item.get("type")))
|
||
citation_key = _make_citation_key(authors or "ref", year or "nd", title or "untitled", ordinal)
|
||
|
||
fields: dict[str, str] = {
|
||
"note": "extracted_reference = {true}; extracted_by = {anystyle}",
|
||
}
|
||
if authors:
|
||
fields["author"] = authors
|
||
if year:
|
||
fields["year"] = year
|
||
if title:
|
||
fields["title"] = title
|
||
|
||
if editors := _anystyle_people_to_names(item.get("editor")):
|
||
fields["editor"] = editors
|
||
if publisher := _first_text(item.get("publisher")):
|
||
fields["publisher"] = publisher
|
||
if location := _first_text(item.get("location")):
|
||
fields["address"] = location
|
||
if pages := _first_text(item.get("pages")):
|
||
fields["pages"] = _normalize_pages(pages)
|
||
if volume := _first_text(item.get("volume")):
|
||
fields["volume"] = volume
|
||
if number := _first_text(item.get("issue")) or _first_text(item.get("number")):
|
||
fields["number"] = number
|
||
if doi := _first_text(item.get("doi")):
|
||
fields["doi"] = doi
|
||
fields["url"] = f"https://doi.org/{doi}"
|
||
elif url := _first_text(item.get("url")):
|
||
fields["url"] = url
|
||
|
||
container = _first_text(item.get("journal")) or _first_text(item.get("container-title"))
|
||
if not container and entry_type in {"book", "phdthesis", "mastersthesis", "techreport"}:
|
||
container = _first_text(item.get("organization")) or _first_text(item.get("institution")) or _first_text(item.get("school"))
|
||
|
||
if container:
|
||
if entry_type == "article":
|
||
fields["journal"] = container
|
||
elif entry_type in {"inproceedings", "incollection"}:
|
||
fields["booktitle"] = container
|
||
elif entry_type == "techreport":
|
||
fields["institution"] = container
|
||
elif entry_type in {"phdthesis", "mastersthesis"}:
|
||
fields["school"] = container
|
||
elif entry_type == "book" and "publisher" not in fields:
|
||
fields["publisher"] = container
|
||
else:
|
||
fields["howpublished"] = container
|
||
|
||
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
|
||
|
||
|
||
def _iter_reference_blocks(text: str) -> list[str]:
|
||
lines: list[str] = []
|
||
current: list[str] = []
|
||
for raw_line in text.splitlines():
|
||
line = raw_line.strip()
|
||
if not line:
|
||
if current:
|
||
lines.append(" ".join(current))
|
||
current = []
|
||
continue
|
||
starts_new = bool(REF_START_PATTERN.match(line))
|
||
line = REF_START_PATTERN.sub("", line)
|
||
normalized = " ".join(line.split())
|
||
if len(normalized) < 20:
|
||
continue
|
||
if starts_new and current:
|
||
lines.append(" ".join(current))
|
||
current = [normalized]
|
||
else:
|
||
current.append(normalized)
|
||
if current:
|
||
lines.append(" ".join(current))
|
||
return lines
|
||
|
||
|
||
def _parse_reference_line(line: str, ordinal: int) -> BibEntry | None:
|
||
for parser in (_parse_apa_style_reference, _parse_publisher_style_reference, _parse_plain_year_reference):
|
||
parsed = parser(line, ordinal)
|
||
if parsed is not None:
|
||
return parsed
|
||
return None
|
||
|
||
|
||
def _parse_apa_style_reference(line: str, ordinal: int) -> BibEntry | None:
|
||
year_match = YEAR_PAREN_PATTERN.search(line)
|
||
if year_match is None:
|
||
return None
|
||
|
||
year = year_match.group(0).strip("()")
|
||
author_part = line[: year_match.start()].strip(" .")
|
||
remainder = line[year_match.end() :].strip(" .")
|
||
if not author_part or not remainder:
|
||
return None
|
||
|
||
segments = _segments_after_year(remainder)
|
||
if not segments:
|
||
return None
|
||
|
||
parts = _make_reference_parts(line, author_part, year, remainder)
|
||
if parts is None:
|
||
return None
|
||
return _build_entry(parts, ordinal)
|
||
|
||
|
||
def _parse_publisher_style_reference(line: str, ordinal: int) -> BibEntry | None:
|
||
year_match = YEAR_PATTERN.search(line)
|
||
if year_match is None:
|
||
return None
|
||
|
||
prefix = line[: year_match.start()].strip(" .,;")
|
||
if "." not in prefix:
|
||
return None
|
||
|
||
head, publisher = prefix.rsplit(".", 1)
|
||
if "." not in head:
|
||
return None
|
||
author_part, title = head.split(".", 1)
|
||
|
||
authors = _normalize_authors(author_part)
|
||
title = _clean_title(title)
|
||
publisher = publisher.strip(" .,;")
|
||
if not authors or not title or not publisher:
|
||
return None
|
||
|
||
year = year_match.group(0)
|
||
citation_key = _make_citation_key(authors, year, title, ordinal)
|
||
identifiers = _extract_identifier_fields(line)
|
||
metadata = _parse_venue_metadata(publisher)
|
||
entry_type = str(metadata.get("entry_type") or _guess_entry_type(publisher))
|
||
if entry_type not in {"book", "phdthesis", "mastersthesis", "techreport"}:
|
||
entry_type = "book"
|
||
fields: dict[str, str] = {
|
||
"author": authors,
|
||
"year": year,
|
||
"title": title,
|
||
"note": f"extracted_reference = {{true}}; raw_reference = {{{line}}}",
|
||
**identifiers,
|
||
}
|
||
if entry_type == "book":
|
||
fields["publisher"] = str(metadata.get("venue") or publisher)
|
||
elif entry_type in {"phdthesis", "mastersthesis"}:
|
||
fields["school"] = str(metadata.get("venue") or publisher)
|
||
else:
|
||
fields["institution"] = str(metadata.get("venue") or publisher)
|
||
for key in ("number", "type", "series"):
|
||
value = metadata.get(key)
|
||
if value:
|
||
fields[key] = str(value)
|
||
return BibEntry(
|
||
entry_type=entry_type,
|
||
citation_key=citation_key,
|
||
fields=fields,
|
||
)
|
||
|
||
|
||
def _parse_plain_year_reference(line: str, ordinal: int) -> BibEntry | None:
|
||
year_match = YEAR_PATTERN.search(line)
|
||
if year_match is None:
|
||
return None
|
||
|
||
year = year_match.group(0)
|
||
author_part = line[: year_match.start()].strip(" .")
|
||
remainder = line[year_match.end() :].strip(" .")
|
||
if not author_part or not remainder:
|
||
return None
|
||
|
||
parts = _make_reference_parts(line, author_part, year, remainder)
|
||
if parts is None:
|
||
return None
|
||
return _build_entry(parts, ordinal)
|
||
|
||
|
||
def _normalize_authors(author_part: str) -> str:
|
||
normalized = author_part.replace(" & ", " and ")
|
||
normalized = re.sub(r"\bet al\.?$", "and others", normalized)
|
||
normalized = re.sub(r"\s+and\s+", " and ", normalized)
|
||
normalized = re.sub(r"\s*,\s*", ", ", normalized)
|
||
return normalized.strip(" .")
|
||
|
||
|
||
def _segments_after_year(remainder: str) -> list[str]:
|
||
return [segment.strip(" .") for segment in remainder.split(". ") if segment.strip(" .")]
|
||
|
||
|
||
def _split_title_and_venue(remainder: str, *, prefer_colon: bool = False) -> tuple[str, str]:
|
||
if prefer_colon and ": " in remainder:
|
||
title, venue = remainder.split(": ", 1)
|
||
return _clean_title(title), _clean_title(venue)
|
||
|
||
segments = _segments_after_year(remainder)
|
||
if not segments:
|
||
return "", ""
|
||
title = _clean_title(segments[0])
|
||
venue = ". ".join(segments[1:]) if len(segments) > 1 else ""
|
||
return title, _clean_title(venue) if venue else ""
|
||
|
||
|
||
def _clean_title(title: str) -> str:
|
||
cleaned = title.strip(" .,;:\"'")
|
||
cleaned = re.sub(r"\s+", " ", cleaned)
|
||
return cleaned
|
||
|
||
|
||
def _make_reference_parts(raw_line: str, author_part: str, year: str, remainder: str) -> ParsedReferenceParts | None:
|
||
title, venue = _split_title_and_venue(remainder)
|
||
authors = _normalize_authors(author_part)
|
||
if not authors or not title:
|
||
return None
|
||
return ParsedReferenceParts(
|
||
raw_line=raw_line,
|
||
authors=authors,
|
||
year=year,
|
||
title=title,
|
||
venue=venue,
|
||
)
|
||
|
||
|
||
def _build_entry(parts: ParsedReferenceParts, ordinal: int) -> BibEntry:
|
||
citation_key = _make_citation_key(parts.authors, parts.year, parts.title, ordinal)
|
||
entry_type = _guess_entry_type(parts.venue)
|
||
metadata = _parse_venue_metadata(parts.venue)
|
||
if metadata.get("entry_type"):
|
||
entry_type = str(metadata["entry_type"])
|
||
|
||
fields: dict[str, str] = {
|
||
"author": parts.authors,
|
||
"year": parts.year,
|
||
"title": parts.title,
|
||
"note": f"extracted_reference = {{true}}; raw_reference = {{{parts.raw_line}}}",
|
||
}
|
||
fields.update(_extract_identifier_fields(parts.raw_line))
|
||
if metadata.get("venue"):
|
||
venue_value = str(metadata["venue"])
|
||
if entry_type == "article":
|
||
fields["journal"] = venue_value
|
||
elif entry_type in {"inproceedings", "incollection"}:
|
||
fields["booktitle"] = venue_value
|
||
elif entry_type == "book":
|
||
fields["publisher"] = venue_value
|
||
elif entry_type in {"phdthesis", "mastersthesis"}:
|
||
fields["school"] = venue_value
|
||
elif entry_type == "techreport":
|
||
fields["institution"] = venue_value
|
||
else:
|
||
fields["howpublished"] = venue_value
|
||
for key in ("volume", "number", "pages", "publisher", "institution", "school", "type", "series"):
|
||
value = metadata.get(key)
|
||
if value:
|
||
fields[key] = str(value)
|
||
|
||
return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields)
|
||
|
||
|
||
def _make_citation_key(authors: str, year: str, title: str, ordinal: int) -> str:
|
||
first_author = authors.split(" and ")[0]
|
||
family_name = first_author.split(",")[0] if "," in first_author else first_author.split()[-1]
|
||
family_name = re.sub(r"[^A-Za-z0-9]+", "", family_name).lower() or "ref"
|
||
|
||
first_word = re.sub(r"[^A-Za-z0-9]+", "", title.split()[0]).lower() if title.split() else "untitled"
|
||
if not first_word:
|
||
first_word = "untitled"
|
||
return f"{family_name}{year}{first_word}{ordinal}"
|
||
|
||
|
||
def _guess_entry_type(venue: str) -> str:
|
||
lowered = venue.lower()
|
||
if "master" in lowered and "thesis" in lowered:
|
||
return "mastersthesis"
|
||
if any(token in lowered for token in ("ph.d", "phd", "doctoral dissertation", "doctor's thesis", "thesis", "dissertation")):
|
||
return "phdthesis"
|
||
if any(token in lowered for token in ("technical report", "tech report", "report no", "working paper", "bulletin")):
|
||
return "techreport"
|
||
if any(token in lowered for token in ("retrieved from", "available at", "accessed", "http://", "https://", "www.")):
|
||
return "misc"
|
||
if any(token in lowered for token in ("journal", "transactions", "review", "letters")):
|
||
return "article"
|
||
if any(token in lowered for token in ("proceedings", "conference", "workshop", "symposium")):
|
||
return "inproceedings"
|
||
if any(token in lowered for token in ("press", "publisher", "publications", "springer", "wiley", "elsevier", "university")):
|
||
return "book"
|
||
return "misc"
|
||
|
||
|
||
def _extract_identifier_fields(text: str) -> dict[str, str]:
|
||
fields: dict[str, str] = {}
|
||
if doi_match := DOI_PATTERN.search(text):
|
||
doi = doi_match.group(0).rstrip(".,;)")
|
||
fields["doi"] = doi
|
||
fields["url"] = f"https://doi.org/{doi}"
|
||
elif url_match := URL_PATTERN.search(text):
|
||
fields["url"] = url_match.group(0).rstrip(".,;)")
|
||
if arxiv_match := ARXIV_PATTERN.search(text):
|
||
fields["arxiv"] = arxiv_match.group(1).rstrip(".,;)")
|
||
if isbn_match := ISBN_PATTERN.search(text):
|
||
fields["isbn"] = isbn_match.group(1).strip()
|
||
if issn_match := ISSN_PATTERN.search(text):
|
||
fields["issn"] = issn_match.group(1).strip()
|
||
return fields
|
||
|
||
|
||
def _looks_like_citation_blob(text: str) -> bool:
|
||
lowered = text.casefold()
|
||
if any(token in lowered for token in ("http://", "https://", "www.", " accessed ", " url ")):
|
||
return True
|
||
if any(token in lowered for token in ("supporting material", "grant", "poll results", "prescribing information")):
|
||
return True
|
||
if text.count(",") >= 3 or text.count(";") >= 2:
|
||
return True
|
||
if re.search(r"\(\d{4}[a-z]?\)", text, flags=re.IGNORECASE):
|
||
return True
|
||
if re.search(r"\b[A-Z]\.[ ]?[A-Z]?\.", text):
|
||
return True
|
||
return False
|
||
|
||
|
||
def _extract_thesis_like_title(text: str) -> str:
|
||
normalized = _clean_title(" ".join(text.split()))
|
||
if not normalized:
|
||
return ""
|
||
|
||
match = THESIS_MARKER_PATTERN.search(normalized)
|
||
if match is not None:
|
||
normalized = normalized[: match.start()].strip(" .")
|
||
for marker in (" ProQuest", " UMI No.", " Dissertation Abstracts", " University Microfilms"):
|
||
if marker in normalized:
|
||
normalized = normalized.split(marker, 1)[0].strip(" .")
|
||
if match is not None and ". " in normalized:
|
||
normalized = normalized.split(". ", 1)[1].strip()
|
||
return normalized.strip(" .")
|
||
|
||
|
||
def _parse_venue_metadata(venue: str) -> dict[str, str]:
|
||
if not venue:
|
||
return {}
|
||
|
||
# These recovery heuristics intentionally mirror patterns already used in
|
||
# citegeist.talkorigins / citegeist.expand and were scoped using GROBID-like
|
||
# staged parsing concerns: preserve identifiers, venue fragments, and page structure.
|
||
normalized = venue.strip(" .")
|
||
metadata: dict[str, str] = {"venue": normalized}
|
||
entry_type = _guess_entry_type(normalized)
|
||
metadata["entry_type"] = entry_type
|
||
|
||
lowered = normalized.lower()
|
||
if entry_type == "misc" and ("retrieved from" in lowered or "available at" in lowered):
|
||
metadata["venue"] = _clean_title(normalized)
|
||
|
||
if volume_match := VOLUME_ISSUE_PAGES_PATTERN.search(normalized):
|
||
metadata["volume"] = volume_match.group("volume").strip()
|
||
if volume_match.group("number"):
|
||
metadata["number"] = volume_match.group("number").strip()
|
||
metadata["pages"] = _normalize_pages(volume_match.group("pages"))
|
||
venue_prefix = normalized[: volume_match.start()].strip(" ,;:.")
|
||
if venue_prefix:
|
||
metadata["venue"] = venue_prefix
|
||
elif pages_match := PAGES_PATTERN.search(normalized):
|
||
metadata["pages"] = _normalize_pages(pages_match.group("pages"))
|
||
venue_prefix = normalized[: pages_match.start()].strip(" ,;:.")
|
||
if venue_prefix:
|
||
metadata["venue"] = venue_prefix
|
||
elif trailing_pages_match := TRAILING_PAGE_PATTERN.search(normalized):
|
||
metadata["pages"] = _normalize_pages(trailing_pages_match.group("pages"))
|
||
venue_prefix = normalized[: trailing_pages_match.start()].strip(" ,;:.")
|
||
if venue_prefix:
|
||
metadata["venue"] = venue_prefix
|
||
|
||
if entry_type == "techreport":
|
||
if report_match := REPORT_NUMBER_PATTERN.search(normalized):
|
||
metadata["number"] = report_match.group("number").strip()
|
||
metadata["type"] = "Technical Report"
|
||
institution = _strip_report_prefix(metadata.get("venue", normalized))
|
||
if institution:
|
||
metadata["venue"] = institution
|
||
elif entry_type in {"phdthesis", "mastersthesis"}:
|
||
school = _strip_thesis_prefix(metadata.get("venue", normalized))
|
||
if school:
|
||
metadata["venue"] = school
|
||
return metadata
|
||
|
||
|
||
def _normalize_pages(value: str) -> str:
|
||
compact = re.sub(r"\s*[\u2013-]+\s*", "--", value.strip())
|
||
return re.sub(r"-{3,}", "--", compact)
|
||
|
||
|
||
def _strip_report_prefix(value: str) -> str:
|
||
cleaned = re.sub(r"\b(?:technical\s+report|tech report|report|working\s+paper|bulletin)\b", "", value, flags=re.IGNORECASE)
|
||
cleaned = re.sub(r"\b(?:no\.?|number)\s*[A-Za-z0-9.\-]+\b", "", cleaned, flags=re.IGNORECASE)
|
||
return _clean_title(cleaned)
|
||
|
||
|
||
def _strip_thesis_prefix(value: str) -> str:
|
||
cleaned = re.sub(r"\b(?:ph\.?d\.?|doctoral|doctor's|master'?s)\s+(?:dissertation|thesis)\b", "", value, flags=re.IGNORECASE)
|
||
cleaned = re.sub(r"^\((?:master|doctoral).*?\)\s*", "", cleaned, flags=re.IGNORECASE)
|
||
return _clean_title(cleaned)
|
||
|
||
|
||
def _first_text(value: object) -> str:
|
||
if isinstance(value, list):
|
||
for item in value:
|
||
text = _first_text(item)
|
||
if text:
|
||
return text
|
||
return ""
|
||
if isinstance(value, dict):
|
||
for key in ("literal", "value", "text", "name"):
|
||
text = _first_text(value.get(key))
|
||
if text:
|
||
return text
|
||
return ""
|
||
if value is None:
|
||
return ""
|
||
return _clean_title(str(value))
|
||
|
||
|
||
def _extract_year_from_values(value: object) -> str:
|
||
text = _first_text(value)
|
||
match = YEAR_PATTERN.search(text)
|
||
return match.group(0) if match is not None else ""
|
||
|
||
|
||
def _anystyle_people_to_names(value: object) -> str:
|
||
if not isinstance(value, list):
|
||
return ""
|
||
names: list[str] = []
|
||
for item in value:
|
||
if isinstance(item, dict):
|
||
family = _first_text(item.get("family"))
|
||
given = _first_text(item.get("given"))
|
||
literal = _first_text(item.get("literal"))
|
||
if family and given:
|
||
names.append(f"{family}, {given}")
|
||
elif literal:
|
||
names.append(literal)
|
||
elif family:
|
||
names.append(family)
|
||
else:
|
||
text = _first_text(item)
|
||
if text:
|
||
names.append(text)
|
||
return " and ".join(name for name in names if name)
|
||
|
||
|
||
def _map_anystyle_type(value: str) -> str:
|
||
lowered = value.casefold()
|
||
if lowered in {"article", "journal_article", "article-journal"}:
|
||
return "article"
|
||
if lowered in {"chapter", "incollection"}:
|
||
return "incollection"
|
||
if lowered in {"paper-conference", "inproceedings", "proceedings"}:
|
||
return "inproceedings"
|
||
if lowered in {"thesis", "phdthesis", "dissertation"}:
|
||
return "phdthesis"
|
||
if lowered in {"mastersthesis", "master-thesis"}:
|
||
return "mastersthesis"
|
||
if lowered in {"report", "techreport"}:
|
||
return "techreport"
|
||
if lowered == "book":
|
||
return "book"
|
||
return "misc"
|