from __future__ import annotations import json import os import re import shutil import subprocess import tempfile import urllib.error import urllib.parse import urllib.request from dataclasses import dataclass from typing import Protocol from .bibtex import BibEntry, parse_bibtex YEAR_PATTERN = re.compile(r"\b(?:1[6-9]|20|21)\d{2}[a-z]?\b", re.IGNORECASE) YEAR_PAREN_PATTERN = re.compile(r"\((?:1[6-9]|20|21)\d{2}[a-z]?\)", re.IGNORECASE) REF_START_PATTERN = re.compile(r"^(?:\[\d+\]|\d+\.|\(\d+\))\s*") DOI_PATTERN = re.compile(r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b", re.IGNORECASE) URL_PATTERN = re.compile(r"https?://\S+", re.IGNORECASE) ARXIV_PATTERN = re.compile(r"\barXiv:\s*([A-Za-z0-9.\-]+)", re.IGNORECASE) ISBN_PATTERN = re.compile(r"\bISBN(?:-1[03])?:?\s*([0-9Xx\-]{10,20})\b") ISSN_PATTERN = re.compile(r"\bISSN:?\s*([0-9Xx\-]{8,12})\b", re.IGNORECASE) VOLUME_ISSUE_PAGES_PATTERN = re.compile( r"(?P\d+)\s*(?:\((?P[^)]+)\))?\s*[:;,]\s*(?P\d+\s*[-\u2013]\s*\d+)\b" ) PAGES_PATTERN = re.compile(r"\bpp?\.\s*(?P\d+\s*[-\u2013]\s*\d+)\b", re.IGNORECASE) TRAILING_PAGE_PATTERN = re.compile(r"[,;]\s*(?P\d+\s*[-\u2013]\s*\d+)\.?$") REPORT_NUMBER_PATTERN = re.compile(r"\b(?:technical\s+report|report|working\s+paper|bulletin)\s+(?:no\.?|number)?\s*(?P[A-Za-z0-9.\-]+)\b", re.IGNORECASE) THESIS_MARKER_PATTERN = re.compile( r"\((?:master|doctoral).*?\)|phd dissertation|master'?s thesis|master’s thesis|doctoral dissertation", re.IGNORECASE, ) @dataclass(slots=True) class ParsedReferenceParts: raw_line: str authors: str year: str title: str venue: str @dataclass(slots=True) class ExtractionComparisonRow: ordinal: int raw_reference: str entries: dict[str, dict[str, object]] differing_fields: list[str] def to_dict(self) -> dict[str, object]: return { "ordinal": self.ordinal, "raw_reference": self.raw_reference, "entries": self.entries, "differing_fields": self.differing_fields, } @dataclass(slots=True) class ExtractionComparisonSummary: backends: list[str] row_count: int rows_with_differences: int differing_field_counts: dict[str, int] backend_presence_counts: dict[str, int] def to_dict(self) -> dict[str, object]: return { "backends": self.backends, "row_count": self.row_count, "rows_with_differences": self.rows_with_differences, "differing_field_counts": self.differing_field_counts, "backend_presence_counts": self.backend_presence_counts, } @dataclass(slots=True) class ExtractionComparisonCheckResult: passed: bool failures: list[str] def to_dict(self) -> dict[str, object]: return { "passed": self.passed, "failures": self.failures, } class ReferenceExtractionBackend(Protocol): name: str def extract_references(self, text: str) -> list[BibEntry]: ... @dataclass(slots=True) class HeuristicReferenceExtractionBackend: name: str = "heuristic" def extract_references(self, text: str) -> list[BibEntry]: return _extract_references_heuristic(text) @dataclass(slots=True) class AnystyleCliReferenceExtractionBackend: name: str = "anystyle" command: str | None = None parser_model: str | None = None def extract_references(self, text: str) -> list[BibEntry]: command = self.command or os.getenv("CITEGEIST_ANYSTYLE_BIN", "anystyle") parser_model = self.parser_model or os.getenv("CITEGEIST_ANYSTYLE_PARSER_MODEL") if shutil.which(command) is None: raise RuntimeError( "The 'anystyle' extraction backend requires the AnyStyle CLI to be installed and on PATH. " "Set CITEGEIST_ANYSTYLE_BIN if the binary is elsewhere." ) blocks = _iter_reference_blocks(text) if not blocks: return [] with tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".txt", delete=False) as handle: handle.write("\n".join(blocks) + "\n") input_path = handle.name args = [command, "--stdout", "-f", "json"] if parser_model: args.extend(["-P", parser_model]) args.extend(["parse", input_path]) try: result = subprocess.run(args, capture_output=True, text=True, check=False) finally: try: os.unlink(input_path) except OSError: pass if result.returncode != 0: message = result.stderr.strip() or result.stdout.strip() or "unknown AnyStyle error" raise RuntimeError(f"AnyStyle extraction failed: {message}") payload = json.loads(result.stdout or "[]") if not isinstance(payload, list): raise RuntimeError("AnyStyle extraction returned an unexpected payload") return [_anystyle_item_to_entry(item, index) for index, item in enumerate(payload, start=1)] @dataclass(slots=True) class GrobidReferenceExtractionBackend: name: str = "grobid" base_url: str | None = None consolidate_citations: int = 0 include_raw_citations: int = 1 def extract_references(self, text: str) -> list[BibEntry]: blocks = _iter_reference_blocks(text) if not blocks: return [] base_url = (self.base_url or os.getenv("CITEGEIST_GROBID_URL", "http://127.0.0.1:8070")).rstrip("/") payload = urllib.parse.urlencode( { "citations": blocks, "consolidateCitations": str(self.consolidate_citations), "includeRawCitations": str(self.include_raw_citations), }, doseq=True, ).encode("utf-8") request = urllib.request.Request( f"{base_url}/api/processCitationList", data=payload, headers={ "Accept": "application/x-bibtex", "Content-Type": "application/x-www-form-urlencoded", }, method="POST", ) try: with urllib.request.urlopen(request, timeout=30) as response: body = response.read().decode("utf-8") except urllib.error.HTTPError as exc: error_body = exc.read() if isinstance(error_body, bytes): detail = error_body.decode("utf-8", errors="replace").strip() else: detail = str(error_body or "").strip() raise RuntimeError(f"GROBID extraction failed with HTTP {exc.code}: {detail or exc.reason}") from exc except urllib.error.URLError as exc: raise RuntimeError(f"GROBID extraction failed: {exc.reason}") from exc if not body.strip(): return [] try: entries = parse_bibtex(body) except Exception as exc: raise RuntimeError("GROBID extraction returned invalid BibTeX output") from exc for index, entry in enumerate(entries, start=1): if entry.citation_key in {"-1", "1", ""}: entry.citation_key = _make_citation_key( entry.fields.get("author", "ref"), entry.fields.get("year", "nd"), entry.fields.get("title", "untitled"), index, ) return entries _EXTRACTION_BACKENDS: dict[str, ReferenceExtractionBackend] = { "heuristic": HeuristicReferenceExtractionBackend(), "anystyle": AnystyleCliReferenceExtractionBackend(), "grobid": GrobidReferenceExtractionBackend(), } def available_extraction_backends() -> list[str]: return sorted(_EXTRACTION_BACKENDS) def get_extraction_backend(name: str = "heuristic") -> ReferenceExtractionBackend: try: return _EXTRACTION_BACKENDS[name] except KeyError as exc: choices = ", ".join(available_extraction_backends()) raise ValueError(f"Unknown extraction backend: {name}. Available backends: {choices}") from exc def register_extraction_backend(backend: ReferenceExtractionBackend) -> None: _EXTRACTION_BACKENDS[backend.name] = backend def extract_references(text: str, backend: str = "heuristic") -> list[BibEntry]: backend_impl = get_extraction_backend(backend) entries = backend_impl.extract_references(text) raw_references = _iter_reference_blocks(text) return _normalize_extracted_entries(entries, raw_references, backend_impl.name) def render_extracted_bibtex(text: str, backend: str = "heuristic") -> str: from .bibtex import render_bibtex return render_bibtex(extract_references(text, backend=backend)) def compare_extraction_backends(text: str, backends: list[str] | None = None) -> list[ExtractionComparisonRow]: selected = backends or available_extraction_backends() raw_references = _iter_reference_blocks(text) extracted_by_backend = {backend: extract_references(text, backend=backend) for backend in selected} rows: list[ExtractionComparisonRow] = [] max_count = max([len(raw_references), *(len(entries) for entries in extracted_by_backend.values())], default=0) for index in range(max_count): entries_payload: dict[str, dict[str, object]] = {} all_field_names: set[str] = set() for backend in selected: entry = extracted_by_backend[backend][index] if index < len(extracted_by_backend[backend]) else None payload = _entry_to_comparison_payload(entry) entries_payload[backend] = payload all_field_names.update(str(field_name) for field_name in payload.get("fields", {})) differing_fields: list[str] = [] entry_type_values = {str(entries_payload[backend].get("entry_type") or "") for backend in selected} if len(entry_type_values) > 1: differing_fields.append("entry_type") for field_name in sorted(all_field_names): values = { str(entries_payload[backend].get("fields", {}).get(field_name, "")) for backend in selected } if len(values) > 1: differing_fields.append(field_name) rows.append( ExtractionComparisonRow( ordinal=index + 1, raw_reference=raw_references[index] if index < len(raw_references) else "", entries=entries_payload, differing_fields=differing_fields, ) ) return rows def summarize_extraction_comparison(rows: list[ExtractionComparisonRow]) -> ExtractionComparisonSummary: backend_names = sorted({backend for row in rows for backend in row.entries}) differing_field_counts: dict[str, int] = {} backend_presence_counts: dict[str, int] = {backend: 0 for backend in backend_names} rows_with_differences = 0 for row in rows: if row.differing_fields: rows_with_differences += 1 for field_name in row.differing_fields: differing_field_counts[field_name] = differing_field_counts.get(field_name, 0) + 1 for backend, payload in row.entries.items(): if payload.get("present"): backend_presence_counts[backend] = backend_presence_counts.get(backend, 0) + 1 return ExtractionComparisonSummary( backends=backend_names, row_count=len(rows), rows_with_differences=rows_with_differences, differing_field_counts=dict(sorted(differing_field_counts.items())), backend_presence_counts=dict(sorted(backend_presence_counts.items())), ) def check_extraction_comparison_summary( summary: ExtractionComparisonSummary, *, max_rows_with_differences: int | None = None, max_field_difference_count: int | None = None, ) -> ExtractionComparisonCheckResult: failures: list[str] = [] if max_rows_with_differences is not None and summary.rows_with_differences > max_rows_with_differences: failures.append( f"rows_with_differences {summary.rows_with_differences} exceeds limit {max_rows_with_differences}" ) if max_field_difference_count is not None: for field_name, count in summary.differing_field_counts.items(): if count > max_field_difference_count: failures.append( f"field '{field_name}' difference count {count} exceeds limit {max_field_difference_count}" ) return ExtractionComparisonCheckResult(passed=not failures, failures=failures) def _extract_references_heuristic(text: str) -> list[BibEntry]: entries: list[BibEntry] = [] for index, line in enumerate(_iter_reference_blocks(text), start=1): parsed = _parse_reference_line(line, index) if parsed is not None: entries.append(parsed) return entries def _entry_to_comparison_payload(entry: BibEntry | None) -> dict[str, object]: if entry is None: return {"present": False, "citation_key": None, "entry_type": None, "fields": {}} return { "present": True, "citation_key": entry.citation_key, "entry_type": entry.entry_type, "fields": dict(entry.fields), } def _normalize_extracted_entries( entries: list[BibEntry], raw_references: list[str], backend_name: str, ) -> list[BibEntry]: normalized_entries: list[BibEntry] = [] for index, entry in enumerate(entries): raw_reference = raw_references[index] if index < len(raw_references) else "" normalized_entries.append(_normalize_extracted_entry(entry, backend_name, raw_reference)) return normalized_entries def _normalize_extracted_entry(entry: BibEntry, backend_name: str, raw_reference: str) -> BibEntry: fields = dict(entry.fields) for key in ( "title", "journal", "booktitle", "publisher", "school", "institution", "howpublished", "address", ): if fields.get(key): fields[key] = _clean_title(fields[key]) if year := fields.get("year"): if match := YEAR_PATTERN.search(year): fields["year"] = match.group(0) if pages := fields.get("pages"): fields["pages"] = _normalize_pages(pages) if doi := fields.get("doi"): normalized_doi = doi.strip().rstrip(".,;)") fields["doi"] = normalized_doi fields["url"] = f"https://doi.org/{normalized_doi}" elif url := fields.get("url"): fields["url"] = url.strip().rstrip(".,;)") fields["note"] = _merge_extraction_note(fields.get("note", ""), backend_name, raw_reference) return BibEntry(entry_type=entry.entry_type, citation_key=entry.citation_key, fields=fields) def _merge_extraction_note(existing: str, backend_name: str, raw_reference: str) -> str: parts: list[str] = [] existing_clean = existing.strip() if existing_clean: parts.append(existing_clean) lowered = existing_clean.casefold() if "extracted_reference" not in lowered: parts.append("extracted_reference = {true}") if "extracted_by" not in lowered: parts.append(f"extracted_by = {{{backend_name}}}") if raw_reference and "raw_reference" not in lowered: parts.append(f"raw_reference = {{{raw_reference}}}") return "; ".join(parts) def _anystyle_item_to_entry(item: object, ordinal: int) -> BibEntry: if not isinstance(item, dict): raise RuntimeError("AnyStyle extraction item is not an object") title = _clean_title(_first_text(item.get("title"))) authors = _anystyle_people_to_names(item.get("author")) year = _extract_year_from_values(item.get("date")) entry_type = _map_anystyle_type(_first_text(item.get("type"))) citation_key = _make_citation_key(authors or "ref", year or "nd", title or "untitled", ordinal) fields: dict[str, str] = { "note": "extracted_reference = {true}; extracted_by = {anystyle}", } if authors: fields["author"] = authors if year: fields["year"] = year if title: fields["title"] = title if editors := _anystyle_people_to_names(item.get("editor")): fields["editor"] = editors if publisher := _first_text(item.get("publisher")): fields["publisher"] = publisher if location := _first_text(item.get("location")): fields["address"] = location if pages := _first_text(item.get("pages")): fields["pages"] = _normalize_pages(pages) if volume := _first_text(item.get("volume")): fields["volume"] = volume if number := _first_text(item.get("issue")) or _first_text(item.get("number")): fields["number"] = number if doi := _first_text(item.get("doi")): fields["doi"] = doi fields["url"] = f"https://doi.org/{doi}" elif url := _first_text(item.get("url")): fields["url"] = url container = _first_text(item.get("journal")) or _first_text(item.get("container-title")) if not container and entry_type in {"book", "phdthesis", "mastersthesis", "techreport"}: container = _first_text(item.get("organization")) or _first_text(item.get("institution")) or _first_text(item.get("school")) if container: if entry_type == "article": fields["journal"] = container elif entry_type in {"inproceedings", "incollection"}: fields["booktitle"] = container elif entry_type == "techreport": fields["institution"] = container elif entry_type in {"phdthesis", "mastersthesis"}: fields["school"] = container elif entry_type == "book" and "publisher" not in fields: fields["publisher"] = container else: fields["howpublished"] = container return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) def _iter_reference_blocks(text: str) -> list[str]: lines: list[str] = [] current: list[str] = [] for raw_line in text.splitlines(): line = raw_line.strip() if not line: if current: lines.append(" ".join(current)) current = [] continue starts_new = bool(REF_START_PATTERN.match(line)) line = REF_START_PATTERN.sub("", line) normalized = " ".join(line.split()) if len(normalized) < 20: continue if starts_new and current: lines.append(" ".join(current)) current = [normalized] else: current.append(normalized) if current: lines.append(" ".join(current)) return lines def _parse_reference_line(line: str, ordinal: int) -> BibEntry | None: for parser in (_parse_apa_style_reference, _parse_publisher_style_reference, _parse_plain_year_reference): parsed = parser(line, ordinal) if parsed is not None: return parsed return None def _parse_apa_style_reference(line: str, ordinal: int) -> BibEntry | None: year_match = YEAR_PAREN_PATTERN.search(line) if year_match is None: return None year = year_match.group(0).strip("()") author_part = line[: year_match.start()].strip(" .") remainder = line[year_match.end() :].strip(" .") if not author_part or not remainder: return None segments = _segments_after_year(remainder) if not segments: return None parts = _make_reference_parts(line, author_part, year, remainder) if parts is None: return None return _build_entry(parts, ordinal) def _parse_publisher_style_reference(line: str, ordinal: int) -> BibEntry | None: year_match = YEAR_PATTERN.search(line) if year_match is None: return None prefix = line[: year_match.start()].strip(" .,;") if "." not in prefix: return None head, publisher = prefix.rsplit(".", 1) if "." not in head: return None author_part, title = head.split(".", 1) authors = _normalize_authors(author_part) title = _clean_title(title) publisher = publisher.strip(" .,;") if not authors or not title or not publisher: return None year = year_match.group(0) citation_key = _make_citation_key(authors, year, title, ordinal) identifiers = _extract_identifier_fields(line) metadata = _parse_venue_metadata(publisher) entry_type = str(metadata.get("entry_type") or _guess_entry_type(publisher)) if entry_type not in {"book", "phdthesis", "mastersthesis", "techreport"}: entry_type = "book" fields: dict[str, str] = { "author": authors, "year": year, "title": title, "note": f"extracted_reference = {{true}}; raw_reference = {{{line}}}", **identifiers, } if entry_type == "book": fields["publisher"] = str(metadata.get("venue") or publisher) elif entry_type in {"phdthesis", "mastersthesis"}: fields["school"] = str(metadata.get("venue") or publisher) else: fields["institution"] = str(metadata.get("venue") or publisher) for key in ("number", "type", "series"): value = metadata.get(key) if value: fields[key] = str(value) return BibEntry( entry_type=entry_type, citation_key=citation_key, fields=fields, ) def _parse_plain_year_reference(line: str, ordinal: int) -> BibEntry | None: year_match = YEAR_PATTERN.search(line) if year_match is None: return None year = year_match.group(0) author_part = line[: year_match.start()].strip(" .") remainder = line[year_match.end() :].strip(" .") if not author_part or not remainder: return None parts = _make_reference_parts(line, author_part, year, remainder) if parts is None: return None return _build_entry(parts, ordinal) def _normalize_authors(author_part: str) -> str: normalized = author_part.replace(" & ", " and ") normalized = re.sub(r"\bet al\.?$", "and others", normalized) normalized = re.sub(r"\s+and\s+", " and ", normalized) normalized = re.sub(r"\s*,\s*", ", ", normalized) return normalized.strip(" .") def _segments_after_year(remainder: str) -> list[str]: return [segment.strip(" .") for segment in remainder.split(". ") if segment.strip(" .")] def _split_title_and_venue(remainder: str, *, prefer_colon: bool = False) -> tuple[str, str]: if prefer_colon and ": " in remainder: title, venue = remainder.split(": ", 1) return _clean_title(title), _clean_title(venue) segments = _segments_after_year(remainder) if not segments: return "", "" title = _clean_title(segments[0]) venue = ". ".join(segments[1:]) if len(segments) > 1 else "" return title, _clean_title(venue) if venue else "" def _clean_title(title: str) -> str: cleaned = title.strip(" .,;:\"'") cleaned = re.sub(r"\s+", " ", cleaned) return cleaned def _make_reference_parts(raw_line: str, author_part: str, year: str, remainder: str) -> ParsedReferenceParts | None: title, venue = _split_title_and_venue(remainder) authors = _normalize_authors(author_part) if not authors or not title: return None return ParsedReferenceParts( raw_line=raw_line, authors=authors, year=year, title=title, venue=venue, ) def _build_entry(parts: ParsedReferenceParts, ordinal: int) -> BibEntry: citation_key = _make_citation_key(parts.authors, parts.year, parts.title, ordinal) entry_type = _guess_entry_type(parts.venue) metadata = _parse_venue_metadata(parts.venue) if metadata.get("entry_type"): entry_type = str(metadata["entry_type"]) fields: dict[str, str] = { "author": parts.authors, "year": parts.year, "title": parts.title, "note": f"extracted_reference = {{true}}; raw_reference = {{{parts.raw_line}}}", } fields.update(_extract_identifier_fields(parts.raw_line)) if metadata.get("venue"): venue_value = str(metadata["venue"]) if entry_type == "article": fields["journal"] = venue_value elif entry_type in {"inproceedings", "incollection"}: fields["booktitle"] = venue_value elif entry_type == "book": fields["publisher"] = venue_value elif entry_type in {"phdthesis", "mastersthesis"}: fields["school"] = venue_value elif entry_type == "techreport": fields["institution"] = venue_value else: fields["howpublished"] = venue_value for key in ("volume", "number", "pages", "publisher", "institution", "school", "type", "series"): value = metadata.get(key) if value: fields[key] = str(value) return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) def _make_citation_key(authors: str, year: str, title: str, ordinal: int) -> str: first_author = authors.split(" and ")[0] family_name = first_author.split(",")[0] if "," in first_author else first_author.split()[-1] family_name = re.sub(r"[^A-Za-z0-9]+", "", family_name).lower() or "ref" first_word = re.sub(r"[^A-Za-z0-9]+", "", title.split()[0]).lower() if title.split() else "untitled" if not first_word: first_word = "untitled" return f"{family_name}{year}{first_word}{ordinal}" def _guess_entry_type(venue: str) -> str: lowered = venue.lower() if "master" in lowered and "thesis" in lowered: return "mastersthesis" if any(token in lowered for token in ("ph.d", "phd", "doctoral dissertation", "doctor's thesis", "thesis", "dissertation")): return "phdthesis" if any(token in lowered for token in ("technical report", "tech report", "report no", "working paper", "bulletin")): return "techreport" if any(token in lowered for token in ("retrieved from", "available at", "accessed", "http://", "https://", "www.")): return "misc" if any(token in lowered for token in ("journal", "transactions", "review", "letters")): return "article" if any(token in lowered for token in ("proceedings", "conference", "workshop", "symposium")): return "inproceedings" if any(token in lowered for token in ("press", "publisher", "publications", "springer", "wiley", "elsevier", "university")): return "book" return "misc" def _extract_identifier_fields(text: str) -> dict[str, str]: fields: dict[str, str] = {} if doi_match := DOI_PATTERN.search(text): doi = doi_match.group(0).rstrip(".,;)") fields["doi"] = doi fields["url"] = f"https://doi.org/{doi}" elif url_match := URL_PATTERN.search(text): fields["url"] = url_match.group(0).rstrip(".,;)") if arxiv_match := ARXIV_PATTERN.search(text): fields["arxiv"] = arxiv_match.group(1).rstrip(".,;)") if isbn_match := ISBN_PATTERN.search(text): fields["isbn"] = isbn_match.group(1).strip() if issn_match := ISSN_PATTERN.search(text): fields["issn"] = issn_match.group(1).strip() return fields def _looks_like_citation_blob(text: str) -> bool: lowered = text.casefold() if any(token in lowered for token in ("http://", "https://", "www.", " accessed ", " url ")): return True if any(token in lowered for token in ("supporting material", "grant", "poll results", "prescribing information")): return True if text.count(",") >= 3 or text.count(";") >= 2: return True if re.search(r"\(\d{4}[a-z]?\)", text, flags=re.IGNORECASE): return True if re.search(r"\b[A-Z]\.[ ]?[A-Z]?\.", text): return True return False def _extract_thesis_like_title(text: str) -> str: normalized = _clean_title(" ".join(text.split())) if not normalized: return "" match = THESIS_MARKER_PATTERN.search(normalized) if match is not None: normalized = normalized[: match.start()].strip(" .") for marker in (" ProQuest", " UMI No.", " Dissertation Abstracts", " University Microfilms"): if marker in normalized: normalized = normalized.split(marker, 1)[0].strip(" .") if match is not None and ". " in normalized: normalized = normalized.split(". ", 1)[1].strip() return normalized.strip(" .") def _parse_venue_metadata(venue: str) -> dict[str, str]: if not venue: return {} # These recovery heuristics intentionally mirror patterns already used in # citegeist.talkorigins / citegeist.expand and were scoped using GROBID-like # staged parsing concerns: preserve identifiers, venue fragments, and page structure. normalized = venue.strip(" .") metadata: dict[str, str] = {"venue": normalized} entry_type = _guess_entry_type(normalized) metadata["entry_type"] = entry_type lowered = normalized.lower() if entry_type == "misc" and ("retrieved from" in lowered or "available at" in lowered): metadata["venue"] = _clean_title(normalized) if volume_match := VOLUME_ISSUE_PAGES_PATTERN.search(normalized): metadata["volume"] = volume_match.group("volume").strip() if volume_match.group("number"): metadata["number"] = volume_match.group("number").strip() metadata["pages"] = _normalize_pages(volume_match.group("pages")) venue_prefix = normalized[: volume_match.start()].strip(" ,;:.") if venue_prefix: metadata["venue"] = venue_prefix elif pages_match := PAGES_PATTERN.search(normalized): metadata["pages"] = _normalize_pages(pages_match.group("pages")) venue_prefix = normalized[: pages_match.start()].strip(" ,;:.") if venue_prefix: metadata["venue"] = venue_prefix elif trailing_pages_match := TRAILING_PAGE_PATTERN.search(normalized): metadata["pages"] = _normalize_pages(trailing_pages_match.group("pages")) venue_prefix = normalized[: trailing_pages_match.start()].strip(" ,;:.") if venue_prefix: metadata["venue"] = venue_prefix if entry_type == "techreport": if report_match := REPORT_NUMBER_PATTERN.search(normalized): metadata["number"] = report_match.group("number").strip() metadata["type"] = "Technical Report" institution = _strip_report_prefix(metadata.get("venue", normalized)) if institution: metadata["venue"] = institution elif entry_type in {"phdthesis", "mastersthesis"}: school = _strip_thesis_prefix(metadata.get("venue", normalized)) if school: metadata["venue"] = school return metadata def _normalize_pages(value: str) -> str: compact = re.sub(r"\s*[\u2013-]+\s*", "--", value.strip()) return re.sub(r"-{3,}", "--", compact) def _strip_report_prefix(value: str) -> str: cleaned = re.sub(r"\b(?:technical\s+report|tech report|report|working\s+paper|bulletin)\b", "", value, flags=re.IGNORECASE) cleaned = re.sub(r"\b(?:no\.?|number)\s*[A-Za-z0-9.\-]+\b", "", cleaned, flags=re.IGNORECASE) return _clean_title(cleaned) def _strip_thesis_prefix(value: str) -> str: cleaned = re.sub(r"\b(?:ph\.?d\.?|doctoral|doctor's|master'?s)\s+(?:dissertation|thesis)\b", "", value, flags=re.IGNORECASE) cleaned = re.sub(r"^\((?:master|doctoral).*?\)\s*", "", cleaned, flags=re.IGNORECASE) return _clean_title(cleaned) def _first_text(value: object) -> str: if isinstance(value, list): for item in value: text = _first_text(item) if text: return text return "" if isinstance(value, dict): for key in ("literal", "value", "text", "name"): text = _first_text(value.get(key)) if text: return text return "" if value is None: return "" return _clean_title(str(value)) def _extract_year_from_values(value: object) -> str: text = _first_text(value) match = YEAR_PATTERN.search(text) return match.group(0) if match is not None else "" def _anystyle_people_to_names(value: object) -> str: if not isinstance(value, list): return "" names: list[str] = [] for item in value: if isinstance(item, dict): family = _first_text(item.get("family")) given = _first_text(item.get("given")) literal = _first_text(item.get("literal")) if family and given: names.append(f"{family}, {given}") elif literal: names.append(literal) elif family: names.append(family) else: text = _first_text(item) if text: names.append(text) return " and ".join(name for name in names if name) def _map_anystyle_type(value: str) -> str: lowered = value.casefold() if lowered in {"article", "journal_article", "article-journal"}: return "article" if lowered in {"chapter", "incollection"}: return "incollection" if lowered in {"paper-conference", "inproceedings", "proceedings"}: return "inproceedings" if lowered in {"thesis", "phdthesis", "dissertation"}: return "phdthesis" if lowered in {"mastersthesis", "master-thesis"}: return "mastersthesis" if lowered in {"report", "techreport"}: return "techreport" if lowered == "book": return "book" return "misc"