"""TalkOrigins example implementation. This module backs the example-facing namespace at ``citegeist.examples.talkorigins``. New code should prefer importing from the examples namespace rather than treating TalkOrigins support as part of the core top-level package surface. """ from __future__ import annotations from collections import Counter from dataclasses import asdict, dataclass from html.parser import HTMLParser import hashlib import json import re from pathlib import Path from urllib.parse import urljoin, urlparse from .bibtex import BibEntry, render_bibtex from .resolve import MetadataResolver, merge_entries, merge_entries_with_conflicts from .sources import SourceClient from .storage import BibliographyStore YEAR_PATTERN = re.compile(r"\b(18|19|20)\d{2}\b") REPEATED_AUTHOR_PATTERN = re.compile(r"^\s*[-_]{3,}\s*,?\s*") WHITESPACE_PATTERN = re.compile(r"\s+") TOPIC_PHRASE_STOPWORDS = { "about", "across", "after", "among", "analysis", "book", "books", "conference", "data", "edition", "effects", "example", "first", "from", "human", "humans", "journal", "method", "methods", "paper", "papers", "review", "science", "second", "studies", "study", "system", "their", "theory", "title", "using", } @dataclass(slots=True) class TalkOriginsTopic: topic: str url: str raw_entries: list[str] @dataclass(slots=True) class TalkOriginsSeedSet: topic: str slug: str url: str raw_entry_count: int parsed_entry_count: int seed_bib: str plaintext_path: str = "" page_path: str = "" snapshot_path: str = "" @dataclass(slots=True) class TalkOriginsBatchExport: base_url: str output_dir: str topic_count: int entry_count: int jobs_path: str manifest_path: str seed_sets: list[TalkOriginsSeedSet] full_bib_path: str = "" full_plaintext_path: str = "" site_index_path: str = "" @dataclass(slots=True) class TalkOriginsValidationReport: manifest_path: str topic_count: int entry_count: int parsed_ratio: float missing_author_count: int missing_title_count: int missing_year_count: int suspicious_entry_type_count: int suspicious_examples: list[dict[str, str]] duplicate_cluster_count: int duplicate_entry_count: int duplicate_examples: list[dict[str, object]] @dataclass(slots=True) class TalkOriginsIngestReport: manifest_path: str topic_count: int raw_entry_count: int stored_entry_count: int duplicate_cluster_count: int duplicate_entry_count: int canonicalized_count: int @dataclass(slots=True) class TalkOriginsDuplicateCluster: key: str count: int items: list[dict[str, str]] canonical: dict[str, object] | None = None @dataclass(slots=True) class TalkOriginsEnrichmentResult: key: str citation_key: str weak_reasons_before: list[str] resolved: bool applied: bool source_label: str = "" weak_reasons_after: list[str] | None = None conflicts: list[dict[str, str]] | None = None error: str = "" @dataclass(slots=True) class TalkOriginsReviewExport: manifest_path: str item_count: int items: list[dict[str, object]] @dataclass(slots=True) class TalkOriginsCorrectionResult: key: str citation_key: str applied: bool error: str = "" @dataclass(slots=True) class TalkOriginsTopicPhraseSuggestion: slug: str topic: str entry_count: int suggested_phrase: str keywords: list[str] review_required: bool = False review_reasons: list[str] | None = None class TalkOriginsScraper: def __init__( self, source_client: SourceClient | None = None, resolver: MetadataResolver | None = None, ) -> None: self.source_client = source_client or SourceClient() self.resolver = resolver or MetadataResolver(source_client=self.source_client) def scrape_to_directory( self, base_url: str, output_dir: str | Path, limit_topics: int | None = None, limit_entries_per_topic: int | None = None, resolve_seeds: bool = False, ingest_store: BibliographyStore | None = None, review_status: str = "draft", expand: bool = False, topic_limit: int = 5, topic_commit_limit: int | None = None, resume: bool = True, ) -> TalkOriginsBatchExport: output_root = Path(output_dir) seeds_dir = output_root / "seeds" plaintext_dir = output_root / "plaintext" snapshots_dir = output_root / "snapshots" site_dir = output_root / "site" topics_dir = site_dir / "topics" seeds_dir.mkdir(parents=True, exist_ok=True) plaintext_dir.mkdir(parents=True, exist_ok=True) snapshots_dir.mkdir(parents=True, exist_ok=True) topics_dir.mkdir(parents=True, exist_ok=True) seed_sets: list[TalkOriginsSeedSet] = [] total_entries = 0 jobs: list[dict[str, object]] = [] full_entries: list[BibEntry] = [] full_plaintext_blocks: list[str] = [] for topic in self.scrape_topics( base_url, snapshots_dir=snapshots_dir, limit_topics=limit_topics, resume=resume, ): raw_entries = topic.raw_entries[:limit_entries_per_topic] if limit_entries_per_topic else topic.raw_entries entry_pairs = [ (raw_entry, self.parse_reference_entry(raw_entry, index + 1)) for index, raw_entry in enumerate(raw_entries) ] parsed_entries = [entry for _, entry in entry_pairs if entry is not None] if resolve_seeds: parsed_entries = [self._augment_entry(entry) for entry in parsed_entries] if parsed_entries: augmented_iter = iter(parsed_entries) entry_pairs = [ (raw_entry, next(augmented_iter) if parsed_entry is not None else None) for raw_entry, parsed_entry in entry_pairs ] slug = _slugify(topic.topic) seed_path = (seeds_dir / f"{slug}.bib").resolve() plaintext_path = (plaintext_dir / f"{slug}.txt").resolve() page_path = (topics_dir / f"{slug}.html").resolve() snapshot_path = (snapshots_dir / f"{slug}.json").resolve() rendered = render_bibtex(parsed_entries) if parsed_entries else "" seed_path.write_text(rendered + ("\n" if rendered else ""), encoding="utf-8") plaintext_path.write_text(_render_plaintext_topic(topic.topic, raw_entries), encoding="utf-8") page_path.write_text( _render_topic_page(topic.topic, entry_pairs, seed_path.name), encoding="utf-8", ) if ingest_store is not None and parsed_entries: ingest_store.ingest_bibtex( rendered, source_label=topic.url, review_status=review_status, ) for entry in parsed_entries: ingest_store.add_entry_topic( entry.citation_key, topic_slug=slug, topic_name=topic.topic, source_type="talkorigins", source_url=topic.url, source_label=topic.url, ) ingest_store.connection.commit() seed_set = TalkOriginsSeedSet( topic=topic.topic, slug=slug, url=topic.url, raw_entry_count=len(raw_entries), parsed_entry_count=len(parsed_entries), seed_bib=str(seed_path), plaintext_path=str(plaintext_path), page_path=str(page_path), snapshot_path=str(snapshot_path), ) seed_sets.append(seed_set) total_entries += len(parsed_entries) full_entries.extend(parsed_entries) full_plaintext_blocks.append(_render_plaintext_topic(topic.topic, raw_entries).rstrip()) jobs.append( { "name": f"talkorigins:{slug}", "topic": topic.topic, "topic_slug": slug, "topic_name": topic.topic, "topic_phrase": topic.topic, "seed_bib": str(seed_path), "expand": expand, "status": review_status, "topic_limit": topic_limit, "topic_commit_limit": topic_commit_limit, } ) output_root.mkdir(parents=True, exist_ok=True) manifest_path = (output_root / "talkorigins_manifest.json").resolve() jobs_path = (output_root / "talkorigins_jobs.json").resolve() full_bib_path = (output_root / "talkorigins_full.bib").resolve() full_plaintext_path = (output_root / "talkorigins_full.txt").resolve() site_index_path = (site_dir / "index.html").resolve() full_bib_path.write_text(render_bibtex(full_entries) + ("\n" if full_entries else ""), encoding="utf-8") full_plaintext_path.write_text("\n\n".join(block for block in full_plaintext_blocks if block) + "\n", encoding="utf-8") site_index_path.write_text( _render_site_index(seed_sets, Path(full_bib_path).name, Path(full_plaintext_path).name), encoding="utf-8", ) manifest_payload = { "base_url": base_url, "resume": resume, "seed_sets": [asdict(item) for item in seed_sets], "full_bib_path": str(full_bib_path), "full_plaintext_path": str(full_plaintext_path), "site_index_path": str(site_index_path), } manifest_path.write_text(json.dumps(manifest_payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") jobs_path.write_text(json.dumps({"jobs": jobs}, indent=2, sort_keys=True) + "\n", encoding="utf-8") return TalkOriginsBatchExport( base_url=base_url, output_dir=str(output_root.resolve()), topic_count=len(seed_sets), entry_count=total_entries, jobs_path=str(jobs_path), manifest_path=str(manifest_path), seed_sets=seed_sets, full_bib_path=str(full_bib_path), full_plaintext_path=str(full_plaintext_path), site_index_path=str(site_index_path), ) def validate_export(self, manifest_path: str | Path) -> TalkOriginsValidationReport: manifest = json.loads(Path(manifest_path).read_text(encoding="utf-8")) seed_sets = manifest.get("seed_sets", []) topic_count = len(seed_sets) raw_total = sum(int(item.get("raw_entry_count", 0)) for item in seed_sets) parsed_total = sum(int(item.get("parsed_entry_count", 0)) for item in seed_sets) missing_author_count = 0 missing_title_count = 0 missing_year_count = 0 suspicious_entry_type_count = 0 suspicious_examples: list[dict[str, str]] = [] duplicate_groups: dict[str, list[dict[str, str]]] = {} for seed_set in seed_sets: seed_bib = seed_set.get("seed_bib") if not isinstance(seed_bib, str) or not seed_bib: continue path = Path(seed_bib) if not path.exists(): continue entries = parse_bib_file(path) for entry in entries: if not entry.fields.get("author"): missing_author_count += 1 if not entry.fields.get("title"): missing_title_count += 1 if not entry.fields.get("year"): missing_year_count += 1 if _is_suspicious_entry_type(entry): suspicious_entry_type_count += 1 if len(suspicious_examples) < 20: suspicious_examples.append( { "citation_key": entry.citation_key, "entry_type": entry.entry_type, "title": entry.fields.get("title", ""), "journal": entry.fields.get("journal", ""), "publisher": entry.fields.get("publisher", ""), "howpublished": entry.fields.get("howpublished", ""), } ) duplicate_key = _duplicate_key(entry) if duplicate_key: duplicate_groups.setdefault(duplicate_key, []).append( { "citation_key": entry.citation_key, "title": entry.fields.get("title", ""), "author": entry.fields.get("author", ""), "year": entry.fields.get("year", ""), "seed_bib": str(path), } ) parsed_ratio = (parsed_total / raw_total) if raw_total else 0.0 duplicate_examples: list[dict[str, object]] = [] duplicate_cluster_count = 0 duplicate_entry_count = 0 for group_key, items in sorted(duplicate_groups.items()): if len(items) < 2: continue duplicate_cluster_count += 1 duplicate_entry_count += len(items) if len(duplicate_examples) < 20: duplicate_examples.append( { "key": group_key, "count": len(items), "items": items[:5], } ) return TalkOriginsValidationReport( manifest_path=str(Path(manifest_path).resolve()), topic_count=topic_count, entry_count=parsed_total, parsed_ratio=parsed_ratio, missing_author_count=missing_author_count, missing_title_count=missing_title_count, missing_year_count=missing_year_count, suspicious_entry_type_count=suspicious_entry_type_count, suspicious_examples=suspicious_examples, duplicate_cluster_count=duplicate_cluster_count, duplicate_entry_count=duplicate_entry_count, duplicate_examples=duplicate_examples, ) def suggest_topic_phrases( self, manifest_path: str | Path, limit: int | None = None, topic_slug: str | None = None, ) -> list[TalkOriginsTopicPhraseSuggestion]: manifest = json.loads(Path(manifest_path).read_text(encoding="utf-8")) seed_sets = manifest.get("seed_sets", []) suggestions: list[TalkOriginsTopicPhraseSuggestion] = [] for seed_set in seed_sets: current_topic_slug = str(seed_set.get("slug") or _slugify(str(seed_set.get("topic") or ""))) if topic_slug and current_topic_slug != topic_slug: continue seed_bib = seed_set.get("seed_bib") if not isinstance(seed_bib, str) or not seed_bib: continue path = Path(seed_bib) if not path.exists(): continue entries = parse_bib_file(path) topic_name = str(seed_set.get("topic") or current_topic_slug) keywords = _suggest_topic_keywords(entries, topic_name) review_reasons = _topic_phrase_review_reasons(entries, keywords) suggestions.append( TalkOriginsTopicPhraseSuggestion( slug=current_topic_slug, topic=topic_name, entry_count=len(entries), suggested_phrase=" ".join([topic_name, *keywords]).strip(), keywords=keywords, review_required=bool(review_reasons), review_reasons=review_reasons, ) ) suggestions.sort(key=lambda item: (item.topic.casefold(), item.slug)) if limit is not None: suggestions = suggestions[:limit] return suggestions def inspect_duplicate_clusters( self, manifest_path: str | Path, limit: int = 20, min_count: int = 2, match: str | None = None, topic_slug: str | None = None, preview_canonical: bool = False, weak_only: bool = False, ) -> list[TalkOriginsDuplicateCluster]: duplicate_groups, grouped_entries = _collect_duplicate_groups( manifest_path, match=match, topic_slug=topic_slug, ) clusters: list[TalkOriginsDuplicateCluster] = [] for group_key, items in sorted(duplicate_groups.items()): if len(items) < min_count: continue canonical_payload = None if preview_canonical: canonical = _build_canonical_preview(grouped_entries[group_key]) weak_reasons = _canonical_weaknesses(canonical) if weak_only and not weak_reasons: continue canonical_payload = { "citation_key": canonical.citation_key, "entry_type": canonical.entry_type, "field_count": len([value for value in canonical.fields.values() if value]), "fields": dict(sorted(canonical.fields.items())), "weak_reasons": weak_reasons, } elif weak_only: canonical = _build_canonical_preview(grouped_entries[group_key]) if not _canonical_weaknesses(canonical): continue clusters.append( TalkOriginsDuplicateCluster( key=group_key, count=len(items), items=sorted( items, key=lambda item: ( item.get("topic_slug", ""), item.get("year", ""), item.get("citation_key", ""), ), ), canonical=canonical_payload, ) ) return clusters[:limit] def enrich_weak_canonicals( self, manifest_path: str | Path, store: BibliographyStore, limit: int = 20, min_count: int = 2, match: str | None = None, topic_slug: str | None = None, apply: bool = False, review_status: str = "enriched", allow_unsafe_matches: bool = False, ) -> list[TalkOriginsEnrichmentResult]: duplicate_groups, grouped_entries = _collect_duplicate_groups( manifest_path, match=match, topic_slug=topic_slug, ) results: list[TalkOriginsEnrichmentResult] = [] for group_key, items in sorted(duplicate_groups.items()): if len(items) < min_count: continue canonical = _build_canonical_preview(grouped_entries[group_key]) weak_reasons_before = _canonical_weaknesses(canonical) if not weak_reasons_before: continue resolution = None error = "" try: resolution = self.resolver.resolve_entry(canonical) except Exception as exc: error = str(exc) result = TalkOriginsEnrichmentResult( key=group_key, citation_key=canonical.citation_key, weak_reasons_before=weak_reasons_before, resolved=resolution is not None, applied=False, source_label=resolution.source_label if resolution is not None else "", error=error, ) if resolution is not None: if not allow_unsafe_matches and not _is_safe_enrichment_match(canonical, resolution): result.resolved = False result.source_label = resolution.source_label result.error = "unsafe resolver match" results.append(result) if len(results) >= limit: break continue merged, conflicts = merge_entries_with_conflicts(canonical, resolution.entry) if canonical.entry_type == "misc" and resolution.entry.entry_type != "misc": merged = BibEntry( entry_type=resolution.entry.entry_type, citation_key=merged.citation_key, fields=merged.fields, ) result.conflicts = conflicts result.weak_reasons_after = _canonical_weaknesses(merged) if apply: store_key = _find_store_citation_key(store, canonical) if store_key: store.replace_entry( store_key, merged, source_type=resolution.source_type, source_label=resolution.source_label, review_status=review_status, ) if conflicts: store.record_conflicts( store_key, conflicts, source_type=resolution.source_type, source_label=resolution.source_label, ) result.citation_key = store_key result.applied = True results.append(result) if len(results) >= limit: break if apply: store.connection.commit() return results def build_review_export( self, manifest_path: str | Path, store: BibliographyStore, limit: int = 20, min_count: int = 2, match: str | None = None, topic_slug: str | None = None, ) -> TalkOriginsReviewExport: clusters = self.inspect_duplicate_clusters( manifest_path, limit=limit, min_count=min_count, match=match, topic_slug=topic_slug, preview_canonical=True, weak_only=True, ) enrichment_results = self.enrich_weak_canonicals( manifest_path, store, limit=limit, min_count=min_count, match=match, topic_slug=topic_slug, apply=False, ) by_key = {result.key: result for result in enrichment_results} items: list[dict[str, object]] = [] for cluster in clusters: result = by_key.get(cluster.key) payload = { "key": cluster.key, "count": cluster.count, "items": cluster.items, "canonical": cluster.canonical, "enrichment": asdict(result) if result is not None else None, } items.append(payload) return TalkOriginsReviewExport( manifest_path=str(Path(manifest_path).resolve()), item_count=len(items), items=items, ) def apply_review_corrections( self, manifest_path: str | Path, corrections_path: str | Path, store: BibliographyStore, default_review_status: str = "reviewed", ) -> list[TalkOriginsCorrectionResult]: duplicate_groups, grouped_entries = _collect_duplicate_groups(manifest_path) payload = json.loads(Path(corrections_path).read_text(encoding="utf-8")) correction_items = payload.get("corrections", []) results: list[TalkOriginsCorrectionResult] = [] for item in correction_items: key = str(item.get("key") or "") if not key: results.append(TalkOriginsCorrectionResult(key="", citation_key="", applied=False, error="missing key")) continue entries = grouped_entries.get(key) if not entries: results.append(TalkOriginsCorrectionResult(key=key, citation_key="", applied=False, error="unknown key")) continue canonical = _build_canonical_preview(entries) store_key = _find_store_citation_key(store, canonical) if not store_key: results.append(TalkOriginsCorrectionResult(key=key, citation_key=canonical.citation_key, applied=False, error="entry not found in store")) continue corrected = BibEntry( entry_type=str(item.get("entry_type") or canonical.entry_type), citation_key=store_key, fields=dict(canonical.fields), ) override_fields = item.get("fields", {}) if isinstance(override_fields, dict): for field_name, value in override_fields.items(): if value is None: corrected.fields.pop(str(field_name), None) else: corrected.fields[str(field_name)] = str(value) review_status = str(item.get("review_status") or default_review_status) store.replace_entry( store_key, corrected, source_type="manual_review", source_label=f"talkorigins_corrections:{Path(corrections_path).resolve()}", review_status=review_status, ) results.append(TalkOriginsCorrectionResult(key=key, citation_key=store_key, applied=True)) store.connection.commit() return results def ingest_export( self, manifest_path: str | Path, store: BibliographyStore, review_status: str = "draft", dedupe: bool = True, ) -> TalkOriginsIngestReport: manifest = json.loads(Path(manifest_path).read_text(encoding="utf-8")) seed_sets = manifest.get("seed_sets", []) topic_count = len(seed_sets) raw_entry_count = sum(int(item.get("parsed_entry_count", 0)) for item in seed_sets) grouped: dict[str, list[tuple[dict[str, object], BibEntry]]] = {} canonicalized_count = 0 duplicate_entry_count = 0 for seed_set in seed_sets: seed_bib = seed_set.get("seed_bib") if not isinstance(seed_bib, str) or not seed_bib: continue entries = parse_bib_file(seed_bib) for entry in entries: group_key = _duplicate_key(entry) if dedupe else entry.citation_key if not group_key: group_key = entry.citation_key grouped.setdefault(group_key, []).append((seed_set, entry)) stored_entry_count = 0 duplicate_cluster_count = 0 source_label = str(Path(manifest_path).resolve()) key_owners: dict[str, str] = {} existing_rows = store.connection.execute("SELECT citation_key FROM entries").fetchall() for row in existing_rows: key_owners[str(row["citation_key"])] = "__existing__" for group_key, items in grouped.items(): if len(items) > 1: duplicate_cluster_count += 1 duplicate_entry_count += len(items) canonical = _select_canonical_entry([entry for _, entry in items]) for _, duplicate in items: if duplicate.citation_key != canonical.citation_key: canonical = merge_entries(canonical, duplicate) canonicalized_count += 1 canonical = _assign_canonical_key(canonical, group_key, key_owners) store.upsert_entry( canonical, raw_bibtex=render_bibtex([canonical]), source_type="talkorigins", source_label=source_label, review_status=review_status, ) stored_entry_count += 1 seen_topics: set[str] = set() for seed_set, _ in items: topic_slug = str(seed_set.get("slug") or _slugify(str(seed_set.get("topic") or ""))) if topic_slug in seen_topics: continue seen_topics.add(topic_slug) store.add_entry_topic( canonical.citation_key, topic_slug=topic_slug, topic_name=str(seed_set.get("topic") or topic_slug), source_type="talkorigins", source_url=str(seed_set.get("url") or ""), source_label=source_label, ) store.connection.commit() return TalkOriginsIngestReport( manifest_path=str(Path(manifest_path).resolve()), topic_count=topic_count, raw_entry_count=raw_entry_count, stored_entry_count=stored_entry_count, duplicate_cluster_count=duplicate_cluster_count, duplicate_entry_count=duplicate_entry_count, canonicalized_count=canonicalized_count, ) def scrape_topics( self, base_url: str, snapshots_dir: Path | None = None, limit_topics: int | None = None, resume: bool = True, ) -> list[TalkOriginsTopic]: fetch_text = getattr(self.source_client, "try_get_text", self.source_client.get_text) index_html = fetch_text(base_url) if index_html is None: return [] parser = _TopicIndexParser(base_url) parser.feed(index_html) topics: list[TalkOriginsTopic] = [] for link in parser.topic_links[:limit_topics]: slug = _slugify(link["topic"]) snapshot_path = snapshots_dir / f"{slug}.json" if snapshots_dir is not None else None snapshot = _load_snapshot(snapshot_path) if resume and snapshot_path is not None else None if snapshot is not None: raw_entries = list(snapshot.get("raw_entries", [])) else: page_html = fetch_text(link["url"]) if page_html is None: continue topic_parser = _TopicPageParser() topic_parser.feed(page_html) raw_entries = normalize_topic_entries(topic_parser.preformatted_text()) if snapshot_path is not None: snapshot_payload = { "topic": link["topic"], "url": link["url"], "raw_entries": raw_entries, } snapshot_path.write_text(json.dumps(snapshot_payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") topics.append(TalkOriginsTopic(topic=link["topic"], url=link["url"], raw_entries=raw_entries)) return topics def parse_reference_entry(self, raw_entry: str, ordinal: int) -> BibEntry | None: year_match = YEAR_PATTERN.search(raw_entry) if year_match is None: return None year = year_match.group(0) author_part = raw_entry[: year_match.start()].strip(" ,.;:") remainder = raw_entry[year_match.end() :].strip(" ,.;:") if not author_part or not remainder: return None title, venue = _split_title_and_venue(remainder) if not title: return None authors = _normalize_gsa_authors(author_part) citation_key = _make_citation_key(authors, year, title, ordinal) entry_type = _guess_entry_type(remainder) fields = { "author": authors, "year": year, "title": title, "note": f"talkorigins_source = {{true}}; raw_reference = {{{raw_entry}}}", } if entry_type == "book": normalized = _normalize_incollection_candidate(title, venue) if normalized is not None: title = normalized["title"] fields["title"] = title entry_type = "incollection" if normalized.get("editor"): fields["editor"] = normalized["editor"] if normalized.get("booktitle"): fields["booktitle"] = normalized["booktitle"] if normalized.get("publisher"): fields["publisher"] = normalized["publisher"] venue = "" if venue: if entry_type == "article": fields["journal"] = venue elif entry_type == "inproceedings": fields["booktitle"] = venue elif entry_type == "incollection": fields["booktitle"] = venue elif entry_type in {"book", "phdthesis", "mastersthesis"}: fields["publisher"] = venue else: fields["howpublished"] = venue return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) def _augment_entry(self, entry: BibEntry) -> BibEntry: try: resolution = self.resolver.resolve_entry(entry) except Exception: return entry if resolution is None: return entry return merge_entries(entry, resolution.entry) def normalize_topic_entries(text: str) -> list[str]: entries: list[str] = [] previous_authors = "" current: list[str] = [] for raw_line in text.splitlines(): line = raw_line.strip() if not line: if current: entry_text = " ".join(current) normalized = _normalize_repeated_authors(entry_text, previous_authors) entries.append(normalized) previous_authors = _extract_author_prefix(normalized) or previous_authors current = [] continue current.append(WHITESPACE_PATTERN.sub(" ", line)) if current: entry_text = " ".join(current) normalized = _normalize_repeated_authors(entry_text, previous_authors) entries.append(normalized) return entries def _normalize_repeated_authors(entry_text: str, previous_authors: str) -> str: if previous_authors and REPEATED_AUTHOR_PATTERN.match(entry_text): return REPEATED_AUTHOR_PATTERN.sub(f"{previous_authors}, ", entry_text, count=1) return entry_text def _extract_author_prefix(entry_text: str) -> str: year_match = YEAR_PATTERN.search(entry_text) if year_match is None: return "" return entry_text[: year_match.start()].strip(" ,;:") def _split_title_and_venue(remainder: str) -> tuple[str, str]: if ": " in remainder: title, venue = remainder.split(": ", 1) return _clean_fragment(title), _clean_fragment(venue) parts = [part.strip() for part in remainder.split(". ") if part.strip()] if not parts: return "", "" title = parts[0] venue = ". ".join(parts[1:]) if len(parts) > 1 else "" return _clean_fragment(title), _clean_fragment(venue) def _normalize_gsa_authors(author_part: str) -> str: cleaned = WHITESPACE_PATTERN.sub(" ", author_part.replace("&", " and ")).strip(" ,;:") if " and " in cleaned and "," not in cleaned: return cleaned fragments = [fragment.strip() for fragment in cleaned.split(",") if fragment.strip()] if len(fragments) < 2: return cleaned authors: list[str] = [] index = 0 while index + 1 < len(fragments): family = re.sub(r"^(and)\s+", "", fragments[index], flags=re.IGNORECASE).strip() given = re.sub(r"^(and)\s+", "", fragments[index + 1], flags=re.IGNORECASE).strip() if family and given: authors.append(f"{family}, {given}") index += 2 if index < len(fragments): trailing = re.sub(r"^(and)\s+", "", fragments[index], flags=re.IGNORECASE).strip() if trailing: authors.append(trailing) return " and ".join(authors) if authors else cleaned def _make_citation_key(authors: str, year: str, title: str, ordinal: int) -> str: first_author = authors.split(" and ")[0] family = first_author.split(",", 1)[0] if "," in first_author else first_author.split()[-1] family = re.sub(r"[^A-Za-z0-9]+", "", family).lower() or "ref" first_word = re.sub(r"[^A-Za-z0-9]+", "", title.split()[0]).lower() if title.split() else "untitled" first_word = first_word or "untitled" return f"{family}{year}{first_word}{ordinal}" def _guess_entry_type(text: str) -> str: lowered = text.lower() if "ph.d" in lowered or "dissertation" in lowered or "thesis" in lowered: return "phdthesis" if any( token in lowered for token in ( "press", "publisher", "publications", "publication", "elsevier", "springer", "wiley", "university", "books", ) ): return "book" if any(token in lowered for token in ("proceedings", "conference", "symposium", "workshop")): return "inproceedings" if any(token in lowered for token in ("journal", "review", "letters", "quarterly", "science", "nature")): return "article" return "misc" def _clean_fragment(value: str) -> str: return WHITESPACE_PATTERN.sub(" ", value.strip(" .;:,\"'")) def _slugify(value: str) -> str: slug = re.sub(r"[^A-Za-z0-9]+", "-", value.lower()).strip("-") return slug or "topic" def _normalize_incollection_candidate(title: str, venue: str) -> dict[str, str] | None: lowered = venue.lower() if ", in " not in lowered: return None split_index = lowered.find(", in ") prefix = _clean_fragment(venue[:split_index]) container = venue[split_index + len(", in ") :].strip() if not container: return None editor_match = re.match(r"^(?P.+?),\s+eds?\.,\s+(?P.+)$", container, flags=re.IGNORECASE) if editor_match is None: return None editor_text = _normalize_gsa_authors(editor_match.group("editors")) rest = editor_match.group("rest").strip() if ": " in rest: booktitle, publisher = rest.split(": ", 1) else: booktitle, publisher = rest, "" normalized_title = title if prefix: normalized_title = _clean_fragment(f"{title}: {prefix}") payload = { "title": normalized_title, "editor": editor_text, "booktitle": _clean_fragment(booktitle), } if publisher: payload["publisher"] = _clean_fragment(publisher) return payload def _load_snapshot(path: Path | None) -> dict[str, object] | None: if path is None or not path.exists(): return None return json.loads(path.read_text(encoding="utf-8")) def parse_bib_file(path: str | Path) -> list[BibEntry]: from .bibtex import parse_bibtex return parse_bibtex(Path(path).read_text(encoding="utf-8")) def _render_plaintext_topic(topic: str, raw_entries: list[str]) -> str: body = "\n\n".join(raw_entries) return f"{topic}\n\n{body}\n" if body else f"{topic}\n" def _render_topic_page(topic: str, entry_pairs: list[tuple[str, BibEntry | None]], seed_filename: str) -> str: entry_blocks: list[str] = [] for index, (raw_entry, parsed_entry) in enumerate(entry_pairs, start=1): bibtex_block = "" if parsed_entry is not None: bibtex_block = render_bibtex([parsed_entry]) safe_plain = _html_escape(raw_entry) safe_bibtex = _html_escape(bibtex_block) entry_blocks.append( "\n".join( [ '
', f'
{safe_plain}
', f' ', f' ', "
", ] ) ) return "\n".join( [ "", '', "", ' ', f" {_html_escape(topic)} bibliography", " ", " ", "", "", f"

{_html_escape(topic)}

", f'

Back to index | Seed BibTeX

', *entry_blocks, "", "", ] ) + "\n" def _render_site_index(seed_sets: list[TalkOriginsSeedSet], full_bib_name: str, full_plaintext_name: str) -> str: items = [ f'
  • {_html_escape(item.topic)} ' f'({item.parsed_entry_count} entries)
  • ' for item in seed_sets ] return "\n".join( [ "", '', "", ' ', " TalkOrigins bibliography reconstruction", " ", "", "", "

    TalkOrigins bibliography reconstruction

    ", "

    Downloads:

    ", " ", "

    Topics

    ", " ", "", "", ] ) + "\n" def _html_escape(value: str) -> str: return ( value.replace("&", "&") .replace("<", "<") .replace(">", ">") .replace('"', """) ) def _collect_duplicate_groups( manifest_path: str | Path, match: str | None = None, topic_slug: str | None = None, ) -> tuple[dict[str, list[dict[str, str]]], dict[str, list[BibEntry]]]: manifest = json.loads(Path(manifest_path).read_text(encoding="utf-8")) seed_sets = manifest.get("seed_sets", []) match_text = match.casefold() if match else None duplicate_groups: dict[str, list[dict[str, str]]] = {} grouped_entries: dict[str, list[BibEntry]] = {} for seed_set in seed_sets: seed_bib = seed_set.get("seed_bib") if not isinstance(seed_bib, str) or not seed_bib: continue current_topic_slug = str(seed_set.get("slug") or _slugify(str(seed_set.get("topic") or ""))) if topic_slug and current_topic_slug != topic_slug: continue path = Path(seed_bib) if not path.exists(): continue for entry in parse_bib_file(path): duplicate_key = _duplicate_key(entry) if not duplicate_key: continue item = { "citation_key": entry.citation_key, "title": entry.fields.get("title", ""), "author": entry.fields.get("author", ""), "year": entry.fields.get("year", ""), "seed_bib": str(path), "topic": str(seed_set.get("topic") or ""), "topic_slug": current_topic_slug, } if match_text and not _duplicate_item_matches(item, duplicate_key, match_text): continue duplicate_groups.setdefault(duplicate_key, []).append(item) grouped_entries.setdefault(duplicate_key, []).append(entry) return duplicate_groups, grouped_entries def _duplicate_key(entry: BibEntry) -> str: author = _normalize_duplicate_text(entry.fields.get("author", "")) title = _normalize_duplicate_text(entry.fields.get("title", "")) year = entry.fields.get("year", "").strip() if not author or not title or not year: return "" first_author = author.split(" and ")[0] return f"{first_author}|{year}|{title}" def _duplicate_item_matches(item: dict[str, str], duplicate_key: str, match_text: str) -> bool: haystacks = ( duplicate_key, item.get("citation_key", ""), item.get("title", ""), item.get("author", ""), item.get("year", ""), item.get("topic", ""), item.get("topic_slug", ""), item.get("seed_bib", ""), ) return any(match_text in value.casefold() for value in haystacks if value) def _normalize_duplicate_text(value: str) -> str: normalized = value.lower() normalized = normalized.replace("&", " and ") normalized = re.sub(r"[^a-z0-9\s]+", " ", normalized) normalized = re.sub(r"\s+", " ", normalized).strip() return normalized def _topic_phrase_tokens(value: str) -> list[str]: return [ token for token in _normalize_duplicate_text(value).split() if len(token) >= 4 and token not in TOPIC_PHRASE_STOPWORDS ] def _suggest_topic_keywords(entries: list[BibEntry], topic_name: str, max_keywords: int = 4) -> list[str]: topic_terms = set(_topic_phrase_tokens(topic_name)) counts: Counter[str] = Counter() for entry in entries: for term in set(_topic_phrase_tokens(entry.fields.get("title", ""))): if term in topic_terms: continue counts[term] += 1 ranked = sorted(counts.items(), key=lambda item: (-item[1], item[0])) if len(entries) <= 1: max_keywords = min(max_keywords, 1) elif len(entries) <= 3: max_keywords = min(max_keywords, 2) filtered = [(term, count) for term, count in ranked if count >= 2] selected = filtered if filtered else ranked[:max_keywords] return [term for term, _ in selected[:max_keywords]] def _topic_phrase_review_reasons(entries: list[BibEntry], keywords: list[str]) -> list[str]: reasons: list[str] = [] if len(entries) <= 1: reasons.append("single_entry_topic") elif len(entries) <= 3: reasons.append("small_topic") if not keywords: reasons.append("no_keyword_signal") elif len(keywords) == 1: reasons.append("thin_keyword_signal") if any(_looks_noisy_keyword(keyword) for keyword in keywords): reasons.append("noisy_keywords") return reasons def _looks_noisy_keyword(keyword: str) -> bool: if len(keyword) <= 3: return True if any(char.isdigit() for char in keyword): return True noisy_tokens = {"boundry", "colloquium", "edition", "history", "idea", "central", "bearing", "time"} return keyword in noisy_tokens def _select_canonical_entry(entries: list[BibEntry]) -> BibEntry: return max( entries, key=lambda entry: ( _entry_richness(entry), -len(entry.citation_key), entry.citation_key, ), ) def _build_canonical_preview(entries: list[BibEntry]) -> BibEntry: canonical = _select_canonical_entry(entries) for duplicate in entries: if duplicate.citation_key != canonical.citation_key: canonical = merge_entries(canonical, duplicate) return canonical def _canonical_weaknesses(entry: BibEntry) -> list[str]: reasons: list[str] = [] if entry.entry_type == "misc": reasons.append("entry_type:misc") if not entry.fields.get("doi"): reasons.append("missing:doi") if _entry_richness(entry) < 6: reasons.append("low_field_richness") if entry.entry_type in {"article", "inproceedings", "incollection"} and not ( entry.fields.get("journal") or entry.fields.get("booktitle") ): reasons.append("missing:venue") return reasons def _find_store_citation_key(store: BibliographyStore, entry: BibEntry) -> str | None: if store.get_entry(entry.citation_key) is not None: return entry.citation_key first_author = entry.fields.get("author", "").split(" and ")[0].strip() row = store.connection.execute( """ SELECT e.citation_key FROM entries e LEFT JOIN entry_creators ec ON ec.entry_id = e.id AND ec.role = 'author' AND ec.ordinal = 1 LEFT JOIN creators c ON c.id = ec.creator_id WHERE COALESCE(e.title, '') = ? AND COALESCE(e.year, '') = ? AND COALESCE(c.full_name, '') = ? ORDER BY e.citation_key LIMIT 1 """, ( entry.fields.get("title", ""), entry.fields.get("year", ""), first_author, ), ).fetchone() if row is None: return None return str(row["citation_key"]) def _is_safe_enrichment_match(base: BibEntry, resolution: object) -> bool: source_label = getattr(resolution, "source_label", "") resolved_entry = getattr(resolution, "entry", None) if not isinstance(source_label, str) or resolved_entry is None: return False if ":search:" not in source_label: return True base_title = _normalize_duplicate_text(base.fields.get("title", "")) resolved_title = _normalize_duplicate_text(resolved_entry.fields.get("title", "")) if not base_title or base_title != resolved_title: return False base_year = (base.fields.get("year") or "").strip() resolved_year = (resolved_entry.fields.get("year") or "").strip() if base_year and resolved_year and base_year == resolved_year: return True base_author = _normalize_duplicate_text(base.fields.get("author", "")) resolved_author = _normalize_duplicate_text(resolved_entry.fields.get("author", "")) if not base_author or not resolved_author: return False base_first = base_author.split(" and ")[0].split()[0] resolved_first = resolved_author.split(" and ")[0].split()[0] return bool(base_first and resolved_first and base_first == resolved_first) def _entry_richness(entry: BibEntry) -> int: score = 0 for field_name, value in entry.fields.items(): if value: score += 3 if field_name in {"doi", "url", "abstract", "publisher", "journal", "booktitle", "editor"} else 1 return score def _assign_canonical_key(entry: BibEntry, group_key: str, key_owners: dict[str, str]) -> BibEntry: base_key = entry.citation_key owner = key_owners.get(base_key) if owner is None or owner == group_key: key_owners[base_key] = group_key return entry suffix = hashlib.sha1(group_key.encode("utf-8")).hexdigest()[:8] candidate = f"{base_key}_{suffix}" counter = 2 while candidate in key_owners and key_owners[candidate] != group_key: candidate = f"{base_key}_{suffix}_{counter}" counter += 1 key_owners[candidate] = group_key return BibEntry(entry_type=entry.entry_type, citation_key=candidate, fields=dict(entry.fields)) def _is_suspicious_entry_type(entry: BibEntry) -> bool: journal = entry.fields.get("journal", "").lower() publisher = entry.fields.get("publisher", "").lower() howpublished = entry.fields.get("howpublished", "").lower() if entry.entry_type == "article" and any( token in journal for token in ("elsevier", "springer", "press", "publications", "publisher", "university") ): return True if entry.entry_type == "misc" and any( token in howpublished for token in ("journal", "review", "letters", "proceedings", "conference", "symposium") ): return True if entry.entry_type == "book" and any( token in publisher for token in ("journal", "review", "letters", "proceedings", "conference") ) and not any( token in publisher for token in ("press", "academic", "elsevier", "springer", "wiley", "university") ): return True if entry.entry_type == "incollection" and not entry.fields.get("booktitle"): return True return False class _TopicIndexParser(HTMLParser): def __init__(self, base_url: str) -> None: super().__init__() self.base_url = base_url self.base_prefix = base_url if base_url.endswith("/") else base_url + "/" self.topic_links: list[dict[str, str]] = [] self._current_href: str | None = None self._current_text: list[str] = [] self._seen_urls: set[str] = set() def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: if tag != "a": return href = dict(attrs).get("href") if not href or href.startswith("#"): return self._current_href = urljoin(self.base_url, href) self._current_text = [] def handle_data(self, data: str) -> None: if self._current_href is not None: self._current_text.append(data) def handle_endtag(self, tag: str) -> None: if tag != "a" or self._current_href is None: return topic = WHITESPACE_PATTERN.sub(" ", "".join(self._current_text)).strip() href = self._current_href self._current_href = None self._current_text = [] if not topic or href in self._seen_urls: return parsed = urlparse(href) base_parsed = urlparse(self.base_prefix) if parsed.netloc and base_parsed.netloc and parsed.netloc != base_parsed.netloc: return if not href.startswith(self.base_prefix): return if href.rstrip("/").endswith("biblio") or href.endswith("origins.html"): return self._seen_urls.add(href) self.topic_links.append({"topic": topic, "url": href}) class _TopicPageParser(HTMLParser): def __init__(self) -> None: super().__init__() self._bibliography_depth = 0 self._in_pre = False self._in_paragraph = False self._current_paragraph: list[str] = [] self._parts: list[str] = [] def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: attributes = dict(attrs) if tag == "div" and "bibliography" in (attributes.get("class") or "").split(): self._bibliography_depth += 1 return if tag == "pre": self._in_pre = True return if self._bibliography_depth and tag == "p": self._in_paragraph = True self._current_paragraph = [] def handle_endtag(self, tag: str) -> None: if tag == "div" and self._bibliography_depth: self._bibliography_depth -= 1 return if tag == "p" and self._in_paragraph: text = "".join(self._current_paragraph).strip() if text: self._parts.append(text) self._parts.append("\n\n") self._current_paragraph = [] self._in_paragraph = False return if tag == "pre": self._in_pre = False self._parts.append("\n") def handle_data(self, data: str) -> None: if self._bibliography_depth and self._in_paragraph: self._current_paragraph.append(data) elif self._in_pre: self._parts.append(data) def preformatted_text(self) -> str: return "".join(self._parts)