From b74582b72f09f36b63e459c26e3cc7ea3d0696c2 Mon Sep 17 00:00:00 2001 From: welsberr Date: Fri, 20 Mar 2026 07:42:49 -0400 Subject: [PATCH] Add topic review workflow and expansion tooling --- Makefile | 5 +- README.md | 102 ++- src/citegeist/__init__.py | 36 +- src/citegeist/batch.py | 78 ++ src/citegeist/bibtex.py | 60 +- src/citegeist/bootstrap.py | 145 ++++ src/citegeist/cli.py | 947 +++++++++++++++++++++- src/citegeist/expand.py | 309 +++++++ src/citegeist/harvest.py | 317 ++++++++ src/citegeist/resolve.py | 254 +++++- src/citegeist/sources.py | 12 +- src/citegeist/storage.py | 473 ++++++++++- src/citegeist/talkorigins.py | 1485 ++++++++++++++++++++++++++++++++++ tests/test_batch.py | 129 +++ tests/test_bootstrap.py | 175 ++++ tests/test_cli.py | 839 ++++++++++++++++++- tests/test_harvest.py | 293 +++++++ tests/test_resolve.py | 251 +++++- tests/test_sources.py | 11 + tests/test_storage.py | 247 ++++++ tests/test_talkorigins.py | 1024 +++++++++++++++++++++++ tests/test_topic_expand.py | 242 ++++++ 22 files changed, 7365 insertions(+), 69 deletions(-) create mode 100644 src/citegeist/batch.py create mode 100644 src/citegeist/bootstrap.py create mode 100644 src/citegeist/harvest.py create mode 100644 src/citegeist/talkorigins.py create mode 100644 tests/test_batch.py create mode 100644 tests/test_bootstrap.py create mode 100644 tests/test_harvest.py create mode 100644 tests/test_talkorigins.py create mode 100644 tests/test_topic_expand.py diff --git a/Makefile b/Makefile index 8b20c95..6456e47 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ PYTHONPATH_SRC=PYTHONPATH=src VENV_PYTHON=.venv/bin/python -.PHONY: test test-live live-smoke +.PHONY: test test-live live-smoke validate-talkorigins test: $(PYTHONPATH_SRC) $(VENV_PYTHON) -m pytest -q @@ -11,3 +11,6 @@ test-live: live-smoke: CITEGEIST_SOURCE_CACHE=.cache/citegeist $(PYTHONPATH_SRC) $(VENV_PYTHON) scripts/live_smoke.py + +validate-talkorigins: + $(PYTHONPATH_SRC) $(VENV_PYTHON) -m citegeist validate-talkorigins talkorigins-out/talkorigins_manifest.json diff --git a/README.md b/README.md index e795a94..d2294f7 100644 --- a/README.md +++ b/README.md @@ -46,12 +46,17 @@ The initial repo includes: - `pybtex`-backed BibTeX parsing and export in a repo-local virtual environment; - a SQLite-backed bibliography store; - a small CLI for ingest, search, inspection, and export; -- review-state tracking on entries and per-field ingest provenance; +- review-state tracking on entries, per-field ingest provenance, and field-level conflict review; - plaintext reference extraction into draft BibTeX for numbered, APA-like, wrapped-line, and simple book-style references; -- identifier-first metadata resolution for DOI, OpenAlex, DBLP, and arXiv-backed entries, with OpenAlex title-search fallback; +- identifier-first metadata resolution for DOI, OpenAlex, DBLP, arXiv, and DataCite-backed entries, with OpenAlex/DataCite title-search fallback; - local citation-graph traversal over stored `cites`, `cited_by`, and `crossref` edges; - Crossref- and OpenAlex-backed graph expansion that materializes draft related works and edge provenance; - a dedicated source-client layer with fixture/cache support for live-source development; +- OAI-PMH Dublin Core harvesting for institutional repositories and thesis/dissertation sources; +- OAI-PMH repository discovery via `Identify`, `ListSets`, and `ListMetadataFormats` to target harvests more precisely; +- bibliography bootstrap workflows that can start from a seed `.bib`, a topic phrase, or both; +- batch bootstrap orchestration from JSON job files containing seed BibTeX paths, topic phrases, or both; +- a TalkOrigins scraper that fixes repeated-author plaintext references, emits per-topic seed BibTeX files, and writes a batch JSON specification; - normalized tables for entries, creators, identifiers, and citation relations; - full-text-search-ready indexing over title, abstract, and fulltext when SQLite FTS5 is available; - tests covering parsing, ingestion, relation storage, and search. @@ -113,18 +118,107 @@ Or use the CLI directly: cd citegeist PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 ingest references.bib PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 search "semantic search" -PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 show --provenance smith2024graphs +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 search "origin" --topic abiogenesis +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 show --provenance --conflicts smith2024graphs PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 set-status smith2024graphs reviewed +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve-conflicts smith2024graphs title accepted +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 apply-conflict smith2024graphs title +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 bootstrap --seed-bib seed.bib --topic "bayesian nonparametrics" +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 bootstrap --topic "bayesian nonparametrics" --preview --topic-commit-limit 5 +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 scrape-talkorigins talkorigins-out --limit-topics 5 --limit-entries-per-topic 20 PYTHONPATH=src .venv/bin/python -m citegeist extract references.txt --output draft.bib PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve smith2024graphs +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topics +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topic-entries abiogenesis +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export-topic abiogenesis --output abiogenesis.bib PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 graph smith2024graphs --relation cites --depth 2 --missing-only PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source crossref PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand smith2024graphs --source openalex --relation cited_by --limit 10 +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 expand-topic abiogenesis --topic-phrase "abiogenesis origin chemistry" --source openalex --relation cites --seed-key seed2024 --min-relevance 0.3 --preview +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 set-topic-phrase abiogenesis "abiogenesis origin chemistry prebiotic" +PYTHONPATH=src .venv/bin/python -m citegeist discover-oai https://example.edu/oai +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 harvest-oai https://example.edu/oai --metadata-prefix mods --from 2024-01-01 --until 2024-12-31 --limit 10 PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export --output reviewed.bib ``` For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run. +For large legacy plaintext corpora such as the TalkOrigins bibliography, prefer a two-step workflow: + +1. `scrape-talkorigins` to generate cleaned per-topic `seed_bib` files plus a `talkorigins_jobs.json` batch spec. +2. `bootstrap-batch` on that JSON file when you want to ingest, resolve, and expand from the generated seeds. + +The TalkOrigins scrape output now includes: + +- `seeds/*.bib` per-topic seed BibTeX files for `bootstrap-batch` +- `plaintext/*.txt` per-topic cleaned GSA-style plaintext with repeated authors expanded +- `site/topics/*.html` reconstructed topic pages with hide/show BibTeX blocks +- `talkorigins_full.txt` and `talkorigins_full.bib` aggregate downloads +- `snapshots/*.json` cached topic payloads so reruns can resume without re-fetching already scraped topics + +After a full scrape, run: + +```bash +PYTHONPATH=src .venv/bin/python -m citegeist validate-talkorigins talkorigins-out/talkorigins_manifest.json +PYTHONPATH=src .venv/bin/python -m citegeist duplicates-talkorigins talkorigins-out/talkorigins_manifest.json --limit 20 +PYTHONPATH=src .venv/bin/python -m citegeist duplicates-talkorigins talkorigins-out/talkorigins_manifest.json --limit 20 --preview --weak-only +PYTHONPATH=src .venv/bin/python -m citegeist suggest-talkorigins-phrases talkorigins-out/talkorigins_manifest.json --output topic-phrases.json +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 stage-topic-phrases topic-phrases.json +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 review-topic-phrase abiogenesis accepted --notes "curated from local corpus" +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 apply-topic-phrases topic-phrases.json +PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 enrich-talkorigins talkorigins-out/talkorigins_manifest.json --limit 20 +PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins-copy.sqlite3 enrich-talkorigins talkorigins-out/talkorigins_manifest.json --limit 5 --apply --allow-unsafe-search-matches +PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 review-talkorigins talkorigins-out/talkorigins_manifest.json --output talkorigins-review.json +PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 apply-talkorigins-corrections talkorigins-out/talkorigins_manifest.json talkorigins-corrections.json +``` + +That report summarizes parse coverage and flags suspicious entry-type / venue combinations for manual cleanup. +It also reports duplicate clusters across topic seed files so you can gauge how much deduplication pressure to expect before ingestion. +Use `duplicates-talkorigins` when you want to inspect specific clusters, filter by text, restrict the audit to one topic slug, or preview only weak canonicalization outcomes before importing. + +Use `suggest-talkorigins-phrases` to derive candidate stored expansion phrases from the existing TalkOrigins topic corpus itself. The output is deterministic JSON keyed by topic slug, with a suggested phrase plus the extracted keywords that drove it. This is a useful first pass before setting topic phrases in the database or editing generated batch jobs. + +Use `stage-topic-phrases` to load those suggestions into the database as review items. Staging stores the candidate in `suggested_phrase` and marks the topic `pending` without changing the active `expansion_phrase`. +Use `review-topic-phrase` to accept or reject one staged suggestion in place. Accepting a suggestion copies it into `expansion_phrase`; rejecting it preserves the review state without changing the live phrase. +Use `apply-topic-phrases` when you want a direct patch path instead of the staged review flow. It accepts either the raw suggestion list or an object with a `topics` list, and will apply `suggested_phrase` or `phrase` to matching topic slugs immediately. +Use `enrich-talkorigins` when you want to target those weak canonical entries for resolver-based metadata upgrades before retrying graph expansion on imported topic slices. +Use `review-talkorigins` when you want one JSON review artifact that combines weak canonical clusters with dry-run enrichment outcomes for manual cleanup. +Use `expand-topic` when you already have both a topic phrase and a curated topic seed set in the database: it expands outward from the topic’s existing entries, then only assigns discovered works back to that topic if they clear a topic-relevance threshold. Write-enabled assignment is stricter than preview ranking: a candidate must clear the score threshold and show a non-generic title anchor to the topic phrase, so broad methods papers do not get attached just because their abstracts or related terms overlap. On large noisy topics, prefer `--seed-key` to restrict the run to just the trusted seed entries you want to expand from, and use `--preview` first to inspect discovered candidates and relevance scores before writing anything. + +Use `set-topic-phrase` to store a curated expansion phrase on the topic itself. When a stored phrase exists, `expand-topic` will use it automatically if you do not pass `--topic-phrase`. Batch bootstrap jobs can also set `topic_slug`, `topic_name`, and `topic_phrase` so curated topic metadata is created as part of the run. +Use `topics --phrase-review-status pending` when you want to audit only topics whose staged phrase suggestions still need review. +`--allow-unsafe-search-matches` exists only for bounded experiments on copied databases when you explicitly want to relax trust to exercise downstream expansion behavior. + +Correction files are simple JSON: + +```json +{ + "corrections": [ + { + "key": "smith jane|1999|weak duplicate", + "entry_type": "article", + "review_status": "reviewed", + "fields": { + "journal": "Journal of Better Metadata", + "doi": "10.1000/weak", + "note": null + } + } + ] +} +``` + +`fields` values overwrite the canonical entry for that duplicate-cluster key. Set a field to `null` to remove it. + +To import the reconstructed corpus into SQLite while collapsing duplicate works across topics into canonical entries: + +```bash +PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 ingest-talkorigins talkorigins-out/talkorigins_manifest.json +``` + +That import preserves many-to-many topic membership through the `topics` and `entry_topics` tables. +After import, use `topics`, `topic-entries`, `search --topic`, and `export-topic` to inspect or export topic slices from the consolidated database. + Live-source workflow: ```bash @@ -147,7 +241,7 @@ make live-smoke ## Near-Term Priorities -- additional resolvers and expansion paths for non-DOI scholarly ecosystems. +- source adapters beyond OAI-PMH for additional non-DOI scholarly ecosystems. See [ROADMAP.md](./ROADMAP.md) for the prioritized phase plan and rationale. diff --git a/src/citegeist/__init__.py b/src/citegeist/__init__.py index 952a02c..bacb1c7 100644 --- a/src/citegeist/__init__.py +++ b/src/citegeist/__init__.py @@ -1,18 +1,52 @@ +from .batch import BatchBootstrapRunner, BatchJobResult, load_batch_jobs from .bibtex import BibEntry, parse_bibtex +from .bootstrap import BootstrapResult, Bootstrapper from .expand import CrossrefExpander, OpenAlexExpander from .extract import extract_references -from .resolve import MetadataResolver, merge_entries +from .harvest import OaiMetadataFormat, OaiPmhHarvester, OaiSet +from .resolve import MetadataResolver, merge_entries, merge_entries_with_conflicts from .sources import SourceClient from .storage import BibliographyStore +from .talkorigins import ( + TalkOriginsBatchExport, + TalkOriginsDuplicateCluster, + TalkOriginsEnrichmentResult, + TalkOriginsIngestReport, + TalkOriginsReviewExport, + TalkOriginsScraper, + TalkOriginsSeedSet, + TalkOriginsTopicPhraseSuggestion, + TalkOriginsTopic, + TalkOriginsValidationReport, +) __all__ = [ "BibEntry", + "BatchBootstrapRunner", + "BatchJobResult", "BibliographyStore", + "BootstrapResult", + "Bootstrapper", "CrossrefExpander", "MetadataResolver", "OpenAlexExpander", + "OaiPmhHarvester", + "OaiMetadataFormat", + "OaiSet", "SourceClient", + "TalkOriginsBatchExport", + "TalkOriginsDuplicateCluster", + "TalkOriginsEnrichmentResult", + "TalkOriginsIngestReport", + "TalkOriginsReviewExport", + "TalkOriginsScraper", + "TalkOriginsSeedSet", + "TalkOriginsTopicPhraseSuggestion", + "TalkOriginsTopic", + "TalkOriginsValidationReport", "extract_references", + "load_batch_jobs", "merge_entries", + "merge_entries_with_conflicts", "parse_bibtex", ] diff --git a/src/citegeist/batch.py b/src/citegeist/batch.py new file mode 100644 index 0000000..203a4a1 --- /dev/null +++ b/src/citegeist/batch.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path + +from .bootstrap import BootstrapResult, Bootstrapper +from .storage import BibliographyStore + + +@dataclass(slots=True) +class BatchJobResult: + job_name: str + result_count: int + results: list[BootstrapResult] + + +def load_batch_jobs(path: str | Path) -> list[dict]: + path = Path(path) + payload = json.loads(path.read_text(encoding="utf-8")) + if isinstance(payload, dict): + jobs = payload.get("jobs", []) + else: + jobs = payload + if not isinstance(jobs, list): + raise ValueError("Batch JSON must be a list of jobs or an object with a 'jobs' list") + normalized_jobs: list[dict] = [] + for job in jobs: + if not isinstance(job, dict): + raise ValueError("Each batch job must be an object") + normalized = dict(job) + seed_bib = normalized.get("seed_bib") + if isinstance(seed_bib, str) and seed_bib: + seed_path = Path(seed_bib) + if not seed_path.is_absolute(): + normalized["seed_bib"] = str((path.parent / seed_path).resolve()) + normalized_jobs.append(normalized) + return normalized_jobs + + +class BatchBootstrapRunner: + def __init__(self, bootstrapper: Bootstrapper | None = None) -> None: + self.bootstrapper = bootstrapper or Bootstrapper() + + def run(self, store: BibliographyStore, jobs: list[dict]) -> list[BatchJobResult]: + results: list[BatchJobResult] = [] + for index, job in enumerate(jobs, start=1): + seed_bib = job.get("seed_bib") + topic = job.get("topic") + topic_limit = int(job.get("topic_limit", 5)) + topic_commit_limit = job.get("topic_commit_limit") + expand = bool(job.get("expand", True)) + review_status = str(job.get("status", "draft")) + preview = bool(job.get("preview", False)) + name = str(job.get("name") or f"job_{index}") + topic_slug = job.get("topic_slug") + topic_name = job.get("topic_name") + topic_phrase = job.get("topic_phrase") + + seed_bibtex = None + if seed_bib: + seed_bibtex = Path(seed_bib).read_text(encoding="utf-8") + + job_results = self.bootstrapper.bootstrap( + store, + seed_bibtex=seed_bibtex, + topic=topic, + topic_limit=topic_limit, + topic_commit_limit=int(topic_commit_limit) if topic_commit_limit is not None else None, + expand=expand, + review_status=review_status, + preview_only=preview, + topic_slug=str(topic_slug) if topic_slug else None, + topic_name=str(topic_name) if topic_name else None, + topic_phrase=str(topic_phrase) if topic_phrase else None, + ) + results.append(BatchJobResult(name, len(job_results), job_results)) + return results diff --git a/src/citegeist/bibtex.py b/src/citegeist/bibtex.py index 41ed97d..d815b9a 100644 --- a/src/citegeist/bibtex.py +++ b/src/citegeist/bibtex.py @@ -5,8 +5,10 @@ from io import StringIO try: from pybtex.database import BibliographyData, Entry, Person, parse_string + from pybtex.bibtex.exceptions import BibTeXError from pybtex.database.output.bibtex import Writer except ImportError: # pragma: no cover - exercised only outside the configured venv + BibTeXError = None BibliographyData = Entry = Person = Writer = None parse_string = None @@ -40,7 +42,11 @@ def render_bibtex(entries: list[BibEntry]) -> str: _require_pybtex() bibliography_entries = {} for entry in entries: - fields = {key: value for key, value in entry.fields.items() if key not in {"author", "editor"}} + fields = { + key: _sanitize_bibtex_value(value) + for key, value in entry.fields.items() + if key not in {"author", "editor"} + } persons = {} for role in ("author", "editor"): raw_names = entry.fields.get(role) @@ -49,7 +55,24 @@ def render_bibtex(entries: list[BibEntry]) -> str: bibliography_entries[entry.citation_key] = Entry(entry.entry_type, fields=fields, persons=persons) buffer = StringIO() - Writer().write_stream(BibliographyData(entries=bibliography_entries), buffer) + try: + Writer().write_stream(BibliographyData(entries=bibliography_entries), buffer) + except BibTeXError: + conservative_entries = {} + for entry in entries: + fields = { + key: _flatten_bibtex_braces(value) + for key, value in entry.fields.items() + if key not in {"author", "editor"} + } + persons = {} + for role in ("author", "editor"): + raw_names = entry.fields.get(role) + if raw_names: + persons[role] = [Person(name.strip()) for name in raw_names.split(" and ") if name.strip()] + conservative_entries[entry.citation_key] = Entry(entry.entry_type, fields=fields, persons=persons) + buffer = StringIO() + Writer().write_stream(BibliographyData(entries=conservative_entries), buffer) return buffer.getvalue().strip() @@ -58,3 +81,36 @@ def _require_pybtex() -> None: raise RuntimeError( "pybtex is required. Use the repo-local virtual environment under .venv/ for citegeist commands." ) + + +def _sanitize_bibtex_value(value: str) -> str: + depth = 0 + parts: list[str] = [] + for char in value: + if char == "{": + depth += 1 + parts.append(char) + continue + if char == "}": + if depth == 0: + parts.append(")") + else: + depth -= 1 + parts.append(char) + continue + parts.append(char) + if depth > 0: + open_count = depth + normalized = [] + for char in parts: + if char == "{" and open_count > 0: + normalized.append("(") + open_count -= 1 + else: + normalized.append(char) + return "".join(normalized) + return "".join(parts) + + +def _flatten_bibtex_braces(value: str) -> str: + return value.replace("{", "(").replace("}", ")") diff --git a/src/citegeist/bootstrap.py b/src/citegeist/bootstrap.py new file mode 100644 index 0000000..80bb4e6 --- /dev/null +++ b/src/citegeist/bootstrap.py @@ -0,0 +1,145 @@ +from __future__ import annotations + +from dataclasses import dataclass +import re + +from .bibtex import BibEntry, parse_bibtex +from .expand import CrossrefExpander, OpenAlexExpander +from .resolve import MetadataResolver +from .storage import BibliographyStore + + +@dataclass(slots=True) +class BootstrapResult: + citation_key: str + origin: str + created: bool + score: float = 0.0 + + +class Bootstrapper: + def __init__( + self, + resolver: MetadataResolver | None = None, + crossref_expander: CrossrefExpander | None = None, + openalex_expander: OpenAlexExpander | None = None, + ) -> None: + self.resolver = resolver or MetadataResolver() + self.crossref_expander = crossref_expander or CrossrefExpander(self.resolver) + self.openalex_expander = openalex_expander or OpenAlexExpander(self.resolver) + + def bootstrap( + self, + store: BibliographyStore, + seed_bibtex: str | None = None, + topic: str | None = None, + topic_limit: int = 5, + topic_commit_limit: int | None = None, + expand: bool = True, + review_status: str = "draft", + preview_only: bool = False, + topic_slug: str | None = None, + topic_name: str | None = None, + topic_phrase: str | None = None, + ) -> list[BootstrapResult]: + results: list[BootstrapResult] = [] + seed_keys: list[str] = [] + + if seed_bibtex: + for entry in parse_bibtex(seed_bibtex): + created = store.get_entry(entry.citation_key) is None + if not preview_only: + store.upsert_entry( + entry, + raw_bibtex=None, + source_type="bootstrap", + source_label="seed_bibtex", + review_status=review_status, + ) + seed_keys.append(entry.citation_key) + results.append(BootstrapResult(entry.citation_key, "seed_bibtex", created)) + + if topic: + if not preview_only and (topic_slug or topic_name or topic_phrase): + store.ensure_topic( + slug=topic_slug or _slugify(topic), + name=topic_name or topic, + source_type="bootstrap", + expansion_phrase=topic_phrase or topic, + ) + ranked_candidates = self._topic_candidates(topic, seed_keys, topic_limit) + if topic_commit_limit is not None: + ranked_candidates = ranked_candidates[:topic_commit_limit] + + for entry, score in ranked_candidates: + created = store.get_entry(entry.citation_key) is None + if not preview_only: + store.upsert_entry( + entry, + raw_bibtex=None, + source_type="bootstrap", + source_label=f"topic:{topic}", + review_status=review_status, + ) + seed_keys.append(entry.citation_key) + results.append(BootstrapResult(entry.citation_key, "topic", created, score=score)) + + if expand and not preview_only: + expanded_keys = list(dict.fromkeys(seed_keys)) + for citation_key in expanded_keys: + for item in self.crossref_expander.expand_entry_references(store, citation_key): + results.append(BootstrapResult(item.discovered_citation_key, "crossref_expand", item.created_entry)) + for item in self.openalex_expander.expand_entry(store, citation_key, relation_type="cites", limit=topic_limit): + results.append(BootstrapResult(item.discovered_citation_key, "openalex_expand", item.created_entry)) + + store.connection.commit() + return results + + def _topic_candidates(self, topic: str, seed_keys: list[str], limit: int) -> list[tuple[BibEntry, float]]: + scored: dict[str, tuple[BibEntry, float]] = {} + + for source_name, base_score, entries in ( + ("openalex", 3.0, self.resolver.search_openalex(topic, limit=limit)), + ("crossref", 2.0, self.resolver.search_crossref(topic, limit=limit)), + ("datacite", 1.5, self.resolver.search_datacite(topic, limit=limit)), + ): + for entry in entries: + score = base_score + _topic_relevance_score(entry, topic) + _seed_overlap_score(entry, seed_keys) + existing = scored.get(entry.citation_key) + if existing is None or score > existing[1]: + scored[entry.citation_key] = (entry, score) + + ranked = sorted( + scored.values(), + key=lambda item: (-item[1], item[0].citation_key), + ) + return ranked[:limit] + + +def _topic_relevance_score(entry: BibEntry, topic: str) -> float: + topic_terms = _tokenize(topic) + title_terms = _tokenize(entry.fields.get("title", "")) + abstract_terms = _tokenize(entry.fields.get("abstract", "")) + overlap = len(topic_terms & (title_terms | abstract_terms)) + return float(overlap) + + +def _seed_overlap_score(entry: BibEntry, seed_keys: list[str]) -> float: + if not seed_keys: + return 0.0 + title_terms = _tokenize(entry.fields.get("title", "")) + score = 0.0 + for seed_key in seed_keys: + seed_terms = _tokenize(seed_key) + if seed_terms & title_terms: + score += 0.25 + return score + + +def _tokenize(value: str) -> set[str]: + return {token for token in re.split(r"\W+", value.lower()) if token} + + +def _slugify(value: str) -> str: + slug = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-") + return slug or "topic" diff --git a/src/citegeist/cli.py b/src/citegeist/cli.py index 2af8de9..7973e87 100644 --- a/src/citegeist/cli.py +++ b/src/citegeist/cli.py @@ -6,11 +6,15 @@ import json import sys from pathlib import Path +from .batch import BatchBootstrapRunner, load_batch_jobs from .bibtex import parse_bibtex, render_bibtex -from .expand import CrossrefExpander, OpenAlexExpander +from .bootstrap import Bootstrapper +from .expand import CrossrefExpander, OpenAlexExpander, TopicExpander from .extract import extract_references -from .resolve import MetadataResolver, merge_entries +from .harvest import OaiPmhHarvester +from .resolve import MetadataResolver, merge_entries_with_conflicts from .storage import BibliographyStore +from .talkorigins import TalkOriginsScraper def build_parser() -> argparse.ArgumentParser: @@ -27,11 +31,13 @@ def build_parser() -> argparse.ArgumentParser: search_parser = subparsers.add_parser("search", help="Search titles, abstracts, and fulltext") search_parser.add_argument("query", help="Search query") search_parser.add_argument("--limit", type=int, default=10, help="Maximum number of results") + search_parser.add_argument("--topic", help="Optional topic slug to filter search results") show_parser = subparsers.add_parser("show", help="Show one entry or list entries") show_parser.add_argument("citation_key", nargs="?", help="Citation key to show") show_parser.add_argument("--limit", type=int, default=20, help="Maximum entries when listing") show_parser.add_argument("--provenance", action="store_true", help="Include field provenance") + show_parser.add_argument("--conflicts", action="store_true", help="Include field conflicts") export_parser = subparsers.add_parser("export", help="Export entries as BibTeX") export_parser.add_argument("citation_keys", nargs="*", help="Optional citation keys to export") @@ -41,6 +47,18 @@ def build_parser() -> argparse.ArgumentParser: status_parser.add_argument("citation_key", help="Citation key to update") status_parser.add_argument("review_status", help="New review status") + conflict_parser = subparsers.add_parser("resolve-conflicts", help="Update conflict review status for one field") + conflict_parser.add_argument("citation_key", help="Citation key to update") + conflict_parser.add_argument("field_name", help="Field name whose open conflicts should be updated") + conflict_parser.add_argument("status", choices=["accepted", "rejected"], help="New conflict status") + + apply_conflict_parser = subparsers.add_parser( + "apply-conflict", + help="Accept the proposed value for the latest open conflict on a field", + ) + apply_conflict_parser.add_argument("citation_key", help="Citation key to update") + apply_conflict_parser.add_argument("field_name", help="Field name whose proposed value should be applied") + extract_parser = subparsers.add_parser("extract", help="Extract draft BibTeX from plaintext references") extract_parser.add_argument("input", help="Plaintext file containing bibliography-style references") extract_parser.add_argument("--output", help="Write extracted BibTeX to a file instead of stdout") @@ -81,6 +99,322 @@ def build_parser() -> argparse.ArgumentParser: ) expand_parser.add_argument("--limit", type=int, default=25, help="Maximum related works to fetch per seed") + expand_topic_parser = subparsers.add_parser( + "expand-topic", + help="Expand one topic from its existing seed entries and assign only relevant discoveries back to that topic", + ) + expand_topic_parser.add_argument("topic_slug", help="Topic slug to expand from") + expand_topic_parser.add_argument( + "--topic-phrase", + help="Optional phrase used for relevance gating; defaults to the stored topic name", + ) + expand_topic_parser.add_argument( + "--source", + choices=["crossref", "openalex"], + default="openalex", + help="External source used for topic expansion", + ) + expand_topic_parser.add_argument( + "--relation", + choices=["cites", "cited_by"], + default="cites", + help="Graph direction to expand for sources that support it", + ) + expand_topic_parser.add_argument("--seed-limit", type=int, default=25, help="Maximum topic seed entries to expand from") + expand_topic_parser.add_argument("--per-seed-limit", type=int, default=25, help="Maximum discovered works to fetch per seed") + expand_topic_parser.add_argument( + "--seed-key", + action="append", + dest="seed_keys", + help="Restrict expansion to one trusted seed entry; may be passed multiple times", + ) + expand_topic_parser.add_argument( + "--min-relevance", + type=float, + default=0.2, + help="Minimum topic-term overlap score required to assign a discovered work back to the topic", + ) + expand_topic_parser.add_argument( + "--preview", + action="store_true", + help="Discover and score candidate expansions without writing entries, relations, or topic assignments", + ) + + set_topic_phrase_parser = subparsers.add_parser( + "set-topic-phrase", + help="Set or clear the stored expansion phrase for one topic", + ) + set_topic_phrase_parser.add_argument("topic_slug", help="Topic slug to update") + set_topic_phrase_parser.add_argument( + "phrase", + nargs="?", + help="Expansion phrase to store; omit with --clear to remove it", + ) + set_topic_phrase_parser.add_argument( + "--clear", + action="store_true", + help="Clear the stored expansion phrase for this topic", + ) + + harvest_parser = subparsers.add_parser("harvest-oai", help="Harvest draft entries from an OAI-PMH repository") + harvest_parser.add_argument("base_url", help="OAI-PMH base URL") + harvest_parser.add_argument("--metadata-prefix", default="oai_dc", help="OAI-PMH metadataPrefix to harvest") + harvest_parser.add_argument("--set", dest="set_spec", help="Optional OAI-PMH set spec") + harvest_parser.add_argument("--from", dest="date_from", help="Optional OAI-PMH lower date bound") + harvest_parser.add_argument("--until", dest="date_until", help="Optional OAI-PMH upper date bound") + harvest_parser.add_argument("--limit", type=int, default=20, help="Maximum harvested records to ingest") + harvest_parser.add_argument("--status", default="draft", help="Initial review status") + + discover_parser = subparsers.add_parser("discover-oai", help="Inspect OAI-PMH repository identity and sets") + discover_parser.add_argument("base_url", help="OAI-PMH base URL") + + bootstrap_parser = subparsers.add_parser( + "bootstrap", + help="Start bibliography expansion from a seed BibTeX file, a topic phrase, or both", + ) + bootstrap_parser.add_argument("--seed-bib", help="Optional seed BibTeX file") + bootstrap_parser.add_argument("--topic", help="Optional topic phrase") + bootstrap_parser.add_argument("--topic-slug", help="Optional stored topic slug for this bootstrap topic") + bootstrap_parser.add_argument("--topic-name", help="Optional stored topic name for this bootstrap topic") + bootstrap_parser.add_argument( + "--store-topic-phrase", + help="Optional stored expansion phrase to save with the bootstrap topic; defaults to --topic when topic metadata is provided", + ) + bootstrap_parser.add_argument("--topic-limit", type=int, default=5, help="Maximum topic-search seed candidates") + bootstrap_parser.add_argument( + "--topic-commit-limit", + type=int, + help="Maximum ranked topic candidates to actually commit and expand", + ) + bootstrap_parser.add_argument( + "--no-expand", + action="store_true", + help="Do not run immediate graph expansion after seeding", + ) + bootstrap_parser.add_argument( + "--preview", + action="store_true", + help="Preview ranked bootstrap candidates without writing to the database or expanding", + ) + bootstrap_parser.add_argument("--status", default="draft", help="Initial review status for imported entries") + + batch_parser = subparsers.add_parser( + "bootstrap-batch", + help="Run multiple bootstrap jobs from a JSON specification file", + ) + batch_parser.add_argument("input", help="Path to batch JSON file") + + talkorigins_parser = subparsers.add_parser( + "scrape-talkorigins", + help="Scrape TalkOrigins into per-topic seed BibTeX files and a bootstrap-batch JSON file", + ) + talkorigins_parser.add_argument( + "output_dir", + help="Directory where seed BibTeX files, manifest, and batch JSON should be written", + ) + talkorigins_parser.add_argument( + "--base-url", + default="https://www.talkorigins.org/origins/biblio/", + help="TalkOrigins bibliography index URL", + ) + talkorigins_parser.add_argument("--limit-topics", type=int, help="Limit the number of scraped topic pages") + talkorigins_parser.add_argument( + "--limit-entries-per-topic", + type=int, + help="Limit the number of parsed references per topic page", + ) + talkorigins_parser.add_argument( + "--resolve-seeds", + action="store_true", + help="Attempt metadata resolution on parsed seed entries before writing BibTeX", + ) + talkorigins_parser.add_argument( + "--ingest", + action="store_true", + help="Also ingest the generated seed BibTeX into the configured database", + ) + talkorigins_parser.add_argument( + "--no-expand", + action="store_true", + help="Write generated batch jobs with graph expansion disabled", + ) + talkorigins_parser.add_argument( + "--no-resume", + action="store_true", + help="Do not reuse saved TalkOrigins topic snapshots from a prior run", + ) + talkorigins_parser.add_argument( + "--topic-limit", + type=int, + default=5, + help="Default bootstrap topic-search limit to include in generated jobs", + ) + talkorigins_parser.add_argument( + "--topic-commit-limit", + type=int, + help="Default bootstrap topic commit limit to include in generated jobs", + ) + talkorigins_parser.add_argument("--status", default="draft", help="Review status for generated seed jobs") + + validate_talkorigins_parser = subparsers.add_parser( + "validate-talkorigins", + help="Validate a generated TalkOrigins manifest and report parse coverage and suspicious entries", + ) + validate_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") + + suggest_talkorigins_parser = subparsers.add_parser( + "suggest-talkorigins-phrases", + help="Suggest stored topic expansion phrases from a TalkOrigins manifest", + ) + suggest_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") + suggest_talkorigins_parser.add_argument("--topic", help="Optional topic slug to restrict suggestions") + suggest_talkorigins_parser.add_argument("--limit", type=int, help="Maximum topics to include") + suggest_talkorigins_parser.add_argument("--output", help="Write suggestions JSON to a file instead of stdout") + + apply_topic_phrases_parser = subparsers.add_parser( + "apply-topic-phrases", + help="Apply stored topic expansion phrases from a JSON suggestion or patch file", + ) + apply_topic_phrases_parser.add_argument("input", help="Path to JSON file containing topic phrase records") + + stage_topic_phrases_parser = subparsers.add_parser( + "stage-topic-phrases", + help="Stage topic phrase suggestions from JSON for later review in the database", + ) + stage_topic_phrases_parser.add_argument("input", help="Path to JSON file containing topic phrase records") + + review_topic_phrase_parser = subparsers.add_parser( + "review-topic-phrase", + help="Accept or reject one staged topic phrase suggestion", + ) + review_topic_phrase_parser.add_argument("topic_slug", help="Topic slug to review") + review_topic_phrase_parser.add_argument("status", choices=["accepted", "rejected"], help="Review decision") + review_topic_phrase_parser.add_argument( + "--notes", + help="Optional review notes to store with the decision", + ) + review_topic_phrase_parser.add_argument( + "--phrase", + help="Optional expansion phrase override to apply with the review decision", + ) + + duplicates_talkorigins_parser = subparsers.add_parser( + "duplicates-talkorigins", + help="Inspect duplicate clusters in a generated TalkOrigins manifest", + ) + duplicates_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") + duplicates_talkorigins_parser.add_argument("--limit", type=int, default=20, help="Maximum clusters to show") + duplicates_talkorigins_parser.add_argument( + "--min-count", + type=int, + default=2, + help="Minimum cluster size to include", + ) + duplicates_talkorigins_parser.add_argument("--match", help="Optional text filter for duplicate clusters") + duplicates_talkorigins_parser.add_argument("--topic", help="Optional topic slug to restrict inspection") + duplicates_talkorigins_parser.add_argument( + "--preview", + action="store_true", + help="Include the canonical merged entry that ingest-talkorigins would choose", + ) + duplicates_talkorigins_parser.add_argument( + "--weak-only", + action="store_true", + help="Show only clusters whose canonical preview still looks weak", + ) + + ingest_talkorigins_parser = subparsers.add_parser( + "ingest-talkorigins", + help="Ingest a TalkOrigins manifest into the database with duplicate consolidation and topic membership", + ) + ingest_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") + ingest_talkorigins_parser.add_argument("--status", default="draft", help="Review status for imported entries") + ingest_talkorigins_parser.add_argument( + "--no-dedupe", + action="store_true", + help="Disable duplicate consolidation and import each parsed entry separately", + ) + + enrich_talkorigins_parser = subparsers.add_parser( + "enrich-talkorigins", + help="Attempt metadata enrichment for weak TalkOrigins canonical entries", + ) + enrich_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") + enrich_talkorigins_parser.add_argument("--limit", type=int, default=20, help="Maximum weak clusters to inspect") + enrich_talkorigins_parser.add_argument( + "--min-count", + type=int, + default=2, + help="Minimum duplicate-cluster size to include", + ) + enrich_talkorigins_parser.add_argument("--match", help="Optional text filter for weak canonical clusters") + enrich_talkorigins_parser.add_argument("--topic", help="Optional topic slug to restrict enrichment") + enrich_talkorigins_parser.add_argument( + "--apply", + action="store_true", + help="Write successful enrichments back into the configured database", + ) + enrich_talkorigins_parser.add_argument( + "--allow-unsafe-search-matches", + action="store_true", + help="Allow low-trust title-search resolver matches for bounded experiments on copied databases", + ) + enrich_talkorigins_parser.add_argument( + "--status", + default="enriched", + help="Review status to set when applying successful enrichments", + ) + + review_talkorigins_parser = subparsers.add_parser( + "review-talkorigins", + help="Export weak TalkOrigins clusters plus dry-run enrichment outcomes for manual review", + ) + review_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") + review_talkorigins_parser.add_argument("--limit", type=int, default=20, help="Maximum weak clusters to export") + review_talkorigins_parser.add_argument( + "--min-count", + type=int, + default=2, + help="Minimum duplicate-cluster size to include", + ) + review_talkorigins_parser.add_argument("--match", help="Optional text filter for weak canonical clusters") + review_talkorigins_parser.add_argument("--topic", help="Optional topic slug to restrict review export") + review_talkorigins_parser.add_argument("--output", help="Write review export JSON to a file instead of stdout") + + apply_review_talkorigins_parser = subparsers.add_parser( + "apply-talkorigins-corrections", + help="Apply curated TalkOrigins review corrections to the consolidated database", + ) + apply_review_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") + apply_review_talkorigins_parser.add_argument("corrections", help="Path to corrections JSON") + apply_review_talkorigins_parser.add_argument( + "--status", + default="reviewed", + help="Default review status to set on corrected entries", + ) + + topics_parser = subparsers.add_parser("topics", help="List known topics in the database") + topics_parser.add_argument("--limit", type=int, default=100, help="Maximum number of topics to list") + topics_parser.add_argument( + "--phrase-review-status", + choices=["unreviewed", "pending", "accepted", "rejected"], + help="Restrict topics to one stored phrase review state", + ) + + topic_entries_parser = subparsers.add_parser( + "topic-entries", + help="List entries assigned to one topic", + ) + topic_entries_parser.add_argument("topic_slug", help="Topic slug to inspect") + topic_entries_parser.add_argument("--limit", type=int, default=100, help="Maximum entries to list") + + export_topic_parser = subparsers.add_parser( + "export-topic", + help="Export one topic slice as BibTeX", + ) + export_topic_parser.add_argument("topic_slug", help="Topic slug to export") + export_topic_parser.add_argument("--output", help="Write BibTeX to a file instead of stdout") + return parser @@ -93,13 +427,17 @@ def main(argv: list[str] | None = None) -> int: if args.command == "ingest": return _run_ingest(store, Path(args.input), args.status, args.source_label) if args.command == "search": - return _run_search(store, args.query, args.limit) + return _run_search(store, args.query, args.limit, args.topic) if args.command == "show": - return _run_show(store, args.citation_key, args.limit, args.provenance) + return _run_show(store, args.citation_key, args.limit, args.provenance, args.conflicts) if args.command == "export": return _run_export(store, args.citation_keys, args.output) if args.command == "set-status": return _run_set_status(store, args.citation_key, args.review_status) + if args.command == "resolve-conflicts": + return _run_resolve_conflicts(store, args.citation_key, args.field_name, args.status) + if args.command == "apply-conflict": + return _run_apply_conflict(store, args.citation_key, args.field_name) if args.command == "extract": return _run_extract(Path(args.input), args.output) if args.command == "resolve": @@ -115,6 +453,122 @@ def main(argv: list[str] | None = None) -> int: ) if args.command == "expand": return _run_expand(store, args.citation_keys, args.source, args.relation, args.limit) + if args.command == "expand-topic": + return _run_expand_topic( + store, + args.topic_slug, + args.topic_phrase, + args.source, + args.relation, + args.seed_limit, + args.per_seed_limit, + args.min_relevance, + args.seed_keys, + args.preview, + ) + if args.command == "set-topic-phrase": + return _run_set_topic_phrase(store, args.topic_slug, args.phrase, args.clear) + if args.command == "harvest-oai": + return _run_harvest_oai( + store, + args.base_url, + args.metadata_prefix, + args.set_spec, + args.date_from, + args.date_until, + args.limit, + args.status, + ) + if args.command == "discover-oai": + return _run_discover_oai(args.base_url) + if args.command == "bootstrap": + return _run_bootstrap( + store, + args.seed_bib, + args.topic, + args.topic_limit, + args.topic_commit_limit, + not args.no_expand, + args.status, + args.preview, + args.topic_slug, + args.topic_name, + args.store_topic_phrase, + ) + if args.command == "bootstrap-batch": + return _run_bootstrap_batch(store, Path(args.input)) + if args.command == "scrape-talkorigins": + return _run_scrape_talkorigins( + store, + args.base_url, + Path(args.output_dir), + args.limit_topics, + args.limit_entries_per_topic, + args.resolve_seeds, + args.ingest, + not args.no_expand, + not args.no_resume, + args.topic_limit, + args.topic_commit_limit, + args.status, + ) + if args.command == "validate-talkorigins": + return _run_validate_talkorigins(Path(args.manifest)) + if args.command == "suggest-talkorigins-phrases": + return _run_suggest_talkorigins_phrases(Path(args.manifest), args.topic, args.limit, args.output) + if args.command == "apply-topic-phrases": + return _run_apply_topic_phrases(store, Path(args.input)) + if args.command == "stage-topic-phrases": + return _run_stage_topic_phrases(store, Path(args.input)) + if args.command == "review-topic-phrase": + return _run_review_topic_phrase(store, args.topic_slug, args.status, args.notes, args.phrase) + if args.command == "duplicates-talkorigins": + return _run_duplicates_talkorigins( + Path(args.manifest), + args.limit, + args.min_count, + args.match, + args.topic, + args.preview, + args.weak_only, + ) + if args.command == "ingest-talkorigins": + return _run_ingest_talkorigins(store, Path(args.manifest), args.status, not args.no_dedupe) + if args.command == "enrich-talkorigins": + return _run_enrich_talkorigins( + store, + Path(args.manifest), + args.limit, + args.min_count, + args.match, + args.topic, + args.apply, + args.status, + args.allow_unsafe_search_matches, + ) + if args.command == "review-talkorigins": + return _run_review_talkorigins( + store, + Path(args.manifest), + args.limit, + args.min_count, + args.match, + args.topic, + args.output, + ) + if args.command == "apply-talkorigins-corrections": + return _run_apply_talkorigins_corrections( + store, + Path(args.manifest), + Path(args.corrections), + args.status, + ) + if args.command == "topics": + return _run_topics(store, args.limit, args.phrase_review_status) + if args.command == "topic-entries": + return _run_topic_entries(store, args.topic_slug, args.limit) + if args.command == "export-topic": + return _run_export_topic(store, args.topic_slug, args.output) finally: store.close() @@ -139,14 +593,20 @@ def _run_ingest( return 0 -def _run_search(store: BibliographyStore, query: str, limit: int) -> int: - for row in store.search_text(query, limit=limit): +def _run_search(store: BibliographyStore, query: str, limit: int, topic_slug: str | None) -> int: + for row in store.search_text(query, limit=limit, topic_slug=topic_slug): score = row.get("score", 0.0) print(f"{row['citation_key']}\t{row.get('year') or ''}\t{score:.3f}\t{row.get('title') or ''}") return 0 -def _run_show(store: BibliographyStore, citation_key: str | None, limit: int, provenance: bool) -> int: +def _run_show( + store: BibliographyStore, + citation_key: str | None, + limit: int, + provenance: bool, + conflicts: bool, +) -> int: if citation_key: entry = store.get_entry(citation_key) if entry is None: @@ -154,6 +614,8 @@ def _run_show(store: BibliographyStore, citation_key: str | None, limit: int, pr return 1 if provenance: entry["field_provenance"] = store.get_field_provenance(citation_key) + if conflicts: + entry["field_conflicts"] = store.get_field_conflicts(citation_key) print(json.dumps(entry, indent=2, sort_keys=True)) return 0 @@ -179,6 +641,23 @@ def _run_set_status(store: BibliographyStore, citation_key: str, review_status: return 0 +def _run_resolve_conflicts(store: BibliographyStore, citation_key: str, field_name: str, status: str) -> int: + count = store.set_conflict_status(citation_key, field_name, status) + if count == 0: + print(f"No open conflicts updated for {citation_key}:{field_name}", file=sys.stderr) + return 1 + print(f"{citation_key}\t{field_name}\t{status}\t{count}") + return 0 + + +def _run_apply_conflict(store: BibliographyStore, citation_key: str, field_name: str) -> int: + if not store.apply_conflict_value(citation_key, field_name): + print(f"No open conflict applied for {citation_key}:{field_name}", file=sys.stderr) + return 1 + print(f"{citation_key}\t{field_name}\tapplied") + return 0 + + def _run_extract(input_path: Path, output: str | None) -> int: text = input_path.read_text(encoding="utf-8") entries = extract_references(text) @@ -211,7 +690,7 @@ def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int: print(f"No resolver match: {citation_key}", file=sys.stderr) exit_code = 1 continue - merged = merge_entries(current_entry, resolution.entry) + merged, conflicts = merge_entries_with_conflicts(current_entry, resolution.entry) store.replace_entry( citation_key, merged, @@ -219,6 +698,13 @@ def _run_resolve(store: BibliographyStore, citation_keys: list[str]) -> int: source_label=resolution.source_label, review_status="enriched", ) + if conflicts: + store.record_conflicts( + citation_key, + conflicts, + source_type=resolution.source_type, + source_label=resolution.source_label, + ) print(f"{citation_key}\t{resolution.source_label}") return exit_code @@ -266,3 +752,448 @@ def _run_expand( all_results.extend(expand_fn(citation_key)) print(json.dumps([asdict(result) for result in all_results], indent=2)) return 0 + + +def _run_expand_topic( + store: BibliographyStore, + topic_slug: str, + topic_phrase: str | None, + source: str, + relation: str, + seed_limit: int, + per_seed_limit: int, + min_relevance: float, + seed_keys: list[str] | None, + preview: bool, +) -> int: + expander = TopicExpander() + stored_topic = store.get_topic(topic_slug) + effective_phrase = topic_phrase + if effective_phrase is None and stored_topic is not None: + effective_phrase = str(stored_topic.get("expansion_phrase") or "") or None + results = expander.expand_topic( + store, + topic_slug, + topic_phrase=effective_phrase, + source=source, + relation_type=relation, + seed_limit=seed_limit, + per_seed_limit=per_seed_limit, + min_relevance=min_relevance, + seed_keys=seed_keys, + preview_only=preview, + ) + print(json.dumps([asdict(result) for result in results], indent=2)) + return 0 + + +def _run_set_topic_phrase( + store: BibliographyStore, + topic_slug: str, + phrase: str | None, + clear: bool, +) -> int: + if clear: + phrase = None + elif phrase is None: + print("set-topic-phrase requires a phrase or --clear", file=sys.stderr) + return 1 + if not store.set_topic_expansion_phrase(topic_slug, phrase): + print(f"Topic not found: {topic_slug}", file=sys.stderr) + return 1 + payload = store.get_topic(topic_slug) + print(json.dumps(payload, indent=2)) + return 0 + + +def _run_harvest_oai( + store: BibliographyStore, + base_url: str, + metadata_prefix: str, + set_spec: str | None, + date_from: str | None, + date_until: str | None, + limit: int, + review_status: str, +) -> int: + harvester = OaiPmhHarvester() + harvested = harvester.list_records( + base_url, + metadata_prefix=metadata_prefix, + set_spec=set_spec, + date_from=date_from, + date_until=date_until, + limit=limit, + ) + for result in harvested: + store.upsert_entry( + result.entry, + raw_bibtex=render_bibtex([result.entry]), + source_type="harvest", + source_label=f"oai:{result.base_url}", + review_status=review_status, + ) + print(result.entry.citation_key) + store.connection.commit() + return 0 + + +def _run_discover_oai(base_url: str) -> int: + harvester = OaiPmhHarvester() + payload = { + "identify": harvester.identify(base_url), + "metadata_formats": [asdict(result) for result in harvester.list_metadata_formats(base_url)], + "sets": [asdict(result) for result in harvester.list_sets(base_url)], + } + print(json.dumps(payload, indent=2, sort_keys=True)) + return 0 + + +def _run_bootstrap( + store: BibliographyStore, + seed_bib: str | None, + topic: str | None, + topic_limit: int, + topic_commit_limit: int | None, + expand: bool, + review_status: str, + preview: bool, + topic_slug: str | None, + topic_name: str | None, + stored_topic_phrase: str | None, +) -> int: + if not seed_bib and not topic: + print("bootstrap requires --seed-bib, --topic, or both", file=sys.stderr) + return 1 + + seed_bibtex = Path(seed_bib).read_text(encoding="utf-8") if seed_bib else None + bootstrapper = Bootstrapper() + results = bootstrapper.bootstrap( + store, + seed_bibtex=seed_bibtex, + topic=topic, + topic_limit=topic_limit, + topic_commit_limit=topic_commit_limit, + expand=expand, + review_status=review_status, + preview_only=preview, + topic_slug=topic_slug, + topic_name=topic_name, + topic_phrase=stored_topic_phrase, + ) + print(json.dumps([asdict(result) for result in results], indent=2)) + return 0 + + +def _run_bootstrap_batch(store: BibliographyStore, input_path: Path) -> int: + jobs = load_batch_jobs(input_path) + runner = BatchBootstrapRunner() + results = runner.run(store, jobs) + payload = [] + for job_result in results: + payload.append( + { + "job_name": job_result.job_name, + "result_count": job_result.result_count, + "results": [asdict(item) for item in job_result.results], + } + ) + print(json.dumps(payload, indent=2)) + return 0 + + +def _run_scrape_talkorigins( + store: BibliographyStore, + base_url: str, + output_dir: Path, + limit_topics: int | None, + limit_entries_per_topic: int | None, + resolve_seeds: bool, + ingest: bool, + expand: bool, + resume: bool, + topic_limit: int, + topic_commit_limit: int | None, + review_status: str, +) -> int: + scraper = TalkOriginsScraper() + export = scraper.scrape_to_directory( + base_url=base_url, + output_dir=output_dir, + limit_topics=limit_topics, + limit_entries_per_topic=limit_entries_per_topic, + resolve_seeds=resolve_seeds, + ingest_store=store if ingest else None, + review_status=review_status, + expand=expand, + resume=resume, + topic_limit=topic_limit, + topic_commit_limit=topic_commit_limit, + ) + print(json.dumps(asdict(export), indent=2)) + return 0 + + +def _run_validate_talkorigins(manifest_path: Path) -> int: + scraper = TalkOriginsScraper() + report = scraper.validate_export(manifest_path) + print(json.dumps(asdict(report), indent=2)) + return 0 + + +def _run_suggest_talkorigins_phrases( + manifest_path: Path, + topic_slug: str | None, + limit: int | None, + output: str | None, +) -> int: + scraper = TalkOriginsScraper() + suggestions = scraper.suggest_topic_phrases(manifest_path, limit=limit, topic_slug=topic_slug) + payload = json.dumps([asdict(item) for item in suggestions], indent=2) + if output: + Path(output).write_text(payload + "\n", encoding="utf-8") + else: + print(payload) + return 0 + + +def _run_apply_topic_phrases(store: BibliographyStore, input_path: Path) -> int: + payload = json.loads(input_path.read_text(encoding="utf-8")) + if isinstance(payload, dict): + items = payload.get("topics", []) + else: + items = payload + if not isinstance(items, list): + print("Topic phrase JSON must be a list or an object with a 'topics' list", file=sys.stderr) + return 1 + + results: list[dict[str, object]] = [] + exit_code = 0 + for item in items: + if not isinstance(item, dict): + continue + slug = str(item.get("slug") or "") + phrase = item.get("suggested_phrase", item.get("phrase")) + if not slug: + continue + if phrase is not None: + phrase = str(phrase) + applied = store.set_topic_expansion_phrase(slug, phrase) + if not applied: + exit_code = 1 + results.append( + { + "slug": slug, + "expansion_phrase": phrase, + "applied": applied, + } + ) + print(json.dumps(results, indent=2)) + return exit_code + + +def _run_stage_topic_phrases(store: BibliographyStore, input_path: Path) -> int: + payload = json.loads(input_path.read_text(encoding="utf-8")) + if isinstance(payload, dict): + items = payload.get("topics", payload.get("items", [])) + else: + items = payload + if not isinstance(items, list): + print("Topic phrase JSON must be a list or an object with a 'topics' or 'items' list", file=sys.stderr) + return 1 + + results: list[dict[str, object]] = [] + exit_code = 0 + for item in items: + if not isinstance(item, dict): + continue + slug = str(item.get("slug") or "") + phrase = item.get("suggested_phrase", item.get("phrase")) + notes = item.get("review_notes") + if not slug: + continue + if phrase is not None: + phrase = str(phrase) + if notes is not None: + notes = str(notes) + staged = store.stage_topic_phrase_suggestion( + slug, + suggested_phrase=phrase, + review_status="pending", + review_notes=notes, + ) + if not staged: + exit_code = 1 + results.append( + { + "slug": slug, + "suggested_phrase": phrase, + "phrase_review_status": "pending", + "staged": staged, + } + ) + print(json.dumps(results, indent=2)) + return exit_code + + +def _run_review_topic_phrase( + store: BibliographyStore, + topic_slug: str, + status: str, + notes: str | None, + phrase: str | None, +) -> int: + if not store.review_topic_phrase_suggestion( + topic_slug, + review_status=status, + review_notes=notes, + applied_phrase=phrase, + ): + print(f"Topic not found: {topic_slug}", file=sys.stderr) + return 1 + payload = store.get_topic(topic_slug) + print(json.dumps(payload, indent=2)) + return 0 + + +def _run_duplicates_talkorigins( + manifest_path: Path, + limit: int, + min_count: int, + match: str | None, + topic_slug: str | None, + preview: bool, + weak_only: bool, +) -> int: + scraper = TalkOriginsScraper() + clusters = scraper.inspect_duplicate_clusters( + manifest_path, + limit=limit, + min_count=min_count, + match=match, + topic_slug=topic_slug, + preview_canonical=preview, + weak_only=weak_only, + ) + print(json.dumps([asdict(cluster) for cluster in clusters], indent=2)) + return 0 + + +def _run_ingest_talkorigins( + store: BibliographyStore, + manifest_path: Path, + review_status: str, + dedupe: bool, +) -> int: + scraper = TalkOriginsScraper() + report = scraper.ingest_export( + manifest_path, + store, + review_status=review_status, + dedupe=dedupe, + ) + print(json.dumps(asdict(report), indent=2)) + return 0 + + +def _run_enrich_talkorigins( + store: BibliographyStore, + manifest_path: Path, + limit: int, + min_count: int, + match: str | None, + topic_slug: str | None, + apply: bool, + review_status: str, + allow_unsafe_matches: bool, +) -> int: + scraper = TalkOriginsScraper() + results = scraper.enrich_weak_canonicals( + manifest_path, + store, + limit=limit, + min_count=min_count, + match=match, + topic_slug=topic_slug, + apply=apply, + review_status=review_status, + allow_unsafe_matches=allow_unsafe_matches, + ) + print(json.dumps([asdict(result) for result in results], indent=2)) + return 0 + + +def _run_review_talkorigins( + store: BibliographyStore, + manifest_path: Path, + limit: int, + min_count: int, + match: str | None, + topic_slug: str | None, + output: str | None, +) -> int: + scraper = TalkOriginsScraper() + review = scraper.build_review_export( + manifest_path, + store, + limit=limit, + min_count=min_count, + match=match, + topic_slug=topic_slug, + ) + payload = json.dumps(asdict(review), indent=2) + if output: + Path(output).write_text(payload + "\n", encoding="utf-8") + else: + print(payload) + return 0 + + +def _run_apply_talkorigins_corrections( + store: BibliographyStore, + manifest_path: Path, + corrections_path: Path, + review_status: str, +) -> int: + scraper = TalkOriginsScraper() + results = scraper.apply_review_corrections( + manifest_path, + corrections_path, + store, + default_review_status=review_status, + ) + print(json.dumps([asdict(result) for result in results], indent=2)) + return 0 + + +def _run_topics(store: BibliographyStore, limit: int, phrase_review_status: str | None) -> int: + print(json.dumps(store.list_topics(limit=limit, phrase_review_status=phrase_review_status), indent=2)) + return 0 + + +def _run_topic_entries(store: BibliographyStore, topic_slug: str, limit: int) -> int: + topic = store.get_topic(topic_slug) + if topic is None: + print(f"Topic not found: {topic_slug}", file=sys.stderr) + return 1 + payload = { + "topic": topic, + "entries": store.list_topic_entries(topic_slug, limit=limit), + } + print(json.dumps(payload, indent=2)) + return 0 + + +def _run_export_topic(store: BibliographyStore, topic_slug: str, output: str | None) -> int: + topic = store.get_topic(topic_slug) + if topic is None: + print(f"Topic not found: {topic_slug}", file=sys.stderr) + return 1 + citation_keys = [row["citation_key"] for row in store.list_topic_entries(topic_slug, limit=100000)] + rendered = store.export_bibtex(citation_keys) + if output: + Path(output).write_text(rendered + ("\n" if rendered else ""), encoding="utf-8") + else: + if rendered: + print(rendered) + return 0 diff --git a/src/citegeist/expand.py b/src/citegeist/expand.py index a9079a1..b93943d 100644 --- a/src/citegeist/expand.py +++ b/src/citegeist/expand.py @@ -18,6 +18,20 @@ class ExpansionResult: source_label: str +@dataclass(slots=True) +class TopicExpansionResult: + topic_slug: str + source_citation_key: str + discovered_citation_key: str + discovered_title: str + created_entry: bool + relation_type: str + source_label: str + relevance_score: float + meets_relevance_threshold: bool + assigned_to_topic: bool + + class CrossrefExpander: def __init__(self, resolver: MetadataResolver | None = None) -> None: self.resolver = resolver or MetadataResolver() @@ -163,6 +177,192 @@ class OpenAlexExpander: return _normalize_openalex_id(results[0].get("id", "")) +class TopicExpander: + def __init__( + self, + crossref_expander: CrossrefExpander | None = None, + openalex_expander: OpenAlexExpander | None = None, + ) -> None: + self.crossref_expander = crossref_expander or CrossrefExpander() + self.openalex_expander = openalex_expander or OpenAlexExpander() + + def expand_topic( + self, + store: BibliographyStore, + topic_slug: str, + topic_phrase: str | None = None, + source: str = "openalex", + relation_type: str = "cites", + seed_limit: int = 25, + per_seed_limit: int = 25, + min_relevance: float = 0.2, + seed_keys: list[str] | None = None, + preview_only: bool = False, + ) -> list[TopicExpansionResult]: + topic = store.get_topic(topic_slug) + if topic is None: + return [] + + phrase = (topic_phrase or str(topic.get("name") or topic_slug)).strip() + seeds = store.list_topic_entries(topic_slug, limit=seed_limit) + if seed_keys: + allowed = set(seed_keys) + seeds = [seed for seed in seeds if str(seed["citation_key"]) in allowed] + results: list[TopicExpansionResult] = [] + + for seed in seeds: + seed_key = str(seed["citation_key"]) + if preview_only: + discovered_rows = self._preview_discoveries( + store, + seed_key, + source=source, + relation_type=relation_type, + limit=per_seed_limit, + ) + else: + discovered_rows = self._materialized_discoveries( + store, + seed_key, + source=source, + relation_type=relation_type, + limit=per_seed_limit, + ) + + for row, target_entry in discovered_rows: + score = _topic_relevance_score(phrase, target_entry) + meets_threshold = _meets_topic_assignment_threshold( + phrase, + target_entry, + min_relevance=min_relevance, + relevance_score=score, + ) + assigned = False + if not preview_only and meets_threshold and target_entry is not None: + assigned = store.add_entry_topic( + row.discovered_citation_key, + topic_slug=topic_slug, + topic_name=str(topic.get("name") or topic_slug), + source_type="topic_expand", + source_url=str(topic.get("source_url") or ""), + source_label=f"{source}:{relation_type}:{seed_key}", + confidence=score, + ) + results.append( + TopicExpansionResult( + topic_slug=topic_slug, + source_citation_key=row.source_citation_key, + discovered_citation_key=row.discovered_citation_key, + discovered_title=str(target_entry.get("title") or ""), + created_entry=row.created_entry, + relation_type=row.relation_type, + source_label=row.source_label, + relevance_score=score, + meets_relevance_threshold=meets_threshold, + assigned_to_topic=assigned, + ) + ) + store.connection.commit() + return results + + def _materialized_discoveries( + self, + store: BibliographyStore, + citation_key: str, + source: str, + relation_type: str, + limit: int, + ) -> list[tuple[ExpansionResult, dict[str, object] | None]]: + if source == "crossref": + expansion_rows = self.crossref_expander.expand_entry_references(store, citation_key) + else: + expansion_rows = self.openalex_expander.expand_entry( + store, + citation_key, + relation_type=relation_type, + limit=limit, + ) + return [(row, store.get_entry(row.discovered_citation_key)) for row in expansion_rows] + + def _preview_discoveries( + self, + store: BibliographyStore, + citation_key: str, + source: str, + relation_type: str, + limit: int, + ) -> list[tuple[ExpansionResult, dict[str, object]]]: + if source == "crossref": + return self._preview_crossref_discoveries(store, citation_key, limit) + return self._preview_openalex_discoveries(store, citation_key, relation_type, limit) + + def _preview_crossref_discoveries( + self, + store: BibliographyStore, + citation_key: str, + limit: int, + ) -> list[tuple[ExpansionResult, dict[str, object]]]: + entry = store.get_entry(citation_key) + if entry is None or not entry.get("doi"): + return [] + doi = str(entry["doi"]) + payload = self.crossref_expander.resolver.source_client.get_json( + f"https://api.crossref.org/works/{doi}?mailto=welsberr@gmail.com" + ) + references = payload.get("message", {}).get("reference", [])[:limit] + rows: list[tuple[ExpansionResult, dict[str, object]]] = [] + for index, reference in enumerate(references, start=1): + discovered = _crossref_reference_to_entry(reference, citation_key, index) + rows.append( + ( + ExpansionResult( + source_citation_key=citation_key, + discovered_citation_key=discovered.citation_key, + created_entry=store.get_entry(discovered.citation_key) is None, + relation_type="cites", + source_label=f"crossref:references:{doi}", + ), + dict(discovered.fields), + ) + ) + return rows + + def _preview_openalex_discoveries( + self, + store: BibliographyStore, + citation_key: str, + relation_type: str, + limit: int, + ) -> list[tuple[ExpansionResult, dict[str, object]]]: + entry = store.get_entry(citation_key) + if entry is None: + return [] + openalex_id = entry.get("openalex") or self.openalex_expander._lookup_openalex_id(entry) + if not openalex_id: + return [] + filter_name = "cited_by" if relation_type == "cites" else "cites" + query = urlencode({"filter": f"{filter_name}:{openalex_id}", "per-page": limit}) + payload = self.openalex_expander.resolver.source_client.get_json(f"https://api.openalex.org/works?{query}") + works = payload.get("results", []) + rows: list[tuple[ExpansionResult, dict[str, object]]] = [] + for work in works: + discovered = _openalex_work_to_entry(work) + source_key = citation_key if relation_type == "cites" else discovered.citation_key + rows.append( + ( + ExpansionResult( + source_citation_key=source_key, + discovered_citation_key=discovered.citation_key, + created_entry=store.get_entry(discovered.citation_key) is None, + relation_type=relation_type, + source_label=f"openalex:{relation_type}:{openalex_id}", + ), + dict(discovered.fields), + ) + ) + return rows + + def _crossref_reference_to_entry(reference: dict, source_citation_key: str, ordinal: int) -> BibEntry: title = ( reference.get("article-title") @@ -211,6 +411,115 @@ def _normalize_text(value: str) -> str: return " ".join(value.split()) +def _topic_relevance_score(topic_phrase: str, entry: dict[str, object] | None) -> float: + if entry is None: + return 0.0 + topic_terms = _expanded_keyword_terms(topic_phrase) + if not topic_terms: + return 0.0 + title_terms = _expanded_keyword_terms(str(entry.get("title") or "")) + abstract_terms = _expanded_keyword_terms(str(entry.get("abstract") or "")) + keyword_terms = _expanded_keyword_terms(str(entry.get("keywords") or "")) + venue_terms = _expanded_keyword_terms(" ".join(str(entry.get(field) or "") for field in ("journal", "booktitle"))) + + score = 0.0 + score += 0.6 * _term_overlap_ratio(topic_terms, title_terms) + score += 0.25 * _term_overlap_ratio(topic_terms, abstract_terms) + score += 0.1 * _term_overlap_ratio(topic_terms, keyword_terms) + score += 0.05 * _term_overlap_ratio(topic_terms, venue_terms) + + phrase = _normalize_text(topic_phrase.casefold()) + title = _normalize_text(str(entry.get("title") or "").casefold()) + if phrase and title and phrase in title: + score = max(score, 0.75) + + return min(score, 1.0) + + +def _meets_topic_assignment_threshold( + topic_phrase: str, + entry: dict[str, object] | None, + min_relevance: float, + relevance_score: float | None = None, +) -> bool: + if entry is None: + return False + score = relevance_score if relevance_score is not None else _topic_relevance_score(topic_phrase, entry) + if score < min_relevance: + return False + title_anchor = _title_topic_anchor_ratio(topic_phrase, str(entry.get("title") or "")) + return title_anchor >= 0.2 + + +def _keyword_terms(text: str) -> set[str]: + return { + _normalize_keyword(term) + for term in re.findall(r"[A-Za-z0-9]+", text.casefold()) + if len(term) >= 4 + } + + +def _expanded_keyword_terms(text: str) -> set[str]: + terms = _keyword_terms(text) + expanded = set(terms) + for term in terms: + expanded.update(_related_topic_terms(term)) + return expanded + + +def _title_topic_anchor_ratio(topic_phrase: str, title: str) -> float: + normalized_phrase = _normalize_text(topic_phrase.casefold()) + normalized_title = _normalize_text(title.casefold()) + if normalized_phrase and normalized_title and normalized_phrase in normalized_title: + return 1.0 + + topic_terms = _core_topic_terms(topic_phrase) + title_terms = _keyword_terms(title) + if not topic_terms or not title_terms: + return 0.0 + overlap = topic_terms & title_terms + if overlap: + return max(0.25, len(overlap) / len(topic_terms)) + return 0.0 + + +def _core_topic_terms(topic_phrase: str) -> set[str]: + generic_terms = {"evolution", "origin", "origins", "science", "study", "studies"} + return {term for term in _keyword_terms(topic_phrase) if term not in generic_terms} + + +def _term_overlap_ratio(topic_terms: set[str], candidate_terms: set[str]) -> float: + if not topic_terms or not candidate_terms: + return 0.0 + return len(topic_terms & candidate_terms) / len(topic_terms) + + +def _normalize_keyword(term: str) -> str: + normalized = term.casefold() + for suffix in ("isms", "ists", "ation", "ment", "ings", "ness", "isms", "ism", "ist", "ing", "ers", "er", "ies", "ied", "ed", "es", "s"): + if len(normalized) > len(suffix) + 3 and normalized.endswith(suffix): + if suffix in {"ies", "ied"}: + return normalized[: -len(suffix)] + "y" + return normalized[: -len(suffix)] + return normalized + + +def _related_topic_terms(term: str) -> set[str]: + related_groups = ( + {"human", "hominid", "hominin", "homo"}, + {"chimpanzee", "chimp", "pan", "ape", "apes", "primate"}, + {"primate", "primate", "ape", "apes", "hominid", "hominin"}, + {"evolution", "evolutionary", "phylogeny", "phylogen", "ancestor", "ancestral"}, + {"origin", "origins", "abiogenesis", "prebiotic"}, + {"morphometry", "morphology", "cranial", "dental", "skeletal", "body"}, + {"paleolithic", "paleoanthropology", "paleoanthropological", "pleistocene", "fossil"}, + ) + for group in related_groups: + if term in group: + return group - {term} + return set() + + def _openalex_work_to_entry(work: dict) -> BibEntry: title = _normalize_text(work.get("display_name", "") or "Untitled work") year = str(work.get("publication_year") or "") diff --git a/src/citegeist/harvest.py b/src/citegeist/harvest.py new file mode 100644 index 0000000..1a85662 --- /dev/null +++ b/src/citegeist/harvest.py @@ -0,0 +1,317 @@ +from __future__ import annotations + +from dataclasses import dataclass +from urllib.parse import urlencode +import xml.etree.ElementTree as ET + +from .bibtex import BibEntry +from .sources import SourceClient + +NS = { + "oai": "http://www.openarchives.org/OAI/2.0/", + "oai_dc": "http://www.openarchives.org/OAI/2.0/oai_dc/", + "dc": "http://purl.org/dc/elements/1.1/", + "mods": "http://www.loc.gov/mods/v3", +} + + +@dataclass(slots=True) +class HarvestResult: + base_url: str + identifier: str + entry: BibEntry + + +@dataclass(slots=True) +class OaiSet: + set_spec: str + set_name: str + set_description: str = "" + + +@dataclass(slots=True) +class OaiMetadataFormat: + metadata_prefix: str + schema: str + metadata_namespace: str + + +class OaiPmhHarvester: + def __init__(self, source_client: SourceClient | None = None) -> None: + self.source_client = source_client or SourceClient() + + def identify(self, base_url: str) -> dict[str, str]: + root = self.source_client.get_xml(f"{base_url}?{urlencode({'verb': 'Identify'})}") + identify = root.find(".//oai:Identify", NS) + if identify is None: + return {} + payload: dict[str, str] = {} + for field_name in ( + "repositoryName", + "baseURL", + "protocolVersion", + "adminEmail", + "earliestDatestamp", + "deletedRecord", + "granularity", + ): + payload[field_name] = _node_text(identify.find(f"oai:{field_name}", NS)) + return payload + + def list_sets(self, base_url: str) -> list[OaiSet]: + root = self.source_client.get_xml(f"{base_url}?{urlencode({'verb': 'ListSets'})}") + sets = root.findall(".//oai:set", NS) + results: list[OaiSet] = [] + for node in sets: + results.append( + OaiSet( + set_spec=_node_text(node.find("oai:setSpec", NS)), + set_name=_node_text(node.find("oai:setName", NS)), + set_description=_flatten_set_description(node.find("oai:setDescription", NS)), + ) + ) + return results + + def list_metadata_formats(self, base_url: str, identifier: str | None = None) -> list[OaiMetadataFormat]: + params = {"verb": "ListMetadataFormats"} + if identifier: + params["identifier"] = identifier + root = self.source_client.get_xml(f"{base_url}?{urlencode(params)}") + formats = root.findall(".//oai:metadataFormat", NS) + results: list[OaiMetadataFormat] = [] + for node in formats: + results.append( + OaiMetadataFormat( + metadata_prefix=_node_text(node.find("oai:metadataPrefix", NS)), + schema=_node_text(node.find("oai:schema", NS)), + metadata_namespace=_node_text(node.find("oai:metadataNamespace", NS)), + ) + ) + return results + + def list_records( + self, + base_url: str, + metadata_prefix: str = "oai_dc", + set_spec: str | None = None, + date_from: str | None = None, + date_until: str | None = None, + limit: int | None = None, + ) -> list[HarvestResult]: + results: list[HarvestResult] = [] + params = {"verb": "ListRecords", "metadataPrefix": metadata_prefix} + if set_spec: + params["set"] = set_spec + if date_from: + params["from"] = date_from + if date_until: + params["until"] = date_until + + ordinal = 1 + next_url = f"{base_url}?{urlencode(params)}" + while next_url: + root = self.source_client.get_xml(next_url) + records = root.findall(".//oai:record", NS) + for record in records: + parsed = self._record_to_result(base_url, record, ordinal) + ordinal += 1 + if parsed is not None: + results.append(parsed) + if limit is not None and len(results) >= limit: + return results + next_url = self._resumption_url(base_url, root) + return results + + def get_record( + self, + base_url: str, + identifier: str, + metadata_prefix: str = "oai_dc", + ) -> HarvestResult | None: + params = { + "verb": "GetRecord", + "metadataPrefix": metadata_prefix, + "identifier": identifier, + } + root = self.source_client.get_xml(f"{base_url}?{urlencode(params)}") + record = root.find(".//oai:record", NS) + if record is None: + return None + return self._record_to_result(base_url, record, 1) + + def _record_to_result(self, base_url: str, record: ET.Element, ordinal: int) -> HarvestResult | None: + identifier = _node_text(record.find("./oai:header/oai:identifier", NS)) + metadata_node = record.find("./oai:metadata/*", NS) + if metadata_node is None or not identifier: + return None + + entry = _metadata_node_to_entry(base_url, identifier, metadata_node, ordinal) + return HarvestResult(base_url=base_url, identifier=identifier, entry=entry) + + def _resumption_url(self, base_url: str, root: ET.Element) -> str | None: + token = _node_text(root.find(".//oai:resumptionToken", NS)) + if not token: + return None + return f"{base_url}?{urlencode({'verb': 'ListRecords', 'resumptionToken': token})}" + + +def _oai_dc_to_entry(base_url: str, identifier: str, metadata: ET.Element, ordinal: int) -> BibEntry: + titles = _all_text(metadata.findall("dc:title", NS)) + creators = _all_text(metadata.findall("dc:creator", NS)) + dates = _all_text(metadata.findall("dc:date", NS)) + descriptions = _all_text(metadata.findall("dc:description", NS)) + identifiers = _all_text(metadata.findall("dc:identifier", NS)) + publishers = _all_text(metadata.findall("dc:publisher", NS)) + types = [value.lower() for value in _all_text(metadata.findall("dc:type", NS))] + + title = titles[0] if titles else "Untitled record" + year = _first_year(dates) + entry_type = _guess_oai_entry_type(types) + + fields: dict[str, str] = { + "title": title, + "oai": identifier, + "url": _best_identifier_url(identifiers) or f"{base_url}?verb=GetRecord&identifier={identifier}&metadataPrefix=oai_dc", + "note": "harvested_from = {oai_pmh}", + } + if creators: + fields["author"] = " and ".join(creators) + if year: + fields["year"] = year + if descriptions: + fields["abstract"] = descriptions[0] + if publishers: + fields["publisher"] = publishers[0] + + citation_key = _oai_citation_key(creators, year, title, ordinal) + return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) + + +def _mods_to_entry(base_url: str, identifier: str, metadata: ET.Element, ordinal: int) -> BibEntry: + title = _node_text(metadata.find(".//mods:titleInfo/mods:title", NS)) or "Untitled record" + sub_title = _node_text(metadata.find(".//mods:titleInfo/mods:subTitle", NS)) + if sub_title: + title = f"{title}: {sub_title}" + + creators: list[str] = [] + for name in metadata.findall(".//mods:name", NS): + role_terms = [term.text or "" for term in name.findall(".//mods:roleTerm", NS)] + if role_terms and not any(term.lower() == "author" for term in role_terms): + continue + parts = [_node_text(part) for part in name.findall("./mods:namePart", NS)] + parts = [part for part in parts if part] + if parts: + creators.append(", ".join(parts) if len(parts) == 2 else " ".join(parts)) + + year = "" + for date_node in metadata.findall(".//mods:originInfo/mods:dateIssued", NS): + text = _node_text(date_node) + if len(text) >= 4 and text[:4].isdigit(): + year = text[:4] + break + + publisher = _node_text(metadata.find(".//mods:originInfo/mods:publisher", NS)) + abstract = _node_text(metadata.find(".//mods:abstract", NS)) + genre = _node_text(metadata.find(".//mods:genre", NS)).lower() + related_title = _node_text(metadata.find(".//mods:relatedItem/mods:titleInfo/mods:title", NS)) + url = _node_text(metadata.find(".//mods:location/mods:url", NS)) + + entry_type = "phdthesis" if "thesis" in genre or "dissertation" in genre else "misc" + if not entry_type == "phdthesis": + if related_title: + entry_type = "article" + + fields: dict[str, str] = { + "title": title, + "oai": identifier, + "url": url or f"{base_url}?verb=GetRecord&identifier={identifier}&metadataPrefix=mods", + "note": "harvested_from = {oai_pmh_mods}", + } + if creators: + fields["author"] = " and ".join(creators) + if year: + fields["year"] = year + if publisher: + fields["publisher"] = publisher + if abstract: + fields["abstract"] = abstract + if related_title: + fields["journal"] = related_title + + citation_key = _oai_citation_key(creators, year, title, ordinal) + return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) + + +def _metadata_node_to_entry(base_url: str, identifier: str, metadata: ET.Element, ordinal: int) -> BibEntry: + if metadata.tag.endswith("dc"): + return _oai_dc_to_entry(base_url, identifier, metadata, ordinal) + if metadata.tag.endswith("mods"): + return _mods_to_entry(base_url, identifier, metadata, ordinal) + return BibEntry( + entry_type="misc", + citation_key=_oai_citation_key([], "", identifier, ordinal), + fields={ + "title": identifier, + "oai": identifier, + "url": f"{base_url}?verb=GetRecord&identifier={identifier}", + "note": f"unsupported_oai_metadata = {{{metadata.tag}}}", + }, + ) + + +def _node_text(node: ET.Element | None) -> str: + if node is None or node.text is None: + return "" + return " ".join(node.text.split()) + + +def _all_text(nodes: list[ET.Element]) -> list[str]: + values = [] + for node in nodes: + value = _node_text(node) + if value: + values.append(value) + return values + + +def _first_year(dates: list[str]) -> str: + for date in dates: + if len(date) >= 4 and date[:4].isdigit(): + return date[:4] + return "" + + +def _guess_oai_entry_type(types: list[str]) -> str: + joined = " ".join(types) + if "thesis" in joined or "dissertation" in joined: + return "phdthesis" + if "article" in joined: + return "article" + if "book" in joined: + return "book" + return "misc" + + +def _best_identifier_url(identifiers: list[str]) -> str: + for identifier in identifiers: + if identifier.startswith("http://") or identifier.startswith("https://"): + return identifier + return "" + + +def _oai_citation_key(creators: list[str], year: str, title: str, ordinal: int) -> str: + author = creators[0] if creators else "oai" + family = author.split(",")[0] if "," in author else author.split()[-1] + family = "".join(ch for ch in family.lower() if ch.isalnum()) or "oai" + first_word = "".join(ch for ch in title.split()[0].lower() if ch.isalnum()) if title.split() else "untitled" + return f"{family}{year or 'nd'}{first_word}{ordinal}" + + +def _flatten_set_description(node: ET.Element | None) -> str: + if node is None: + return "" + parts = [] + for child in node.iter(): + if child.text and child.text.strip(): + parts.append(" ".join(child.text.split())) + return " ".join(parts) diff --git a/src/citegeist/resolve.py b/src/citegeist/resolve.py index 4d3ce28..5e5a205 100644 --- a/src/citegeist/resolve.py +++ b/src/citegeist/resolve.py @@ -30,6 +30,9 @@ class MetadataResolver: resolved = self.resolve_doi(doi) if resolved is not None: return resolved + resolved = self.resolve_datacite_doi(doi) + if resolved is not None: + return resolved if openalex_id := entry.fields.get("openalex"): resolved = self.resolve_openalex(openalex_id) @@ -47,6 +50,20 @@ class MetadataResolver: return resolved if title := entry.fields.get("title"): + resolved = self.search_crossref_best_match( + title=title, + author_text=entry.fields.get("author", ""), + year=entry.fields.get("year", ""), + ) + if resolved is not None: + return resolved + resolved = self.search_datacite_best_match( + title=title, + author_text=entry.fields.get("author", ""), + year=entry.fields.get("year", ""), + ) + if resolved is not None: + return resolved resolved = self.search_openalex_best_match( title=title, author_text=entry.fields.get("author", ""), @@ -75,6 +92,26 @@ class MetadataResolver: items = payload.get("message", {}).get("items", []) return [_crossref_message_to_entry(item) for item in items] + def search_crossref_best_match( + self, + title: str, + author_text: str = "", + year: str = "", + ) -> Resolution | None: + candidate = _select_best_title_match( + self.search_crossref(title, limit=5), + title=title, + author_text=author_text, + year=year, + ) + if candidate is None: + return None + return Resolution( + entry=candidate, + source_type="resolver", + source_label=f"crossref:search:{title}", + ) + def resolve_dblp(self, dblp_key: str) -> Resolution | None: encoded_key = urllib.parse.quote(dblp_key, safe="/:") text = self.source_client.get_text(f"https://dblp.org/rec/{encoded_key}.bib") @@ -128,6 +165,43 @@ class MetadataResolver: source_label=f"openalex:id:{normalized_id}", ) + def resolve_datacite_doi(self, doi: str) -> Resolution | None: + encoded = urllib.parse.quote(doi, safe="") + payload = self.source_client.get_json(f"https://api.datacite.org/dois/{encoded}") + data = payload.get("data", {}) + if not data: + return None + return Resolution( + entry=_datacite_work_to_entry(data), + source_type="resolver", + source_label=f"datacite:doi:{doi}", + ) + + def search_datacite(self, title: str, limit: int = 5) -> list[BibEntry]: + query = urllib.parse.urlencode({"query": title, "page[size]": limit}) + payload = self.source_client.get_json(f"https://api.datacite.org/dois?{query}") + return [_datacite_work_to_entry(item) for item in payload.get("data", [])] + + def search_datacite_best_match( + self, + title: str, + author_text: str = "", + year: str = "", + ) -> Resolution | None: + candidate = _select_best_title_match( + self.search_datacite(title, limit=5), + title=title, + author_text=author_text, + year=year, + ) + if candidate is None: + return None + return Resolution( + entry=candidate, + source_type="resolver", + source_label=f"datacite:search:{title}", + ) + def search_openalex(self, title: str, limit: int = 5) -> list[BibEntry]: query = urllib.parse.urlencode({"search": title, "per-page": limit}) payload = self.source_client.get_json(f"https://api.openalex.org/works?{query}") @@ -139,42 +213,50 @@ class MetadataResolver: author_text: str = "", year: str = "", ) -> Resolution | None: - candidates = self.search_openalex(title, limit=5) - if not candidates: + candidate = _select_best_title_match( + self.search_openalex(title, limit=5), + title=title, + author_text=author_text, + year=year, + ) + if candidate is None: return None - - title_norm = _normalize_match_text(title) - author_norm = _normalize_match_text(author_text) - for candidate in candidates: - candidate_title = _normalize_match_text(candidate.fields.get("title", "")) - candidate_author = _normalize_match_text(candidate.fields.get("author", "")) - candidate_year = candidate.fields.get("year", "") - if candidate_title == title_norm: - if author_norm and candidate_author and author_norm.split(" and ")[0] not in candidate_author: - continue - if year and candidate_year and year != candidate_year: - continue - return Resolution( - entry=candidate, - source_type="resolver", - source_label=f"openalex:search:{title}", - ) - return Resolution( - entry=candidates[0], + entry=candidate, source_type="resolver", source_label=f"openalex:search:{title}", ) def merge_entries(base: BibEntry, resolved: BibEntry) -> BibEntry: + merged, _ = merge_entries_with_conflicts(base, resolved) + return merged + + +def merge_entries_with_conflicts(base: BibEntry, resolved: BibEntry) -> tuple[BibEntry, list[dict[str, str]]]: merged_fields = dict(base.fields) + conflicts: list[dict[str, str]] = [] for key, value in resolved.fields.items(): - if value and (key not in merged_fields or not merged_fields[key]): + if not value: + continue + current_value = merged_fields.get(key, "") + if current_value and current_value != value: + conflicts.append( + { + "field_name": key, + "current_value": current_value, + "proposed_value": value, + } + ) + continue + if key not in merged_fields or not merged_fields[key]: merged_fields[key] = value - return BibEntry( - entry_type=base.entry_type or resolved.entry_type, - citation_key=base.citation_key, - fields=merged_fields, + return ( + BibEntry( + entry_type=base.entry_type or resolved.entry_type, + citation_key=base.citation_key, + fields=merged_fields, + ), + conflicts, ) @@ -363,3 +445,123 @@ def _normalize_match_text(value: str) -> str: lowered = value.lower() lowered = re.sub(r"\W+", " ", lowered) return " ".join(lowered.split()) + + +def _select_best_title_match( + candidates: list[BibEntry], + title: str, + author_text: str = "", + year: str = "", +) -> BibEntry | None: + if not candidates: + return None + + title_norm = _normalize_match_text(title) + author_tokens = _author_match_tokens(author_text) + year_text = str(year or "").strip() + + for candidate in candidates: + candidate_title = _normalize_match_text(candidate.fields.get("title", "")) + if candidate_title != title_norm: + continue + candidate_year = str(candidate.fields.get("year", "") or "").strip() + if year_text and candidate_year and year_text != candidate_year: + continue + if author_tokens and not _candidate_matches_author_tokens(candidate, author_tokens): + continue + return candidate + return None + + +def _author_match_tokens(author_text: str) -> set[str]: + normalized = _normalize_match_text(author_text) + if not normalized: + return set() + tokens = { + token + for token in re.findall(r"[a-z0-9]+", normalized) + if len(token) >= 2 and token not in {"and", "et", "al"} + } + return tokens + + +def _candidate_matches_author_tokens(candidate: BibEntry, author_tokens: set[str]) -> bool: + candidate_author = _normalize_match_text(candidate.fields.get("author", "")) + if not candidate_author: + return False + candidate_tokens = set(re.findall(r"[a-z0-9]+", candidate_author)) + return bool(author_tokens & candidate_tokens) + + +def _datacite_work_to_entry(data: dict) -> BibEntry: + attributes = data.get("attributes", {}) + doi = str(attributes.get("doi") or "") + titles = attributes.get("titles") or [] + creators = attributes.get("creators") or [] + descriptions = attributes.get("descriptions") or [] + publisher = str(attributes.get("publisher") or "") + year = str(attributes.get("publicationYear") or "") + url = str(attributes.get("url") or "") + types = attributes.get("types") or {} + + title = titles[0].get("title", "") if titles else "" + author_names = " and ".join(_datacite_creator_name(creator) for creator in creators if _datacite_creator_name(creator)) + abstract = _datacite_abstract(descriptions) + entry_type = _datacite_type_to_bibtype(str(types.get("resourceTypeGeneral") or "")) + + fields: dict[str, str] = {} + if title: + fields["title"] = title + if author_names: + fields["author"] = author_names + if year: + fields["year"] = year + if doi: + fields["doi"] = doi + if url: + fields["url"] = url + elif doi: + fields["url"] = f"https://doi.org/{doi}" + if publisher: + fields["publisher"] = publisher + if abstract: + fields["abstract"] = abstract + + citation_key = _make_resolution_key(author_names or "datacite", year or "n.d.", title or doi or "untitled") + return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) + + +def _datacite_creator_name(creator: dict) -> str: + family = str(creator.get("familyName") or "") + given = str(creator.get("givenName") or "") + if family and given: + return f"{family}, {given}" + return str(creator.get("name") or family or given) + + +def _datacite_abstract(descriptions: list[dict]) -> str: + for description in descriptions: + if str(description.get("descriptionType") or "").lower() == "abstract": + return str(description.get("description") or "") + return "" + + +def _datacite_type_to_bibtype(resource_type: str) -> str: + lowered = resource_type.lower() + mapping = { + "audiovisual": "misc", + "book": "book", + "bookchapter": "incollection", + "collection": "misc", + "computationalnotebook": "misc", + "conferencepaper": "inproceedings", + "dataset": "misc", + "dissertation": "phdthesis", + "image": "misc", + "journalarticle": "article", + "model": "misc", + "report": "techreport", + "software": "misc", + "text": "misc", + } + return mapping.get(lowered, "misc") diff --git a/src/citegeist/sources.py b/src/citegeist/sources.py index 63bd23d..0f453e5 100644 --- a/src/citegeist/sources.py +++ b/src/citegeist/sources.py @@ -30,11 +30,11 @@ class SourceClient: def get_text(self, url: str) -> str: cached = self._read_cached(url, "txt") if cached is not None: - return cached.decode("utf-8") + return self._decode_text(cached) payload = self._fetch_bytes(url) self._write_cache(url, "txt", payload) - return payload.decode("utf-8") + return self._decode_text(payload) def get_xml(self, url: str) -> ET.Element: cached = self._read_cached(url, "xml") @@ -76,3 +76,11 @@ class SourceClient: self.cache_dir.mkdir(parents=True, exist_ok=True) path = self.cache_dir / self._cache_key(url, suffix) path.write_bytes(payload) + + def _decode_text(self, payload: bytes) -> str: + for encoding in ("utf-8", "utf-8-sig", "iso-8859-1", "latin-1"): + try: + return payload.decode(encoding) + except UnicodeDecodeError: + continue + return payload.decode("utf-8", errors="replace") diff --git a/src/citegeist/storage.py b/src/citegeist/storage.py index 57e75ee..f2578f8 100644 --- a/src/citegeist/storage.py +++ b/src/citegeist/storage.py @@ -95,6 +95,29 @@ class BibliographyStore: PRIMARY KEY (source_entry_id, target_citation_key, relation_type) ); + CREATE TABLE IF NOT EXISTS topics ( + id INTEGER PRIMARY KEY, + slug TEXT NOT NULL UNIQUE, + name TEXT NOT NULL, + source_type TEXT NOT NULL, + source_url TEXT, + expansion_phrase TEXT, + suggested_phrase TEXT, + phrase_review_status TEXT NOT NULL DEFAULT 'unreviewed', + phrase_review_notes TEXT, + created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP + ); + + CREATE TABLE IF NOT EXISTS entry_topics ( + entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE, + topic_id INTEGER NOT NULL REFERENCES topics(id) ON DELETE CASCADE, + source_label TEXT NOT NULL, + confidence REAL, + created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (entry_id, topic_id) + ); + CREATE TABLE IF NOT EXISTS field_provenance ( id INTEGER PRIMARY KEY, entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE, @@ -117,10 +140,23 @@ class BibliographyStore: confidence REAL, recorded_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP ); + + CREATE TABLE IF NOT EXISTS field_conflicts ( + id INTEGER PRIMARY KEY, + entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE, + field_name TEXT NOT NULL, + current_value TEXT, + proposed_value TEXT, + source_type TEXT NOT NULL, + source_label TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'open', + recorded_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP + ); """ ) self._ensure_entry_columns() + self._ensure_topic_columns() if self._fts5_enabled: self.connection.execute( @@ -177,6 +213,7 @@ class BibliographyStore: ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(citation_key) DO UPDATE SET entry_type = excluded.entry_type, + review_status = excluded.review_status, title = excluded.title, year = excluded.year, journal = excluded.journal, @@ -280,30 +317,58 @@ class BibliographyStore: return entry_id - def search_text(self, query: str, limit: int = 10) -> list[dict[str, object]]: + def search_text(self, query: str, limit: int = 10, topic_slug: str | None = None) -> list[dict[str, object]]: if self._fts5_enabled: - rows = self.connection.execute( - """ - SELECT e.citation_key, e.title, e.year, bm25(entry_text_fts) AS score - FROM entry_text_fts - JOIN entries e ON e.citation_key = entry_text_fts.citation_key - WHERE entry_text_fts MATCH ? - ORDER BY score - LIMIT ? - """, - (query, limit), - ).fetchall() + if topic_slug: + rows = self.connection.execute( + """ + SELECT DISTINCT e.citation_key, e.title, e.year, bm25(entry_text_fts) AS score + FROM entry_text_fts + JOIN entries e ON e.citation_key = entry_text_fts.citation_key + JOIN entry_topics et ON et.entry_id = e.id + JOIN topics t ON t.id = et.topic_id + WHERE entry_text_fts MATCH ? AND t.slug = ? + ORDER BY score + LIMIT ? + """, + (query, topic_slug, limit), + ).fetchall() + else: + rows = self.connection.execute( + """ + SELECT e.citation_key, e.title, e.year, bm25(entry_text_fts) AS score + FROM entry_text_fts + JOIN entries e ON e.citation_key = entry_text_fts.citation_key + WHERE entry_text_fts MATCH ? + ORDER BY score + LIMIT ? + """, + (query, limit), + ).fetchall() else: pattern = f"%{query}%" - rows = self.connection.execute( - """ - SELECT citation_key, title, year, 0.0 AS score - FROM entries - WHERE title LIKE ? OR abstract LIKE ? OR fulltext LIKE ? - LIMIT ? - """, - (pattern, pattern, pattern, limit), - ).fetchall() + if topic_slug: + rows = self.connection.execute( + """ + SELECT DISTINCT e.citation_key, e.title, e.year, 0.0 AS score + FROM entries e + JOIN entry_topics et ON et.entry_id = e.id + JOIN topics t ON t.id = et.topic_id + WHERE t.slug = ? AND (e.title LIKE ? OR e.abstract LIKE ? OR e.fulltext LIKE ?) + LIMIT ? + """, + (topic_slug, pattern, pattern, pattern, limit), + ).fetchall() + else: + rows = self.connection.execute( + """ + SELECT citation_key, title, year, 0.0 AS score + FROM entries + WHERE title LIKE ? OR abstract LIKE ? OR fulltext LIKE ? + LIMIT ? + """, + (pattern, pattern, pattern, limit), + ).fetchall() return [dict(row) for row in rows] @@ -383,7 +448,11 @@ class BibliographyStore: "SELECT * FROM entries WHERE citation_key = ?", (citation_key,), ).fetchone() - return self._row_to_entry_dict(row) if row else None + if row is None: + return None + payload = self._row_to_entry_dict(row) + payload["topics"] = self.get_entry_topics(citation_key) + return payload def list_entries(self, limit: int = 50) -> list[dict[str, object]]: rows = self.connection.execute( @@ -397,6 +466,227 @@ class BibliographyStore: ).fetchall() return [dict(row) for row in rows] + def ensure_topic( + self, + slug: str, + name: str, + source_type: str = "manual", + source_url: str | None = None, + expansion_phrase: str | None = None, + suggested_phrase: str | None = None, + phrase_review_status: str | None = None, + phrase_review_notes: str | None = None, + ) -> int: + row = self.connection.execute( + """ + INSERT INTO topics ( + slug, name, source_type, source_url, expansion_phrase, + suggested_phrase, phrase_review_status, phrase_review_notes + ) + VALUES (?, ?, ?, ?, ?, ?, COALESCE(?, 'unreviewed'), ?) + ON CONFLICT(slug) DO UPDATE SET + name = excluded.name, + source_type = excluded.source_type, + source_url = COALESCE(excluded.source_url, topics.source_url), + expansion_phrase = COALESCE(excluded.expansion_phrase, topics.expansion_phrase), + suggested_phrase = COALESCE(excluded.suggested_phrase, topics.suggested_phrase), + phrase_review_status = COALESCE(excluded.phrase_review_status, topics.phrase_review_status), + phrase_review_notes = COALESCE(excluded.phrase_review_notes, topics.phrase_review_notes), + updated_at = CURRENT_TIMESTAMP + RETURNING id + """, + ( + slug, + name, + source_type, + source_url, + expansion_phrase, + suggested_phrase, + phrase_review_status, + phrase_review_notes, + ), + ).fetchone() + return int(row["id"]) + + def add_entry_topic( + self, + citation_key: str, + topic_slug: str, + topic_name: str, + source_type: str = "manual", + source_url: str | None = None, + source_label: str = "manual", + confidence: float = 1.0, + expansion_phrase: str | None = None, + ) -> bool: + entry_row = self.connection.execute( + "SELECT id FROM entries WHERE citation_key = ?", + (citation_key,), + ).fetchone() + if entry_row is None: + return False + + topic_id = self.ensure_topic( + topic_slug, + topic_name, + source_type=source_type, + source_url=source_url, + expansion_phrase=expansion_phrase, + ) + self.connection.execute( + """ + INSERT INTO entry_topics (entry_id, topic_id, source_label, confidence) + VALUES (?, ?, ?, ?) + ON CONFLICT(entry_id, topic_id) DO UPDATE SET + source_label = excluded.source_label, + confidence = excluded.confidence + """, + (int(entry_row["id"]), topic_id, source_label, confidence), + ) + return True + + def get_entry_topics(self, citation_key: str) -> list[dict[str, object]]: + rows = self.connection.execute( + """ + SELECT t.slug, t.name, t.source_type, t.source_url, et.source_label, et.confidence + FROM entry_topics et + JOIN entries e ON e.id = et.entry_id + JOIN topics t ON t.id = et.topic_id + WHERE e.citation_key = ? + ORDER BY t.name, t.slug + """, + (citation_key,), + ).fetchall() + return [dict(row) for row in rows] + + def list_topics( + self, + limit: int = 100, + phrase_review_status: str | None = None, + ) -> list[dict[str, object]]: + where = "" + params: list[object] = [] + if phrase_review_status is not None: + where = "WHERE t.phrase_review_status = ?" + params.append(phrase_review_status) + params.append(limit) + rows = self.connection.execute( + f""" + SELECT t.slug, t.name, t.source_type, t.source_url, t.expansion_phrase, + t.suggested_phrase, t.phrase_review_status, t.phrase_review_notes, + COUNT(et.entry_id) AS entry_count + FROM topics t + LEFT JOIN entry_topics et ON et.topic_id = t.id + {where} + GROUP BY t.id, t.slug, t.name, t.source_type, t.source_url, t.expansion_phrase, + t.suggested_phrase, t.phrase_review_status, t.phrase_review_notes + ORDER BY t.name, t.slug + LIMIT ? + """, + params, + ).fetchall() + return [dict(row) for row in rows] + + def get_topic(self, slug: str) -> dict[str, object] | None: + row = self.connection.execute( + """ + SELECT t.slug, t.name, t.source_type, t.source_url, t.expansion_phrase, + t.suggested_phrase, t.phrase_review_status, t.phrase_review_notes, + COUNT(et.entry_id) AS entry_count + FROM topics t + LEFT JOIN entry_topics et ON et.topic_id = t.id + WHERE t.slug = ? + GROUP BY t.id, t.slug, t.name, t.source_type, t.source_url, t.expansion_phrase, + t.suggested_phrase, t.phrase_review_status, t.phrase_review_notes + """, + (slug,), + ).fetchone() + return dict(row) if row else None + + def set_topic_expansion_phrase(self, slug: str, expansion_phrase: str | None) -> bool: + row = self.connection.execute( + """ + UPDATE topics + SET expansion_phrase = ?, updated_at = CURRENT_TIMESTAMP + WHERE slug = ? + RETURNING id + """, + (expansion_phrase, slug), + ).fetchone() + self.connection.commit() + return row is not None + + def stage_topic_phrase_suggestion( + self, + slug: str, + suggested_phrase: str | None, + review_status: str = "pending", + review_notes: str | None = None, + ) -> bool: + row = self.connection.execute( + """ + UPDATE topics + SET suggested_phrase = ?, + phrase_review_status = ?, + phrase_review_notes = ?, + updated_at = CURRENT_TIMESTAMP + WHERE slug = ? + RETURNING id + """, + (suggested_phrase, review_status, review_notes, slug), + ).fetchone() + self.connection.commit() + return row is not None + + def review_topic_phrase_suggestion( + self, + slug: str, + review_status: str, + review_notes: str | None = None, + applied_phrase: str | None = None, + ) -> bool: + topic = self.get_topic(slug) + if topic is None: + return False + + suggested_phrase = topic.get("suggested_phrase") + expansion_phrase = topic.get("expansion_phrase") + if review_status == "accepted": + expansion_phrase = applied_phrase if applied_phrase is not None else suggested_phrase + elif applied_phrase is not None: + expansion_phrase = applied_phrase + + row = self.connection.execute( + """ + UPDATE topics + SET expansion_phrase = ?, + phrase_review_status = ?, + phrase_review_notes = ?, + updated_at = CURRENT_TIMESTAMP + WHERE slug = ? + RETURNING id + """, + (expansion_phrase, review_status, review_notes, slug), + ).fetchone() + self.connection.commit() + return row is not None + + def list_topic_entries(self, topic_slug: str, limit: int = 100) -> list[dict[str, object]]: + rows = self.connection.execute( + """ + SELECT e.citation_key, e.entry_type, e.review_status, e.title, e.year, + t.slug AS topic_slug, t.name AS topic_name, et.source_label, et.confidence + FROM entry_topics et + JOIN topics t ON t.id = et.topic_id + JOIN entries e ON e.id = et.entry_id + WHERE t.slug = ? + ORDER BY COALESCE(e.year, ''), e.citation_key + LIMIT ? + """, + (topic_slug, limit), + ).fetchall() + return [dict(row) for row in rows] + def set_entry_status(self, citation_key: str, review_status: str) -> bool: row = self.connection.execute( """ @@ -437,6 +727,114 @@ class BibliographyStore: self.connection.commit() return True + def record_conflicts( + self, + citation_key: str, + conflicts: list[dict[str, str]], + source_type: str, + source_label: str, + ) -> bool: + row = self.connection.execute( + "SELECT id FROM entries WHERE citation_key = ?", + (citation_key,), + ).fetchone() + if row is None: + return False + + entry_id = int(row["id"]) + for conflict in conflicts: + self.connection.execute( + """ + INSERT INTO field_conflicts ( + entry_id, field_name, current_value, proposed_value, source_type, source_label, status + ) VALUES (?, ?, ?, ?, ?, ?, 'open') + """, + ( + entry_id, + conflict["field_name"], + conflict.get("current_value"), + conflict.get("proposed_value"), + source_type, + source_label, + ), + ) + self.connection.commit() + return True + + def get_field_conflicts(self, citation_key: str, status: str | None = None) -> list[dict[str, object]]: + where = "" + params: list[object] = [citation_key] + if status is not None: + where = " AND fc.status = ?" + params.append(status) + + rows = self.connection.execute( + f""" + SELECT fc.field_name, fc.current_value, fc.proposed_value, fc.source_type, + fc.source_label, fc.status, fc.recorded_at + FROM field_conflicts fc + JOIN entries e ON e.id = fc.entry_id + WHERE e.citation_key = ?{where} + ORDER BY fc.recorded_at, fc.id + """, + params, + ).fetchall() + return [dict(row) for row in rows] + + def set_conflict_status(self, citation_key: str, field_name: str, status: str) -> int: + row = self.connection.execute( + "SELECT id FROM entries WHERE citation_key = ?", + (citation_key,), + ).fetchone() + if row is None: + return 0 + entry_id = int(row["id"]) + result = self.connection.execute( + """ + UPDATE field_conflicts + SET status = ? + WHERE entry_id = ? AND field_name = ? AND status = 'open' + """, + (status, entry_id, field_name), + ) + self.connection.commit() + return result.rowcount + + def apply_conflict_value(self, citation_key: str, field_name: str) -> bool: + row = self.connection.execute( + """ + SELECT fc.id, fc.proposed_value, e.review_status + FROM field_conflicts fc + JOIN entries e ON e.id = fc.entry_id + WHERE e.citation_key = ? AND fc.field_name = ? AND fc.status = 'open' + ORDER BY fc.recorded_at DESC, fc.id DESC + LIMIT 1 + """, + (citation_key, field_name), + ).fetchone() + if row is None: + return False + + entry = self._load_bib_entry(citation_key) + if entry is None: + return False + + proposed_value = str(row["proposed_value"] or "") + entry.fields[field_name] = proposed_value + self.upsert_entry( + entry, + raw_bibtex=_entry_to_bibtex(entry), + source_type="manual_review", + source_label=f"conflict_accept:{field_name}", + review_status=str(row["review_status"] or "draft"), + ) + self.connection.execute( + "UPDATE field_conflicts SET status = 'accepted' WHERE id = ?", + (int(row["id"]),), + ) + self.connection.commit() + return True + def add_relation( self, source_citation_key: str, @@ -651,6 +1049,37 @@ class BibliographyStore: "ALTER TABLE entries ADD COLUMN review_status TEXT NOT NULL DEFAULT 'draft'" ) + def _ensure_topic_columns(self) -> None: + columns = { + row["name"] for row in self.connection.execute("PRAGMA table_info(topics)").fetchall() + } + if "expansion_phrase" not in columns: + try: + self.connection.execute("ALTER TABLE topics ADD COLUMN expansion_phrase TEXT") + except sqlite3.OperationalError as exc: + if "duplicate column name" not in str(exc).lower(): + raise + if "suggested_phrase" not in columns: + try: + self.connection.execute("ALTER TABLE topics ADD COLUMN suggested_phrase TEXT") + except sqlite3.OperationalError as exc: + if "duplicate column name" not in str(exc).lower(): + raise + if "phrase_review_status" not in columns: + try: + self.connection.execute( + "ALTER TABLE topics ADD COLUMN phrase_review_status TEXT NOT NULL DEFAULT 'unreviewed'" + ) + except sqlite3.OperationalError as exc: + if "duplicate column name" not in str(exc).lower(): + raise + if "phrase_review_notes" not in columns: + try: + self.connection.execute("ALTER TABLE topics ADD COLUMN phrase_review_notes TEXT") + except sqlite3.OperationalError as exc: + if "duplicate column name" not in str(exc).lower(): + raise + def _record_field_provenance( self, entry_id: int, diff --git a/src/citegeist/talkorigins.py b/src/citegeist/talkorigins.py new file mode 100644 index 0000000..45ce910 --- /dev/null +++ b/src/citegeist/talkorigins.py @@ -0,0 +1,1485 @@ +from __future__ import annotations + +from collections import Counter +from dataclasses import asdict, dataclass +from html.parser import HTMLParser +import hashlib +import json +import re +from pathlib import Path +from urllib.parse import urljoin, urlparse + +from .bibtex import BibEntry, render_bibtex +from .resolve import MetadataResolver, merge_entries, merge_entries_with_conflicts +from .sources import SourceClient +from .storage import BibliographyStore + +YEAR_PATTERN = re.compile(r"\b(18|19|20)\d{2}\b") +REPEATED_AUTHOR_PATTERN = re.compile(r"^\s*[-_]{3,}\s*,?\s*") +WHITESPACE_PATTERN = re.compile(r"\s+") +TOPIC_PHRASE_STOPWORDS = { + "about", + "across", + "after", + "among", + "analysis", + "book", + "books", + "conference", + "data", + "edition", + "effects", + "example", + "first", + "from", + "human", + "humans", + "journal", + "method", + "methods", + "paper", + "papers", + "review", + "science", + "second", + "studies", + "study", + "system", + "their", + "theory", + "title", + "using", +} + + +@dataclass(slots=True) +class TalkOriginsTopic: + topic: str + url: str + raw_entries: list[str] + + +@dataclass(slots=True) +class TalkOriginsSeedSet: + topic: str + slug: str + url: str + raw_entry_count: int + parsed_entry_count: int + seed_bib: str + plaintext_path: str = "" + page_path: str = "" + snapshot_path: str = "" + + +@dataclass(slots=True) +class TalkOriginsBatchExport: + base_url: str + output_dir: str + topic_count: int + entry_count: int + jobs_path: str + manifest_path: str + seed_sets: list[TalkOriginsSeedSet] + full_bib_path: str = "" + full_plaintext_path: str = "" + site_index_path: str = "" + + +@dataclass(slots=True) +class TalkOriginsValidationReport: + manifest_path: str + topic_count: int + entry_count: int + parsed_ratio: float + missing_author_count: int + missing_title_count: int + missing_year_count: int + suspicious_entry_type_count: int + suspicious_examples: list[dict[str, str]] + duplicate_cluster_count: int + duplicate_entry_count: int + duplicate_examples: list[dict[str, object]] + + +@dataclass(slots=True) +class TalkOriginsIngestReport: + manifest_path: str + topic_count: int + raw_entry_count: int + stored_entry_count: int + duplicate_cluster_count: int + duplicate_entry_count: int + canonicalized_count: int + + +@dataclass(slots=True) +class TalkOriginsDuplicateCluster: + key: str + count: int + items: list[dict[str, str]] + canonical: dict[str, object] | None = None + + +@dataclass(slots=True) +class TalkOriginsEnrichmentResult: + key: str + citation_key: str + weak_reasons_before: list[str] + resolved: bool + applied: bool + source_label: str = "" + weak_reasons_after: list[str] | None = None + conflicts: list[dict[str, str]] | None = None + error: str = "" + + +@dataclass(slots=True) +class TalkOriginsReviewExport: + manifest_path: str + item_count: int + items: list[dict[str, object]] + + +@dataclass(slots=True) +class TalkOriginsCorrectionResult: + key: str + citation_key: str + applied: bool + error: str = "" + + +@dataclass(slots=True) +class TalkOriginsTopicPhraseSuggestion: + slug: str + topic: str + entry_count: int + suggested_phrase: str + keywords: list[str] + review_required: bool = False + review_reasons: list[str] | None = None + + +class TalkOriginsScraper: + def __init__( + self, + source_client: SourceClient | None = None, + resolver: MetadataResolver | None = None, + ) -> None: + self.source_client = source_client or SourceClient() + self.resolver = resolver or MetadataResolver(source_client=self.source_client) + + def scrape_to_directory( + self, + base_url: str, + output_dir: str | Path, + limit_topics: int | None = None, + limit_entries_per_topic: int | None = None, + resolve_seeds: bool = False, + ingest_store: BibliographyStore | None = None, + review_status: str = "draft", + expand: bool = False, + topic_limit: int = 5, + topic_commit_limit: int | None = None, + resume: bool = True, + ) -> TalkOriginsBatchExport: + output_root = Path(output_dir) + seeds_dir = output_root / "seeds" + plaintext_dir = output_root / "plaintext" + snapshots_dir = output_root / "snapshots" + site_dir = output_root / "site" + topics_dir = site_dir / "topics" + seeds_dir.mkdir(parents=True, exist_ok=True) + plaintext_dir.mkdir(parents=True, exist_ok=True) + snapshots_dir.mkdir(parents=True, exist_ok=True) + topics_dir.mkdir(parents=True, exist_ok=True) + + seed_sets: list[TalkOriginsSeedSet] = [] + total_entries = 0 + jobs: list[dict[str, object]] = [] + full_entries: list[BibEntry] = [] + full_plaintext_blocks: list[str] = [] + + for topic in self.scrape_topics( + base_url, + snapshots_dir=snapshots_dir, + limit_topics=limit_topics, + resume=resume, + ): + raw_entries = topic.raw_entries[:limit_entries_per_topic] if limit_entries_per_topic else topic.raw_entries + entry_pairs = [ + (raw_entry, self.parse_reference_entry(raw_entry, index + 1)) + for index, raw_entry in enumerate(raw_entries) + ] + parsed_entries = [entry for _, entry in entry_pairs if entry is not None] + if resolve_seeds: + parsed_entries = [self._augment_entry(entry) for entry in parsed_entries] + if parsed_entries: + augmented_iter = iter(parsed_entries) + entry_pairs = [ + (raw_entry, next(augmented_iter) if parsed_entry is not None else None) + for raw_entry, parsed_entry in entry_pairs + ] + + slug = _slugify(topic.topic) + seed_path = (seeds_dir / f"{slug}.bib").resolve() + plaintext_path = (plaintext_dir / f"{slug}.txt").resolve() + page_path = (topics_dir / f"{slug}.html").resolve() + snapshot_path = (snapshots_dir / f"{slug}.json").resolve() + rendered = render_bibtex(parsed_entries) if parsed_entries else "" + seed_path.write_text(rendered + ("\n" if rendered else ""), encoding="utf-8") + plaintext_path.write_text(_render_plaintext_topic(topic.topic, raw_entries), encoding="utf-8") + page_path.write_text( + _render_topic_page(topic.topic, entry_pairs, seed_path.name), + encoding="utf-8", + ) + + if ingest_store is not None and parsed_entries: + ingest_store.ingest_bibtex( + rendered, + source_label=topic.url, + review_status=review_status, + ) + for entry in parsed_entries: + ingest_store.add_entry_topic( + entry.citation_key, + topic_slug=slug, + topic_name=topic.topic, + source_type="talkorigins", + source_url=topic.url, + source_label=topic.url, + ) + ingest_store.connection.commit() + + seed_set = TalkOriginsSeedSet( + topic=topic.topic, + slug=slug, + url=topic.url, + raw_entry_count=len(raw_entries), + parsed_entry_count=len(parsed_entries), + seed_bib=str(seed_path), + plaintext_path=str(plaintext_path), + page_path=str(page_path), + snapshot_path=str(snapshot_path), + ) + seed_sets.append(seed_set) + total_entries += len(parsed_entries) + full_entries.extend(parsed_entries) + full_plaintext_blocks.append(_render_plaintext_topic(topic.topic, raw_entries).rstrip()) + jobs.append( + { + "name": f"talkorigins:{slug}", + "topic": topic.topic, + "topic_slug": slug, + "topic_name": topic.topic, + "topic_phrase": topic.topic, + "seed_bib": str(seed_path), + "expand": expand, + "status": review_status, + "topic_limit": topic_limit, + "topic_commit_limit": topic_commit_limit, + } + ) + + output_root.mkdir(parents=True, exist_ok=True) + manifest_path = (output_root / "talkorigins_manifest.json").resolve() + jobs_path = (output_root / "talkorigins_jobs.json").resolve() + full_bib_path = (output_root / "talkorigins_full.bib").resolve() + full_plaintext_path = (output_root / "talkorigins_full.txt").resolve() + site_index_path = (site_dir / "index.html").resolve() + full_bib_path.write_text(render_bibtex(full_entries) + ("\n" if full_entries else ""), encoding="utf-8") + full_plaintext_path.write_text("\n\n".join(block for block in full_plaintext_blocks if block) + "\n", encoding="utf-8") + site_index_path.write_text( + _render_site_index(seed_sets, Path(full_bib_path).name, Path(full_plaintext_path).name), + encoding="utf-8", + ) + manifest_payload = { + "base_url": base_url, + "resume": resume, + "seed_sets": [asdict(item) for item in seed_sets], + "full_bib_path": str(full_bib_path), + "full_plaintext_path": str(full_plaintext_path), + "site_index_path": str(site_index_path), + } + manifest_path.write_text(json.dumps(manifest_payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") + jobs_path.write_text(json.dumps({"jobs": jobs}, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + return TalkOriginsBatchExport( + base_url=base_url, + output_dir=str(output_root.resolve()), + topic_count=len(seed_sets), + entry_count=total_entries, + jobs_path=str(jobs_path), + manifest_path=str(manifest_path), + seed_sets=seed_sets, + full_bib_path=str(full_bib_path), + full_plaintext_path=str(full_plaintext_path), + site_index_path=str(site_index_path), + ) + + def validate_export(self, manifest_path: str | Path) -> TalkOriginsValidationReport: + manifest = json.loads(Path(manifest_path).read_text(encoding="utf-8")) + seed_sets = manifest.get("seed_sets", []) + + topic_count = len(seed_sets) + raw_total = sum(int(item.get("raw_entry_count", 0)) for item in seed_sets) + parsed_total = sum(int(item.get("parsed_entry_count", 0)) for item in seed_sets) + missing_author_count = 0 + missing_title_count = 0 + missing_year_count = 0 + suspicious_entry_type_count = 0 + suspicious_examples: list[dict[str, str]] = [] + duplicate_groups: dict[str, list[dict[str, str]]] = {} + + for seed_set in seed_sets: + seed_bib = seed_set.get("seed_bib") + if not isinstance(seed_bib, str) or not seed_bib: + continue + path = Path(seed_bib) + if not path.exists(): + continue + entries = parse_bib_file(path) + for entry in entries: + if not entry.fields.get("author"): + missing_author_count += 1 + if not entry.fields.get("title"): + missing_title_count += 1 + if not entry.fields.get("year"): + missing_year_count += 1 + if _is_suspicious_entry_type(entry): + suspicious_entry_type_count += 1 + if len(suspicious_examples) < 20: + suspicious_examples.append( + { + "citation_key": entry.citation_key, + "entry_type": entry.entry_type, + "title": entry.fields.get("title", ""), + "journal": entry.fields.get("journal", ""), + "publisher": entry.fields.get("publisher", ""), + "howpublished": entry.fields.get("howpublished", ""), + } + ) + duplicate_key = _duplicate_key(entry) + if duplicate_key: + duplicate_groups.setdefault(duplicate_key, []).append( + { + "citation_key": entry.citation_key, + "title": entry.fields.get("title", ""), + "author": entry.fields.get("author", ""), + "year": entry.fields.get("year", ""), + "seed_bib": str(path), + } + ) + + parsed_ratio = (parsed_total / raw_total) if raw_total else 0.0 + duplicate_examples: list[dict[str, object]] = [] + duplicate_cluster_count = 0 + duplicate_entry_count = 0 + for group_key, items in sorted(duplicate_groups.items()): + if len(items) < 2: + continue + duplicate_cluster_count += 1 + duplicate_entry_count += len(items) + if len(duplicate_examples) < 20: + duplicate_examples.append( + { + "key": group_key, + "count": len(items), + "items": items[:5], + } + ) + return TalkOriginsValidationReport( + manifest_path=str(Path(manifest_path).resolve()), + topic_count=topic_count, + entry_count=parsed_total, + parsed_ratio=parsed_ratio, + missing_author_count=missing_author_count, + missing_title_count=missing_title_count, + missing_year_count=missing_year_count, + suspicious_entry_type_count=suspicious_entry_type_count, + suspicious_examples=suspicious_examples, + duplicate_cluster_count=duplicate_cluster_count, + duplicate_entry_count=duplicate_entry_count, + duplicate_examples=duplicate_examples, + ) + + def suggest_topic_phrases( + self, + manifest_path: str | Path, + limit: int | None = None, + topic_slug: str | None = None, + ) -> list[TalkOriginsTopicPhraseSuggestion]: + manifest = json.loads(Path(manifest_path).read_text(encoding="utf-8")) + seed_sets = manifest.get("seed_sets", []) + suggestions: list[TalkOriginsTopicPhraseSuggestion] = [] + + for seed_set in seed_sets: + current_topic_slug = str(seed_set.get("slug") or _slugify(str(seed_set.get("topic") or ""))) + if topic_slug and current_topic_slug != topic_slug: + continue + seed_bib = seed_set.get("seed_bib") + if not isinstance(seed_bib, str) or not seed_bib: + continue + path = Path(seed_bib) + if not path.exists(): + continue + entries = parse_bib_file(path) + topic_name = str(seed_set.get("topic") or current_topic_slug) + keywords = _suggest_topic_keywords(entries, topic_name) + review_reasons = _topic_phrase_review_reasons(entries, keywords) + suggestions.append( + TalkOriginsTopicPhraseSuggestion( + slug=current_topic_slug, + topic=topic_name, + entry_count=len(entries), + suggested_phrase=" ".join([topic_name, *keywords]).strip(), + keywords=keywords, + review_required=bool(review_reasons), + review_reasons=review_reasons, + ) + ) + + suggestions.sort(key=lambda item: (item.topic.casefold(), item.slug)) + if limit is not None: + suggestions = suggestions[:limit] + return suggestions + + def inspect_duplicate_clusters( + self, + manifest_path: str | Path, + limit: int = 20, + min_count: int = 2, + match: str | None = None, + topic_slug: str | None = None, + preview_canonical: bool = False, + weak_only: bool = False, + ) -> list[TalkOriginsDuplicateCluster]: + duplicate_groups, grouped_entries = _collect_duplicate_groups( + manifest_path, + match=match, + topic_slug=topic_slug, + ) + + clusters: list[TalkOriginsDuplicateCluster] = [] + for group_key, items in sorted(duplicate_groups.items()): + if len(items) < min_count: + continue + canonical_payload = None + if preview_canonical: + canonical = _build_canonical_preview(grouped_entries[group_key]) + weak_reasons = _canonical_weaknesses(canonical) + if weak_only and not weak_reasons: + continue + canonical_payload = { + "citation_key": canonical.citation_key, + "entry_type": canonical.entry_type, + "field_count": len([value for value in canonical.fields.values() if value]), + "fields": dict(sorted(canonical.fields.items())), + "weak_reasons": weak_reasons, + } + elif weak_only: + canonical = _build_canonical_preview(grouped_entries[group_key]) + if not _canonical_weaknesses(canonical): + continue + clusters.append( + TalkOriginsDuplicateCluster( + key=group_key, + count=len(items), + items=sorted( + items, + key=lambda item: ( + item.get("topic_slug", ""), + item.get("year", ""), + item.get("citation_key", ""), + ), + ), + canonical=canonical_payload, + ) + ) + return clusters[:limit] + + def enrich_weak_canonicals( + self, + manifest_path: str | Path, + store: BibliographyStore, + limit: int = 20, + min_count: int = 2, + match: str | None = None, + topic_slug: str | None = None, + apply: bool = False, + review_status: str = "enriched", + allow_unsafe_matches: bool = False, + ) -> list[TalkOriginsEnrichmentResult]: + duplicate_groups, grouped_entries = _collect_duplicate_groups( + manifest_path, + match=match, + topic_slug=topic_slug, + ) + results: list[TalkOriginsEnrichmentResult] = [] + + for group_key, items in sorted(duplicate_groups.items()): + if len(items) < min_count: + continue + canonical = _build_canonical_preview(grouped_entries[group_key]) + weak_reasons_before = _canonical_weaknesses(canonical) + if not weak_reasons_before: + continue + resolution = None + error = "" + try: + resolution = self.resolver.resolve_entry(canonical) + except Exception as exc: + error = str(exc) + + result = TalkOriginsEnrichmentResult( + key=group_key, + citation_key=canonical.citation_key, + weak_reasons_before=weak_reasons_before, + resolved=resolution is not None, + applied=False, + source_label=resolution.source_label if resolution is not None else "", + error=error, + ) + + if resolution is not None: + if not allow_unsafe_matches and not _is_safe_enrichment_match(canonical, resolution): + result.resolved = False + result.source_label = resolution.source_label + result.error = "unsafe resolver match" + results.append(result) + if len(results) >= limit: + break + continue + merged, conflicts = merge_entries_with_conflicts(canonical, resolution.entry) + if canonical.entry_type == "misc" and resolution.entry.entry_type != "misc": + merged = BibEntry( + entry_type=resolution.entry.entry_type, + citation_key=merged.citation_key, + fields=merged.fields, + ) + result.conflicts = conflicts + result.weak_reasons_after = _canonical_weaknesses(merged) + if apply: + store_key = _find_store_citation_key(store, canonical) + if store_key: + store.replace_entry( + store_key, + merged, + source_type=resolution.source_type, + source_label=resolution.source_label, + review_status=review_status, + ) + if conflicts: + store.record_conflicts( + store_key, + conflicts, + source_type=resolution.source_type, + source_label=resolution.source_label, + ) + result.citation_key = store_key + result.applied = True + results.append(result) + if len(results) >= limit: + break + + if apply: + store.connection.commit() + return results + + def build_review_export( + self, + manifest_path: str | Path, + store: BibliographyStore, + limit: int = 20, + min_count: int = 2, + match: str | None = None, + topic_slug: str | None = None, + ) -> TalkOriginsReviewExport: + clusters = self.inspect_duplicate_clusters( + manifest_path, + limit=limit, + min_count=min_count, + match=match, + topic_slug=topic_slug, + preview_canonical=True, + weak_only=True, + ) + enrichment_results = self.enrich_weak_canonicals( + manifest_path, + store, + limit=limit, + min_count=min_count, + match=match, + topic_slug=topic_slug, + apply=False, + ) + by_key = {result.key: result for result in enrichment_results} + items: list[dict[str, object]] = [] + for cluster in clusters: + result = by_key.get(cluster.key) + payload = { + "key": cluster.key, + "count": cluster.count, + "items": cluster.items, + "canonical": cluster.canonical, + "enrichment": asdict(result) if result is not None else None, + } + items.append(payload) + return TalkOriginsReviewExport( + manifest_path=str(Path(manifest_path).resolve()), + item_count=len(items), + items=items, + ) + + def apply_review_corrections( + self, + manifest_path: str | Path, + corrections_path: str | Path, + store: BibliographyStore, + default_review_status: str = "reviewed", + ) -> list[TalkOriginsCorrectionResult]: + duplicate_groups, grouped_entries = _collect_duplicate_groups(manifest_path) + payload = json.loads(Path(corrections_path).read_text(encoding="utf-8")) + correction_items = payload.get("corrections", []) + results: list[TalkOriginsCorrectionResult] = [] + + for item in correction_items: + key = str(item.get("key") or "") + if not key: + results.append(TalkOriginsCorrectionResult(key="", citation_key="", applied=False, error="missing key")) + continue + entries = grouped_entries.get(key) + if not entries: + results.append(TalkOriginsCorrectionResult(key=key, citation_key="", applied=False, error="unknown key")) + continue + + canonical = _build_canonical_preview(entries) + store_key = _find_store_citation_key(store, canonical) + if not store_key: + results.append(TalkOriginsCorrectionResult(key=key, citation_key=canonical.citation_key, applied=False, error="entry not found in store")) + continue + + corrected = BibEntry( + entry_type=str(item.get("entry_type") or canonical.entry_type), + citation_key=store_key, + fields=dict(canonical.fields), + ) + override_fields = item.get("fields", {}) + if isinstance(override_fields, dict): + for field_name, value in override_fields.items(): + if value is None: + corrected.fields.pop(str(field_name), None) + else: + corrected.fields[str(field_name)] = str(value) + + review_status = str(item.get("review_status") or default_review_status) + store.replace_entry( + store_key, + corrected, + source_type="manual_review", + source_label=f"talkorigins_corrections:{Path(corrections_path).resolve()}", + review_status=review_status, + ) + results.append(TalkOriginsCorrectionResult(key=key, citation_key=store_key, applied=True)) + + store.connection.commit() + return results + + def ingest_export( + self, + manifest_path: str | Path, + store: BibliographyStore, + review_status: str = "draft", + dedupe: bool = True, + ) -> TalkOriginsIngestReport: + manifest = json.loads(Path(manifest_path).read_text(encoding="utf-8")) + seed_sets = manifest.get("seed_sets", []) + topic_count = len(seed_sets) + raw_entry_count = sum(int(item.get("parsed_entry_count", 0)) for item in seed_sets) + + grouped: dict[str, list[tuple[dict[str, object], BibEntry]]] = {} + canonicalized_count = 0 + duplicate_entry_count = 0 + + for seed_set in seed_sets: + seed_bib = seed_set.get("seed_bib") + if not isinstance(seed_bib, str) or not seed_bib: + continue + entries = parse_bib_file(seed_bib) + for entry in entries: + group_key = _duplicate_key(entry) if dedupe else entry.citation_key + if not group_key: + group_key = entry.citation_key + grouped.setdefault(group_key, []).append((seed_set, entry)) + + stored_entry_count = 0 + duplicate_cluster_count = 0 + source_label = str(Path(manifest_path).resolve()) + key_owners: dict[str, str] = {} + existing_rows = store.connection.execute("SELECT citation_key FROM entries").fetchall() + for row in existing_rows: + key_owners[str(row["citation_key"])] = "__existing__" + + for group_key, items in grouped.items(): + if len(items) > 1: + duplicate_cluster_count += 1 + duplicate_entry_count += len(items) + + canonical = _select_canonical_entry([entry for _, entry in items]) + for _, duplicate in items: + if duplicate.citation_key != canonical.citation_key: + canonical = merge_entries(canonical, duplicate) + canonicalized_count += 1 + canonical = _assign_canonical_key(canonical, group_key, key_owners) + + store.upsert_entry( + canonical, + raw_bibtex=render_bibtex([canonical]), + source_type="talkorigins", + source_label=source_label, + review_status=review_status, + ) + stored_entry_count += 1 + + seen_topics: set[str] = set() + for seed_set, _ in items: + topic_slug = str(seed_set.get("slug") or _slugify(str(seed_set.get("topic") or ""))) + if topic_slug in seen_topics: + continue + seen_topics.add(topic_slug) + store.add_entry_topic( + canonical.citation_key, + topic_slug=topic_slug, + topic_name=str(seed_set.get("topic") or topic_slug), + source_type="talkorigins", + source_url=str(seed_set.get("url") or ""), + source_label=source_label, + ) + + store.connection.commit() + return TalkOriginsIngestReport( + manifest_path=str(Path(manifest_path).resolve()), + topic_count=topic_count, + raw_entry_count=raw_entry_count, + stored_entry_count=stored_entry_count, + duplicate_cluster_count=duplicate_cluster_count, + duplicate_entry_count=duplicate_entry_count, + canonicalized_count=canonicalized_count, + ) + + def scrape_topics( + self, + base_url: str, + snapshots_dir: Path | None = None, + limit_topics: int | None = None, + resume: bool = True, + ) -> list[TalkOriginsTopic]: + index_html = self.source_client.get_text(base_url) + parser = _TopicIndexParser(base_url) + parser.feed(index_html) + + topics: list[TalkOriginsTopic] = [] + for link in parser.topic_links[:limit_topics]: + slug = _slugify(link["topic"]) + snapshot_path = snapshots_dir / f"{slug}.json" if snapshots_dir is not None else None + snapshot = _load_snapshot(snapshot_path) if resume and snapshot_path is not None else None + if snapshot is not None: + raw_entries = list(snapshot.get("raw_entries", [])) + else: + page_html = self.source_client.get_text(link["url"]) + topic_parser = _TopicPageParser() + topic_parser.feed(page_html) + raw_entries = normalize_topic_entries(topic_parser.preformatted_text()) + if snapshot_path is not None: + snapshot_payload = { + "topic": link["topic"], + "url": link["url"], + "raw_entries": raw_entries, + } + snapshot_path.write_text(json.dumps(snapshot_payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") + topics.append(TalkOriginsTopic(topic=link["topic"], url=link["url"], raw_entries=raw_entries)) + return topics + + def parse_reference_entry(self, raw_entry: str, ordinal: int) -> BibEntry | None: + year_match = YEAR_PATTERN.search(raw_entry) + if year_match is None: + return None + + year = year_match.group(0) + author_part = raw_entry[: year_match.start()].strip(" ,.;:") + remainder = raw_entry[year_match.end() :].strip(" ,.;:") + if not author_part or not remainder: + return None + + title, venue = _split_title_and_venue(remainder) + if not title: + return None + + authors = _normalize_gsa_authors(author_part) + citation_key = _make_citation_key(authors, year, title, ordinal) + entry_type = _guess_entry_type(remainder) + fields = { + "author": authors, + "year": year, + "title": title, + "note": f"talkorigins_source = {{true}}; raw_reference = {{{raw_entry}}}", + } + if entry_type == "book": + normalized = _normalize_incollection_candidate(title, venue) + if normalized is not None: + title = normalized["title"] + fields["title"] = title + entry_type = "incollection" + if normalized.get("editor"): + fields["editor"] = normalized["editor"] + if normalized.get("booktitle"): + fields["booktitle"] = normalized["booktitle"] + if normalized.get("publisher"): + fields["publisher"] = normalized["publisher"] + venue = "" + if venue: + if entry_type == "article": + fields["journal"] = venue + elif entry_type == "inproceedings": + fields["booktitle"] = venue + elif entry_type == "incollection": + fields["booktitle"] = venue + elif entry_type in {"book", "phdthesis", "mastersthesis"}: + fields["publisher"] = venue + else: + fields["howpublished"] = venue + + return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) + + def _augment_entry(self, entry: BibEntry) -> BibEntry: + try: + resolution = self.resolver.resolve_entry(entry) + except Exception: + return entry + if resolution is None: + return entry + return merge_entries(entry, resolution.entry) + + +def normalize_topic_entries(text: str) -> list[str]: + entries: list[str] = [] + previous_authors = "" + current: list[str] = [] + + for raw_line in text.splitlines(): + line = raw_line.strip() + if not line: + if current: + entry_text = " ".join(current) + normalized = _normalize_repeated_authors(entry_text, previous_authors) + entries.append(normalized) + previous_authors = _extract_author_prefix(normalized) or previous_authors + current = [] + continue + current.append(WHITESPACE_PATTERN.sub(" ", line)) + + if current: + entry_text = " ".join(current) + normalized = _normalize_repeated_authors(entry_text, previous_authors) + entries.append(normalized) + + return entries + + +def _normalize_repeated_authors(entry_text: str, previous_authors: str) -> str: + if previous_authors and REPEATED_AUTHOR_PATTERN.match(entry_text): + return REPEATED_AUTHOR_PATTERN.sub(f"{previous_authors}, ", entry_text, count=1) + return entry_text + + +def _extract_author_prefix(entry_text: str) -> str: + year_match = YEAR_PATTERN.search(entry_text) + if year_match is None: + return "" + return entry_text[: year_match.start()].strip(" ,;:") + + +def _split_title_and_venue(remainder: str) -> tuple[str, str]: + if ": " in remainder: + title, venue = remainder.split(": ", 1) + return _clean_fragment(title), _clean_fragment(venue) + + parts = [part.strip() for part in remainder.split(". ") if part.strip()] + if not parts: + return "", "" + title = parts[0] + venue = ". ".join(parts[1:]) if len(parts) > 1 else "" + return _clean_fragment(title), _clean_fragment(venue) + + +def _normalize_gsa_authors(author_part: str) -> str: + cleaned = WHITESPACE_PATTERN.sub(" ", author_part.replace("&", " and ")).strip(" ,;:") + if " and " in cleaned and "," not in cleaned: + return cleaned + + fragments = [fragment.strip() for fragment in cleaned.split(",") if fragment.strip()] + if len(fragments) < 2: + return cleaned + + authors: list[str] = [] + index = 0 + while index + 1 < len(fragments): + family = re.sub(r"^(and)\s+", "", fragments[index], flags=re.IGNORECASE).strip() + given = re.sub(r"^(and)\s+", "", fragments[index + 1], flags=re.IGNORECASE).strip() + if family and given: + authors.append(f"{family}, {given}") + index += 2 + + if index < len(fragments): + trailing = re.sub(r"^(and)\s+", "", fragments[index], flags=re.IGNORECASE).strip() + if trailing: + authors.append(trailing) + + return " and ".join(authors) if authors else cleaned + + +def _make_citation_key(authors: str, year: str, title: str, ordinal: int) -> str: + first_author = authors.split(" and ")[0] + family = first_author.split(",", 1)[0] if "," in first_author else first_author.split()[-1] + family = re.sub(r"[^A-Za-z0-9]+", "", family).lower() or "ref" + first_word = re.sub(r"[^A-Za-z0-9]+", "", title.split()[0]).lower() if title.split() else "untitled" + first_word = first_word or "untitled" + return f"{family}{year}{first_word}{ordinal}" + + +def _guess_entry_type(text: str) -> str: + lowered = text.lower() + if "ph.d" in lowered or "dissertation" in lowered or "thesis" in lowered: + return "phdthesis" + if any( + token in lowered + for token in ( + "press", + "publisher", + "publications", + "publication", + "elsevier", + "springer", + "wiley", + "university", + "books", + ) + ): + return "book" + if any(token in lowered for token in ("proceedings", "conference", "symposium", "workshop")): + return "inproceedings" + if any(token in lowered for token in ("journal", "review", "letters", "quarterly", "science", "nature")): + return "article" + return "misc" + + +def _clean_fragment(value: str) -> str: + return WHITESPACE_PATTERN.sub(" ", value.strip(" .;:,\"'")) + + +def _slugify(value: str) -> str: + slug = re.sub(r"[^A-Za-z0-9]+", "-", value.lower()).strip("-") + return slug or "topic" + + +def _normalize_incollection_candidate(title: str, venue: str) -> dict[str, str] | None: + lowered = venue.lower() + if ", in " not in lowered: + return None + + split_index = lowered.find(", in ") + prefix = _clean_fragment(venue[:split_index]) + container = venue[split_index + len(", in ") :].strip() + if not container: + return None + + editor_match = re.match(r"^(?P.+?),\s+eds?\.,\s+(?P.+)$", container, flags=re.IGNORECASE) + if editor_match is None: + return None + + editor_text = _normalize_gsa_authors(editor_match.group("editors")) + rest = editor_match.group("rest").strip() + if ": " in rest: + booktitle, publisher = rest.split(": ", 1) + else: + booktitle, publisher = rest, "" + + normalized_title = title + if prefix: + normalized_title = _clean_fragment(f"{title}: {prefix}") + + payload = { + "title": normalized_title, + "editor": editor_text, + "booktitle": _clean_fragment(booktitle), + } + if publisher: + payload["publisher"] = _clean_fragment(publisher) + return payload + + +def _load_snapshot(path: Path | None) -> dict[str, object] | None: + if path is None or not path.exists(): + return None + return json.loads(path.read_text(encoding="utf-8")) + + +def parse_bib_file(path: str | Path) -> list[BibEntry]: + from .bibtex import parse_bibtex + + return parse_bibtex(Path(path).read_text(encoding="utf-8")) + + +def _render_plaintext_topic(topic: str, raw_entries: list[str]) -> str: + body = "\n\n".join(raw_entries) + return f"{topic}\n\n{body}\n" if body else f"{topic}\n" + + +def _render_topic_page(topic: str, entry_pairs: list[tuple[str, BibEntry | None]], seed_filename: str) -> str: + entry_blocks: list[str] = [] + for index, (raw_entry, parsed_entry) in enumerate(entry_pairs, start=1): + bibtex_block = "" + if parsed_entry is not None: + bibtex_block = render_bibtex([parsed_entry]) + safe_plain = _html_escape(raw_entry) + safe_bibtex = _html_escape(bibtex_block) + entry_blocks.append( + "\n".join( + [ + '
', + f'
{safe_plain}
', + f' ', + f' ', + "
", + ] + ) + ) + + return "\n".join( + [ + "", + '', + "", + ' ', + f" {_html_escape(topic)} bibliography", + " ", + " ", + "", + "", + f"

{_html_escape(topic)}

", + f'

Back to index | Seed BibTeX

', + *entry_blocks, + "", + "", + ] + ) + "\n" + + +def _render_site_index(seed_sets: list[TalkOriginsSeedSet], full_bib_name: str, full_plaintext_name: str) -> str: + items = [ + f'
  • {_html_escape(item.topic)} ' + f'({item.parsed_entry_count} entries)
  • ' + for item in seed_sets + ] + return "\n".join( + [ + "", + '', + "", + ' ', + " TalkOrigins bibliography reconstruction", + " ", + "", + "", + "

    TalkOrigins bibliography reconstruction

    ", + "

    Downloads:

    ", + " ", + "

    Topics

    ", + "
      ", + *items, + "
    ", + "", + "", + ] + ) + "\n" + + +def _html_escape(value: str) -> str: + return ( + value.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + ) + + +def _collect_duplicate_groups( + manifest_path: str | Path, + match: str | None = None, + topic_slug: str | None = None, +) -> tuple[dict[str, list[dict[str, str]]], dict[str, list[BibEntry]]]: + manifest = json.loads(Path(manifest_path).read_text(encoding="utf-8")) + seed_sets = manifest.get("seed_sets", []) + match_text = match.casefold() if match else None + duplicate_groups: dict[str, list[dict[str, str]]] = {} + grouped_entries: dict[str, list[BibEntry]] = {} + + for seed_set in seed_sets: + seed_bib = seed_set.get("seed_bib") + if not isinstance(seed_bib, str) or not seed_bib: + continue + current_topic_slug = str(seed_set.get("slug") or _slugify(str(seed_set.get("topic") or ""))) + if topic_slug and current_topic_slug != topic_slug: + continue + path = Path(seed_bib) + if not path.exists(): + continue + for entry in parse_bib_file(path): + duplicate_key = _duplicate_key(entry) + if not duplicate_key: + continue + item = { + "citation_key": entry.citation_key, + "title": entry.fields.get("title", ""), + "author": entry.fields.get("author", ""), + "year": entry.fields.get("year", ""), + "seed_bib": str(path), + "topic": str(seed_set.get("topic") or ""), + "topic_slug": current_topic_slug, + } + if match_text and not _duplicate_item_matches(item, duplicate_key, match_text): + continue + duplicate_groups.setdefault(duplicate_key, []).append(item) + grouped_entries.setdefault(duplicate_key, []).append(entry) + + return duplicate_groups, grouped_entries + + +def _duplicate_key(entry: BibEntry) -> str: + author = _normalize_duplicate_text(entry.fields.get("author", "")) + title = _normalize_duplicate_text(entry.fields.get("title", "")) + year = entry.fields.get("year", "").strip() + if not author or not title or not year: + return "" + first_author = author.split(" and ")[0] + return f"{first_author}|{year}|{title}" + + +def _duplicate_item_matches(item: dict[str, str], duplicate_key: str, match_text: str) -> bool: + haystacks = ( + duplicate_key, + item.get("citation_key", ""), + item.get("title", ""), + item.get("author", ""), + item.get("year", ""), + item.get("topic", ""), + item.get("topic_slug", ""), + item.get("seed_bib", ""), + ) + return any(match_text in value.casefold() for value in haystacks if value) + + +def _normalize_duplicate_text(value: str) -> str: + normalized = value.lower() + normalized = normalized.replace("&", " and ") + normalized = re.sub(r"[^a-z0-9\s]+", " ", normalized) + normalized = re.sub(r"\s+", " ", normalized).strip() + return normalized + + +def _topic_phrase_tokens(value: str) -> list[str]: + return [ + token + for token in _normalize_duplicate_text(value).split() + if len(token) >= 4 and token not in TOPIC_PHRASE_STOPWORDS + ] + + +def _suggest_topic_keywords(entries: list[BibEntry], topic_name: str, max_keywords: int = 4) -> list[str]: + topic_terms = set(_topic_phrase_tokens(topic_name)) + counts: Counter[str] = Counter() + for entry in entries: + for term in set(_topic_phrase_tokens(entry.fields.get("title", ""))): + if term in topic_terms: + continue + counts[term] += 1 + ranked = sorted(counts.items(), key=lambda item: (-item[1], item[0])) + if len(entries) <= 1: + max_keywords = min(max_keywords, 1) + elif len(entries) <= 3: + max_keywords = min(max_keywords, 2) + filtered = [(term, count) for term, count in ranked if count >= 2] + selected = filtered if filtered else ranked[:max_keywords] + return [term for term, _ in selected[:max_keywords]] + + +def _topic_phrase_review_reasons(entries: list[BibEntry], keywords: list[str]) -> list[str]: + reasons: list[str] = [] + if len(entries) <= 1: + reasons.append("single_entry_topic") + elif len(entries) <= 3: + reasons.append("small_topic") + if not keywords: + reasons.append("no_keyword_signal") + elif len(keywords) == 1: + reasons.append("thin_keyword_signal") + if any(_looks_noisy_keyword(keyword) for keyword in keywords): + reasons.append("noisy_keywords") + return reasons + + +def _looks_noisy_keyword(keyword: str) -> bool: + if len(keyword) <= 3: + return True + if any(char.isdigit() for char in keyword): + return True + noisy_tokens = {"boundry", "colloquium", "edition", "history", "idea", "central", "bearing", "time"} + return keyword in noisy_tokens + + +def _select_canonical_entry(entries: list[BibEntry]) -> BibEntry: + return max( + entries, + key=lambda entry: ( + _entry_richness(entry), + -len(entry.citation_key), + entry.citation_key, + ), + ) + + +def _build_canonical_preview(entries: list[BibEntry]) -> BibEntry: + canonical = _select_canonical_entry(entries) + for duplicate in entries: + if duplicate.citation_key != canonical.citation_key: + canonical = merge_entries(canonical, duplicate) + return canonical + + +def _canonical_weaknesses(entry: BibEntry) -> list[str]: + reasons: list[str] = [] + if entry.entry_type == "misc": + reasons.append("entry_type:misc") + if not entry.fields.get("doi"): + reasons.append("missing:doi") + if _entry_richness(entry) < 6: + reasons.append("low_field_richness") + if entry.entry_type in {"article", "inproceedings", "incollection"} and not ( + entry.fields.get("journal") or entry.fields.get("booktitle") + ): + reasons.append("missing:venue") + return reasons + + +def _find_store_citation_key(store: BibliographyStore, entry: BibEntry) -> str | None: + if store.get_entry(entry.citation_key) is not None: + return entry.citation_key + + first_author = entry.fields.get("author", "").split(" and ")[0].strip() + row = store.connection.execute( + """ + SELECT e.citation_key + FROM entries e + LEFT JOIN entry_creators ec + ON ec.entry_id = e.id AND ec.role = 'author' AND ec.ordinal = 1 + LEFT JOIN creators c + ON c.id = ec.creator_id + WHERE COALESCE(e.title, '') = ? + AND COALESCE(e.year, '') = ? + AND COALESCE(c.full_name, '') = ? + ORDER BY e.citation_key + LIMIT 1 + """, + ( + entry.fields.get("title", ""), + entry.fields.get("year", ""), + first_author, + ), + ).fetchone() + if row is None: + return None + return str(row["citation_key"]) + + +def _is_safe_enrichment_match(base: BibEntry, resolution: object) -> bool: + source_label = getattr(resolution, "source_label", "") + resolved_entry = getattr(resolution, "entry", None) + if not isinstance(source_label, str) or resolved_entry is None: + return False + if ":search:" not in source_label: + return True + + base_title = _normalize_duplicate_text(base.fields.get("title", "")) + resolved_title = _normalize_duplicate_text(resolved_entry.fields.get("title", "")) + if not base_title or base_title != resolved_title: + return False + + base_year = (base.fields.get("year") or "").strip() + resolved_year = (resolved_entry.fields.get("year") or "").strip() + if base_year and resolved_year and base_year == resolved_year: + return True + + base_author = _normalize_duplicate_text(base.fields.get("author", "")) + resolved_author = _normalize_duplicate_text(resolved_entry.fields.get("author", "")) + if not base_author or not resolved_author: + return False + base_first = base_author.split(" and ")[0].split()[0] + resolved_first = resolved_author.split(" and ")[0].split()[0] + return bool(base_first and resolved_first and base_first == resolved_first) + + +def _entry_richness(entry: BibEntry) -> int: + score = 0 + for field_name, value in entry.fields.items(): + if value: + score += 3 if field_name in {"doi", "url", "abstract", "publisher", "journal", "booktitle", "editor"} else 1 + return score + + +def _assign_canonical_key(entry: BibEntry, group_key: str, key_owners: dict[str, str]) -> BibEntry: + base_key = entry.citation_key + owner = key_owners.get(base_key) + if owner is None or owner == group_key: + key_owners[base_key] = group_key + return entry + + suffix = hashlib.sha1(group_key.encode("utf-8")).hexdigest()[:8] + candidate = f"{base_key}_{suffix}" + counter = 2 + while candidate in key_owners and key_owners[candidate] != group_key: + candidate = f"{base_key}_{suffix}_{counter}" + counter += 1 + key_owners[candidate] = group_key + return BibEntry(entry_type=entry.entry_type, citation_key=candidate, fields=dict(entry.fields)) + + +def _is_suspicious_entry_type(entry: BibEntry) -> bool: + journal = entry.fields.get("journal", "").lower() + publisher = entry.fields.get("publisher", "").lower() + howpublished = entry.fields.get("howpublished", "").lower() + if entry.entry_type == "article" and any( + token in journal + for token in ("elsevier", "springer", "press", "publications", "publisher", "university") + ): + return True + if entry.entry_type == "misc" and any( + token in howpublished + for token in ("journal", "review", "letters", "proceedings", "conference", "symposium") + ): + return True + if entry.entry_type == "book" and any( + token in publisher for token in ("journal", "review", "letters", "proceedings", "conference") + ) and not any( + token in publisher for token in ("press", "academic", "elsevier", "springer", "wiley", "university") + ): + return True + if entry.entry_type == "incollection" and not entry.fields.get("booktitle"): + return True + return False + + +class _TopicIndexParser(HTMLParser): + def __init__(self, base_url: str) -> None: + super().__init__() + self.base_url = base_url + self.base_prefix = base_url if base_url.endswith("/") else base_url + "/" + self.topic_links: list[dict[str, str]] = [] + self._current_href: str | None = None + self._current_text: list[str] = [] + self._seen_urls: set[str] = set() + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + if tag != "a": + return + href = dict(attrs).get("href") + if not href or href.startswith("#"): + return + self._current_href = urljoin(self.base_url, href) + self._current_text = [] + + def handle_data(self, data: str) -> None: + if self._current_href is not None: + self._current_text.append(data) + + def handle_endtag(self, tag: str) -> None: + if tag != "a" or self._current_href is None: + return + topic = WHITESPACE_PATTERN.sub(" ", "".join(self._current_text)).strip() + href = self._current_href + self._current_href = None + self._current_text = [] + if not topic or href in self._seen_urls: + return + parsed = urlparse(href) + base_parsed = urlparse(self.base_prefix) + if parsed.netloc and base_parsed.netloc and parsed.netloc != base_parsed.netloc: + return + if not href.startswith(self.base_prefix): + return + if href.rstrip("/").endswith("biblio") or href.endswith("origins.html"): + return + self._seen_urls.add(href) + self.topic_links.append({"topic": topic, "url": href}) + + +class _TopicPageParser(HTMLParser): + def __init__(self) -> None: + super().__init__() + self._bibliography_depth = 0 + self._in_pre = False + self._in_paragraph = False + self._current_paragraph: list[str] = [] + self._parts: list[str] = [] + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + attributes = dict(attrs) + if tag == "div" and "bibliography" in (attributes.get("class") or "").split(): + self._bibliography_depth += 1 + return + if tag == "pre": + self._in_pre = True + return + if self._bibliography_depth and tag == "p": + self._in_paragraph = True + self._current_paragraph = [] + + def handle_endtag(self, tag: str) -> None: + if tag == "div" and self._bibliography_depth: + self._bibliography_depth -= 1 + return + if tag == "p" and self._in_paragraph: + text = "".join(self._current_paragraph).strip() + if text: + self._parts.append(text) + self._parts.append("\n\n") + self._current_paragraph = [] + self._in_paragraph = False + return + if tag == "pre": + self._in_pre = False + self._parts.append("\n") + + def handle_data(self, data: str) -> None: + if self._bibliography_depth and self._in_paragraph: + self._current_paragraph.append(data) + elif self._in_pre: + self._parts.append(data) + + def preformatted_text(self) -> str: + return "".join(self._parts) diff --git a/tests/test_batch.py b/tests/test_batch.py new file mode 100644 index 0000000..9fe71b6 --- /dev/null +++ b/tests/test_batch.py @@ -0,0 +1,129 @@ +from pathlib import Path + +from citegeist.batch import BatchBootstrapRunner, load_batch_jobs +from citegeist.cli import main +from citegeist.storage import BibliographyStore + + +def test_load_batch_jobs_accepts_object_with_jobs(tmp_path: Path): + path = tmp_path / "jobs.json" + path.write_text( + """ +{ + "jobs": [ + {"name": "topic-only", "topic": "graph topic"}, + {"name": "seed-only", "seed_bib": "seed.bib"} + ] +} +""", + encoding="utf-8", + ) + + jobs = load_batch_jobs(path) + + assert jobs[0]["name"] == "topic-only" + assert jobs[1]["seed_bib"] == str((tmp_path / "seed.bib").resolve()) + + +def test_batch_runner_executes_multiple_jobs(tmp_path: Path): + seed_bib = tmp_path / "seed.bib" + seed_bib.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024} +} +""", + encoding="utf-8", + ) + jobs = [ + {"name": "seed-job", "seed_bib": str(seed_bib), "expand": False}, + {"name": "topic-job", "topic": "graph topic", "expand": False, "preview": True}, + ] + + runner = BatchBootstrapRunner() + from citegeist import BibEntry + + runner.bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign] + BibEntry(entry_type="article", citation_key="topic2024graph", fields={"title": "Graph Topic Result"}) + ] + runner.bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign] + runner.bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign] + + store = BibliographyStore() + try: + results = runner.run(store, jobs) + assert [job.job_name for job in results] == ["seed-job", "topic-job"] + assert results[0].result_count == 1 + assert results[1].results[0].citation_key == "topic2024graph" + assert store.get_entry("seed2024") is not None + assert store.get_entry("topic2024graph") is None + finally: + store.close() + + +def test_batch_runner_can_store_topic_phrase_metadata(): + jobs = [ + { + "name": "topic-job", + "topic": "graph topic", + "topic_slug": "graph-methods", + "topic_name": "Graph Methods", + "topic_phrase": "graph networks biology", + "expand": False, + "preview": False, + } + ] + + runner = BatchBootstrapRunner() + from citegeist import BibEntry + + runner.bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign] + BibEntry(entry_type="article", citation_key="topic2024graph", fields={"title": "Graph Topic Result"}) + ] + runner.bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign] + runner.bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign] + + store = BibliographyStore() + try: + runner.run(store, jobs) + topic = store.get_topic("graph-methods") + assert topic is not None + assert topic["name"] == "Graph Methods" + assert topic["expansion_phrase"] == "graph networks biology" + finally: + store.close() + + +def test_bootstrap_batch_cli_runs_json_jobs(tmp_path: Path): + seed_bib = tmp_path / "seed.bib" + seed_bib.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024} +} +""", + encoding="utf-8", + ) + batch_json = tmp_path / "jobs.json" + batch_json.write_text( + f""" +[ + {{"name": "seed-job", "seed_bib": "{seed_bib}", "expand": false}}, + {{"name": "topic-job", "topic": "graph topic", "expand": false, "preview": true}} +] +""", + encoding="utf-8", + ) + + from unittest.mock import patch + + database = tmp_path / "library.sqlite3" + with patch("citegeist.cli.BatchBootstrapRunner.run") as mocked_run: + mocked_run.return_value = [] + exit_code = main(["--db", str(database), "bootstrap-batch", str(batch_json)]) + + assert exit_code == 0 diff --git a/tests/test_bootstrap.py b/tests/test_bootstrap.py new file mode 100644 index 0000000..728d8ac --- /dev/null +++ b/tests/test_bootstrap.py @@ -0,0 +1,175 @@ +from citegeist import BibliographyStore +from citegeist.bootstrap import Bootstrapper +from citegeist.cli import main + + +def test_bootstrap_from_seed_bib_only(): + store = BibliographyStore() + try: + bootstrapper = Bootstrapper() + bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign] + bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: [] # type: ignore[method-assign] + + results = bootstrapper.bootstrap( + store, + seed_bibtex=""" +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024} +} +""", + expand=False, + ) + + assert [item.citation_key for item in results] == ["seed2024"] + assert store.get_entry("seed2024") is not None + finally: + store.close() + + +def test_bootstrap_from_topic_only(): + store = BibliographyStore() + try: + bootstrapper = Bootstrapper() + bootstrapper.resolver.search_openalex = lambda topic, limit=5: [] # type: ignore[method-assign] + bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign] + bootstrapper.resolver.search_datacite = lambda topic, limit=5: [ # type: ignore[method-assign] + __import__("citegeist").BibEntry( + entry_type="article", + citation_key="topic2024graph", + fields={"title": "Graph Topic Result", "year": "2024"}, + ) + ] + bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign] + bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: [] # type: ignore[method-assign] + + results = bootstrapper.bootstrap(store, topic="graph topic", expand=False) + + assert [item.citation_key for item in results] == ["topic2024graph"] + assert store.get_entry("topic2024graph") is not None + assert results[0].score > 0 + finally: + store.close() + + +def test_bootstrap_cli_accepts_seed_and_topic(tmp_path): + seed_bib = tmp_path / "seed.bib" + seed_bib.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024} +} +""", + encoding="utf-8", + ) + + from unittest.mock import patch + + database = tmp_path / "library.sqlite3" + with patch("citegeist.cli.Bootstrapper.bootstrap") as mocked_bootstrap: + mocked_bootstrap.return_value = [] + exit_code = main( + [ + "--db", + str(database), + "bootstrap", + "--seed-bib", + str(seed_bib), + "--topic", + "graph topic", + "--no-expand", + ] + ) + + assert exit_code == 0 + + +def test_bootstrap_ranks_and_deduplicates_topic_candidates(): + store = BibliographyStore() + try: + bootstrapper = Bootstrapper() + from citegeist import BibEntry + + bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign] + BibEntry( + entry_type="article", + citation_key="shared2024graph", + fields={"title": "Graph Topic Ranking", "abstract": "graph topic graph"}, + ) + ] + bootstrapper.resolver.search_crossref = lambda topic, limit=5: [ # type: ignore[method-assign] + BibEntry( + entry_type="article", + citation_key="shared2024graph", + fields={"title": "Graph Topic Ranking", "abstract": "graph"}, + ), + BibEntry( + entry_type="article", + citation_key="crossref2024other", + fields={"title": "Less relevant paper"}, + ), + ] + bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign] + bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign] + bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: [] # type: ignore[method-assign] + + results = bootstrapper.bootstrap(store, topic="graph topic", expand=False, topic_limit=5) + + topic_results = [item for item in results if item.origin == "topic"] + assert [item.citation_key for item in topic_results] == ["shared2024graph", "crossref2024other"] + assert topic_results[0].score > topic_results[1].score + finally: + store.close() + + +def test_bootstrap_preview_does_not_write_to_database(): + store = BibliographyStore() + try: + bootstrapper = Bootstrapper() + from citegeist import BibEntry + + bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign] + BibEntry(entry_type="article", citation_key="preview2024graph", fields={"title": "Preview Graph Topic"}) + ] + bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign] + bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign] + + results = bootstrapper.bootstrap(store, topic="graph topic", expand=False, preview_only=True) + + assert [item.citation_key for item in results] == ["preview2024graph"] + assert store.get_entry("preview2024graph") is None + finally: + store.close() + + +def test_bootstrap_topic_commit_limit_restricts_persisted_candidates(): + store = BibliographyStore() + try: + bootstrapper = Bootstrapper() + from citegeist import BibEntry + + bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign] + BibEntry(entry_type="article", citation_key="rank1", fields={"title": "Graph Topic One"}), + BibEntry(entry_type="article", citation_key="rank2", fields={"title": "Graph Topic Two"}), + ] + bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign] + bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign] + bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign] + bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: [] # type: ignore[method-assign] + + results = bootstrapper.bootstrap( + store, + topic="graph topic", + expand=False, + topic_limit=5, + topic_commit_limit=1, + ) + + assert [item.citation_key for item in results if item.origin == "topic"] == ["rank1"] + assert store.get_entry("rank1") is not None + assert store.get_entry("rank2") is None + finally: + store.close() diff --git a/tests/test_cli.py b/tests/test_cli.py index 4fed32c..7ab29c9 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -119,7 +119,7 @@ def test_cli_resolve_updates_entry(tmp_path: Path): citation_key="resolvedkey", fields={ "author": "Smith, Jane", - "title": "Graph-first bibliography augmentation", + "title": "Resolved Graph-first bibliography augmentation", "year": "2024", "doi": "10.1000/example-doi", "journal": "Journal of Graph Studies", @@ -138,6 +138,803 @@ def test_cli_resolve_updates_entry(tmp_path: Path): ) assert exit_code == 0 + show = run_cli(tmp_path, "show", "--conflicts", "smith2024graphs") + assert show.returncode == 0 + payload = json.loads(show.stdout) + assert payload["field_conflicts"][0]["field_name"] == "title" + + +def test_cli_resolve_conflicts_updates_status(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text( + """ +@article{smith2024graphs, + author = {Smith, Jane}, + title = {Graph-first bibliography augmentation}, + year = {2024} +} +""", + encoding="utf-8", + ) + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + from citegeist.storage import BibliographyStore + + database = tmp_path / "library.sqlite3" + store = BibliographyStore(database) + try: + store.record_conflicts( + "smith2024graphs", + [ + { + "field_name": "title", + "current_value": "Graph-first bibliography augmentation", + "proposed_value": "Resolved title", + } + ], + source_type="resolver", + source_label="openalex:search:Graph-first bibliography augmentation", + ) + finally: + store.close() + + result = run_cli(tmp_path, "resolve-conflicts", "smith2024graphs", "title", "accepted") + assert result.returncode == 0 + assert "accepted" in result.stdout + + +def test_cli_apply_conflict_updates_entry_value(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text( + """ +@article{smith2024graphs, + author = {Smith, Jane}, + title = {Graph-first bibliography augmentation}, + year = {2024} +} +""", + encoding="utf-8", + ) + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + from citegeist.storage import BibliographyStore + + database = tmp_path / "library.sqlite3" + store = BibliographyStore(database) + try: + store.record_conflicts( + "smith2024graphs", + [ + { + "field_name": "title", + "current_value": "Graph-first bibliography augmentation", + "proposed_value": "Resolved Graph-first bibliography augmentation", + } + ], + source_type="resolver", + source_label="openalex:search:Graph-first bibliography augmentation", + ) + finally: + store.close() + + result = run_cli(tmp_path, "apply-conflict", "smith2024graphs", "title") + assert result.returncode == 0 + assert "applied" in result.stdout + + show = run_cli(tmp_path, "show", "smith2024graphs") + payload = json.loads(show.stdout) + assert payload["title"] == "Resolved Graph-first bibliography augmentation" + + +def test_cli_discover_oai_outputs_identity_and_sets(): + from unittest.mock import patch + from citegeist.harvest import OaiMetadataFormat, OaiSet + + with patch("citegeist.cli.OaiPmhHarvester.identify") as mocked_identify, patch( + "citegeist.cli.OaiPmhHarvester.list_sets" + ) as mocked_sets, patch("citegeist.cli.OaiPmhHarvester.list_metadata_formats") as mocked_formats: + mocked_identify.return_value = { + "repositoryName": "Example Repository", + "granularity": "YYYY-MM-DD", + } + mocked_formats.return_value = [ + OaiMetadataFormat( + metadata_prefix="oai_dc", + schema="http://www.openarchives.org/OAI/2.0/oai_dc.xsd", + metadata_namespace="http://www.openarchives.org/OAI/2.0/oai_dc/", + ) + ] + mocked_sets.return_value = [ + OaiSet(set_spec="theses", set_name="Theses", set_description="Graduate theses") + ] + exit_code = main(["discover-oai", "https://example.edu/oai"]) + + assert exit_code == 0 + + +def test_cli_bootstrap_preview_mode(tmp_path): + from unittest.mock import patch + + database = tmp_path / "library.sqlite3" + with patch("citegeist.cli.Bootstrapper.bootstrap") as mocked_bootstrap: + mocked_bootstrap.return_value = [] + exit_code = main( + [ + "--db", + str(database), + "bootstrap", + "--topic", + "graph topic", + "--preview", + "--topic-commit-limit", + "2", + ] + ) + + assert exit_code == 0 + _, kwargs = mocked_bootstrap.call_args + assert kwargs["preview_only"] is True + assert kwargs["topic_commit_limit"] == 2 + + +def test_cli_bootstrap_accepts_stored_topic_metadata(tmp_path): + from unittest.mock import patch + + database = tmp_path / "library.sqlite3" + with patch("citegeist.cli.Bootstrapper.bootstrap") as mocked_bootstrap: + mocked_bootstrap.return_value = [] + exit_code = main( + [ + "--db", + str(database), + "bootstrap", + "--topic", + "graph topic", + "--topic-slug", + "graph-methods", + "--topic-name", + "Graph Methods", + "--store-topic-phrase", + "graph networks biology", + ] + ) + + assert exit_code == 0 + _, kwargs = mocked_bootstrap.call_args + assert kwargs["topic_slug"] == "graph-methods" + assert kwargs["topic_name"] == "Graph Methods" + assert kwargs["topic_phrase"] == "graph networks biology" + + +def test_cli_scrape_talkorigins_accepts_output_dir(tmp_path): + from unittest.mock import patch + + database = tmp_path / "library.sqlite3" + with patch("citegeist.cli.TalkOriginsScraper.scrape_to_directory") as mocked_scrape: + mocked_scrape.return_value = __import__("citegeist").TalkOriginsBatchExport( + base_url="https://www.talkorigins.org/origins/biblio/", + output_dir=str(tmp_path), + topic_count=1, + entry_count=2, + jobs_path=str(tmp_path / "jobs.json"), + manifest_path=str(tmp_path / "manifest.json"), + seed_sets=[], + ) + exit_code = main( + [ + "--db", + str(database), + "scrape-talkorigins", + str(tmp_path / "talkorigins-out"), + "--limit-topics", + "3", + "--limit-entries-per-topic", + "10", + "--no-resume", + "--no-expand", + ] + ) + + assert exit_code == 0 + + +def test_cli_validate_talkorigins_accepts_manifest(tmp_path): + from unittest.mock import patch + + manifest = tmp_path / "talkorigins_manifest.json" + manifest.write_text("{}", encoding="utf-8") + with patch("citegeist.cli.TalkOriginsScraper.validate_export") as mocked_validate: + mocked_validate.return_value = __import__("citegeist").TalkOriginsValidationReport( + manifest_path=str(manifest), + topic_count=1, + entry_count=2, + parsed_ratio=1.0, + missing_author_count=0, + missing_title_count=0, + missing_year_count=0, + suspicious_entry_type_count=0, + suspicious_examples=[], + duplicate_cluster_count=0, + duplicate_entry_count=0, + duplicate_examples=[], + ) + exit_code = main(["validate-talkorigins", str(manifest)]) + + assert exit_code == 0 + + +def test_cli_suggest_talkorigins_phrases_writes_output(tmp_path): + from unittest.mock import patch + + manifest = tmp_path / "talkorigins_manifest.json" + manifest.write_text("{}", encoding="utf-8") + output = tmp_path / "phrases.json" + with patch("citegeist.cli.TalkOriginsScraper.suggest_topic_phrases") as mocked_suggest: + mocked_suggest.return_value = [ + __import__("citegeist", fromlist=["TalkOriginsTopicPhraseSuggestion"]).TalkOriginsTopicPhraseSuggestion( + slug="abiogenesis", + topic="Abiogenesis", + entry_count=2, + suggested_phrase="Abiogenesis prebiotic chemistry ribozyme", + keywords=["prebiotic", "chemistry", "ribozyme"], + review_required=True, + review_reasons=["small_topic"], + ) + ] + exit_code = main( + [ + "suggest-talkorigins-phrases", + str(manifest), + "--topic", + "abiogenesis", + "--output", + str(output), + ] + ) + + assert exit_code == 0 + payload = json.loads(output.read_text(encoding="utf-8")) + assert payload[0]["slug"] == "abiogenesis" + + +def test_cli_duplicates_talkorigins_accepts_manifest(tmp_path): + from unittest.mock import patch + + manifest = tmp_path / "talkorigins_manifest.json" + manifest.write_text("{}", encoding="utf-8") + with patch("citegeist.cli.TalkOriginsScraper.inspect_duplicate_clusters") as mocked_duplicates: + mocked_duplicates.return_value = [ + __import__("citegeist.talkorigins", fromlist=["TalkOriginsDuplicateCluster"]).TalkOriginsDuplicateCluster( + key="smith|1999|duplicate paper", + count=2, + items=[ + { + "citation_key": "dup1", + "title": "Duplicate Paper", + "author": "Smith, Jane", + "year": "1999", + "seed_bib": "a.bib", + "topic": "Abiogenesis", + "topic_slug": "abiogenesis", + } + ], + canonical={ + "citation_key": "dup1", + "entry_type": "article", + "field_count": 3, + "fields": {"title": "Duplicate Paper"}, + "weak_reasons": [], + }, + ) + ] + exit_code = main( + [ + "duplicates-talkorigins", + str(manifest), + "--topic", + "abiogenesis", + "--match", + "duplicate", + "--preview", + "--weak-only", + ] + ) + + assert exit_code == 0 + + +def test_cli_ingest_talkorigins_accepts_manifest(tmp_path): + from unittest.mock import patch + + database = tmp_path / "library.sqlite3" + manifest = tmp_path / "talkorigins_manifest.json" + manifest.write_text("{}", encoding="utf-8") + with patch("citegeist.cli.TalkOriginsScraper.ingest_export") as mocked_ingest: + mocked_ingest.return_value = __import__("citegeist").TalkOriginsIngestReport( + manifest_path=str(manifest), + topic_count=1, + raw_entry_count=2, + stored_entry_count=1, + duplicate_cluster_count=1, + duplicate_entry_count=2, + canonicalized_count=1, + ) + exit_code = main(["--db", str(database), "ingest-talkorigins", str(manifest)]) + + assert exit_code == 0 + + +def test_cli_enrich_talkorigins_accepts_manifest(tmp_path): + from unittest.mock import patch + + database = tmp_path / "library.sqlite3" + manifest = tmp_path / "talkorigins_manifest.json" + manifest.write_text("{}", encoding="utf-8") + with patch("citegeist.cli.TalkOriginsScraper.enrich_weak_canonicals") as mocked_enrich: + mocked_enrich.return_value = [ + __import__("citegeist.talkorigins", fromlist=["TalkOriginsEnrichmentResult"]).TalkOriginsEnrichmentResult( + key="smith|1999|duplicate paper", + citation_key="dup1", + weak_reasons_before=["missing:doi"], + resolved=True, + applied=False, + source_label="crossref:search:Duplicate Paper", + weak_reasons_after=[], + conflicts=[], + error="", + ) + ] + exit_code = main( + [ + "--db", + str(database), + "enrich-talkorigins", + str(manifest), + "--limit", + "5", + "--apply", + "--allow-unsafe-search-matches", + ] + ) + + assert exit_code == 0 + + +def test_cli_review_talkorigins_writes_output(tmp_path): + from unittest.mock import patch + + database = tmp_path / "library.sqlite3" + manifest = tmp_path / "talkorigins_manifest.json" + manifest.write_text("{}", encoding="utf-8") + output = tmp_path / "review.json" + with patch("citegeist.cli.TalkOriginsScraper.build_review_export") as mocked_review: + mocked_review.return_value = __import__("citegeist.talkorigins", fromlist=["TalkOriginsReviewExport"]).TalkOriginsReviewExport( + manifest_path=str(manifest), + item_count=1, + items=[{"key": "smith|1999|duplicate paper", "canonical": {}, "enrichment": {}}], + ) + exit_code = main( + [ + "--db", + str(database), + "review-talkorigins", + str(manifest), + "--output", + str(output), + ] + ) + + assert exit_code == 0 + assert output.exists() + + +def test_cli_apply_talkorigins_corrections_accepts_files(tmp_path): + from unittest.mock import patch + + database = tmp_path / "library.sqlite3" + manifest = tmp_path / "talkorigins_manifest.json" + corrections = tmp_path / "corrections.json" + manifest.write_text("{}", encoding="utf-8") + corrections.write_text('{"corrections": []}', encoding="utf-8") + with patch("citegeist.cli.TalkOriginsScraper.apply_review_corrections") as mocked_apply: + mocked_apply.return_value = [ + __import__("citegeist.talkorigins", fromlist=["TalkOriginsCorrectionResult"]).TalkOriginsCorrectionResult( + key="smith|1999|duplicate paper", + citation_key="dup1", + applied=True, + error="", + ) + ] + exit_code = main( + [ + "--db", + str(database), + "apply-talkorigins-corrections", + str(manifest), + str(corrections), + ] + ) + + assert exit_code == 0 + + +def test_cli_topics_and_topic_entries(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024} +} +""", + encoding="utf-8", + ) + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + from citegeist.storage import BibliographyStore + + database = tmp_path / "library.sqlite3" + store = BibliographyStore(database) + try: + store.add_entry_topic( + "seed2024", + topic_slug="graph-methods", + topic_name="Graph Methods", + source_type="talkorigins", + source_url="https://example.org/topics/graph-methods", + source_label="topic-seed", + ) + store.connection.commit() + finally: + store.close() + + topics = run_cli(tmp_path, "topics") + assert topics.returncode == 0 + topics_payload = json.loads(topics.stdout) + assert topics_payload[0]["slug"] == "graph-methods" + + topic_entries = run_cli(tmp_path, "topic-entries", "graph-methods") + assert topic_entries.returncode == 0 + topic_payload = json.loads(topic_entries.stdout) + assert topic_payload["topic"]["slug"] == "graph-methods" + assert topic_payload["entries"][0]["citation_key"] == "seed2024" + + +def test_cli_can_set_topic_phrase(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024} +} +""", + encoding="utf-8", + ) + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + from citegeist.storage import BibliographyStore + + database = tmp_path / "library.sqlite3" + store = BibliographyStore(database) + try: + store.add_entry_topic( + "seed2024", + topic_slug="graph-methods", + topic_name="Graph Methods", + source_type="talkorigins", + source_url="https://example.org/topics/graph-methods", + source_label="topic-seed", + ) + store.connection.commit() + finally: + store.close() + + result = run_cli(tmp_path, "set-topic-phrase", "graph-methods", "graph networks biology") + assert result.returncode == 0 + payload = json.loads(result.stdout) + assert payload["expansion_phrase"] == "graph networks biology" + + +def test_cli_can_apply_topic_phrases_from_json(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024} +} +""", + encoding="utf-8", + ) + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + from citegeist.storage import BibliographyStore + + database = tmp_path / "library.sqlite3" + store = BibliographyStore(database) + try: + store.add_entry_topic( + "seed2024", + topic_slug="graph-methods", + topic_name="Graph Methods", + source_type="talkorigins", + source_url="https://example.org/topics/graph-methods", + source_label="topic-seed", + ) + store.connection.commit() + finally: + store.close() + + phrases_path = tmp_path / "phrases.json" + phrases_path.write_text( + json.dumps( + [ + { + "slug": "graph-methods", + "suggested_phrase": "graph networks biology", + } + ] + ), + encoding="utf-8", + ) + + result = run_cli(tmp_path, "apply-topic-phrases", str(phrases_path)) + assert result.returncode == 0 + payload = json.loads(result.stdout) + assert payload[0]["applied"] is True + + check = run_cli(tmp_path, "topics") + topics_payload = json.loads(check.stdout) + assert topics_payload[0]["expansion_phrase"] == "graph networks biology" + + +def test_cli_can_stage_topic_phrases_from_json(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024} +} +""", + encoding="utf-8", + ) + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + from citegeist.storage import BibliographyStore + + database = tmp_path / "library.sqlite3" + store = BibliographyStore(database) + try: + store.add_entry_topic( + "seed2024", + topic_slug="graph-methods", + topic_name="Graph Methods", + source_type="talkorigins", + source_url="https://example.org/topics/graph-methods", + source_label="topic-seed", + ) + store.connection.commit() + finally: + store.close() + + phrases_path = tmp_path / "phrases.json" + phrases_path.write_text( + json.dumps( + [ + { + "slug": "graph-methods", + "suggested_phrase": "graph networks biology", + } + ] + ), + encoding="utf-8", + ) + + result = run_cli(tmp_path, "stage-topic-phrases", str(phrases_path)) + assert result.returncode == 0 + payload = json.loads(result.stdout) + assert payload[0]["staged"] is True + assert payload[0]["phrase_review_status"] == "pending" + + check = run_cli(tmp_path, "topics") + topics_payload = json.loads(check.stdout) + assert topics_payload[0]["suggested_phrase"] == "graph networks biology" + assert topics_payload[0]["expansion_phrase"] is None + assert topics_payload[0]["phrase_review_status"] == "pending" + + +def test_cli_can_review_topic_phrase(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024} +} +""", + encoding="utf-8", + ) + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + from citegeist.storage import BibliographyStore + + database = tmp_path / "library.sqlite3" + store = BibliographyStore(database) + try: + store.add_entry_topic( + "seed2024", + topic_slug="graph-methods", + topic_name="Graph Methods", + source_type="talkorigins", + source_url="https://example.org/topics/graph-methods", + source_label="topic-seed", + ) + store.stage_topic_phrase_suggestion("graph-methods", "graph networks biology") + finally: + store.close() + + result = run_cli( + tmp_path, + "review-topic-phrase", + "graph-methods", + "accepted", + "--notes", + "curated and approved", + ) + assert result.returncode == 0 + payload = json.loads(result.stdout) + assert payload["suggested_phrase"] == "graph networks biology" + assert payload["expansion_phrase"] == "graph networks biology" + assert payload["phrase_review_status"] == "accepted" + assert payload["phrase_review_notes"] == "curated and approved" + + +def test_cli_topics_can_filter_by_phrase_review_status(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024} +} +""", + encoding="utf-8", + ) + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + from citegeist.storage import BibliographyStore + + database = tmp_path / "library.sqlite3" + store = BibliographyStore(database) + try: + store.add_entry_topic( + "seed2024", + topic_slug="graph-methods", + topic_name="Graph Methods", + source_type="talkorigins", + source_url="https://example.org/topics/graph-methods", + source_label="topic-seed", + ) + store.ensure_topic("abiogenesis", "Abiogenesis") + store.stage_topic_phrase_suggestion("graph-methods", "graph networks biology") + store.stage_topic_phrase_suggestion("abiogenesis", "abiogenesis life origin") + store.review_topic_phrase_suggestion("abiogenesis", "accepted") + finally: + store.close() + + result = run_cli(tmp_path, "topics", "--phrase-review-status", "pending") + assert result.returncode == 0 + payload = json.loads(result.stdout) + assert [topic["slug"] for topic in payload] == ["graph-methods"] + + +def test_cli_export_topic(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024} +} +""", + encoding="utf-8", + ) + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + from citegeist.storage import BibliographyStore + + database = tmp_path / "library.sqlite3" + store = BibliographyStore(database) + try: + store.add_entry_topic( + "seed2024", + topic_slug="graph-methods", + topic_name="Graph Methods", + source_type="talkorigins", + source_url="https://example.org/topics/graph-methods", + source_label="topic-seed", + ) + store.connection.commit() + finally: + store.close() + + export_path = tmp_path / "graph-methods.bib" + result = run_cli(tmp_path, "export-topic", "graph-methods", "--output", str(export_path)) + assert result.returncode == 0 + exported = export_path.read_text(encoding="utf-8") + assert "@article{seed2024," in exported + + +def test_cli_search_can_filter_by_topic(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Graph Methods for Biology}, + year = {2024}, + abstract = {A graph methods paper.} +} + +@article{other2023, + author = {Other, Bob}, + title = {Graph Methods for Chemistry}, + year = {2023}, + abstract = {Another graph methods paper.} +} +""", + encoding="utf-8", + ) + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + + from citegeist.storage import BibliographyStore + + database = tmp_path / "library.sqlite3" + store = BibliographyStore(database) + try: + store.add_entry_topic( + "seed2024", + topic_slug="biology", + topic_name="Biology", + source_type="talkorigins", + source_url="https://example.org/topics/biology", + source_label="topic-seed", + ) + store.add_entry_topic( + "other2023", + topic_slug="chemistry", + topic_name="Chemistry", + source_type="talkorigins", + source_url="https://example.org/topics/chemistry", + source_label="topic-seed", + ) + store.connection.commit() + finally: + store.close() + + search = run_cli(tmp_path, "search", "graph", "--topic", "biology") + assert search.returncode == 0 + assert "seed2024" in search.stdout + assert "other2023" not in search.stdout def test_cli_graph_outputs_missing_targets(tmp_path: Path): @@ -239,3 +1036,43 @@ def test_cli_expand_with_mocked_openalex(tmp_path: Path): ) assert exit_code == 0 + + +def test_cli_expand_topic_with_mocked_expander(tmp_path: Path): + from citegeist.expand import TopicExpansionResult + + with patch("citegeist.cli.TopicExpander.expand_topic") as mocked_expand: + mocked_expand.return_value = [ + TopicExpansionResult( + topic_slug="abiogenesis", + source_citation_key="seed2024", + discovered_citation_key="discovered1", + discovered_title="Abiogenesis origin chemistry", + created_entry=True, + relation_type="cites", + source_label="openalex:cites:seed2024", + relevance_score=0.67, + meets_relevance_threshold=True, + assigned_to_topic=True, + ) + ] + database = tmp_path / "library.sqlite3" + exit_code = main( + [ + "--db", + str(database), + "expand-topic", + "abiogenesis", + "--topic-phrase", + "abiogenesis origin chemistry", + "--seed-key", + "seed2024", + "--min-relevance", + "0.3", + "--preview", + ] + ) + + assert exit_code == 0 + _, kwargs = mocked_expand.call_args + assert kwargs["preview_only"] is True diff --git a/tests/test_harvest.py b/tests/test_harvest.py new file mode 100644 index 0000000..49da298 --- /dev/null +++ b/tests/test_harvest.py @@ -0,0 +1,293 @@ +from citegeist import OaiPmhHarvester, parse_bibtex +from citegeist.cli import main + + +OAI_XML = """ + + + +
    + oai:example.edu:123 +
    + + + Thesis Metadata Harvesting + Doe, Jane + 2023-05-01 + A dissertation about repository harvesting. + https://example.edu/items/123 + Example University + Text + Dissertation + + +
    +
    +
    +""" + +OAI_XML_PAGE_1 = """ + + + +
    + oai:example.edu:123 +
    + + + First Harvested Thesis + Doe, Jane + 2023-05-01 + Dissertation + + +
    + TOKEN123 +
    +
    +""" + +OAI_XML_PAGE_2 = """ + + + +
    + oai:example.edu:456 +
    + + + Second Harvested Thesis + Smith, John + 2022-05-01 + Dissertation + + +
    +
    +
    +""" + +OAI_IDENTIFY_XML = """ + + + Example Repository + https://example.edu/oai + 2.0 + repo@example.edu + 2001-01-01 + persistent + YYYY-MM-DD + + +""" + +OAI_LISTSETS_XML = """ + + + + theses + Theses and Dissertations + + This set contains graduate theses. + + + + +""" + +OAI_METADATA_FORMATS_XML = """ + + + + oai_dc + http://www.openarchives.org/OAI/2.0/oai_dc.xsd + http://www.openarchives.org/OAI/2.0/oai_dc/ + + + mods + http://www.loc.gov/standards/mods/v3/mods-3-7.xsd + http://www.loc.gov/mods/v3 + + + +""" + +OAI_MODS_XML = """ + + + +
    + oai:example.edu:mods123 +
    + + + + MODS Thesis Title + + + Doe + Jane + + author + + + + Example University + 2022 + + dissertation + MODS abstract text. + + https://example.edu/mods123 + + + +
    +
    +
    +""" + + +def test_oai_harvester_maps_dublin_core_to_bibentry(): + harvester = OaiPmhHarvester() + harvester.source_client.get_xml = lambda _url: __import__("xml.etree.ElementTree").etree.ElementTree.fromstring(OAI_XML) # type: ignore[method-assign] + + results = harvester.list_records("https://example.edu/oai") + + assert len(results) == 1 + entry = results[0].entry + assert entry.entry_type == "phdthesis" + assert entry.fields["title"] == "Thesis Metadata Harvesting" + assert entry.fields["author"] == "Doe, Jane" + assert entry.fields["oai"] == "oai:example.edu:123" + + +def test_oai_harvester_follows_resumption_tokens(): + harvester = OaiPmhHarvester() + from xml.etree import ElementTree as ET + + payloads = iter([ET.fromstring(OAI_XML_PAGE_1), ET.fromstring(OAI_XML_PAGE_2)]) + harvester.source_client.get_xml = lambda _url: next(payloads) # type: ignore[method-assign] + + results = harvester.list_records("https://example.edu/oai") + + assert [result.identifier for result in results] == [ + "oai:example.edu:123", + "oai:example.edu:456", + ] + assert [result.entry.citation_key for result in results] == [ + "doe2023first1", + "smith2022second2", + ] + + +def test_oai_harvester_passes_date_filters(): + harvester = OaiPmhHarvester() + seen_urls: list[str] = [] + from xml.etree import ElementTree as ET + + def fake_get_xml(url: str): + seen_urls.append(url) + return ET.fromstring(OAI_XML) + + harvester.source_client.get_xml = fake_get_xml # type: ignore[method-assign] + + harvester.list_records( + "https://example.edu/oai", + date_from="2023-01-01", + date_until="2023-12-31", + limit=1, + ) + + assert "from=2023-01-01" in seen_urls[0] + assert "until=2023-12-31" in seen_urls[0] + + +def test_oai_harvester_maps_mods_records(): + harvester = OaiPmhHarvester() + from xml.etree import ElementTree as ET + + harvester.source_client.get_xml = lambda _url: ET.fromstring(OAI_MODS_XML) # type: ignore[method-assign] + + results = harvester.list_records("https://example.edu/oai", metadata_prefix="mods") + + assert len(results) == 1 + entry = results[0].entry + assert entry.entry_type == "phdthesis" + assert entry.fields["title"] == "MODS Thesis Title" + assert entry.fields["author"] == "Doe, Jane" + assert entry.fields["publisher"] == "Example University" + assert entry.fields["abstract"] == "MODS abstract text." + + +def test_oai_harvester_can_identify_repository_and_list_sets(): + harvester = OaiPmhHarvester() + from xml.etree import ElementTree as ET + + payloads = iter( + [ET.fromstring(OAI_IDENTIFY_XML), ET.fromstring(OAI_LISTSETS_XML), ET.fromstring(OAI_METADATA_FORMATS_XML)] + ) + harvester.source_client.get_xml = lambda _url: next(payloads) # type: ignore[method-assign] + + identify = harvester.identify("https://example.edu/oai") + sets = harvester.list_sets("https://example.edu/oai") + formats = harvester.list_metadata_formats("https://example.edu/oai") + + assert identify["repositoryName"] == "Example Repository" + assert identify["granularity"] == "YYYY-MM-DD" + assert sets[0].set_spec == "theses" + assert sets[0].set_name == "Theses and Dissertations" + assert "graduate theses" in sets[0].set_description + assert [item.metadata_prefix for item in formats] == ["oai_dc", "mods"] + + +def test_harvest_oai_cli_ingests_records(tmp_path): + from unittest.mock import patch + + database = tmp_path / "library.sqlite3" + harvester = OaiPmhHarvester() + from xml.etree import ElementTree as ET + + harvester.source_client.get_xml = lambda _url: ET.fromstring(OAI_XML) # type: ignore[method-assign] + harvested = harvester.list_records("https://example.edu/oai") + + with patch("citegeist.cli.OaiPmhHarvester.list_records") as mocked_list: + mocked_list.return_value = harvested + + exit_code = main( + [ + "--db", + str(database), + "harvest-oai", + "https://example.edu/oai", + "--metadata-prefix", + "oai_dc", + "--from", + "2023-01-01", + "--until", + "2023-12-31", + "--limit", + "5", + ] + ) + + assert exit_code == 0 + + from citegeist.storage import BibliographyStore + + store = BibliographyStore(database) + try: + entry = store.list_entries(limit=10)[0] + assert entry["citation_key"] == "doe2023thesis1" + bibtex = store.get_entry_bibtex("doe2023thesis1") + parsed = parse_bibtex(bibtex or "") + assert parsed[0].fields["oai"] == "oai:example.edu:123" + finally: + store.close() diff --git a/tests/test_resolve.py b/tests/test_resolve.py index f4c22b0..226ee10 100644 --- a/tests/test_resolve.py +++ b/tests/test_resolve.py @@ -1,11 +1,13 @@ from xml.etree import ElementTree as ET -from citegeist.bibtex import BibEntry +from citegeist.bibtex import BibEntry, render_bibtex from citegeist.resolve import ( MetadataResolver, _arxiv_atom_entry_to_bib, _crossref_message_to_entry, + _datacite_work_to_entry, _openalex_work_to_entry, + merge_entries_with_conflicts, merge_entries, ) @@ -65,6 +67,31 @@ def test_merge_entries_prefers_existing_values_and_adds_missing_fields(): assert merged.fields["journal"] == "Journal of Graph Studies" +def test_merge_entries_with_conflicts_records_disagreements(): + base = BibEntry( + entry_type="article", + citation_key="smith2024graphs", + fields={"title": "Existing Title", "journal": "Current Journal"}, + ) + resolved = BibEntry( + entry_type="article", + citation_key="resolved", + fields={"title": "Resolved Title", "journal": "Current Journal", "year": "2024"}, + ) + + merged, conflicts = merge_entries_with_conflicts(base, resolved) + + assert merged.fields["title"] == "Existing Title" + assert merged.fields["year"] == "2024" + assert conflicts == [ + { + "field_name": "title", + "current_value": "Existing Title", + "proposed_value": "Resolved Title", + } + ] + + def test_resolver_tries_doi_before_dblp(): resolver = MetadataResolver() calls: list[tuple[str, str]] = [] @@ -77,7 +104,12 @@ def test_resolver_tries_doi_before_dblp(): calls.append(("dblp", value)) return None + def fake_datacite(value: str): + calls.append(("datacite", value)) + return None + resolver.resolve_doi = fake_doi # type: ignore[method-assign] + resolver.resolve_datacite_doi = fake_datacite # type: ignore[method-assign] resolver.resolve_dblp = fake_dblp # type: ignore[method-assign] resolver.resolve_entry( @@ -88,7 +120,11 @@ def test_resolver_tries_doi_before_dblp(): ) ) - assert calls == [("doi", "10.1000/example-doi"), ("dblp", "conf/test/Smith24")] + assert calls == [ + ("doi", "10.1000/example-doi"), + ("datacite", "10.1000/example-doi"), + ("dblp", "conf/test/Smith24"), + ] def test_openalex_work_to_entry_maps_basic_fields(): @@ -131,6 +167,8 @@ def test_resolver_can_resolve_openalex_id(): def test_resolver_falls_back_to_openalex_title_search(): resolver = MetadataResolver() + resolver.search_datacite = lambda title, limit=5: [] # type: ignore[method-assign] + resolver.search_crossref = lambda title, limit=5: [] # type: ignore[method-assign] resolver.search_openalex = lambda title, limit=5: [ # type: ignore[method-assign] _openalex_work_to_entry( { @@ -154,3 +192,212 @@ def test_resolver_falls_back_to_openalex_title_search(): assert resolution is not None assert resolution.source_label == "openalex:search:OpenAlex Resolved Work" assert resolution.entry.fields["openalex"] == "W12345" + + +def test_resolver_prefers_exact_crossref_title_match_before_datacite(): + resolver = MetadataResolver() + resolver.search_crossref = lambda title, limit=5: [ # type: ignore[method-assign] + _crossref_message_to_entry( + { + "type": "journal-article", + "title": [title], + "DOI": "10.1126/science.1090005", + "container-title": ["Science"], + "author": [ + {"family": "King", "given": "Mary-Claire"}, + {"family": "Wilson", "given": "A. C."}, + ], + "issued": {"date-parts": [[1975, 4, 11]]}, + } + ) + ] + resolver.search_datacite = lambda title, limit=5: [ # type: ignore[method-assign] + _datacite_work_to_entry( + { + "attributes": { + "doi": "10.5061/dryad.v6wwpzh17", + "titles": [ + { + "title": "Conserved patterns and locomotor-related evolutionary constraints in the hominoid vertebral column" + } + ], + "creators": [ + {"familyName": "Villamil", "givenName": "Catalina I."}, + {"familyName": "Middleton", "givenName": "Emily R."}, + ], + "publicationYear": 2024, + "types": {"resourceTypeGeneral": "Dataset"}, + } + } + ) + ] + + resolution = resolver.resolve_entry( + BibEntry( + entry_type="article", + citation_key="king1975evolution2", + fields={ + "title": "Evolution at two levels in humans and chimpanzees", + "author": "King, M. C. and Wilson, A. C.", + "year": "1975", + }, + ) + ) + + assert resolution is not None + assert resolution.source_label == "crossref:search:Evolution at two levels in humans and chimpanzees" + assert resolution.entry.fields["doi"] == "10.1126/science.1090005" + + +def test_resolver_rejects_mismatched_title_search_candidates(): + resolver = MetadataResolver() + resolver.search_crossref = lambda title, limit=5: [] # type: ignore[method-assign] + resolver.search_datacite = lambda title, limit=5: [ # type: ignore[method-assign] + _datacite_work_to_entry( + { + "attributes": { + "doi": "10.5061/dryad.v6wwpzh17", + "titles": [ + { + "title": "Conserved patterns and locomotor-related evolutionary constraints in the hominoid vertebral column" + } + ], + "creators": [ + {"familyName": "Villamil", "givenName": "Catalina I."}, + ], + "publicationYear": 2024, + "types": {"resourceTypeGeneral": "Dataset"}, + } + } + ) + ] + resolver.search_openalex = lambda title, limit=5: [ # type: ignore[method-assign] + _openalex_work_to_entry( + { + "id": "https://openalex.org/W2033360601", + "display_name": "Immunological relatedness of glucose 6-phosphate dehydrogenases from vertebrate and invertebrate species.", + "publication_year": 1978, + "type": "article", + "authorships": [ + {"author": {"display_name": "Yoshikazu Sado"}}, + {"author": {"display_name": "Samuel H. Hori"}}, + ], + "doi": "https://doi.org/10.1266/jjg.53.91", + } + ) + ] + + resolution = resolver.resolve_entry( + BibEntry( + entry_type="article", + citation_key="sarich1967immunological1", + fields={ + "title": "Immunological Time Scale for Homonid Evolution", + "author": "Sarich, V. and Wilson, A.", + "year": "1967", + }, + ) + ) + + assert resolution is None + + +def test_datacite_work_to_entry_maps_basic_fields(): + entry = _datacite_work_to_entry( + { + "attributes": { + "doi": "10.1000/datacite-example", + "titles": [{"title": "Repository Dissertation Record"}], + "creators": [{"familyName": "Doe", "givenName": "Jane"}], + "publicationYear": 2021, + "publisher": "Example University", + "url": "https://example.edu/record/123", + "types": {"resourceTypeGeneral": "Dissertation"}, + "descriptions": [ + { + "descriptionType": "Abstract", + "description": "An abstract from DataCite.", + } + ], + } + } + ) + + assert entry.entry_type == "phdthesis" + assert entry.fields["doi"] == "10.1000/datacite-example" + assert entry.fields["author"] == "Doe, Jane" + assert entry.fields["publisher"] == "Example University" + assert entry.fields["abstract"] == "An abstract from DataCite." + + +def test_resolver_can_resolve_datacite_doi(): + resolver = MetadataResolver() + resolver.source_client.get_json = lambda _url: { # type: ignore[method-assign] + "data": { + "attributes": { + "doi": "10.1000/datacite-example", + "titles": [{"title": "Repository Dissertation Record"}], + "creators": [{"familyName": "Doe", "givenName": "Jane"}], + "publicationYear": 2021, + "types": {"resourceTypeGeneral": "Dissertation"}, + } + } + } + + resolution = resolver.resolve_datacite_doi("10.1000/datacite-example") + + assert resolution is not None + assert resolution.source_label == "datacite:doi:10.1000/datacite-example" + assert resolution.entry.entry_type == "phdthesis" + + +def test_resolver_can_fall_back_to_datacite_title_search(): + resolver = MetadataResolver() + resolver.search_crossref = lambda title, limit=5: [] # type: ignore[method-assign] + resolver.search_datacite = lambda title, limit=5: [ # type: ignore[method-assign] + _datacite_work_to_entry( + { + "attributes": { + "doi": "10.1000/datacite-example", + "titles": [{"title": title}], + "creators": [{"familyName": "Doe", "givenName": "Jane"}], + "publicationYear": 2021, + "types": {"resourceTypeGeneral": "Dissertation"}, + } + } + ) + ] + resolver.search_openalex = lambda title, limit=5: [] # type: ignore[method-assign] + + resolution = resolver.resolve_entry( + BibEntry( + entry_type="misc", + citation_key="draft1", + fields={"title": "Repository Dissertation Record", "author": "Doe, Jane", "year": "2021"}, + ) + ) + + assert resolution is not None + assert resolution.source_label == "datacite:search:Repository Dissertation Record" + assert resolution.entry.fields["doi"] == "10.1000/datacite-example" + + +def test_render_bibtex_tolerates_unmatched_braces_in_field_values(): + rendered = render_bibtex( + [ + BibEntry( + entry_type="misc", + citation_key="broken2026", + fields={ + "author": "Broken, Example", + "title": "Unmatched { braces } example } tail", + "year": "2026", + "note": "Open { brace only", + }, + ) + ] + ) + + assert "@misc{broken2026," in rendered + assert "Unmatched { braces } example ) tail" in rendered + assert "Open ( brace only" in rendered diff --git a/tests/test_sources.py b/tests/test_sources.py index fea995a..f9f0bf2 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -28,3 +28,14 @@ def test_source_client_writes_cache_after_fetch(tmp_path: Path): assert payload["ok"] is True assert any(cache_dir.iterdir()) + + +def test_source_client_falls_back_to_latin1_for_text(tmp_path: Path): + client = SourceClient(cache_dir=tmp_path / "cache") + url = "https://example.org/latin1" + + client._fetch_bytes = lambda _url: "café".encode("iso-8859-1") # type: ignore[method-assign] + + payload = client.get_text(url) + + assert payload == "café" diff --git a/tests/test_storage.py b/tests/test_storage.py index 3458f52..f432bfd 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -130,3 +130,250 @@ def test_store_traverses_graph_and_surfaces_missing_targets(): assert rows[2]["depth"] == 2 finally: store.close() + + +def test_store_records_and_updates_field_conflicts(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024} +} +""" + ) + ok = store.record_conflicts( + "seed2024", + [ + { + "field_name": "title", + "current_value": "Seed Paper", + "proposed_value": "Resolved Seed Paper", + } + ], + source_type="resolver", + source_label="crossref:doi:10.1000/seed", + ) + assert ok is True + conflicts = store.get_field_conflicts("seed2024") + assert conflicts[0]["field_name"] == "title" + assert conflicts[0]["status"] == "open" + assert store.set_conflict_status("seed2024", "title", "accepted") == 1 + updated = store.get_field_conflicts("seed2024", status="accepted") + assert len(updated) == 1 + finally: + store.close() + + +def test_store_can_apply_latest_conflict_value(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024} +} +""" + ) + store.record_conflicts( + "seed2024", + [ + { + "field_name": "title", + "current_value": "Seed Paper", + "proposed_value": "Resolved Seed Paper", + } + ], + source_type="resolver", + source_label="crossref:doi:10.1000/seed", + ) + + assert store.apply_conflict_value("seed2024", "title") is True + entry = store.get_entry("seed2024") + assert entry is not None + assert entry["title"] == "Resolved Seed Paper" + accepted = store.get_field_conflicts("seed2024", status="accepted") + assert len(accepted) == 1 + finally: + store.close() + + +def test_store_supports_entry_topic_membership(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024} +} +""" + ) + + assert store.add_entry_topic( + "seed2024", + topic_slug="graph-methods", + topic_name="Graph Methods", + source_type="talkorigins", + source_url="https://example.org/topics/graph-methods", + source_label="topic-seed", + ) is True + assert store.add_entry_topic( + "seed2024", + topic_slug="semantic-search", + topic_name="Semantic Search", + source_type="talkorigins", + source_url="https://example.org/topics/semantic-search", + source_label="topic-seed", + ) is True + + entry = store.get_entry("seed2024") + assert entry is not None + assert [topic["slug"] for topic in entry["topics"]] == ["graph-methods", "semantic-search"] + + topics = store.list_topics() + assert [topic["slug"] for topic in topics] == ["graph-methods", "semantic-search"] + assert topics[0]["entry_count"] == 1 + topic = store.get_topic("graph-methods") + assert topic is not None + assert topic["name"] == "Graph Methods" + assert topic["expansion_phrase"] is None + topic_entries = store.list_topic_entries("graph-methods") + assert topic_entries[0]["citation_key"] == "seed2024" + finally: + store.close() + + +def test_store_can_set_topic_expansion_phrase(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Seed Paper}, + year = {2024} +} +""" + ) + store.add_entry_topic( + "seed2024", + topic_slug="graph-methods", + topic_name="Graph Methods", + source_type="talkorigins", + source_url="https://example.org/topics/graph-methods", + source_label="topic-seed", + ) + assert store.set_topic_expansion_phrase("graph-methods", "graph networks biology") is True + + topic = store.get_topic("graph-methods") + assert topic is not None + assert topic["expansion_phrase"] == "graph networks biology" + assert topic["phrase_review_status"] == "unreviewed" + topics = store.list_topics() + assert topics[0]["expansion_phrase"] == "graph networks biology" + finally: + store.close() + + +def test_store_can_stage_and_review_topic_phrase_suggestion(): + store = BibliographyStore() + try: + store.ensure_topic("graph-methods", "Graph Methods") + + assert store.stage_topic_phrase_suggestion( + "graph-methods", + "graph networks biology", + review_notes="generated from local titles", + ) is True + + staged = store.get_topic("graph-methods") + assert staged is not None + assert staged["suggested_phrase"] == "graph networks biology" + assert staged["expansion_phrase"] is None + assert staged["phrase_review_status"] == "pending" + assert staged["phrase_review_notes"] == "generated from local titles" + + assert store.review_topic_phrase_suggestion( + "graph-methods", + "accepted", + review_notes="looks good", + ) is True + + reviewed = store.get_topic("graph-methods") + assert reviewed is not None + assert reviewed["suggested_phrase"] == "graph networks biology" + assert reviewed["expansion_phrase"] == "graph networks biology" + assert reviewed["phrase_review_status"] == "accepted" + assert reviewed["phrase_review_notes"] == "looks good" + finally: + store.close() + + +def test_store_can_filter_topics_by_phrase_review_status(): + store = BibliographyStore() + try: + store.ensure_topic("graph-methods", "Graph Methods") + store.ensure_topic("abiogenesis", "Abiogenesis") + store.stage_topic_phrase_suggestion("graph-methods", "graph networks biology") + store.stage_topic_phrase_suggestion("abiogenesis", "abiogenesis life origin") + store.review_topic_phrase_suggestion("abiogenesis", "accepted") + + pending_topics = store.list_topics(phrase_review_status="pending") + accepted_topics = store.list_topics(phrase_review_status="accepted") + + assert [topic["slug"] for topic in pending_topics] == ["graph-methods"] + assert [topic["slug"] for topic in accepted_topics] == ["abiogenesis"] + finally: + store.close() + + +def test_store_search_text_can_filter_by_topic(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Graph Methods for Biology}, + year = {2024}, + abstract = {A graph methods paper.} +} + +@article{other2023, + author = {Other, Bob}, + title = {Graph Methods for Chemistry}, + year = {2023}, + abstract = {Another graph methods paper.} +} +""" + ) + + store.add_entry_topic( + "seed2024", + topic_slug="biology", + topic_name="Biology", + source_type="talkorigins", + source_url="https://example.org/topics/biology", + source_label="topic-seed", + ) + store.add_entry_topic( + "other2023", + topic_slug="chemistry", + topic_name="Chemistry", + source_type="talkorigins", + source_url="https://example.org/topics/chemistry", + source_label="topic-seed", + ) + store.connection.commit() + + results = store.search_text("graph", topic_slug="biology") + + assert [row["citation_key"] for row in results] == ["seed2024"] + finally: + store.close() diff --git a/tests/test_talkorigins.py b/tests/test_talkorigins.py new file mode 100644 index 0000000..9ca9943 --- /dev/null +++ b/tests/test_talkorigins.py @@ -0,0 +1,1024 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from citegeist.batch import load_batch_jobs +from citegeist.bibtex import BibEntry +from citegeist.storage import BibliographyStore +from citegeist.talkorigins import TalkOriginsScraper, normalize_topic_entries + + +INDEX_HTML = """ + +Abiogenesis +Evolution +Browse + +""" + +ABIOGENESIS_HTML = """ +
    +Smith, J., 1998, First paper title: Journal of Origins, v. 10, p. 1-10.
    +
    +---, 2001, Second paper title: Journal of Origins, v. 12, p. 20-30.
    +
    +""" + +EVOLUTION_HTML = """ +
    +Jones, A., and Roe, B.,
    +2003, Wrapped title across lines:
    +Proceedings of the Example Conference, p. 40-55.
    +
    +""" + + +class FakeSourceClient: + def __init__(self, payloads: dict[str, str]) -> None: + self.payloads = payloads + + def get_text(self, url: str) -> str: + return self.payloads[url] + + +def test_normalize_topic_entries_carries_forward_repeated_authors(): + text = """ +Smith, J., 1998, First paper title: Journal of Origins. + +---, 2001, Second paper title: Journal of Origins. +""" + + entries = normalize_topic_entries(text) + + assert entries[1].startswith("Smith, J., 2001") + + +def test_talkorigins_scraper_writes_seed_bibs_and_jobs(tmp_path: Path): + base_url = "https://www.talkorigins.org/origins/biblio/" + scraper = TalkOriginsScraper( + source_client=FakeSourceClient( + { + base_url: INDEX_HTML, + f"{base_url}abiogenesis.html": ABIOGENESIS_HTML, + f"{base_url}evolution.html": EVOLUTION_HTML, + } + ) + ) + + export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path) + + assert export.topic_count == 2 + assert export.entry_count == 3 + + jobs = json.loads(Path(export.jobs_path).read_text(encoding="utf-8")) + assert jobs["jobs"][0]["name"] == "talkorigins:abiogenesis" + assert Path(jobs["jobs"][0]["seed_bib"]).exists() + + manifest = json.loads(Path(export.manifest_path).read_text(encoding="utf-8")) + assert manifest["seed_sets"][0]["parsed_entry_count"] == 2 + + abiogenesis_bib = Path(export.seed_sets[0].seed_bib).read_text(encoding="utf-8") + abiogenesis_plain = Path(export.seed_sets[0].plaintext_path).read_text(encoding="utf-8") + abiogenesis_page = Path(export.seed_sets[0].page_path).read_text(encoding="utf-8") + full_bib = Path(export.full_bib_path).read_text(encoding="utf-8") + full_plain = Path(export.full_plaintext_path).read_text(encoding="utf-8") + site_index = Path(export.site_index_path).read_text(encoding="utf-8") + assert "@article{smith1998first1," in abiogenesis_bib + assert 'author = "Smith, J"' in abiogenesis_bib + assert "@article{smith2001second2," in abiogenesis_bib + assert "Abiogenesis" in abiogenesis_plain + assert "Show BibTeX" in abiogenesis_page + assert "toggleBibtex" in abiogenesis_page + assert "@article{smith1998first1," in full_bib + assert "Evolution" in full_plain + assert "Full BibTeX bibliography" in site_index + + +def test_talkorigins_parser_prefers_book_for_publisher_like_venues(): + scraper = TalkOriginsScraper(source_client=FakeSourceClient({})) + + entry = scraper.parse_reference_entry( + "Rutten, M. G., 1971, The Origin of Life by Natural Causes: Amsterdam, London, New York, Elsevier.", + 1, + ) + + assert entry is not None + assert entry.entry_type == "book" + assert entry.fields["publisher"] == "Amsterdam, London, New York, Elsevier" + + +def test_talkorigins_parser_promotes_edited_volume_chapter_to_incollection(): + scraper = TalkOriginsScraper(source_client=FakeSourceClient({})) + + entry = scraper.parse_reference_entry( + "Carpenter, C. R., 1958, Territoriality: A Review of Concepts and Problems, in Roe, A., and Simpson, G. G., eds., Behavior and Evolution: New Haven, Yale University Press, p. 224-250.", + 1, + ) + + assert entry is not None + assert entry.entry_type == "incollection" + assert entry.fields["title"] == "Territoriality: A Review of Concepts and Problems" + assert entry.fields["editor"] == "Roe, A. and Simpson, G. G." + assert entry.fields["booktitle"] == "Behavior and Evolution" + assert "Yale University Press" in entry.fields["publisher"] + + +def test_talkorigins_scraper_resume_uses_saved_snapshot(tmp_path: Path): + base_url = "https://www.talkorigins.org/origins/biblio/" + scraper = TalkOriginsScraper( + source_client=FakeSourceClient( + { + base_url: INDEX_HTML, + f"{base_url}abiogenesis.html": ABIOGENESIS_HTML, + f"{base_url}evolution.html": EVOLUTION_HTML, + } + ) + ) + + first_export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path, limit_topics=1) + snapshot_path = Path(first_export.seed_sets[0].snapshot_path) + snapshot = json.loads(snapshot_path.read_text(encoding="utf-8")) + assert snapshot["raw_entries"][0].startswith("Smith, J.") + + scraper_with_broken_page = TalkOriginsScraper( + source_client=FakeSourceClient( + { + base_url: INDEX_HTML, + f"{base_url}abiogenesis.html": "broken", + } + ) + ) + resumed_export = scraper_with_broken_page.scrape_to_directory(base_url=base_url, output_dir=tmp_path, limit_topics=1) + + assert resumed_export.entry_count == 2 + + +def test_talkorigins_validation_reports_suspicious_entries(tmp_path: Path): + base_url = "https://www.talkorigins.org/origins/biblio/" + scraper = TalkOriginsScraper( + source_client=FakeSourceClient( + { + base_url: INDEX_HTML, + f"{base_url}abiogenesis.html": ABIOGENESIS_HTML, + } + ) + ) + + export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path, limit_topics=1) + seed_bib_path = Path(export.seed_sets[0].seed_bib) + seed_bib_path.write_text( + """ +@article{bad1, + author = "Example, A", + year = "1999", + title = "Bad Venue Classification", + journal = "Elsevier" +} +""", + encoding="utf-8", + ) + + report = scraper.validate_export(export.manifest_path) + + assert report.topic_count == 1 + assert report.entry_count == 2 + assert report.suspicious_entry_type_count >= 1 + assert report.suspicious_examples[0]["citation_key"] == "bad1" + + +def test_talkorigins_validation_does_not_flag_legitimate_incollection(tmp_path: Path): + base_url = "https://www.talkorigins.org/origins/biblio/" + scraper = TalkOriginsScraper( + source_client=FakeSourceClient( + { + base_url: INDEX_HTML, + f"{base_url}abiogenesis.html": ABIOGENESIS_HTML, + } + ) + ) + + export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path, limit_topics=1) + seed_bib_path = Path(export.seed_sets[0].seed_bib) + seed_bib_path.write_text( + """ +@incollection{good1, + author = "Example, A", + editor = "Editor, E", + year = "1999", + title = "Good Chapter", + booktitle = "Collected Essays", + publisher = "New Haven, Yale University Press" +} +""", + encoding="utf-8", + ) + + report = scraper.validate_export(export.manifest_path) + + assert all(item["citation_key"] != "good1" for item in report.suspicious_examples) + + +def test_talkorigins_validation_reports_duplicate_clusters(tmp_path: Path): + base_url = "https://www.talkorigins.org/origins/biblio/" + scraper = TalkOriginsScraper( + source_client=FakeSourceClient( + { + base_url: INDEX_HTML, + f"{base_url}abiogenesis.html": ABIOGENESIS_HTML, + f"{base_url}evolution.html": EVOLUTION_HTML, + } + ) + ) + + export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path) + Path(export.seed_sets[0].seed_bib).write_text( + """ +@article{dup1, + author = "Smith, Jane", + year = "1999", + title = "Duplicate Paper", + journal = "Journal A" +} +""", + encoding="utf-8", + ) + Path(export.seed_sets[1].seed_bib).write_text( + """ +@article{dup2, + author = "Smith, Jane", + year = "1999", + title = "Duplicate Paper", + journal = "Journal B" +} +""", + encoding="utf-8", + ) + + report = scraper.validate_export(export.manifest_path) + + assert report.duplicate_cluster_count >= 1 + assert report.duplicate_entry_count >= 2 + assert report.duplicate_examples[0]["items"][0]["citation_key"] in {"dup1", "dup2"} + + +def test_talkorigins_can_suggest_topic_phrases(tmp_path: Path): + base_url = "https://www.talkorigins.org/origins/biblio/" + scraper = TalkOriginsScraper( + source_client=FakeSourceClient( + { + base_url: INDEX_HTML, + f"{base_url}abiogenesis.html": ABIOGENESIS_HTML, + } + ) + ) + + export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path, limit_topics=1) + Path(export.seed_sets[0].seed_bib).write_text( + """ +@article{bio1, + author = "Smith, Jane", + year = "1999", + title = "Prebiotic chemistry and ribozyme catalysis", + journal = "Origins" +} + +@article{bio2, + author = "Smith, Jane", + year = "2001", + title = "Ribozyme networks in prebiotic chemistry", + journal = "Origins" +} +""", + encoding="utf-8", + ) + + suggestions = scraper.suggest_topic_phrases(export.manifest_path) + + assert len(suggestions) == 1 + assert suggestions[0].slug == "abiogenesis" + assert suggestions[0].suggested_phrase.startswith("Abiogenesis ") + assert "chemistry" in suggestions[0].keywords + assert "prebiotic" in suggestions[0].keywords + assert suggestions[0].review_required is True + assert "small_topic" in (suggestions[0].review_reasons or []) + assert "noisy_keywords" not in (suggestions[0].review_reasons or []) + + +def test_talkorigins_duplicate_inspection_filters_by_topic_and_match(tmp_path: Path): + base_url = "https://www.talkorigins.org/origins/biblio/" + scraper = TalkOriginsScraper( + source_client=FakeSourceClient( + { + base_url: INDEX_HTML, + f"{base_url}abiogenesis.html": ABIOGENESIS_HTML, + f"{base_url}evolution.html": EVOLUTION_HTML, + } + ) + ) + + export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path) + Path(export.seed_sets[0].seed_bib).write_text( + """ +@article{dup1, + author = "Smith, Jane", + year = "1999", + title = "Duplicate Paper", + journal = "Journal A" +} + +@article{dup2, + author = "Smith, Jane", + year = "1999", + title = "Duplicate Paper", + journal = "Journal B" +} +""", + encoding="utf-8", + ) + Path(export.seed_sets[1].seed_bib).write_text( + """ +@article{other1, + author = "Jones, Alex", + year = "2001", + title = "Other Topic Paper", + journal = "Journal C" +} +""", + encoding="utf-8", + ) + + clusters = scraper.inspect_duplicate_clusters( + export.manifest_path, + topic_slug="abiogenesis", + match="duplicate", + ) + + assert len(clusters) == 1 + assert clusters[0].count == 2 + assert all(item["topic_slug"] == "abiogenesis" for item in clusters[0].items) + + +def test_talkorigins_duplicate_inspection_can_preview_canonical_choice(tmp_path: Path): + base_url = "https://www.talkorigins.org/origins/biblio/" + scraper = TalkOriginsScraper( + source_client=FakeSourceClient( + { + base_url: INDEX_HTML, + f"{base_url}abiogenesis.html": ABIOGENESIS_HTML, + f"{base_url}evolution.html": EVOLUTION_HTML, + } + ) + ) + + export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path) + Path(export.seed_sets[0].seed_bib).write_text( + """ +@article{dup1, + author = "Smith, Jane", + year = "1999", + title = "Duplicate Paper", + journal = "Journal A" +} +""", + encoding="utf-8", + ) + Path(export.seed_sets[1].seed_bib).write_text( + """ +@article{dup2, + author = "Smith, Jane", + year = "1999", + title = "Duplicate Paper", + journal = "Journal B", + doi = "10.1000/dup" +} +""", + encoding="utf-8", + ) + + clusters = scraper.inspect_duplicate_clusters(export.manifest_path, preview_canonical=True) + + assert len(clusters) == 1 + assert clusters[0].canonical is not None + assert clusters[0].canonical["citation_key"] == "dup2" + assert clusters[0].canonical["fields"]["doi"] == "10.1000/dup" + assert clusters[0].canonical["weak_reasons"] == [] + + +def test_talkorigins_duplicate_inspection_can_filter_to_weak_canonicals(tmp_path: Path): + base_url = "https://www.talkorigins.org/origins/biblio/" + scraper = TalkOriginsScraper( + source_client=FakeSourceClient( + { + base_url: INDEX_HTML, + f"{base_url}abiogenesis.html": ABIOGENESIS_HTML, + f"{base_url}evolution.html": EVOLUTION_HTML, + } + ) + ) + + export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path) + Path(export.seed_sets[0].seed_bib).write_text( + """ +@misc{weak1, + author = "Smith, Jane", + year = "1999", + title = "Weak Duplicate" +} + +@misc{weak2, + author = "Smith, Jane", + year = "1999", + title = "Weak Duplicate", + note = "Copied from legacy source" +} +""", + encoding="utf-8", + ) + Path(export.seed_sets[1].seed_bib).write_text( + """ +@article{strong1, + author = "Jones, Alex", + year = "2001", + title = "Strong Duplicate", + journal = "Journal B", + doi = "10.1000/strong" +} + +@article{strong2, + author = "Jones, Alex", + year = "2001", + title = "Strong Duplicate", + journal = "Journal B" +} +""", + encoding="utf-8", + ) + + clusters = scraper.inspect_duplicate_clusters( + export.manifest_path, + preview_canonical=True, + weak_only=True, + ) + + assert len(clusters) == 1 + assert clusters[0].canonical is not None + assert clusters[0].canonical["citation_key"] == "weak2" + assert "entry_type:misc" in clusters[0].canonical["weak_reasons"] + assert "missing:doi" in clusters[0].canonical["weak_reasons"] + + +def test_talkorigins_enrich_weak_canonicals_can_preview_resolution(tmp_path: Path): + base_url = "https://www.talkorigins.org/origins/biblio/" + scraper = TalkOriginsScraper( + source_client=FakeSourceClient( + { + base_url: INDEX_HTML, + f"{base_url}abiogenesis.html": ABIOGENESIS_HTML, + f"{base_url}evolution.html": EVOLUTION_HTML, + } + ) + ) + + export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path) + Path(export.seed_sets[0].seed_bib).write_text( + """ +@misc{weak1, + author = "Smith, Jane", + year = "1999", + title = "Weak Duplicate" +} + +@misc{weak2, + author = "Smith, Jane", + year = "1999", + title = "Weak Duplicate", + note = "Copied from legacy source" +} +""", + encoding="utf-8", + ) + Path(export.seed_sets[1].seed_bib).write_text("", encoding="utf-8") + + from citegeist.resolve import Resolution + + scraper.resolver.resolve_entry = lambda entry: Resolution( # type: ignore[method-assign] + entry=BibEntry( + entry_type="article", + citation_key="resolved", + fields={ + "author": entry.fields["author"], + "title": entry.fields["title"], + "year": entry.fields["year"], + "doi": "10.1000/weak", + "journal": "Journal of Better Metadata", + }, + ), + source_type="resolver", + source_label="crossref:search:Weak Duplicate", + ) + + store = BibliographyStore() + try: + results = scraper.enrich_weak_canonicals(export.manifest_path, store, apply=False) + finally: + store.close() + + assert len(results) == 1 + assert results[0].resolved is True + assert results[0].applied is False + assert results[0].weak_reasons_after == [] + + +def test_talkorigins_enrich_weak_canonicals_can_apply_to_store(tmp_path: Path): + base_url = "https://www.talkorigins.org/origins/biblio/" + scraper = TalkOriginsScraper( + source_client=FakeSourceClient( + { + base_url: INDEX_HTML, + f"{base_url}abiogenesis.html": ABIOGENESIS_HTML, + f"{base_url}evolution.html": EVOLUTION_HTML, + } + ) + ) + + export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path) + Path(export.seed_sets[0].seed_bib).write_text( + """ +@misc{weak2, + author = "Smith, Jane", + year = "1999", + title = "Weak Duplicate" +} +""", + encoding="utf-8", + ) + Path(export.seed_sets[1].seed_bib).write_text( + """ +@misc{weak1, + author = "Smith, Jane", + year = "1999", + title = "Weak Duplicate", + note = "Copied from legacy source" +} +""", + encoding="utf-8", + ) + + from citegeist.resolve import Resolution + + scraper.resolver.resolve_entry = lambda entry: Resolution( # type: ignore[method-assign] + entry=BibEntry( + entry_type="article", + citation_key="resolved", + fields={ + "author": entry.fields["author"], + "title": entry.fields["title"], + "year": entry.fields["year"], + "doi": "10.1000/weak", + "journal": "Journal of Better Metadata", + }, + ), + source_type="resolver", + source_label="crossref:search:Weak Duplicate", + ) + + store = BibliographyStore() + try: + scraper.ingest_export(export.manifest_path, store, dedupe=False) + results = scraper.enrich_weak_canonicals(export.manifest_path, store, apply=True) + + assert len(results) == 1 + assert results[0].applied is True + entry = store.get_entry(results[0].citation_key) + assert entry is not None + assert entry["doi"] == "10.1000/weak" + assert entry["journal"] == "Journal of Better Metadata" + assert entry["review_status"] == "enriched" + finally: + store.close() + + +def test_talkorigins_enrich_weak_canonicals_rejects_unsafe_search_match(tmp_path: Path): + base_url = "https://www.talkorigins.org/origins/biblio/" + scraper = TalkOriginsScraper( + source_client=FakeSourceClient( + { + base_url: INDEX_HTML, + f"{base_url}abiogenesis.html": ABIOGENESIS_HTML, + f"{base_url}evolution.html": EVOLUTION_HTML, + } + ) + ) + + export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path) + Path(export.seed_sets[0].seed_bib).write_text( + """ +@misc{weak2, + author = "Adams, D", + year = "1987", + title = "The bigger they are, the harder they fall" +} +""", + encoding="utf-8", + ) + Path(export.seed_sets[1].seed_bib).write_text( + """ +@misc{weak1, + author = "Adams, D", + year = "1987", + title = "The bigger they are, the harder they fall", + note = "Copied from legacy source" +} +""", + encoding="utf-8", + ) + + from citegeist.resolve import Resolution + + scraper.resolver.resolve_entry = lambda entry: Resolution( # type: ignore[method-assign] + entry=BibEntry( + entry_type="misc", + citation_key="resolved", + fields={ + "author": "Kulik, Dean", + "title": "The Nexus Recursive Harmonic Framework: Reality as Unbounded, Observerless Computation (SILR / RHA / CST) Ver 2", + "year": "2026", + "doi": "10.9999/not-a-match", + }, + ), + source_type="resolver", + source_label="datacite:search:The bigger they are, the harder they fall", + ) + + store = BibliographyStore() + try: + scraper.ingest_export(export.manifest_path, store, dedupe=False) + results = scraper.enrich_weak_canonicals(export.manifest_path, store, apply=True) + + assert len(results) == 1 + assert results[0].resolved is False + assert results[0].applied is False + assert results[0].error == "unsafe resolver match" + entry = store.get_entry("weak2") or store.get_entry("weak1") + assert entry is not None + assert entry["doi"] is None + finally: + store.close() + + +def test_talkorigins_enrich_weak_canonicals_can_allow_unsafe_search_match(tmp_path: Path): + base_url = "https://www.talkorigins.org/origins/biblio/" + scraper = TalkOriginsScraper( + source_client=FakeSourceClient( + { + base_url: INDEX_HTML, + f"{base_url}abiogenesis.html": ABIOGENESIS_HTML, + f"{base_url}evolution.html": EVOLUTION_HTML, + } + ) + ) + + export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path) + Path(export.seed_sets[0].seed_bib).write_text( + """ +@misc{weak2, + author = "Adams, D", + year = "1987", + title = "The bigger they are, the harder they fall" +} +""", + encoding="utf-8", + ) + Path(export.seed_sets[1].seed_bib).write_text( + """ +@misc{weak1, + author = "Adams, D", + year = "1987", + title = "The bigger they are, the harder they fall", + note = "Copied from legacy source" +} +""", + encoding="utf-8", + ) + + from citegeist.resolve import Resolution + + scraper.resolver.resolve_entry = lambda entry: Resolution( # type: ignore[method-assign] + entry=BibEntry( + entry_type="misc", + citation_key="resolved", + fields={ + "author": "Kulik, Dean", + "title": "The Nexus Recursive Harmonic Framework: Reality as Unbounded, Observerless Computation (SILR / RHA / CST) Ver 2", + "year": "2026", + "doi": "10.9999/not-a-match", + }, + ), + source_type="resolver", + source_label="datacite:search:The bigger they are, the harder they fall", + ) + + store = BibliographyStore() + try: + scraper.ingest_export(export.manifest_path, store, dedupe=False) + results = scraper.enrich_weak_canonicals( + export.manifest_path, + store, + apply=True, + allow_unsafe_matches=True, + ) + + assert len(results) == 1 + assert results[0].resolved is True + assert results[0].applied is True + entry = store.get_entry(results[0].citation_key) + assert entry is not None + assert entry["doi"] == "10.9999/not-a-match" + finally: + store.close() + + +def test_talkorigins_build_review_export_combines_cluster_and_enrichment(tmp_path: Path): + base_url = "https://www.talkorigins.org/origins/biblio/" + scraper = TalkOriginsScraper( + source_client=FakeSourceClient( + { + base_url: INDEX_HTML, + f"{base_url}abiogenesis.html": ABIOGENESIS_HTML, + f"{base_url}evolution.html": EVOLUTION_HTML, + } + ) + ) + + export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path) + Path(export.seed_sets[0].seed_bib).write_text( + """ +@misc{weak1, + author = "Smith, Jane", + year = "1999", + title = "Weak Duplicate" +} + +@misc{weak2, + author = "Smith, Jane", + year = "1999", + title = "Weak Duplicate", + note = "Copied from legacy source" +} +""", + encoding="utf-8", + ) + Path(export.seed_sets[1].seed_bib).write_text("", encoding="utf-8") + + from citegeist.resolve import Resolution + + scraper.resolver.resolve_entry = lambda entry: Resolution( # type: ignore[method-assign] + entry=BibEntry( + entry_type="article", + citation_key="resolved", + fields={ + "author": entry.fields["author"], + "title": entry.fields["title"], + "year": entry.fields["year"], + "doi": "10.1000/weak", + "journal": "Journal of Better Metadata", + }, + ), + source_type="resolver", + source_label="crossref:search:Weak Duplicate", + ) + + store = BibliographyStore() + try: + review = scraper.build_review_export(export.manifest_path, store) + finally: + store.close() + + assert review.item_count == 1 + assert review.items[0]["canonical"]["citation_key"] == "weak2" + assert review.items[0]["enrichment"]["resolved"] is True + assert review.items[0]["enrichment"]["applied"] is False + + +def test_talkorigins_apply_review_corrections_updates_store_entry(tmp_path: Path): + base_url = "https://www.talkorigins.org/origins/biblio/" + scraper = TalkOriginsScraper( + source_client=FakeSourceClient( + { + base_url: INDEX_HTML, + f"{base_url}abiogenesis.html": ABIOGENESIS_HTML, + f"{base_url}evolution.html": EVOLUTION_HTML, + } + ) + ) + + export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path) + Path(export.seed_sets[0].seed_bib).write_text( + """ +@misc{weak1, + author = "Smith, Jane", + year = "1999", + title = "Weak Duplicate" +} + +@misc{weak2, + author = "Smith, Jane", + year = "1999", + title = "Weak Duplicate", + note = "Copied from legacy source" +} +""", + encoding="utf-8", + ) + Path(export.seed_sets[1].seed_bib).write_text("", encoding="utf-8") + + corrections_path = tmp_path / "corrections.json" + corrections_path.write_text( + json.dumps( + { + "corrections": [ + { + "key": "smith jane|1999|weak duplicate", + "entry_type": "article", + "review_status": "reviewed", + "fields": { + "journal": "Journal of Better Metadata", + "doi": "10.1000/weak", + "note": None, + }, + } + ] + } + ), + encoding="utf-8", + ) + + store = BibliographyStore() + try: + scraper.ingest_export(export.manifest_path, store, dedupe=True) + results = scraper.apply_review_corrections(export.manifest_path, corrections_path, store) + + assert len(results) == 1 + assert results[0].applied is True + entry = store.get_entry(results[0].citation_key) + assert entry is not None + assert entry["entry_type"] == "article" + assert entry["journal"] == "Journal of Better Metadata" + assert entry["doi"] == "10.1000/weak" + assert entry["review_status"] == "reviewed" + assert entry.get("note") is None + finally: + store.close() + + +def test_talkorigins_scraper_assigns_topics_when_ingesting(tmp_path: Path): + base_url = "https://www.talkorigins.org/origins/biblio/" + scraper = TalkOriginsScraper( + source_client=FakeSourceClient( + { + base_url: INDEX_HTML, + f"{base_url}abiogenesis.html": ABIOGENESIS_HTML, + } + ) + ) + + store = BibliographyStore() + try: + export = scraper.scrape_to_directory( + base_url=base_url, + output_dir=tmp_path, + limit_topics=1, + ingest_store=store, + ) + + assert export.entry_count == 2 + entry = store.get_entry("smith1998first1") + assert entry is not None + assert entry["topics"][0]["slug"] == "abiogenesis" + assert entry["topics"][0]["name"] == "Abiogenesis" + assert store.list_topics()[0]["slug"] == "abiogenesis" + finally: + store.close() + + +def test_talkorigins_ingest_export_consolidates_duplicates_into_one_entry(tmp_path: Path): + base_url = "https://www.talkorigins.org/origins/biblio/" + scraper = TalkOriginsScraper( + source_client=FakeSourceClient( + { + base_url: INDEX_HTML, + f"{base_url}abiogenesis.html": ABIOGENESIS_HTML, + f"{base_url}evolution.html": EVOLUTION_HTML, + } + ) + ) + + export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path) + Path(export.seed_sets[0].seed_bib).write_text( + """ +@article{dup1, + author = "Smith, Jane", + year = "1999", + title = "Duplicate Paper", + journal = "Journal A" +} +""", + encoding="utf-8", + ) + Path(export.seed_sets[1].seed_bib).write_text( + """ +@article{dup2, + author = "Smith, Jane", + year = "1999", + title = "Duplicate Paper", + journal = "Journal B", + doi = "10.1000/dup" +} +""", + encoding="utf-8", + ) + + store = BibliographyStore() + try: + report = scraper.ingest_export(export.manifest_path, store) + + assert report.duplicate_cluster_count >= 1 + assert report.stored_entry_count == 1 + assert report.canonicalized_count >= 1 + entry = store.get_entry("dup2") + assert entry is not None + assert entry["doi"] == "10.1000/dup" + assert [topic["slug"] for topic in entry["topics"]] == ["abiogenesis", "evolution"] + finally: + store.close() + + +def test_talkorigins_ingest_export_avoids_canonical_key_collisions(tmp_path: Path): + base_url = "https://www.talkorigins.org/origins/biblio/" + scraper = TalkOriginsScraper( + source_client=FakeSourceClient( + { + base_url: INDEX_HTML, + f"{base_url}abiogenesis.html": ABIOGENESIS_HTML, + f"{base_url}evolution.html": EVOLUTION_HTML, + } + ) + ) + + export = scraper.scrape_to_directory(base_url=base_url, output_dir=tmp_path) + Path(export.seed_sets[0].seed_bib).write_text( + """ +@article{sharedkey, + author = "Smith, Jane", + year = "1999", + title = "First Paper", + journal = "Journal A" +} +""", + encoding="utf-8", + ) + Path(export.seed_sets[1].seed_bib).write_text( + """ +@article{sharedkey, + author = "Jones, Alex", + year = "2001", + title = "Second Paper", + journal = "Journal B" +} +""", + encoding="utf-8", + ) + + store = BibliographyStore() + try: + report = scraper.ingest_export(export.manifest_path, store) + + assert report.stored_entry_count == 2 + entries = store.list_entries(limit=10) + assert len(entries) == 2 + assert len({entry["citation_key"] for entry in entries}) == 2 + finally: + store.close() + + +def test_load_batch_jobs_resolves_relative_seed_paths(tmp_path: Path): + seed_bib = tmp_path / "seeds" / "topic.bib" + seed_bib.parent.mkdir(parents=True) + seed_bib.write_text("", encoding="utf-8") + + jobs_json = tmp_path / "jobs.json" + jobs_json.write_text( + """ +{ + "jobs": [ + {"name": "relative-job", "seed_bib": "seeds/topic.bib", "topic": "Abiogenesis"} + ] +} +""", + encoding="utf-8", + ) + + jobs = load_batch_jobs(jobs_json) + + assert jobs[0]["seed_bib"] == str(seed_bib.resolve()) diff --git a/tests/test_topic_expand.py b/tests/test_topic_expand.py new file mode 100644 index 0000000..f7cd4a7 --- /dev/null +++ b/tests/test_topic_expand.py @@ -0,0 +1,242 @@ +from citegeist.bibtex import BibEntry +from citegeist.expand import ( + ExpansionResult, + TopicExpander, + _meets_topic_assignment_threshold, + _topic_relevance_score, +) +from citegeist.storage import BibliographyStore + + +class FakeOpenAlexExpander: + def __init__(self, results: list[ExpansionResult] | dict[str, list[ExpansionResult]]) -> None: + self.results = results + + def expand_entry(self, store, citation_key, relation_type="cites", limit=25): + if isinstance(self.results, dict): + return list(self.results.get(citation_key, [])) + return list(self.results) + + +def test_topic_expander_assigns_relevant_discoveries_back_to_topic(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Abiogenesis Seed Paper}, + year = {2024} +} +""" + ) + store.add_entry_topic( + "seed2024", + topic_slug="abiogenesis", + topic_name="Abiogenesis", + source_type="talkorigins", + source_url="https://example.org/topics/abiogenesis", + source_label="seed", + ) + store.upsert_entry( + BibEntry( + entry_type="article", + citation_key="discovered1", + fields={ + "title": "Abiogenesis and origin chemistry", + "abstract": "A study of abiogenesis pathways.", + "year": "2025", + }, + ), + source_type="graph_expand", + source_label="test", + review_status="draft", + ) + store.upsert_entry( + BibEntry( + entry_type="article", + citation_key="discovered2", + fields={ + "title": "Galaxy formation dynamics", + "abstract": "Nothing about the topic.", + "year": "2025", + }, + ), + source_type="graph_expand", + source_label="test", + review_status="draft", + ) + store.connection.commit() + + expander = TopicExpander( + openalex_expander=FakeOpenAlexExpander( + [ + ExpansionResult("seed2024", "discovered1", False, "cites", "openalex:cites:seed2024"), + ExpansionResult("seed2024", "discovered2", False, "cites", "openalex:cites:seed2024"), + ] + ) + ) + + results = expander.expand_topic( + store, + "abiogenesis", + topic_phrase="abiogenesis origin chemistry", + min_relevance=0.34, + ) + + assert len(results) == 2 + assigned = {item.discovered_citation_key: item.assigned_to_topic for item in results} + assert assigned["discovered1"] is True + assert assigned["discovered2"] is False + topics = store.get_entry_topics("discovered1") + assert topics[0]["slug"] == "abiogenesis" + assert store.get_entry_topics("discovered2") == [] + finally: + store.close() + + +def test_topic_expander_can_restrict_to_allowed_seed_keys(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Abiogenesis Seed Paper}, + year = {2024} +} + +@article{seed2023, + author = {Seed, Bob}, + title = {Abiogenesis Historical Seed}, + year = {2023} +} +""" + ) + for citation_key in ("seed2024", "seed2023"): + store.add_entry_topic( + citation_key, + topic_slug="abiogenesis", + topic_name="Abiogenesis", + source_type="talkorigins", + source_url="https://example.org/topics/abiogenesis", + source_label="seed", + ) + store.upsert_entry( + BibEntry( + entry_type="article", + citation_key="discovered1", + fields={ + "title": "Abiogenesis origin chemistry", + "abstract": "A study of abiogenesis chemistry.", + "year": "2025", + }, + ), + source_type="graph_expand", + source_label="test", + review_status="draft", + ) + store.connection.commit() + + expander = TopicExpander( + openalex_expander=FakeOpenAlexExpander( + {"seed2023": [ExpansionResult("seed2023", "discovered1", False, "cites", "openalex:cites:seed2023")]} + ) + ) + + results = expander.expand_topic( + store, + "abiogenesis", + topic_phrase="abiogenesis origin chemistry", + seed_keys=["seed2024"], + ) + + assert results == [] + assert store.get_entry_topics("discovered1") == [] + finally: + store.close() + + +def test_topic_expander_preview_discovers_without_writing(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Abiogenesis Seed Paper}, + year = {2024} +} +""" + ) + store.add_entry_topic( + "seed2024", + topic_slug="abiogenesis", + topic_name="Abiogenesis", + source_type="talkorigins", + source_url="https://example.org/topics/abiogenesis", + source_label="seed", + ) + store.connection.commit() + + expander = TopicExpander() + expander._preview_discoveries = lambda *_args, **_kwargs: [ # type: ignore[method-assign] + ( + ExpansionResult( + "seed2024", + "preview1", + True, + "cites", + "openalex:cites:seed2024", + ), + { + "title": "Abiogenesis origin chemistry", + "abstract": "A study of abiogenesis chemistry.", + "year": "2025", + }, + ) + ] + + results = expander.expand_topic( + store, + "abiogenesis", + topic_phrase="abiogenesis origin chemistry", + min_relevance=0.3, + preview_only=True, + ) + + assert len(results) == 1 + assert results[0].discovered_citation_key == "preview1" + assert results[0].meets_relevance_threshold is True + assert results[0].assigned_to_topic is False + assert results[0].created_entry is True + assert store.get_entry("preview1") is None + assert store.get_entry_topics("preview1") == [] + finally: + store.close() + + +def test_topic_relevance_score_expands_human_evolution_terms(): + score = _topic_relevance_score( + "human evolution", + { + "title": "Body size and proportions in early hominids", + "abstract": "A fossil and paleolithic perspective on primate ancestry.", + "journal": "Science", + }, + ) + + assert score >= 0.15 + + +def test_topic_assignment_requires_title_anchor(): + entry = { + "title": "Phylogenies and the Comparative Method", + "abstract": "A comparative framework for primate and hominid evolution.", + "journal": "Systematic Zoology", + } + + score = _topic_relevance_score("human evolution", entry) + + assert score >= 0.15 + assert _meets_topic_assignment_threshold("human evolution", entry, min_relevance=0.15, relevance_score=score) is False