diff --git a/README.md b/README.md index 609ff29..cef32e0 100644 --- a/README.md +++ b/README.md @@ -56,11 +56,12 @@ The initial repo includes: - OAI-PMH repository discovery via `Identify`, `ListSets`, and `ListMetadataFormats` to target harvests more precisely; - bibliography bootstrap workflows that can start from a seed `.bib`, a topic phrase, or both; - batch bootstrap orchestration from JSON job files containing seed BibTeX paths, topic phrases, or both; -- a TalkOrigins scraper that fixes repeated-author plaintext references, emits per-topic seed BibTeX files, and writes a batch JSON specification; - normalized tables for entries, creators, identifiers, and citation relations; - full-text-search-ready indexing over title, abstract, and fulltext when SQLite FTS5 is available; - tests covering parsing, ingestion, relation storage, and search. +Example applications live alongside the core package rather than defining it. The current example corpus pipeline is the TalkOrigins bibliography workflow under [`citegeist.examples.talkorigins`](./src/citegeist/examples/talkorigins.py) with a usage guide in [examples/talkorigins/README.md](./examples/talkorigins/README.md). + The prioritized execution plan lives in [ROADMAP.md](./ROADMAP.md). ## Layout @@ -69,6 +70,7 @@ The prioritized execution plan lives in [ROADMAP.md](./ROADMAP.md). citegeist/ src/citegeist/ bibtex.py + examples/ storage.py tests/ test_storage.py @@ -125,7 +127,6 @@ PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve-confli PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 apply-conflict smith2024graphs title PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 bootstrap --seed-bib seed.bib --topic "bayesian nonparametrics" PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 bootstrap --topic "bayesian nonparametrics" --preview --topic-commit-limit 5 -PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 scrape-talkorigins talkorigins-out --limit-topics 5 --limit-entries-per-topic 20 PYTHONPATH=src .venv/bin/python -m citegeist extract references.txt --output draft.bib PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve smith2024graphs PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 topics @@ -143,42 +144,7 @@ PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export --outpu For live-source development, prefer fixture-backed or cache-backed source clients so resolver and expansion work can be exercised repeatedly without re-hitting upstream APIs on every run. -For large legacy plaintext corpora such as the TalkOrigins bibliography, prefer a two-step workflow: - -1. `scrape-talkorigins` to generate cleaned per-topic `seed_bib` files plus a `talkorigins_jobs.json` batch spec. -2. `bootstrap-batch` on that JSON file when you want to ingest, resolve, and expand from the generated seeds. - -The TalkOrigins scrape output now includes: - -- `seeds/*.bib` per-topic seed BibTeX files for `bootstrap-batch` -- `plaintext/*.txt` per-topic cleaned GSA-style plaintext with repeated authors expanded -- `site/topics/*.html` reconstructed topic pages with hide/show BibTeX blocks -- `talkorigins_full.txt` and `talkorigins_full.bib` aggregate downloads -- `snapshots/*.json` cached topic payloads so reruns can resume without re-fetching already scraped topics - -After a full scrape, run: - -```bash -PYTHONPATH=src .venv/bin/python -m citegeist validate-talkorigins talkorigins-out/talkorigins_manifest.json -PYTHONPATH=src .venv/bin/python -m citegeist duplicates-talkorigins talkorigins-out/talkorigins_manifest.json --limit 20 -PYTHONPATH=src .venv/bin/python -m citegeist duplicates-talkorigins talkorigins-out/talkorigins_manifest.json --limit 20 --preview --weak-only -PYTHONPATH=src .venv/bin/python -m citegeist suggest-talkorigins-phrases talkorigins-out/talkorigins_manifest.json --output topic-phrases.json -PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 stage-topic-phrases topic-phrases.json -PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export-topic-phrase-reviews --output topic-phrase-review.json -PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 review-topic-phrase abiogenesis accepted --notes "curated from local corpus" -PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 review-topic-phrases topic-phrase-review.json -PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 apply-topic-phrases topic-phrases.json -PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 enrich-talkorigins talkorigins-out/talkorigins_manifest.json --limit 20 -PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins-copy.sqlite3 enrich-talkorigins talkorigins-out/talkorigins_manifest.json --limit 5 --apply --allow-unsafe-search-matches -PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 review-talkorigins talkorigins-out/talkorigins_manifest.json --output talkorigins-review.json -PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 apply-talkorigins-corrections talkorigins-out/talkorigins_manifest.json talkorigins-corrections.json -``` - -That report summarizes parse coverage and flags suspicious entry-type / venue combinations for manual cleanup. -It also reports duplicate clusters across topic seed files so you can gauge how much deduplication pressure to expect before ingestion. -Use `duplicates-talkorigins` when you want to inspect specific clusters, filter by text, restrict the audit to one topic slug, or preview only weak canonicalization outcomes before importing. - -Use `suggest-talkorigins-phrases` to derive candidate stored expansion phrases from the existing TalkOrigins topic corpus itself. The output is deterministic JSON keyed by topic slug, with a suggested phrase plus the extracted keywords that drove it. This is a useful first pass before setting topic phrases in the database or editing generated batch jobs. +## Example Application Use `stage-topic-phrases` to load those suggestions into the database as review items. Staging stores the candidate in `suggested_phrase` and marks the topic `pending` without changing the active `expansion_phrase`. Use `export-topic-phrase-reviews` to write an editable JSON template directly from the database for the currently staged suggestions. That gives you a round-trip path from DB review queue to file edits and back into `review-topic-phrases`. @@ -194,6 +160,22 @@ Use `set-topic-phrase` to store a curated expansion phrase on the topic itself. Use `topics --phrase-review-status pending` when you want to audit only topics whose staged phrase suggestions still need review. `--allow-unsafe-search-matches` exists only for bounded experiments on copied databases when you explicitly want to relax trust to exercise downstream expansion behavior. +The TalkOrigins corpus pipeline remains in the repository as an example application rather than a core package surface. Use the example-scoped Python namespace: + +```python +from citegeist.examples.talkorigins import TalkOriginsScraper +``` + +and the example-scoped CLI commands: + +```bash +PYTHONPATH=src .venv/bin/python -m citegeist example-talkorigins-scrape talkorigins-out --limit-topics 5 --limit-entries-per-topic 20 +PYTHONPATH=src .venv/bin/python -m citegeist example-talkorigins-validate talkorigins-out/talkorigins_manifest.json +PYTHONPATH=src .venv/bin/python -m citegeist example-talkorigins-duplicates talkorigins-out/talkorigins_manifest.json --limit 20 --preview --weak-only +``` + +The older `scrape-talkorigins`-style command names remain available as compatibility aliases. The full example workflow and reconstruction notes live in [examples/talkorigins/README.md](./examples/talkorigins/README.md). + Correction files are simple JSON: ```json @@ -215,15 +197,6 @@ Correction files are simple JSON: `fields` values overwrite the canonical entry for that duplicate-cluster key. Set a field to `null` to remove it. -To import the reconstructed corpus into SQLite while collapsing duplicate works across topics into canonical entries: - -```bash -PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 ingest-talkorigins talkorigins-out/talkorigins_manifest.json -``` - -That import preserves many-to-many topic membership through the `topics` and `entry_topics` tables. -After import, use `topics`, `topic-entries`, `search --topic`, and `export-topic` to inspect or export topic slices from the consolidated database. - Live-source workflow: ```bash diff --git a/examples/talkorigins/README.md b/examples/talkorigins/README.md new file mode 100644 index 0000000..26481c0 --- /dev/null +++ b/examples/talkorigins/README.md @@ -0,0 +1,52 @@ +# TalkOrigins Example + +This example shows how to use `citegeist` on a large legacy plaintext bibliography corpus. + +It is intentionally positioned as an application of the core library, not as the main product surface. + +## What It Demonstrates + +- scraping a legacy bibliography index; +- normalizing repeated-author plaintext references; +- converting topic pages into per-topic seed BibTeX; +- generating batch bootstrap specs for downstream ingest and expansion; +- reconstructing cleaned plaintext and BibTeX topic pages for review; +- validating parse quality, duplicate clusters, and weak canonical entries; +- curating topic phrases and correction files before broader enrichment. + +The example implementation lives under the Python namespace: + +```python +from citegeist.examples.talkorigins import TalkOriginsScraper +``` + +The preferred CLI commands are example-scoped: + +```bash +PYTHONPATH=src .venv/bin/python -m citegeist example-talkorigins-scrape talkorigins-out --limit-topics 5 --limit-entries-per-topic 20 +PYTHONPATH=src .venv/bin/python -m citegeist example-talkorigins-validate talkorigins-out/talkorigins_manifest.json +PYTHONPATH=src .venv/bin/python -m citegeist example-talkorigins-duplicates talkorigins-out/talkorigins_manifest.json --limit 20 --preview --weak-only +PYTHONPATH=src .venv/bin/python -m citegeist example-talkorigins-suggest-phrases talkorigins-out/talkorigins_manifest.json --output topic-phrases.json +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 stage-topic-phrases topic-phrases.json +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export-topic-phrase-reviews --output topic-phrase-review.json +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 review-topic-phrases topic-phrase-review.json +PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 example-talkorigins-enrich talkorigins-out/talkorigins_manifest.json --limit 20 +PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 example-talkorigins-review talkorigins-out/talkorigins_manifest.json --output talkorigins-review.json +PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 example-talkorigins-apply-corrections talkorigins-out/talkorigins_manifest.json talkorigins-corrections.json +PYTHONPATH=src .venv/bin/python -m citegeist --db talkorigins.sqlite3 example-talkorigins-ingest talkorigins-out/talkorigins_manifest.json +``` + +## Output Artifacts + +The example scrape writes: + +- `seeds/*.bib` per-topic seed BibTeX files; +- `plaintext/*.txt` cleaned GSA-style plaintext with repeated authors expanded; +- `site/topics/*.html` reconstructed topic pages with hide/show BibTeX blocks; +- `talkorigins_full.txt` and `talkorigins_full.bib` aggregate downloads; +- `snapshots/*.json` cached topic payloads so reruns can resume. + +## Notes + +- The example-specific CLI names have compatibility aliases matching the older `scrape-talkorigins` style commands. +- Topic phrase staging, review, and export commands are generic `citegeist` functionality and are not specific to TalkOrigins. diff --git a/src/citegeist/__init__.py b/src/citegeist/__init__.py index bacb1c7..022568c 100644 --- a/src/citegeist/__init__.py +++ b/src/citegeist/__init__.py @@ -7,18 +7,6 @@ from .harvest import OaiMetadataFormat, OaiPmhHarvester, OaiSet from .resolve import MetadataResolver, merge_entries, merge_entries_with_conflicts from .sources import SourceClient from .storage import BibliographyStore -from .talkorigins import ( - TalkOriginsBatchExport, - TalkOriginsDuplicateCluster, - TalkOriginsEnrichmentResult, - TalkOriginsIngestReport, - TalkOriginsReviewExport, - TalkOriginsScraper, - TalkOriginsSeedSet, - TalkOriginsTopicPhraseSuggestion, - TalkOriginsTopic, - TalkOriginsValidationReport, -) __all__ = [ "BibEntry", @@ -34,16 +22,6 @@ __all__ = [ "OaiMetadataFormat", "OaiSet", "SourceClient", - "TalkOriginsBatchExport", - "TalkOriginsDuplicateCluster", - "TalkOriginsEnrichmentResult", - "TalkOriginsIngestReport", - "TalkOriginsReviewExport", - "TalkOriginsScraper", - "TalkOriginsSeedSet", - "TalkOriginsTopicPhraseSuggestion", - "TalkOriginsTopic", - "TalkOriginsValidationReport", "extract_references", "load_batch_jobs", "merge_entries", diff --git a/src/citegeist/cli.py b/src/citegeist/cli.py index 801f272..7d50f22 100644 --- a/src/citegeist/cli.py +++ b/src/citegeist/cli.py @@ -9,12 +9,12 @@ from pathlib import Path from .batch import BatchBootstrapRunner, load_batch_jobs from .bibtex import parse_bibtex, render_bibtex from .bootstrap import Bootstrapper +from .examples.talkorigins import TalkOriginsScraper from .expand import CrossrefExpander, OpenAlexExpander, TopicExpander from .extract import extract_references from .harvest import OaiPmhHarvester from .resolve import MetadataResolver, merge_entries_with_conflicts from .storage import BibliographyStore -from .talkorigins import TalkOriginsScraper def build_parser() -> argparse.ArgumentParser: @@ -205,8 +205,9 @@ def build_parser() -> argparse.ArgumentParser: batch_parser.add_argument("input", help="Path to batch JSON file") talkorigins_parser = subparsers.add_parser( - "scrape-talkorigins", - help="Scrape TalkOrigins into per-topic seed BibTeX files and a bootstrap-batch JSON file", + "example-talkorigins-scrape", + aliases=["scrape-talkorigins"], + help="Example workflow: scrape TalkOrigins into per-topic seed BibTeX files and a bootstrap-batch JSON file", ) talkorigins_parser.add_argument( "output_dir", @@ -257,14 +258,16 @@ def build_parser() -> argparse.ArgumentParser: talkorigins_parser.add_argument("--status", default="draft", help="Review status for generated seed jobs") validate_talkorigins_parser = subparsers.add_parser( - "validate-talkorigins", - help="Validate a generated TalkOrigins manifest and report parse coverage and suspicious entries", + "example-talkorigins-validate", + aliases=["validate-talkorigins"], + help="Example workflow: validate a generated TalkOrigins manifest and report parse coverage and suspicious entries", ) validate_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") suggest_talkorigins_parser = subparsers.add_parser( - "suggest-talkorigins-phrases", - help="Suggest stored topic expansion phrases from a TalkOrigins manifest", + "example-talkorigins-suggest-phrases", + aliases=["suggest-talkorigins-phrases"], + help="Example workflow: suggest stored topic expansion phrases from a TalkOrigins manifest", ) suggest_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") suggest_talkorigins_parser.add_argument("--topic", help="Optional topic slug to restrict suggestions") @@ -305,8 +308,9 @@ def build_parser() -> argparse.ArgumentParser: review_topic_phrases_parser.add_argument("input", help="Path to JSON file containing topic phrase review records") duplicates_talkorigins_parser = subparsers.add_parser( - "duplicates-talkorigins", - help="Inspect duplicate clusters in a generated TalkOrigins manifest", + "example-talkorigins-duplicates", + aliases=["duplicates-talkorigins"], + help="Example workflow: inspect duplicate clusters in a generated TalkOrigins manifest", ) duplicates_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") duplicates_talkorigins_parser.add_argument("--limit", type=int, default=20, help="Maximum clusters to show") @@ -330,8 +334,9 @@ def build_parser() -> argparse.ArgumentParser: ) ingest_talkorigins_parser = subparsers.add_parser( - "ingest-talkorigins", - help="Ingest a TalkOrigins manifest into the database with duplicate consolidation and topic membership", + "example-talkorigins-ingest", + aliases=["ingest-talkorigins"], + help="Example workflow: ingest a TalkOrigins manifest into the database with duplicate consolidation and topic membership", ) ingest_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") ingest_talkorigins_parser.add_argument("--status", default="draft", help="Review status for imported entries") @@ -342,8 +347,9 @@ def build_parser() -> argparse.ArgumentParser: ) enrich_talkorigins_parser = subparsers.add_parser( - "enrich-talkorigins", - help="Attempt metadata enrichment for weak TalkOrigins canonical entries", + "example-talkorigins-enrich", + aliases=["enrich-talkorigins"], + help="Example workflow: attempt metadata enrichment for weak TalkOrigins canonical entries", ) enrich_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") enrich_talkorigins_parser.add_argument("--limit", type=int, default=20, help="Maximum weak clusters to inspect") @@ -372,8 +378,9 @@ def build_parser() -> argparse.ArgumentParser: ) review_talkorigins_parser = subparsers.add_parser( - "review-talkorigins", - help="Export weak TalkOrigins clusters plus dry-run enrichment outcomes for manual review", + "example-talkorigins-review", + aliases=["review-talkorigins"], + help="Example workflow: export weak TalkOrigins clusters plus dry-run enrichment outcomes for manual review", ) review_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") review_talkorigins_parser.add_argument("--limit", type=int, default=20, help="Maximum weak clusters to export") @@ -388,8 +395,9 @@ def build_parser() -> argparse.ArgumentParser: review_talkorigins_parser.add_argument("--output", help="Write review export JSON to a file instead of stdout") apply_review_talkorigins_parser = subparsers.add_parser( - "apply-talkorigins-corrections", - help="Apply curated TalkOrigins review corrections to the consolidated database", + "example-talkorigins-apply-corrections", + aliases=["apply-talkorigins-corrections"], + help="Example workflow: apply curated TalkOrigins review corrections to the consolidated database", ) apply_review_talkorigins_parser.add_argument("manifest", help="Path to talkorigins_manifest.json") apply_review_talkorigins_parser.add_argument("corrections", help="Path to corrections JSON") @@ -530,7 +538,7 @@ def main(argv: list[str] | None = None) -> int: ) if args.command == "bootstrap-batch": return _run_bootstrap_batch(store, Path(args.input)) - if args.command == "scrape-talkorigins": + if args.command in {"example-talkorigins-scrape", "scrape-talkorigins"}: return _run_scrape_talkorigins( store, args.base_url, @@ -545,9 +553,9 @@ def main(argv: list[str] | None = None) -> int: args.topic_commit_limit, args.status, ) - if args.command == "validate-talkorigins": + if args.command in {"example-talkorigins-validate", "validate-talkorigins"}: return _run_validate_talkorigins(Path(args.manifest)) - if args.command == "suggest-talkorigins-phrases": + if args.command in {"example-talkorigins-suggest-phrases", "suggest-talkorigins-phrases"}: return _run_suggest_talkorigins_phrases(Path(args.manifest), args.topic, args.limit, args.output) if args.command == "apply-topic-phrases": return _run_apply_topic_phrases(store, Path(args.input)) @@ -557,7 +565,7 @@ def main(argv: list[str] | None = None) -> int: return _run_review_topic_phrase(store, args.topic_slug, args.status, args.notes, args.phrase) if args.command == "review-topic-phrases": return _run_review_topic_phrases(store, Path(args.input)) - if args.command == "duplicates-talkorigins": + if args.command in {"example-talkorigins-duplicates", "duplicates-talkorigins"}: return _run_duplicates_talkorigins( Path(args.manifest), args.limit, @@ -567,9 +575,9 @@ def main(argv: list[str] | None = None) -> int: args.preview, args.weak_only, ) - if args.command == "ingest-talkorigins": + if args.command in {"example-talkorigins-ingest", "ingest-talkorigins"}: return _run_ingest_talkorigins(store, Path(args.manifest), args.status, not args.no_dedupe) - if args.command == "enrich-talkorigins": + if args.command in {"example-talkorigins-enrich", "enrich-talkorigins"}: return _run_enrich_talkorigins( store, Path(args.manifest), @@ -581,7 +589,7 @@ def main(argv: list[str] | None = None) -> int: args.status, args.allow_unsafe_search_matches, ) - if args.command == "review-talkorigins": + if args.command in {"example-talkorigins-review", "review-talkorigins"}: return _run_review_talkorigins( store, Path(args.manifest), @@ -591,7 +599,7 @@ def main(argv: list[str] | None = None) -> int: args.topic, args.output, ) - if args.command == "apply-talkorigins-corrections": + if args.command in {"example-talkorigins-apply-corrections", "apply-talkorigins-corrections"}: return _run_apply_talkorigins_corrections( store, Path(args.manifest), diff --git a/src/citegeist/examples/__init__.py b/src/citegeist/examples/__init__.py new file mode 100644 index 0000000..0fe26e4 --- /dev/null +++ b/src/citegeist/examples/__init__.py @@ -0,0 +1,29 @@ +from .talkorigins import ( + TalkOriginsBatchExport, + TalkOriginsCorrectionResult, + TalkOriginsDuplicateCluster, + TalkOriginsEnrichmentResult, + TalkOriginsIngestReport, + TalkOriginsReviewExport, + TalkOriginsScraper, + TalkOriginsSeedSet, + TalkOriginsTopic, + TalkOriginsTopicPhraseSuggestion, + TalkOriginsValidationReport, + normalize_topic_entries, +) + +__all__ = [ + "TalkOriginsBatchExport", + "TalkOriginsCorrectionResult", + "TalkOriginsDuplicateCluster", + "TalkOriginsEnrichmentResult", + "TalkOriginsIngestReport", + "TalkOriginsReviewExport", + "TalkOriginsScraper", + "TalkOriginsSeedSet", + "TalkOriginsTopic", + "TalkOriginsTopicPhraseSuggestion", + "TalkOriginsValidationReport", + "normalize_topic_entries", +] diff --git a/src/citegeist/examples/talkorigins.py b/src/citegeist/examples/talkorigins.py new file mode 100644 index 0000000..46c6803 --- /dev/null +++ b/src/citegeist/examples/talkorigins.py @@ -0,0 +1,29 @@ +from ..talkorigins import ( + TalkOriginsBatchExport, + TalkOriginsCorrectionResult, + TalkOriginsDuplicateCluster, + TalkOriginsEnrichmentResult, + TalkOriginsIngestReport, + TalkOriginsReviewExport, + TalkOriginsScraper, + TalkOriginsSeedSet, + TalkOriginsTopic, + TalkOriginsTopicPhraseSuggestion, + TalkOriginsValidationReport, + normalize_topic_entries, +) + +__all__ = [ + "TalkOriginsBatchExport", + "TalkOriginsCorrectionResult", + "TalkOriginsDuplicateCluster", + "TalkOriginsEnrichmentResult", + "TalkOriginsIngestReport", + "TalkOriginsReviewExport", + "TalkOriginsScraper", + "TalkOriginsSeedSet", + "TalkOriginsTopic", + "TalkOriginsTopicPhraseSuggestion", + "TalkOriginsValidationReport", + "normalize_topic_entries", +] diff --git a/src/citegeist/talkorigins.py b/src/citegeist/talkorigins.py index 45ce910..e98d239 100644 --- a/src/citegeist/talkorigins.py +++ b/src/citegeist/talkorigins.py @@ -1,3 +1,10 @@ +"""TalkOrigins example implementation. + +This module backs the example-facing namespace at ``citegeist.examples.talkorigins``. +New code should prefer importing from the examples namespace rather than treating +TalkOrigins support as part of the core top-level package surface. +""" + from __future__ import annotations from collections import Counter diff --git a/tests/test_cli.py b/tests/test_cli.py index 368571b..38b567d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -7,6 +7,16 @@ from pathlib import Path from unittest.mock import patch from citegeist.cli import main +from citegeist.examples.talkorigins import ( + TalkOriginsBatchExport, + TalkOriginsCorrectionResult, + TalkOriginsDuplicateCluster, + TalkOriginsEnrichmentResult, + TalkOriginsIngestReport, + TalkOriginsReviewExport, + TalkOriginsTopicPhraseSuggestion, + TalkOriginsValidationReport, +) SAMPLE_BIB = """ @@ -313,7 +323,7 @@ def test_cli_scrape_talkorigins_accepts_output_dir(tmp_path): database = tmp_path / "library.sqlite3" with patch("citegeist.cli.TalkOriginsScraper.scrape_to_directory") as mocked_scrape: - mocked_scrape.return_value = __import__("citegeist").TalkOriginsBatchExport( + mocked_scrape.return_value = TalkOriginsBatchExport( base_url="https://www.talkorigins.org/origins/biblio/", output_dir=str(tmp_path), topic_count=1, @@ -326,7 +336,7 @@ def test_cli_scrape_talkorigins_accepts_output_dir(tmp_path): [ "--db", str(database), - "scrape-talkorigins", + "example-talkorigins-scrape", str(tmp_path / "talkorigins-out"), "--limit-topics", "3", @@ -346,7 +356,7 @@ def test_cli_validate_talkorigins_accepts_manifest(tmp_path): manifest = tmp_path / "talkorigins_manifest.json" manifest.write_text("{}", encoding="utf-8") with patch("citegeist.cli.TalkOriginsScraper.validate_export") as mocked_validate: - mocked_validate.return_value = __import__("citegeist").TalkOriginsValidationReport( + mocked_validate.return_value = TalkOriginsValidationReport( manifest_path=str(manifest), topic_count=1, entry_count=2, @@ -360,7 +370,7 @@ def test_cli_validate_talkorigins_accepts_manifest(tmp_path): duplicate_entry_count=0, duplicate_examples=[], ) - exit_code = main(["validate-talkorigins", str(manifest)]) + exit_code = main(["example-talkorigins-validate", str(manifest)]) assert exit_code == 0 @@ -373,7 +383,7 @@ def test_cli_suggest_talkorigins_phrases_writes_output(tmp_path): output = tmp_path / "phrases.json" with patch("citegeist.cli.TalkOriginsScraper.suggest_topic_phrases") as mocked_suggest: mocked_suggest.return_value = [ - __import__("citegeist", fromlist=["TalkOriginsTopicPhraseSuggestion"]).TalkOriginsTopicPhraseSuggestion( + TalkOriginsTopicPhraseSuggestion( slug="abiogenesis", topic="Abiogenesis", entry_count=2, @@ -385,7 +395,7 @@ def test_cli_suggest_talkorigins_phrases_writes_output(tmp_path): ] exit_code = main( [ - "suggest-talkorigins-phrases", + "example-talkorigins-suggest-phrases", str(manifest), "--topic", "abiogenesis", @@ -406,7 +416,7 @@ def test_cli_duplicates_talkorigins_accepts_manifest(tmp_path): manifest.write_text("{}", encoding="utf-8") with patch("citegeist.cli.TalkOriginsScraper.inspect_duplicate_clusters") as mocked_duplicates: mocked_duplicates.return_value = [ - __import__("citegeist.talkorigins", fromlist=["TalkOriginsDuplicateCluster"]).TalkOriginsDuplicateCluster( + TalkOriginsDuplicateCluster( key="smith|1999|duplicate paper", count=2, items=[ @@ -431,7 +441,7 @@ def test_cli_duplicates_talkorigins_accepts_manifest(tmp_path): ] exit_code = main( [ - "duplicates-talkorigins", + "example-talkorigins-duplicates", str(manifest), "--topic", "abiogenesis", @@ -452,7 +462,7 @@ def test_cli_ingest_talkorigins_accepts_manifest(tmp_path): manifest = tmp_path / "talkorigins_manifest.json" manifest.write_text("{}", encoding="utf-8") with patch("citegeist.cli.TalkOriginsScraper.ingest_export") as mocked_ingest: - mocked_ingest.return_value = __import__("citegeist").TalkOriginsIngestReport( + mocked_ingest.return_value = TalkOriginsIngestReport( manifest_path=str(manifest), topic_count=1, raw_entry_count=2, @@ -461,7 +471,7 @@ def test_cli_ingest_talkorigins_accepts_manifest(tmp_path): duplicate_entry_count=2, canonicalized_count=1, ) - exit_code = main(["--db", str(database), "ingest-talkorigins", str(manifest)]) + exit_code = main(["--db", str(database), "example-talkorigins-ingest", str(manifest)]) assert exit_code == 0 @@ -474,7 +484,7 @@ def test_cli_enrich_talkorigins_accepts_manifest(tmp_path): manifest.write_text("{}", encoding="utf-8") with patch("citegeist.cli.TalkOriginsScraper.enrich_weak_canonicals") as mocked_enrich: mocked_enrich.return_value = [ - __import__("citegeist.talkorigins", fromlist=["TalkOriginsEnrichmentResult"]).TalkOriginsEnrichmentResult( + TalkOriginsEnrichmentResult( key="smith|1999|duplicate paper", citation_key="dup1", weak_reasons_before=["missing:doi"], @@ -490,7 +500,7 @@ def test_cli_enrich_talkorigins_accepts_manifest(tmp_path): [ "--db", str(database), - "enrich-talkorigins", + "example-talkorigins-enrich", str(manifest), "--limit", "5", @@ -510,7 +520,7 @@ def test_cli_review_talkorigins_writes_output(tmp_path): manifest.write_text("{}", encoding="utf-8") output = tmp_path / "review.json" with patch("citegeist.cli.TalkOriginsScraper.build_review_export") as mocked_review: - mocked_review.return_value = __import__("citegeist.talkorigins", fromlist=["TalkOriginsReviewExport"]).TalkOriginsReviewExport( + mocked_review.return_value = TalkOriginsReviewExport( manifest_path=str(manifest), item_count=1, items=[{"key": "smith|1999|duplicate paper", "canonical": {}, "enrichment": {}}], @@ -519,7 +529,7 @@ def test_cli_review_talkorigins_writes_output(tmp_path): [ "--db", str(database), - "review-talkorigins", + "example-talkorigins-review", str(manifest), "--output", str(output), @@ -540,7 +550,7 @@ def test_cli_apply_talkorigins_corrections_accepts_files(tmp_path): corrections.write_text('{"corrections": []}', encoding="utf-8") with patch("citegeist.cli.TalkOriginsScraper.apply_review_corrections") as mocked_apply: mocked_apply.return_value = [ - __import__("citegeist.talkorigins", fromlist=["TalkOriginsCorrectionResult"]).TalkOriginsCorrectionResult( + TalkOriginsCorrectionResult( key="smith|1999|duplicate paper", citation_key="dup1", applied=True, @@ -551,7 +561,7 @@ def test_cli_apply_talkorigins_corrections_accepts_files(tmp_path): [ "--db", str(database), - "apply-talkorigins-corrections", + "example-talkorigins-apply-corrections", str(manifest), str(corrections), ] diff --git a/tests/test_talkorigins.py b/tests/test_talkorigins.py index 9ca9943..e9a8a1d 100644 --- a/tests/test_talkorigins.py +++ b/tests/test_talkorigins.py @@ -5,8 +5,8 @@ from pathlib import Path from citegeist.batch import load_batch_jobs from citegeist.bibtex import BibEntry +from citegeist.examples.talkorigins import TalkOriginsScraper, normalize_topic_entries from citegeist.storage import BibliographyStore -from citegeist.talkorigins import TalkOriginsScraper, normalize_topic_entries INDEX_HTML = """