From 2459830b70b56396441f2cbdfc67479d10015652 Mon Sep 17 00:00:00 2001 From: welsberr Date: Sun, 29 Mar 2026 09:37:55 -0400 Subject: [PATCH] Added lightweight demo web app. --- README.md | 83 ++ ROADMAP.md | 57 +- examples/literature-explorer/README.md | 122 ++ examples/literature-explorer/index.html | 1082 +++++++++++++++++ .../literature-explorer.js | 54 + pyproject.toml | 1 + src/citegeist/__init__.py | 18 +- src/citegeist/app_api.py | 270 ++++ src/citegeist/app_server.py | 194 +++ src/citegeist/bootstrap.py | 56 +- src/citegeist/cli.py | 95 +- src/citegeist/expand.py | 66 +- src/citegeist/extract.py | 778 +++++++++++- src/citegeist/resolve.py | 45 +- src/citegeist/talkorigins.py | 55 +- tests/fixtures/extract_backend_fixture.txt | 5 + tests/test_app_api.py | 179 +++ tests/test_app_server.py | 45 + tests/test_bootstrap.py | 89 +- tests/test_extract.py | 459 ++++++- tests/test_openalex_expand.py | 31 + tests/test_resolve.py | 39 + 22 files changed, 3663 insertions(+), 160 deletions(-) create mode 100644 examples/literature-explorer/README.md create mode 100644 examples/literature-explorer/index.html create mode 100644 examples/literature-explorer/literature-explorer.js create mode 100644 src/citegeist/app_api.py create mode 100644 src/citegeist/app_server.py create mode 100644 tests/fixtures/extract_backend_fixture.txt create mode 100644 tests/test_app_api.py create mode 100644 tests/test_app_server.py diff --git a/README.md b/README.md index 14e53d5..28ee89f 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,8 @@ The initial repo includes: - a small CLI for ingest, search, inspection, and export; - review-state tracking on entries, per-field ingest provenance, and field-level conflict review; - plaintext reference extraction into draft BibTeX for numbered, APA-like, wrapped-line, and simple book-style references; +- staged plaintext reference extraction that now preserves more structured metadata from legacy references, including year suffixes, identifiers, volume/issue/pages, and thesis/report/web-style venue hints; +- a reference-extraction backend seam with the local `heuristic` parser as the default implementation, so optional external backends can be added later without changing the core extract workflow; - standalone verification and disambiguation of free-text references or partial BibTeX into auditable BibTeX/JSON results with `x_status`, `x_confidence`, `x_source`, `x_query`, and alternate-candidate traces; - identifier-first metadata resolution for DOI, OpenAlex, DBLP, arXiv, and DataCite-backed entries, with OpenAlex/DataCite title-search fallback; - local citation-graph traversal over stored `cites`, `cited_by`, and `crossref` edges; @@ -65,10 +67,42 @@ Example applications live alongside the core package rather than defining it. Cu - a comprehensive CLI cookbook in [examples/cli/README.md](./examples/cli/README.md); - a topic-only bootstrap workflow for `artificial life` in [examples/artificial-life/README.md](./examples/artificial-life/README.md); +- a browser-oriented literature explorer demo with a small HTTP bridge, static HTML/JS shell, and lightweight graph view in [examples/literature-explorer/README.md](./examples/literature-explorer/README.md); - the TalkOrigins bibliography pipeline under [`citegeist.examples.talkorigins`](./src/citegeist/examples/talkorigins.py) with a usage guide in [examples/talkorigins/README.md](./examples/talkorigins/README.md). The prioritized execution plan lives in [ROADMAP.md](./ROADMAP.md). +## Status Assessment + +`citegeist` is no longer just a storage-and-export skeleton. It now covers the main early pipeline the project set out to make usable on one local machine: + +1. ingest or extract rough references, +2. verify and improve them before trust is assumed, +3. normalize and store them with provenance, +4. expand outward through citation links, +5. review and export BibTeX again. + +In practical terms, the strongest implemented areas are now: + +- BibTeX-native local storage, review state, provenance tracking, and export; +- rough-reference handling through both heuristic extraction and standalone verification/disambiguation; +- conservative metadata enrichment and citation-graph expansion from scholarly APIs; +- fixture-backed parser and source-client workflows that keep improvement work auditable; +- a lightweight local demo surface for topic discovery, topic expansion, extraction, verification, and graph inspection in the browser without introducing a full web framework dependency. + +The main gaps are no longer the basic pipeline. The main gaps are evaluation depth and researcher ergonomics: broader regression fixtures, clearer comparative quality measurement across parsers/resolvers, stronger review workflows for larger corpora, and a richer review UI than the current demonstration shell. + +## Positioning + +Compared with other bibliographic tooling, `citegeist` is strongest when bibliography work starts messy and needs to become structured: + +- Against reference managers such as Zotero or JabRef, `citegeist` is currently weaker as a polished day-to-day library manager or sync-oriented desktop app, but stronger as a BibTeX-first command-line workbench for extraction, repair, provenance, and graph-oriented discovery. +- Against parser-focused tools such as AnyStyle, GROBID, or ParsCit-style systems, `citegeist` is not trying to outcompete them as a dedicated citation parser. Instead, it now uses their staged-parsing ideas and can optionally call external parsers while keeping a local default parser and a normalized BibTeX-oriented downstream workflow. +- Against verifier/disambiguator workflows like `VeriBib`, `citegeist` now covers the same high-value pre-ingest verification pattern, but places it inside a broader local pipeline that also stores results, resolves identifiers, expands citation graphs, and exports reviewed BibTeX. +- Against process-heavy corpus updaters like `TOA-Bib-Updater`, `citegeist` now adopts the useful operational pattern of staged artifacts and reviewable outputs, but keeps the core centered on reusable library/database primitives rather than one source-specific acquisition script. + +The clearest differentiator at this point is integration. `citegeist` is becoming a local bibliography workbench that combines extraction, verification, enrichment, graph expansion, topic-aware review, and BibTeX export in one toolchain rather than treating those as unrelated utilities. + ## Layout ```text @@ -133,6 +167,10 @@ PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 apply-conflict PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 bootstrap --seed-bib seed.bib --topic "bayesian nonparametrics" PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 bootstrap --topic "bayesian nonparametrics" --preview --topic-commit-limit 5 PYTHONPATH=src .venv/bin/python -m citegeist extract references.txt --output draft.bib +PYTHONPATH=src .venv/bin/python -m citegeist extract references.txt --backend heuristic --output draft.bib +PYTHONPATH=src .venv/bin/python -m citegeist compare-extract references.txt --backend heuristic --backend anystyle --output compare.json +PYTHONPATH=src .venv/bin/python -m citegeist compare-extract references.txt --backend heuristic --backend grobid --summary --output compare-summary.json +PYTHONPATH=src .venv/bin/python -m citegeist compare-extract references.txt --backend heuristic --backend grobid --summary --max-rows-with-differences 0 --output compare-check.json PYTHONPATH=src .venv/bin/python -m citegeist verify --string '"Graph-first bibliography augmentation" Smith 2024' --context "citation graphs" --format json PYTHONPATH=src .venv/bin/python -m citegeist verify --bib draft.bib --output verified.bib PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 resolve smith2024graphs @@ -177,6 +215,51 @@ For live-source development, prefer fixture-backed or cache-backed source client - From `VeriBib`: a standalone `verify` workflow for ambiguous strings or rough BibTeX, with explicit confidence/status audit fields and alternate-candidate traces before you commit changes to the main library. - From `TOA-Bib-Updater`: resumable, artifact-oriented corpus processing remains the preferred process model for large imports. In practice this already appears in the TalkOrigins example pipeline through saved manifests, review exports, duplicate reports, and staged topic-phrase review flows. +Those ideas are now implemented enough to matter operationally, not just directionally: `VeriBib`'s main contribution has become a core `verify` command, while `TOA-Bib-Updater`'s main contribution remains process shape and review artifacts rather than parser or storage internals. + +## Parsing Sources + +The plaintext-reference parser is still local and heuristic-first, but its current direction explicitly borrows ideas from earlier citation-parsing work: + +- Conceptual influence: GROBID's staged parsing model and bibliographical-reference annotation guidance informed the split between reference-block segmentation, field extraction, and metadata recovery, especially for identifiers, year variants, and venue/page structure. +- Conceptual influence: AnyStyle and ParsCit informed the emphasis on treating reference parsing as a separable stage with gold-fixture-driven improvement rather than a one-pass punctuation split. +- In-repo code prior art: some of the newer heuristics were adapted from existing CiteGeist code in [`src/citegeist/talkorigins.py`](./src/citegeist/talkorigins.py) and [`src/citegeist/expand.py`](./src/citegeist/expand.py), particularly around entry-type guessing, fragment cleanup, and thesis/report handling for citation-like blobs. + +External references used for the current parser direction: + +- GROBID principles and parsing architecture: +- GROBID bibliographical-reference annotation notes: +- AnyStyle project: +- ParsCit paper: + +The built-in extraction backends are: + +- `heuristic`: the default local parser, always available +- `anystyle`: an optional adapter around the AnyStyle CLI when `anystyle` is installed locally +- `grobid`: an optional adapter around a running GROBID service using `/api/processCitationList` + +The backend interface exists so future GROBID- or other parser adapters can be registered without replacing the local parser or changing the CLI contract. + +To compare backend output on the same plaintext references, use `compare-extract`. It aligns entries by ordinal/reference block and emits JSON with per-backend payloads plus a `differing_fields` summary for each row. Add `--summary` when you want a compact evaluation artifact with disagreement counts by field and backend presence counts instead of the full row-by-row payload. Add `--max-rows-with-differences` and/or `--max-field-difference-count` when you want CI-style failure thresholds; the command will emit the summary JSON and return a nonzero exit code if the limits are exceeded. + +For regression-oriented parser work, keep a small curated plaintext fixture set and run `compare-extract` against multiple backends before changing heuristics. That makes backend disagreement explicit and gives you a stable review artifact for parser changes. + +For the optional AnyStyle backend, install the CLI separately and then run: + +```bash +PYTHONPATH=src .venv/bin/python -m citegeist extract references.txt --backend anystyle --output draft.bib +``` + +If the binary is not on `PATH`, set `CITEGEIST_ANYSTYLE_BIN=/path/to/anystyle`. If you want a custom AnyStyle parser model, set `CITEGEIST_ANYSTYLE_PARSER_MODEL=/path/to/model.mod`. + +For the optional GROBID backend, start a GROBID service and then run: + +```bash +PYTHONPATH=src .venv/bin/python -m citegeist extract references.txt --backend grobid --output draft.bib +``` + +By default CiteGeist targets `http://127.0.0.1:8070`. Override that with `CITEGEIST_GROBID_URL=http://host:port` if your service is elsewhere. + ## Example Application - Use `stage-topic-phrases` to load those suggestions into the database as review items. Staging stores the candidate in `suggested_phrase` and marks the topic `pending` without changing the active `expansion_phrase`. diff --git a/ROADMAP.md b/ROADMAP.md index 9f6dd82..8f54dbc 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -25,9 +25,19 @@ Completed: - lightweight BibTeX parsing; - SQLite storage for entries, creators, identifiers, and relations; - local text search using SQLite FTS5 when available; +- CLI workflows for ingest, inspect, search, export, conflict review, bootstrap, graph traversal, expansion, OAI discovery/harvest, extraction, verification, and extraction-backend comparison; +- entry review-state tracking plus field-level provenance and conflict handling; +- plaintext reference extraction with a staged heuristic parser that preserves identifiers, year suffixes, volume/issue/pages, and thesis/report/web-style hints; +- optional extraction backends for AnyStyle and GROBID behind explicit backend selection, with shared normalization back into CiteGeist draft-entry conventions; +- backend comparison, summary, and threshold-check workflows for parser regression/evaluation; - standalone verification/disambiguation output for free-text references and partial BibTeX with auditable match metadata; +- identifier-first metadata resolution plus title-search fallback across DOI, OpenAlex, DBLP, arXiv, and DataCite-backed flows; +- citation-graph expansion and topic-oriented bootstrap/expansion workflows; +- OAI-PMH repository discovery and harvesting for external corpus acquisition; - tests for ingest, relation storage, and search. +In effect, Phases 1 and 2 are largely in place, and substantial parts of Phases 3, 4, and 6 already exist in usable form. The roadmap is now less about creating the first end-to-end path and more about improving quality, evaluation, and larger-corpus review discipline. + ## Comparison Notes From Related Repos The adjacent `TOA-Bib-Updater` and `VeriBib` repositories are useful prior art, but they contribute different things: @@ -41,6 +51,15 @@ The adjacent `TOA-Bib-Updater` and `VeriBib` repositories are useful prior art, 2. keep resumable manifests and review exports for large acquisition workflows, especially example pipelines and batch imports; 3. avoid coupling the core model to brittle source-specific scraping logic. +## Source Notes + +Reference-extraction planning in this repository currently draws on both external and internal prior art: + +- External conceptual sources: GROBID, AnyStyle, and ParsCit are the main references for staged citation parsing, token/field separation, and gold-fixture-driven improvement. +- Internal code sources: the plaintext extractor should continue to reuse and consolidate heuristics already present in `citegeist.talkorigins` and `citegeist.expand` where those routines solve overlapping problems such as thesis/report classification, fragment cleanup, or citation-blob handling. + +This project should acknowledge those influences in code comments and docs when parser behavior is intentionally adapted from them. + ## Phase 1: Core Ingestion And Export Priority: P0 @@ -67,6 +86,9 @@ Exit criteria: - a user can ingest a `.bib` file, inspect entries, search locally, and export a reviewed `.bib`; - round-trip tests show no unexpected field loss for supported entry types. +Status: +Largely complete. Remaining work here is mostly refinement: export fidelity on edge cases, review ergonomics, and better audit/report surfaces rather than missing core capability. + ## Phase 2: Reference Extraction Priority: P0 @@ -80,6 +102,10 @@ Tasks: - define a draft-entry schema for incomplete references with confidence markers; - support ingestion of OCR- or PDF-derived plaintext bibliography sections; - add normalization for author names, years, title casing, and page ranges; +- keep the parser staged internally so segmentation, field parsing, and later optional external backends remain separable; +- keep the local heuristic parser as the default path even if optional external backends are added later; +- support optional external parser adapters only behind explicit backend selection, so local workflows still work without Ruby/Java services; +- when adding external backends, normalize their outputs back into the same draft-entry conventions used by the local parser; - prefer sentence-boundary venue detection over naive keyword splits so title text containing words like `report` is not truncated; - repair partially extracted venue stubs such as `Occas.` or `Proc.` by reparsing the full raw reference line when the structured fields are obviously incomplete; @@ -96,6 +122,9 @@ Exit criteria: - a user can pass a plaintext bibliography section and receive draft BibTeX entries with unresolved fields clearly marked; - tests cover common article, book, chapter, proceedings, report, and abbreviation-heavy legacy references. +Status: +Substantially complete for the current heuristic-first strategy. The remaining work is quality-focused: larger curated fixtures, sharper benchmark discipline, and continued parser refinement rather than creating the extraction path from scratch. + ## Phase 3: Metadata Enrichment Priority: P1 @@ -121,6 +150,9 @@ Exit criteria: - an incomplete entry can be enriched from at least one authoritative source; - conflicting fields remain visible for review instead of being lost. +Status: +Partially complete. Resolver and merge behavior are already useful, especially for identifier-first flows, but provenance-rich resolution logs, comparative resolver evaluation, and more deliberate review tooling still need attention. + ## Phase 4: Citation Graph Expansion Priority: P1 @@ -145,6 +177,9 @@ Exit criteria: - starting from one or more seed entries, a user can expand outward through citation edges and persist newly discovered nodes; - graph traversal results can be exported as BibTeX candidates for review. +Status: +Partially complete and already usable. The main next-step work is better scoring, filtering, and review surfaces for large discovery sets rather than basic graph traversal. + ## Phase 5: Search And Ranking Priority: P2 @@ -170,6 +205,17 @@ Exit criteria: - local search is useful on realistic corpora without requiring external services; - semantic indexing is optional and does not displace the simpler local search path. +Status: +Early but serviceable. SQLite FTS covers the basic local-search path, but retrieval benchmarking, saved search workflows, and optional semantic ranking remain future work. + +Note: +The repository now has a small app-facing JSON adapter surface, a lightweight local HTTP bridge, and a static literature-explorer demo shell. That is enough for a browser or desktop-web shell to drive topic discovery, topic expansion, extraction, verification, entry inspection, and lightweight graph exploration against one local database. It is still a demo boundary rather than a full multi-user application or long-running service architecture. + +Near-term follow-up for this demo surface: +- add stronger candidate-review interactions for bootstrap and expansion results; +- improve graph review beyond the current lightweight SVG overview; +- keep payload contracts stable enough that the demo can double as an evaluation harness for parser and discovery changes. + ## Phase 6: Corpus Acquisition Pipelines Priority: P2 @@ -194,10 +240,13 @@ Exit criteria: - new public corpora can be imported through adapters without changing the storage core; - imported entries retain their source provenance and can be reviewed like any other entry. +Status: +Partially complete. OAI acquisition and the TalkOrigins example already demonstrate the pattern, but the general adapter surface and review/report discipline across more sources still need expansion. + ## Suggested Next Three Tasks -1. Add a CLI module with `ingest`, `search`, `show`, and `export`. -2. Implement BibTeX export from the normalized store. -3. Add provenance tables and entry review status fields. +1. Expand evaluation fixtures and benchmarking for extraction and verification so backend disagreement, parser regressions, and resolver quality can be measured on a broader real-world corpus. +2. Strengthen review and audit workflows for enrichment/graph expansion, especially around provenance logs, candidate summaries, and larger batch-review artifacts. +3. Improve discovery quality inside topic and corpus workflows through better ranking/filtering, more deliberate topic assignment criteria, and retrieval benchmarks that compare lexical and future semantic approaches. -These three tasks complete the first usable local workflow and should be treated as the immediate sprint. +These three tasks should be treated as the immediate sprint because the basic workflow now exists; the bottleneck has shifted to quality measurement, reviewability, and discovery precision. diff --git a/examples/literature-explorer/README.md b/examples/literature-explorer/README.md new file mode 100644 index 0000000..039d7de --- /dev/null +++ b/examples/literature-explorer/README.md @@ -0,0 +1,122 @@ +# Literature Explorer Demo + +This example describes a thin UI-facing API layer for a browser-based `literature explorer` demo. + +The intended split is: + +- Python owns SQLite, BibTeX parsing, resolver calls, topic bootstrap, topic expansion, and provenance-aware storage. +- A small app-facing adapter returns stable JSON payloads. +- A browser-side JavaScript object calls that adapter and renders HTML5/CSS views. + +The Python-side adapter now exists as: + +```python +from citegeist import BibliographyStore, LiteratureExplorerApi + +store = BibliographyStore("library.sqlite3") +api = LiteratureExplorerApi(store) +``` + +The lightest runnable bridge is now the bundled standard-library HTTP server: + +```bash +PYTHONPATH=src .venv/bin/python -m citegeist.app_server --db library.sqlite3 --host 127.0.0.1 --port 8765 +``` + +or, after installation: + +```bash +citegeist-explorer-server --db library.sqlite3 --host 127.0.0.1 --port 8765 +``` + +## UI-Facing Operations + +The adapter exposes JSON-serializable methods suitable for a local web bridge: + +- `capabilities()` +- `search(query, limit=20, topic_slug=None)` +- `show_entry(citation_key, include_provenance=False, include_conflicts=False, include_bibtex=False)` +- `list_topics(limit=100, phrase_review_status=None)` +- `get_topic(topic_slug, entry_limit=100)` +- `bootstrap(...)` +- `expand_topic(...)` +- `extract_text(text, backend="heuristic")` +- `verify_strings(values, context="", limit=5)` +- `verify_bibtex(bibtex_text, context="", limit=5)` +- `graph(seed_keys, relation_types=None, depth=1, review_status=None, missing_only=False)` + +## Browser Contract + +A demo app can expose a browser object like: + +```js +window.citegeist = { + search(query, opts) {}, + showEntry(citationKey, opts) {}, + listTopics(opts) {}, + getTopic(topicSlug, opts) {}, + bootstrap(opts) {}, + expandTopic(topicSlug, opts) {}, + extractText(text, opts) {}, + verifyStrings(values, opts) {}, + verifyBibtex(bibtexText, opts) {}, + graph(seedKeys, opts) {} +} +``` + +The bundled JS helper now supports a minimal fetch bridge: + +```js +import { + createHttpBridge, + createLiteratureExplorerClient, +} from "./literature-explorer.js"; + +const bridge = createHttpBridge("http://127.0.0.1:8765"); +const citegeist = createLiteratureExplorerClient(bridge); +``` + +That browser object can be backed by: + +- a local HTTP bridge; +- an Electron/Tauri preload bridge; +- Pyodide-style in-process Python, if the dependency/runtime tradeoffs are acceptable. + +The bundled lightweight choice is the local HTTP bridge because it adds no new runtime dependency beyond the Python standard library. + +## Minimal Run Path + +1. Start the local explorer bridge: + +```bash +PYTHONPATH=src .venv/bin/python -m citegeist.app_server --db library.sqlite3 --host 127.0.0.1 --port 8765 +``` + +2. Serve the example directory statically from another terminal: + +```bash +cd examples/literature-explorer +python3 -m http.server 8000 +``` + +3. Open: + +```text +http://127.0.0.1:8000/index.html +``` + +The demo shell in `index.html` is intentionally narrow. It is meant to prove that the app-facing API is sufficient to drive topic bootstrap, topic expansion, search, extraction, verification, and entry inspection from a browser without introducing a frontend framework. + +## Demo Scope + +This is sufficient to drive a demonstration app that can: + +- start from a topic phrase and preview bootstrap candidates; +- commit a bounded topic corpus into the local store; +- inspect topic members, confidence scores, and entry metadata; +- preview or apply topic expansion; +- inspect one entry with BibTeX/provenance details; +- run rough-reference extraction and verification; +- render local citation neighborhoods from `graph()` JSON payloads. + +For a first demo, the strongest path is topic exploration rather than generic reference-manager behavior. diff --git a/examples/literature-explorer/index.html b/examples/literature-explorer/index.html new file mode 100644 index 0000000..e0e8433 --- /dev/null +++ b/examples/literature-explorer/index.html @@ -0,0 +1,1082 @@ + + + + + + CiteGeist Literature Explorer + + + +
+ + +
+
+

Session

+
+
+ 0 + Topics +
+
+ 0 + Topic Entries +
+
+ none + Last Operation +
+
+
+ +
+
+

Topics

+
Connect to the server to load topics.
+
+
+

Entry Detail

+
Select a topic entry or search result to inspect one record.
+
+
+ +
+
+

Topic View

+
No topic loaded yet.
+
+
+

Activity

+
Waiting for requests…
+
+
+ +
+
+

Search Results

+
Run a search to inspect matching entries.
+
+
+

Extract / Verify Output

+
Extraction and verification results will appear here.
+
+
+ +
+

Graph View

+
Load a topic to view a small local network around its first few entries.
+
+
+
+ + + + diff --git a/examples/literature-explorer/literature-explorer.js b/examples/literature-explorer/literature-explorer.js new file mode 100644 index 0000000..d6212ff --- /dev/null +++ b/examples/literature-explorer/literature-explorer.js @@ -0,0 +1,54 @@ +export function createLiteratureExplorerClient(bridge) { + return { + capabilities() { + return bridge.call("capabilities", {}); + }, + search(query, options = {}) { + return bridge.call("search", { query, ...options }); + }, + showEntry(citationKey, options = {}) { + return bridge.call("show_entry", { citation_key: citationKey, ...options }); + }, + listTopics(options = {}) { + return bridge.call("list_topics", options); + }, + getTopic(topicSlug, options = {}) { + return bridge.call("get_topic", { topic_slug: topicSlug, ...options }); + }, + bootstrap(options = {}) { + return bridge.call("bootstrap", options); + }, + expandTopic(topicSlug, options = {}) { + return bridge.call("expand_topic", { topic_slug: topicSlug, ...options }); + }, + extractText(text, options = {}) { + return bridge.call("extract_text", { text, ...options }); + }, + verifyStrings(values, options = {}) { + return bridge.call("verify_strings", { values, ...options }); + }, + verifyBibtex(bibtexText, options = {}) { + return bridge.call("verify_bibtex", { bibtex_text: bibtexText, ...options }); + }, + graph(seedKeys, options = {}) { + return bridge.call("graph", { seed_keys: seedKeys, ...options }); + }, + }; +} + +export function createHttpBridge(baseUrl = "http://127.0.0.1:8765") { + return { + async call(method, params = {}) { + const response = await fetch(`${baseUrl}/call`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ method, params }), + }); + const payload = await response.json(); + if (!response.ok || payload.ok === false) { + throw new Error(payload.error || `Request failed: ${response.status}`); + } + return payload.result; + }, + }; +} diff --git a/pyproject.toml b/pyproject.toml index 880b039..07bf6ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ dependencies = ["pybtex==0.25.1"] [project.scripts] citegeist = "citegeist.cli:main" +citegeist-explorer-server = "citegeist.app_server:main" [tool.pytest.ini_options] pythonpath = ["src"] diff --git a/src/citegeist/__init__.py b/src/citegeist/__init__.py index 612ae62..a906e56 100644 --- a/src/citegeist/__init__.py +++ b/src/citegeist/__init__.py @@ -1,8 +1,17 @@ +from .app_api import LiteratureExplorerApi from .batch import BatchBootstrapRunner, BatchJobResult, load_batch_jobs from .bibtex import BibEntry, parse_bibtex from .bootstrap import BootstrapResult, Bootstrapper from .expand import CrossrefExpander, OpenAlexExpander -from .extract import extract_references +from .extract import ( + available_extraction_backends, + check_extraction_comparison_summary, + compare_extraction_backends, + extract_references, + get_extraction_backend, + register_extraction_backend, + summarize_extraction_comparison, +) from .harvest import OaiMetadataFormat, OaiPmhHarvester, OaiSet from .resolve import MetadataResolver, merge_entries, merge_entries_with_conflicts from .sources import SourceClient @@ -18,6 +27,7 @@ __all__ = [ "BootstrapResult", "Bootstrapper", "CrossrefExpander", + "LiteratureExplorerApi", "MetadataResolver", "OpenAlexExpander", "OaiPmhHarvester", @@ -26,9 +36,15 @@ __all__ = [ "SourceClient", "VerificationMatch", "VerificationResult", + "available_extraction_backends", + "check_extraction_comparison_summary", + "compare_extraction_backends", "extract_references", + "get_extraction_backend", "load_batch_jobs", "merge_entries", "merge_entries_with_conflicts", "parse_bibtex", + "register_extraction_backend", + "summarize_extraction_comparison", ] diff --git a/src/citegeist/app_api.py b/src/citegeist/app_api.py new file mode 100644 index 0000000..7cf660d --- /dev/null +++ b/src/citegeist/app_api.py @@ -0,0 +1,270 @@ +from __future__ import annotations + +from dataclasses import asdict + +from .bibtex import BibEntry, parse_bibtex, render_bibtex +from .bootstrap import Bootstrapper +from .expand import TopicExpander +from .extract import extract_references +from .storage import BibliographyStore +from .verify import BibliographyVerifier + + +class LiteratureExplorerApi: + """JSON-serializable adapter layer for browser or local UI bridges.""" + + def __init__( + self, + store: BibliographyStore, + *, + bootstrapper: Bootstrapper | None = None, + topic_expander: TopicExpander | None = None, + verifier: BibliographyVerifier | None = None, + ) -> None: + self.store = store + self.bootstrapper = bootstrapper or Bootstrapper() + self.topic_expander = topic_expander or TopicExpander() + self.verifier = verifier or BibliographyVerifier() + + def capabilities(self) -> dict[str, object]: + return { + "operations": [ + "search", + "show_entry", + "list_topics", + "get_topic", + "bootstrap", + "expand_topic", + "extract_text", + "verify_strings", + "graph", + ], + "preview_operations": ["bootstrap", "expand_topic"], + } + + def search(self, query: str, *, limit: int = 20, topic_slug: str | None = None) -> dict[str, object]: + return { + "query": query, + "topic_slug": topic_slug, + "results": self.store.search_text(query, limit=limit, topic_slug=topic_slug), + } + + def show_entry( + self, + citation_key: str, + *, + include_provenance: bool = False, + include_conflicts: bool = False, + include_bibtex: bool = False, + ) -> dict[str, object] | None: + entry = self.store.get_entry(citation_key) + if entry is None: + return None + payload = dict(entry) + if include_provenance: + payload["provenance"] = self.store.get_field_provenance(citation_key) + if include_conflicts: + payload["conflicts"] = self.store.get_conflicts(citation_key) + if include_bibtex: + payload["bibtex"] = self.store.get_entry_bibtex(citation_key) + return payload + + def list_topics(self, *, limit: int = 100, phrase_review_status: str | None = None) -> dict[str, object]: + return {"topics": self.store.list_topics(limit=limit, phrase_review_status=phrase_review_status)} + + def get_topic(self, topic_slug: str, *, entry_limit: int = 100) -> dict[str, object] | None: + topic = self.store.get_topic(topic_slug) + if topic is None: + return None + return { + "topic": topic, + "entries": self.store.list_topic_entries(topic_slug, limit=entry_limit), + } + + def bootstrap( + self, + *, + seed_bibtex: str | None = None, + topic: str | None = None, + topic_slug: str | None = None, + topic_name: str | None = None, + topic_phrase: str | None = None, + topic_limit: int = 5, + topic_commit_limit: int | None = None, + expand: bool = True, + preview_only: bool = False, + review_status: str = "draft", + ) -> dict[str, object]: + results = self.bootstrapper.bootstrap( + self.store, + seed_bibtex=seed_bibtex, + topic=topic, + topic_limit=topic_limit, + topic_commit_limit=topic_commit_limit, + expand=expand, + review_status=review_status, + preview_only=preview_only, + topic_slug=topic_slug, + topic_name=topic_name, + topic_phrase=topic_phrase, + ) + effective_slug = topic_slug + if effective_slug is None and topic: + effective_slug = _slugify(topic) + payload: dict[str, object] = { + "preview": preview_only, + "results": [asdict(result) for result in results], + } + if effective_slug is not None: + payload["topic"] = self.store.get_topic(effective_slug) + payload["entries"] = self.store.list_topic_entries(effective_slug, limit=200) + return payload + + def expand_topic( + self, + topic_slug: str, + *, + topic_phrase: str | None = None, + source: str = "openalex", + relation_type: str = "cites", + seed_limit: int = 25, + per_seed_limit: int = 25, + min_relevance: float = 0.2, + seed_keys: list[str] | None = None, + preview_only: bool = False, + ) -> dict[str, object] | None: + topic = self.store.get_topic(topic_slug) + if topic is None: + return None + results = self.topic_expander.expand_topic( + self.store, + topic_slug, + topic_phrase=topic_phrase, + source=source, + relation_type=relation_type, + seed_limit=seed_limit, + per_seed_limit=per_seed_limit, + min_relevance=min_relevance, + seed_keys=seed_keys, + preview_only=preview_only, + ) + return { + "topic": self.store.get_topic(topic_slug), + "preview": preview_only, + "results": [asdict(result) for result in results], + "entries": self.store.list_topic_entries(topic_slug, limit=200), + } + + def extract_text(self, text: str, *, backend: str = "heuristic") -> dict[str, object]: + entries = extract_references(text, backend=backend) + return { + "backend": backend, + "entries": [_entry_payload(entry) for entry in entries], + "bibtex": render_bibtex(entries), + } + + def verify_strings(self, values: list[str], *, context: str = "", limit: int = 5) -> dict[str, object]: + results = self.verifier.verify_strings(values, context=context, limit=limit) + return { + "context": context, + "results": [_verification_payload(result) for result in results], + } + + def verify_bibtex(self, bibtex_text: str, *, context: str = "", limit: int = 5) -> dict[str, object]: + entries = parse_bibtex(bibtex_text) + results = [self.verifier.verify_bib_entry(entry, context=context, limit=limit) for entry in entries] + return { + "context": context, + "results": [_verification_payload(result) for result in results], + } + + def graph( + self, + seed_keys: list[str], + *, + relation_types: list[str] | None = None, + depth: int = 1, + review_status: str | None = None, + missing_only: bool = False, + ) -> dict[str, object]: + rows = self.store.traverse_graph( + seed_keys, + relation_types=relation_types or ["cites"], + max_depth=depth, + review_status=review_status, + include_missing=True, + ) + if missing_only: + rows = [row for row in rows if not row["target_exists"]] + return _graph_payload(self.store, seed_keys, rows) + + +def _entry_payload(entry: BibEntry) -> dict[str, object]: + return { + "citation_key": entry.citation_key, + "entry_type": entry.entry_type, + "fields": dict(entry.fields), + } + + +def _verification_payload(result: object) -> dict[str, object]: + payload = asdict(result) + payload["entry"] = _entry_payload(result.entry) # type: ignore[attr-defined] + payload["alternates"] = [ + { + **asdict(match), + "entry": _entry_payload(match.entry), + } + for match in result.alternates # type: ignore[attr-defined] + ] + return payload + + +def _graph_payload(store: BibliographyStore, seed_keys: list[str], rows: list[dict[str, object]]) -> dict[str, object]: + nodes: dict[str, dict[str, object]] = {} + + def ensure_node(citation_key: str, *, fallback_title: str | None = None, target_exists: bool = True) -> None: + if citation_key in nodes: + return + entry = store.get_entry(citation_key) + nodes[citation_key] = { + "id": citation_key, + "label": citation_key, + "title": (entry or {}).get("title") or fallback_title, + "review_status": (entry or {}).get("review_status"), + "target_exists": entry is not None if entry is not None else target_exists, + "is_seed": citation_key in seed_keys, + } + + for seed_key in seed_keys: + ensure_node(seed_key) + + edges = [] + for index, row in enumerate(rows, start=1): + source_key = str(row["source_citation_key"]) + target_key = str(row["target_citation_key"]) + ensure_node(source_key) + ensure_node( + target_key, + fallback_title=str(row.get("target_title") or "") or None, + target_exists=bool(row.get("target_exists")), + ) + edges.append( + { + "id": f"edge-{index}", + "source": source_key, + "target": target_key, + "relation_type": str(row["relation_type"]), + "depth": int(row["depth"]), + "target_exists": bool(row["target_exists"]), + } + ) + + return { + "nodes": sorted(nodes.values(), key=lambda item: str(item["id"])), + "edges": edges, + } + + +def _slugify(value: str) -> str: + return "-".join(part for part in "".join(ch.lower() if ch.isalnum() else "-" for ch in value).split("-") if part) or "topic" diff --git a/src/citegeist/app_server.py b/src/citegeist/app_server.py new file mode 100644 index 0000000..16547e0 --- /dev/null +++ b/src/citegeist/app_server.py @@ -0,0 +1,194 @@ +from __future__ import annotations + +from http import HTTPStatus +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +import argparse +import json +from pathlib import Path +from typing import Any + +from .app_api import LiteratureExplorerApi +from .storage import BibliographyStore + + +class LiteratureExplorerAppServer: + def __init__(self, api: LiteratureExplorerApi) -> None: + self.api = api + + def dispatch(self, method: str, params: dict[str, Any] | None = None) -> Any: + params = params or {} + if method == "capabilities": + return self.api.capabilities() + if method == "search": + return self.api.search( + str(params.get("query") or ""), + limit=int(params.get("limit", 20)), + topic_slug=_optional_str(params.get("topic_slug")), + ) + if method == "show_entry": + return self.api.show_entry( + str(params.get("citation_key") or ""), + include_provenance=bool(params.get("include_provenance", False)), + include_conflicts=bool(params.get("include_conflicts", False)), + include_bibtex=bool(params.get("include_bibtex", False)), + ) + if method == "list_topics": + return self.api.list_topics( + limit=int(params.get("limit", 100)), + phrase_review_status=_optional_str(params.get("phrase_review_status")), + ) + if method == "get_topic": + return self.api.get_topic( + str(params.get("topic_slug") or ""), + entry_limit=int(params.get("entry_limit", 100)), + ) + if method == "bootstrap": + return self.api.bootstrap( + seed_bibtex=_optional_str(params.get("seed_bibtex")), + topic=_optional_str(params.get("topic")), + topic_slug=_optional_str(params.get("topic_slug")), + topic_name=_optional_str(params.get("topic_name")), + topic_phrase=_optional_str(params.get("topic_phrase")), + topic_limit=int(params.get("topic_limit", 5)), + topic_commit_limit=_optional_int(params.get("topic_commit_limit")), + expand=bool(params.get("expand", True)), + preview_only=bool(params.get("preview_only", False)), + review_status=str(params.get("review_status") or "draft"), + ) + if method == "expand_topic": + return self.api.expand_topic( + str(params.get("topic_slug") or ""), + topic_phrase=_optional_str(params.get("topic_phrase")), + source=str(params.get("source") or "openalex"), + relation_type=str(params.get("relation_type") or "cites"), + seed_limit=int(params.get("seed_limit", 25)), + per_seed_limit=int(params.get("per_seed_limit", 25)), + min_relevance=float(params.get("min_relevance", 0.2)), + seed_keys=_string_list(params.get("seed_keys")), + preview_only=bool(params.get("preview_only", False)), + ) + if method == "extract_text": + return self.api.extract_text( + str(params.get("text") or ""), + backend=str(params.get("backend") or "heuristic"), + ) + if method == "verify_strings": + return self.api.verify_strings( + _string_list(params.get("values")), + context=str(params.get("context") or ""), + limit=int(params.get("limit", 5)), + ) + if method == "verify_bibtex": + return self.api.verify_bibtex( + str(params.get("bibtex_text") or ""), + context=str(params.get("context") or ""), + limit=int(params.get("limit", 5)), + ) + if method == "graph": + return self.api.graph( + _string_list(params.get("seed_keys")), + relation_types=_string_list(params.get("relation_types")), + depth=int(params.get("depth", 1)), + review_status=_optional_str(params.get("review_status")), + missing_only=bool(params.get("missing_only", False)), + ) + raise KeyError(f"Unknown method: {method}") + + +def create_request_handler(server: LiteratureExplorerAppServer): + class Handler(BaseHTTPRequestHandler): + def do_OPTIONS(self) -> None: + self.send_response(HTTPStatus.NO_CONTENT) + self._write_cors_headers() + self.end_headers() + + def do_POST(self) -> None: + if self.path != "/call": + self._write_json({"error": "not_found"}, status=HTTPStatus.NOT_FOUND) + return + try: + body = self.rfile.read(int(self.headers.get("Content-Length", "0") or "0")) + payload = json.loads(body.decode("utf-8") or "{}") + method = str(payload.get("method") or "") + params = payload.get("params") or {} + if not isinstance(params, dict): + raise ValueError("params must be an object") + result = server.dispatch(method, params) + self._write_json({"ok": True, "result": result}) + except KeyError as exc: + self._write_json({"ok": False, "error": str(exc)}, status=HTTPStatus.NOT_FOUND) + except Exception as exc: # pragma: no cover - defensive fallback + self._write_json({"ok": False, "error": str(exc)}, status=HTTPStatus.BAD_REQUEST) + + def do_GET(self) -> None: + if self.path == "/healthz": + self._write_json({"ok": True}) + return + if self.path == "/capabilities": + self._write_json({"ok": True, "result": server.dispatch("capabilities", {})}) + return + self._write_json({"error": "not_found"}, status=HTTPStatus.NOT_FOUND) + + def log_message(self, format: str, *args: object) -> None: + return + + def _write_json(self, payload: dict[str, Any], *, status: HTTPStatus = HTTPStatus.OK) -> None: + body = json.dumps(payload, indent=2).encode("utf-8") + self.send_response(status) + self._write_cors_headers() + self.send_header("Content-Type", "application/json; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def _write_cors_headers(self) -> None: + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header("Access-Control-Allow-Headers", "Content-Type") + self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS") + + return Handler + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="Run a lightweight local HTTP bridge for the CiteGeist literature explorer demo") + parser.add_argument("--db", default="library.sqlite3", help="SQLite database path") + parser.add_argument("--host", default="127.0.0.1", help="Bind host") + parser.add_argument("--port", type=int, default=8765, help="Bind port") + args = parser.parse_args(argv) + + store = BibliographyStore(Path(args.db)) + api = LiteratureExplorerApi(store) + server = LiteratureExplorerAppServer(api) + httpd = ThreadingHTTPServer((args.host, args.port), create_request_handler(server)) + try: + print(f"CiteGeist explorer server listening on http://{args.host}:{args.port}") + httpd.serve_forever() + finally: + httpd.server_close() + store.close() + return 0 + + +def _optional_str(value: object) -> str | None: + if value is None: + return None + text = str(value).strip() + return text or None + + +def _optional_int(value: object) -> int | None: + if value is None or value == "": + return None + return int(value) + + +def _string_list(value: object) -> list[str]: + if value is None: + return [] + if isinstance(value, list): + return [str(item) for item in value if str(item)] + return [str(value)] if str(value) else [] + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/citegeist/bootstrap.py b/src/citegeist/bootstrap.py index 4abdb0b..be2b38c 100644 --- a/src/citegeist/bootstrap.py +++ b/src/citegeist/bootstrap.py @@ -48,6 +48,8 @@ class Bootstrapper: ) -> list[BootstrapResult]: results: list[BootstrapResult] = [] seed_keys: list[str] = [] + effective_topic_slug = topic_slug or (_slugify(topic) if topic else None) + effective_topic_name = topic_name or topic if seed_bibtex: for entry in parse_bibtex(seed_bibtex): @@ -61,6 +63,16 @@ class Bootstrapper: review_status=review_status, ) seed_keys.append(entry.citation_key) + if effective_topic_slug and effective_topic_name: + store.add_entry_topic( + entry.citation_key, + topic_slug=effective_topic_slug, + topic_name=effective_topic_name, + source_type="bootstrap", + source_label="seed_bibtex", + confidence=1.0, + expansion_phrase=topic_phrase or topic, + ) results.append( BootstrapResult( entry.citation_key, @@ -76,13 +88,19 @@ class Bootstrapper: if topic: if not preview_only and (topic_slug or topic_name or topic_phrase): store.ensure_topic( - slug=topic_slug or _slugify(topic), - name=topic_name or topic, + slug=effective_topic_slug or _slugify(topic), + name=effective_topic_name or topic, source_type="bootstrap", expansion_phrase=topic_phrase or topic, ) candidate_limit = max(topic_limit, topic_commit_limit or 0) ranked_candidates = self._topic_candidates(topic, seed_keys, candidate_limit) + if not preview_only: + ranked_candidates = [ + (entry, score) + for entry, score in ranked_candidates + if _meets_topic_commit_threshold(entry, topic) + ] if topic_commit_limit is not None: ranked_candidates = ranked_candidates[:topic_commit_limit] @@ -97,6 +115,16 @@ class Bootstrapper: review_status=review_status, ) seed_keys.append(entry.citation_key) + if effective_topic_slug and effective_topic_name: + store.add_entry_topic( + entry.citation_key, + topic_slug=effective_topic_slug, + topic_name=effective_topic_name, + source_type="bootstrap", + source_label=f"topic:{topic}", + confidence=score, + expansion_phrase=topic_phrase or topic, + ) results.append( BootstrapResult( entry.citation_key, @@ -166,6 +194,30 @@ def _tokenize(value: str) -> set[str]: return {token for token in re.split(r"\W+", value.lower()) if token} +def _core_topic_terms(value: str) -> set[str]: + generic_terms = {"evolution", "origin", "origins", "science", "study", "studies"} + return {token for token in _tokenize(value) if token not in generic_terms} + + +def _meets_topic_commit_threshold(entry: BibEntry, topic: str) -> bool: + title = entry.fields.get("title", "") + if not title: + return False + normalized_topic = " ".join(topic.casefold().split()) + normalized_title = " ".join(title.casefold().split()) + if normalized_topic and normalized_topic in normalized_title: + return True + + topic_terms = _core_topic_terms(topic) + if not topic_terms: + return False + title_terms = _tokenize(title) + overlap = topic_terms & title_terms + if not overlap: + return False + return max(0.25, len(overlap) / len(topic_terms)) >= 0.2 + + def _slugify(value: str) -> str: slug = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-") return slug or "topic" diff --git a/src/citegeist/cli.py b/src/citegeist/cli.py index 7d9caf3..352b616 100644 --- a/src/citegeist/cli.py +++ b/src/citegeist/cli.py @@ -12,7 +12,13 @@ from .bibtex import parse_bibtex, render_bibtex from .bootstrap import Bootstrapper from .examples.talkorigins import TalkOriginsScraper from .expand import CrossrefExpander, OpenAlexExpander, TopicExpander -from .extract import extract_references +from .extract import ( + available_extraction_backends, + check_extraction_comparison_summary, + compare_extraction_backends, + extract_references, + summarize_extraction_comparison, +) from .harvest import OaiPmhHarvester from .resolve import MetadataResolver, merge_entries_with_conflicts from .storage import BibliographyStore @@ -68,8 +74,43 @@ def build_parser() -> argparse.ArgumentParser: extract_parser = subparsers.add_parser("extract", help="Extract draft BibTeX from plaintext references") extract_parser.add_argument("input", help="Plaintext file containing bibliography-style references") + extract_parser.add_argument( + "--backend", + choices=available_extraction_backends(), + default="heuristic", + help="Reference extraction backend to use", + ) extract_parser.add_argument("--output", help="Write extracted BibTeX to a file instead of stdout") + compare_extract_parser = subparsers.add_parser( + "compare-extract", + help="Run multiple extraction backends on the same plaintext references and emit a JSON comparison", + ) + compare_extract_parser.add_argument("input", help="Plaintext file containing bibliography-style references") + compare_extract_parser.add_argument( + "--backend", + action="append", + dest="backends", + choices=available_extraction_backends(), + help="Backend to include in the comparison; may be passed multiple times", + ) + compare_extract_parser.add_argument( + "--summary", + action="store_true", + help="Emit a compact JSON summary instead of row-by-row comparison output", + ) + compare_extract_parser.add_argument( + "--max-rows-with-differences", + type=int, + help="Fail with a nonzero exit code if rows_with_differences exceeds this value", + ) + compare_extract_parser.add_argument( + "--max-field-difference-count", + type=int, + help="Fail with a nonzero exit code if any field disagreement count exceeds this value", + ) + compare_extract_parser.add_argument("--output", help="Write JSON comparison to a file instead of stdout") + verify_parser = subparsers.add_parser( "verify", help="Verify or disambiguate free-text references or BibTeX entries without modifying the database", @@ -553,7 +594,16 @@ def main(argv: list[str] | None = None) -> int: if args.command == "apply-conflict": return _run_apply_conflict(store, args.citation_key, args.field_name) if args.command == "extract": - return _run_extract(Path(args.input), args.output) + return _run_extract(Path(args.input), args.backend, args.output) + if args.command == "compare-extract": + return _run_compare_extract( + Path(args.input), + args.backends, + args.summary, + args.max_rows_with_differences, + args.max_field_difference_count, + args.output, + ) if args.command == "verify": return _run_verify(args.string, args.list_input, args.bib, args.context, args.limit, args.format, args.output) if args.command == "resolve": @@ -792,9 +842,9 @@ def _run_apply_conflict(store: BibliographyStore, citation_key: str, field_name: return 0 -def _run_extract(input_path: Path, output: str | None) -> int: +def _run_extract(input_path: Path, backend: str, output: str | None) -> int: text = input_path.read_text(encoding="utf-8") - entries = extract_references(text) + entries = extract_references(text, backend=backend) rendered = render_bibtex(entries) if output: Path(output).write_text(rendered + ("\n" if rendered else ""), encoding="utf-8") @@ -804,6 +854,43 @@ def _run_extract(input_path: Path, output: str | None) -> int: return 0 +def _run_compare_extract( + input_path: Path, + backends: list[str] | None, + summary: bool, + max_rows_with_differences: int | None, + max_field_difference_count: int | None, + output: str | None, +) -> int: + text = input_path.read_text(encoding="utf-8") + rows = compare_extraction_backends(text, backends=backends) + payload: object + exit_code = 0 + if summary: + summary_payload = summarize_extraction_comparison(rows) + payload = summary_payload.to_dict() + if max_rows_with_differences is not None or max_field_difference_count is not None: + check = check_extraction_comparison_summary( + summary_payload, + max_rows_with_differences=max_rows_with_differences, + max_field_difference_count=max_field_difference_count, + ) + payload = { + "summary": payload, + "check": check.to_dict(), + } + if not check.passed: + exit_code = 1 + else: + payload = [row.to_dict() for row in rows] + rendered = json.dumps(payload, indent=2) + if output: + Path(output).write_text(rendered + "\n", encoding="utf-8") + else: + print(rendered) + return exit_code + + def _run_verify( string_input: str | None, list_input: str | None, diff --git a/src/citegeist/expand.py b/src/citegeist/expand.py index 3eb617e..72f641a 100644 --- a/src/citegeist/expand.py +++ b/src/citegeist/expand.py @@ -6,6 +6,7 @@ from dataclasses import dataclass from urllib.parse import quote, urlencode from .bibtex import BibEntry, parse_bibtex +from .extract import _extract_thesis_like_title, _looks_like_citation_blob as _shared_looks_like_citation_blob from .resolve import MetadataResolver, merge_entries from .storage import BibliographyStore @@ -455,25 +456,8 @@ def _extract_crossref_unstructured_title(text: str) -> str: normalized = _normalize_text(text) if not normalized: return "" - - thesis_markers = ( - "(Master", - "(Doctoral", - "PhD dissertation", - "Master's thesis", - "Master’s thesis", - "Doctoral dissertation", - ) - for marker in thesis_markers: - if marker in normalized: - normalized = normalized.split(marker, 1)[0].strip(" .") - break - for marker in (" ProQuest", " UMI No.", " Dissertation Abstracts", " University Microfilms"): - if marker in normalized: - normalized = normalized.split(marker, 1)[0].strip(" .") - if any(marker in text for marker in thesis_markers) and ". " in normalized: - normalized = normalized.split(". ", 1)[1].strip() - return normalized.strip(" .") + thesis_title = _extract_thesis_like_title(normalized) + return thesis_title or normalized.strip(" .") def _skip_crossref_reference(reference: dict, entry: BibEntry) -> bool: @@ -500,18 +484,7 @@ def _skip_crossref_reference(reference: dict, entry: BibEntry) -> bool: def _looks_like_citation_blob(text: str) -> bool: - lowered = text.casefold() - if any(token in lowered for token in ("http://", "https://", "www.", " accessed ", " url ")): - return True - if any(token in lowered for token in ("supporting material", "grant", "poll results", "prescribing information")): - return True - if text.count(",") >= 3 or text.count(";") >= 2: - return True - if re.search(r"\(\d{4}\)", text): - return True - if re.search(r"\b[A-Z]\.[ ]?[A-Z]?\.", text): - return True - return False + return _shared_looks_like_citation_blob(text) def _reference_citation_key(reference: dict, title: str, year: str, ordinal: int) -> str: @@ -527,8 +500,11 @@ def _reference_citation_key(reference: dict, title: str, year: str, ordinal: int def _normalize_text(value: str) -> str: - without_tags = re.sub(r"<[^>]+>", "", html.unescape(value)) - return " ".join(without_tags.split()) + without_tags = re.sub(r"<[^>]+>", " ", html.unescape(value)) + normalized = " ".join(without_tags.split()) + normalized = re.sub(r"([\(\[\{])\s+", r"\1", normalized) + normalized = re.sub(r"\s+([\)\]\},.;:!?])", r"\1", normalized) + return normalized def _normalize_person_display_name(value: str) -> str: @@ -699,7 +675,9 @@ def _openalex_work_to_entry(work: dict) -> BibEntry: doi = _normalize_openalex_doi(work.get("doi")) openalex_id = _normalize_openalex_id(work.get("id", "")) authors = " and ".join(_openalex_author_name(item) for item in work.get("authorships", [])) - source = ((work.get("primary_location") or {}).get("source") or {}).get("display_name", "") + source_info = (work.get("primary_location") or {}).get("source") or {} + source = source_info.get("display_name", "") + source_type = _normalize_text(str(source_info.get("type") or "")).casefold() work_type = work.get("type", "") fields: dict[str, str] = {"title": title} @@ -717,13 +695,13 @@ def _openalex_work_to_entry(work: dict) -> BibEntry: if abstract_text: fields["abstract"] = abstract_text if source: - if work_type == "article": + if _openalex_should_use_journal_field(work_type, source_type): fields["journal"] = source else: fields["booktitle"] = source citation_key = _openalex_citation_key(doi, openalex_id, authors, year, title) - entry_type = _openalex_type_to_bibtype(work_type) + entry_type = _openalex_type_to_bibtype(work_type, source_type) return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) @@ -742,7 +720,13 @@ def _openalex_abstract_text(inverted_index: dict) -> str: return "" if _looks_like_openalex_page_blob(text) else text -def _openalex_type_to_bibtype(work_type: str) -> str: +def _openalex_should_use_journal_field(work_type: str, source_type: str) -> bool: + if work_type == "article": + return True + return source_type == "journal" + + +def _openalex_type_to_bibtype(work_type: str, source_type: str = "") -> str: mapping = { "article": "article", "book": "book", @@ -750,7 +734,13 @@ def _openalex_type_to_bibtype(work_type: str) -> str: "dissertation": "phdthesis", "proceedings-article": "inproceedings", } - return mapping.get(work_type, "misc") + if work_type in mapping: + return mapping[work_type] + if source_type == "journal": + return "article" + if source_type == "conference": + return "inproceedings" + return "misc" def _openalex_citation_key(doi: str, openalex_id: str, authors: str, year: str, title: str) -> str: diff --git a/src/citegeist/extract.py b/src/citegeist/extract.py index 782f984..47768f8 100644 --- a/src/citegeist/extract.py +++ b/src/citegeist/extract.py @@ -1,15 +1,337 @@ from __future__ import annotations +import json +import os import re +import shutil +import subprocess +import tempfile +import urllib.error +import urllib.parse +import urllib.request +from dataclasses import dataclass +from typing import Protocol -from .bibtex import BibEntry +from .bibtex import BibEntry, parse_bibtex -YEAR_PATTERN = re.compile(r"\b(19|20)\d{2}\b") -YEAR_PAREN_PATTERN = re.compile(r"\((19|20)\d{2}\)") +YEAR_PATTERN = re.compile(r"\b(?:1[6-9]|20|21)\d{2}[a-z]?\b", re.IGNORECASE) +YEAR_PAREN_PATTERN = re.compile(r"\((?:1[6-9]|20|21)\d{2}[a-z]?\)", re.IGNORECASE) REF_START_PATTERN = re.compile(r"^(?:\[\d+\]|\d+\.|\(\d+\))\s*") +DOI_PATTERN = re.compile(r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b", re.IGNORECASE) +URL_PATTERN = re.compile(r"https?://\S+", re.IGNORECASE) +ARXIV_PATTERN = re.compile(r"\barXiv:\s*([A-Za-z0-9.\-]+)", re.IGNORECASE) +ISBN_PATTERN = re.compile(r"\bISBN(?:-1[03])?:?\s*([0-9Xx\-]{10,20})\b") +ISSN_PATTERN = re.compile(r"\bISSN:?\s*([0-9Xx\-]{8,12})\b", re.IGNORECASE) +VOLUME_ISSUE_PAGES_PATTERN = re.compile( + r"(?P\d+)\s*(?:\((?P[^)]+)\))?\s*[:;,]\s*(?P\d+\s*[-\u2013]\s*\d+)\b" +) +PAGES_PATTERN = re.compile(r"\bpp?\.\s*(?P\d+\s*[-\u2013]\s*\d+)\b", re.IGNORECASE) +TRAILING_PAGE_PATTERN = re.compile(r"[,;]\s*(?P\d+\s*[-\u2013]\s*\d+)\.?$") +REPORT_NUMBER_PATTERN = re.compile(r"\b(?:technical\s+report|report|working\s+paper|bulletin)\s+(?:no\.?|number)?\s*(?P[A-Za-z0-9.\-]+)\b", re.IGNORECASE) +THESIS_MARKER_PATTERN = re.compile( + r"\((?:master|doctoral).*?\)|phd dissertation|master'?s thesis|master’s thesis|doctoral dissertation", + re.IGNORECASE, +) -def extract_references(text: str) -> list[BibEntry]: +@dataclass(slots=True) +class ParsedReferenceParts: + raw_line: str + authors: str + year: str + title: str + venue: str + + +@dataclass(slots=True) +class ExtractionComparisonRow: + ordinal: int + raw_reference: str + entries: dict[str, dict[str, object]] + differing_fields: list[str] + + def to_dict(self) -> dict[str, object]: + return { + "ordinal": self.ordinal, + "raw_reference": self.raw_reference, + "entries": self.entries, + "differing_fields": self.differing_fields, + } + + +@dataclass(slots=True) +class ExtractionComparisonSummary: + backends: list[str] + row_count: int + rows_with_differences: int + differing_field_counts: dict[str, int] + backend_presence_counts: dict[str, int] + + def to_dict(self) -> dict[str, object]: + return { + "backends": self.backends, + "row_count": self.row_count, + "rows_with_differences": self.rows_with_differences, + "differing_field_counts": self.differing_field_counts, + "backend_presence_counts": self.backend_presence_counts, + } + + +@dataclass(slots=True) +class ExtractionComparisonCheckResult: + passed: bool + failures: list[str] + + def to_dict(self) -> dict[str, object]: + return { + "passed": self.passed, + "failures": self.failures, + } + + +class ReferenceExtractionBackend(Protocol): + name: str + + def extract_references(self, text: str) -> list[BibEntry]: + ... + + +@dataclass(slots=True) +class HeuristicReferenceExtractionBackend: + name: str = "heuristic" + + def extract_references(self, text: str) -> list[BibEntry]: + return _extract_references_heuristic(text) + + +@dataclass(slots=True) +class AnystyleCliReferenceExtractionBackend: + name: str = "anystyle" + command: str | None = None + parser_model: str | None = None + + def extract_references(self, text: str) -> list[BibEntry]: + command = self.command or os.getenv("CITEGEIST_ANYSTYLE_BIN", "anystyle") + parser_model = self.parser_model or os.getenv("CITEGEIST_ANYSTYLE_PARSER_MODEL") + if shutil.which(command) is None: + raise RuntimeError( + "The 'anystyle' extraction backend requires the AnyStyle CLI to be installed and on PATH. " + "Set CITEGEIST_ANYSTYLE_BIN if the binary is elsewhere." + ) + + blocks = _iter_reference_blocks(text) + if not blocks: + return [] + + with tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".txt", delete=False) as handle: + handle.write("\n".join(blocks) + "\n") + input_path = handle.name + + args = [command, "--stdout", "-f", "json"] + if parser_model: + args.extend(["-P", parser_model]) + args.extend(["parse", input_path]) + + try: + result = subprocess.run(args, capture_output=True, text=True, check=False) + finally: + try: + os.unlink(input_path) + except OSError: + pass + + if result.returncode != 0: + message = result.stderr.strip() or result.stdout.strip() or "unknown AnyStyle error" + raise RuntimeError(f"AnyStyle extraction failed: {message}") + + payload = json.loads(result.stdout or "[]") + if not isinstance(payload, list): + raise RuntimeError("AnyStyle extraction returned an unexpected payload") + return [_anystyle_item_to_entry(item, index) for index, item in enumerate(payload, start=1)] + + +@dataclass(slots=True) +class GrobidReferenceExtractionBackend: + name: str = "grobid" + base_url: str | None = None + consolidate_citations: int = 0 + include_raw_citations: int = 1 + + def extract_references(self, text: str) -> list[BibEntry]: + blocks = _iter_reference_blocks(text) + if not blocks: + return [] + + base_url = (self.base_url or os.getenv("CITEGEIST_GROBID_URL", "http://127.0.0.1:8070")).rstrip("/") + payload = urllib.parse.urlencode( + { + "citations": blocks, + "consolidateCitations": str(self.consolidate_citations), + "includeRawCitations": str(self.include_raw_citations), + }, + doseq=True, + ).encode("utf-8") + request = urllib.request.Request( + f"{base_url}/api/processCitationList", + data=payload, + headers={ + "Accept": "application/x-bibtex", + "Content-Type": "application/x-www-form-urlencoded", + }, + method="POST", + ) + + try: + with urllib.request.urlopen(request, timeout=30) as response: + body = response.read().decode("utf-8") + except urllib.error.HTTPError as exc: + error_body = exc.read() + if isinstance(error_body, bytes): + detail = error_body.decode("utf-8", errors="replace").strip() + else: + detail = str(error_body or "").strip() + raise RuntimeError(f"GROBID extraction failed with HTTP {exc.code}: {detail or exc.reason}") from exc + except urllib.error.URLError as exc: + raise RuntimeError(f"GROBID extraction failed: {exc.reason}") from exc + + if not body.strip(): + return [] + + try: + entries = parse_bibtex(body) + except Exception as exc: + raise RuntimeError("GROBID extraction returned invalid BibTeX output") from exc + + for index, entry in enumerate(entries, start=1): + if entry.citation_key in {"-1", "1", ""}: + entry.citation_key = _make_citation_key( + entry.fields.get("author", "ref"), + entry.fields.get("year", "nd"), + entry.fields.get("title", "untitled"), + index, + ) + return entries + + +_EXTRACTION_BACKENDS: dict[str, ReferenceExtractionBackend] = { + "heuristic": HeuristicReferenceExtractionBackend(), + "anystyle": AnystyleCliReferenceExtractionBackend(), + "grobid": GrobidReferenceExtractionBackend(), +} + + +def available_extraction_backends() -> list[str]: + return sorted(_EXTRACTION_BACKENDS) + + +def get_extraction_backend(name: str = "heuristic") -> ReferenceExtractionBackend: + try: + return _EXTRACTION_BACKENDS[name] + except KeyError as exc: + choices = ", ".join(available_extraction_backends()) + raise ValueError(f"Unknown extraction backend: {name}. Available backends: {choices}") from exc + + +def register_extraction_backend(backend: ReferenceExtractionBackend) -> None: + _EXTRACTION_BACKENDS[backend.name] = backend + + +def extract_references(text: str, backend: str = "heuristic") -> list[BibEntry]: + backend_impl = get_extraction_backend(backend) + entries = backend_impl.extract_references(text) + raw_references = _iter_reference_blocks(text) + return _normalize_extracted_entries(entries, raw_references, backend_impl.name) + + +def render_extracted_bibtex(text: str, backend: str = "heuristic") -> str: + from .bibtex import render_bibtex + + return render_bibtex(extract_references(text, backend=backend)) + + +def compare_extraction_backends(text: str, backends: list[str] | None = None) -> list[ExtractionComparisonRow]: + selected = backends or available_extraction_backends() + raw_references = _iter_reference_blocks(text) + extracted_by_backend = {backend: extract_references(text, backend=backend) for backend in selected} + + rows: list[ExtractionComparisonRow] = [] + max_count = max([len(raw_references), *(len(entries) for entries in extracted_by_backend.values())], default=0) + for index in range(max_count): + entries_payload: dict[str, dict[str, object]] = {} + all_field_names: set[str] = set() + for backend in selected: + entry = extracted_by_backend[backend][index] if index < len(extracted_by_backend[backend]) else None + payload = _entry_to_comparison_payload(entry) + entries_payload[backend] = payload + all_field_names.update(str(field_name) for field_name in payload.get("fields", {})) + + differing_fields: list[str] = [] + entry_type_values = {str(entries_payload[backend].get("entry_type") or "") for backend in selected} + if len(entry_type_values) > 1: + differing_fields.append("entry_type") + for field_name in sorted(all_field_names): + values = { + str(entries_payload[backend].get("fields", {}).get(field_name, "")) + for backend in selected + } + if len(values) > 1: + differing_fields.append(field_name) + rows.append( + ExtractionComparisonRow( + ordinal=index + 1, + raw_reference=raw_references[index] if index < len(raw_references) else "", + entries=entries_payload, + differing_fields=differing_fields, + ) + ) + return rows + + +def summarize_extraction_comparison(rows: list[ExtractionComparisonRow]) -> ExtractionComparisonSummary: + backend_names = sorted({backend for row in rows for backend in row.entries}) + differing_field_counts: dict[str, int] = {} + backend_presence_counts: dict[str, int] = {backend: 0 for backend in backend_names} + rows_with_differences = 0 + + for row in rows: + if row.differing_fields: + rows_with_differences += 1 + for field_name in row.differing_fields: + differing_field_counts[field_name] = differing_field_counts.get(field_name, 0) + 1 + for backend, payload in row.entries.items(): + if payload.get("present"): + backend_presence_counts[backend] = backend_presence_counts.get(backend, 0) + 1 + + return ExtractionComparisonSummary( + backends=backend_names, + row_count=len(rows), + rows_with_differences=rows_with_differences, + differing_field_counts=dict(sorted(differing_field_counts.items())), + backend_presence_counts=dict(sorted(backend_presence_counts.items())), + ) + + +def check_extraction_comparison_summary( + summary: ExtractionComparisonSummary, + *, + max_rows_with_differences: int | None = None, + max_field_difference_count: int | None = None, +) -> ExtractionComparisonCheckResult: + failures: list[str] = [] + if max_rows_with_differences is not None and summary.rows_with_differences > max_rows_with_differences: + failures.append( + f"rows_with_differences {summary.rows_with_differences} exceeds limit {max_rows_with_differences}" + ) + if max_field_difference_count is not None: + for field_name, count in summary.differing_field_counts.items(): + if count > max_field_difference_count: + failures.append( + f"field '{field_name}' difference count {count} exceeds limit {max_field_difference_count}" + ) + return ExtractionComparisonCheckResult(passed=not failures, failures=failures) + + +def _extract_references_heuristic(text: str) -> list[BibEntry]: entries: list[BibEntry] = [] for index, line in enumerate(_iter_reference_blocks(text), start=1): parsed = _parse_reference_line(line, index) @@ -18,10 +340,135 @@ def extract_references(text: str) -> list[BibEntry]: return entries -def render_extracted_bibtex(text: str) -> str: - from .bibtex import render_bibtex +def _entry_to_comparison_payload(entry: BibEntry | None) -> dict[str, object]: + if entry is None: + return {"present": False, "citation_key": None, "entry_type": None, "fields": {}} + return { + "present": True, + "citation_key": entry.citation_key, + "entry_type": entry.entry_type, + "fields": dict(entry.fields), + } - return render_bibtex(extract_references(text)) + +def _normalize_extracted_entries( + entries: list[BibEntry], + raw_references: list[str], + backend_name: str, +) -> list[BibEntry]: + normalized_entries: list[BibEntry] = [] + for index, entry in enumerate(entries): + raw_reference = raw_references[index] if index < len(raw_references) else "" + normalized_entries.append(_normalize_extracted_entry(entry, backend_name, raw_reference)) + return normalized_entries + + +def _normalize_extracted_entry(entry: BibEntry, backend_name: str, raw_reference: str) -> BibEntry: + fields = dict(entry.fields) + + for key in ( + "title", + "journal", + "booktitle", + "publisher", + "school", + "institution", + "howpublished", + "address", + ): + if fields.get(key): + fields[key] = _clean_title(fields[key]) + + if year := fields.get("year"): + if match := YEAR_PATTERN.search(year): + fields["year"] = match.group(0) + + if pages := fields.get("pages"): + fields["pages"] = _normalize_pages(pages) + + if doi := fields.get("doi"): + normalized_doi = doi.strip().rstrip(".,;)") + fields["doi"] = normalized_doi + fields["url"] = f"https://doi.org/{normalized_doi}" + elif url := fields.get("url"): + fields["url"] = url.strip().rstrip(".,;)") + + fields["note"] = _merge_extraction_note(fields.get("note", ""), backend_name, raw_reference) + return BibEntry(entry_type=entry.entry_type, citation_key=entry.citation_key, fields=fields) + + +def _merge_extraction_note(existing: str, backend_name: str, raw_reference: str) -> str: + parts: list[str] = [] + existing_clean = existing.strip() + if existing_clean: + parts.append(existing_clean) + lowered = existing_clean.casefold() + if "extracted_reference" not in lowered: + parts.append("extracted_reference = {true}") + if "extracted_by" not in lowered: + parts.append(f"extracted_by = {{{backend_name}}}") + if raw_reference and "raw_reference" not in lowered: + parts.append(f"raw_reference = {{{raw_reference}}}") + return "; ".join(parts) + + +def _anystyle_item_to_entry(item: object, ordinal: int) -> BibEntry: + if not isinstance(item, dict): + raise RuntimeError("AnyStyle extraction item is not an object") + + title = _clean_title(_first_text(item.get("title"))) + authors = _anystyle_people_to_names(item.get("author")) + year = _extract_year_from_values(item.get("date")) + entry_type = _map_anystyle_type(_first_text(item.get("type"))) + citation_key = _make_citation_key(authors or "ref", year or "nd", title or "untitled", ordinal) + + fields: dict[str, str] = { + "note": "extracted_reference = {true}; extracted_by = {anystyle}", + } + if authors: + fields["author"] = authors + if year: + fields["year"] = year + if title: + fields["title"] = title + + if editors := _anystyle_people_to_names(item.get("editor")): + fields["editor"] = editors + if publisher := _first_text(item.get("publisher")): + fields["publisher"] = publisher + if location := _first_text(item.get("location")): + fields["address"] = location + if pages := _first_text(item.get("pages")): + fields["pages"] = _normalize_pages(pages) + if volume := _first_text(item.get("volume")): + fields["volume"] = volume + if number := _first_text(item.get("issue")) or _first_text(item.get("number")): + fields["number"] = number + if doi := _first_text(item.get("doi")): + fields["doi"] = doi + fields["url"] = f"https://doi.org/{doi}" + elif url := _first_text(item.get("url")): + fields["url"] = url + + container = _first_text(item.get("journal")) or _first_text(item.get("container-title")) + if not container and entry_type in {"book", "phdthesis", "mastersthesis", "techreport"}: + container = _first_text(item.get("organization")) or _first_text(item.get("institution")) or _first_text(item.get("school")) + + if container: + if entry_type == "article": + fields["journal"] = container + elif entry_type in {"inproceedings", "incollection"}: + fields["booktitle"] = container + elif entry_type == "techreport": + fields["institution"] = container + elif entry_type in {"phdthesis", "mastersthesis"}: + fields["school"] = container + elif entry_type == "book" and "publisher" not in fields: + fields["publisher"] = container + else: + fields["howpublished"] = container + + return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) def _iter_reference_blocks(text: str) -> list[str]: @@ -72,10 +519,10 @@ def _parse_apa_style_reference(line: str, ordinal: int) -> BibEntry | None: if not segments: return None - title = _clean_title(segments[0]) - venue = segments[1] if len(segments) > 1 else "" - authors = _normalize_authors(author_part) - return _build_entry(line, ordinal, authors, year, title, venue) + parts = _make_reference_parts(line, author_part, year, remainder) + if parts is None: + return None + return _build_entry(parts, ordinal) def _parse_publisher_style_reference(line: str, ordinal: int) -> BibEntry | None: @@ -98,17 +545,34 @@ def _parse_publisher_style_reference(line: str, ordinal: int) -> BibEntry | None if not authors or not title or not publisher: return None - citation_key = _make_citation_key(authors, year_match.group(0), title, ordinal) + year = year_match.group(0) + citation_key = _make_citation_key(authors, year, title, ordinal) + identifiers = _extract_identifier_fields(line) + metadata = _parse_venue_metadata(publisher) + entry_type = str(metadata.get("entry_type") or _guess_entry_type(publisher)) + if entry_type not in {"book", "phdthesis", "mastersthesis", "techreport"}: + entry_type = "book" + fields: dict[str, str] = { + "author": authors, + "year": year, + "title": title, + "note": f"extracted_reference = {{true}}; raw_reference = {{{line}}}", + **identifiers, + } + if entry_type == "book": + fields["publisher"] = str(metadata.get("venue") or publisher) + elif entry_type in {"phdthesis", "mastersthesis"}: + fields["school"] = str(metadata.get("venue") or publisher) + else: + fields["institution"] = str(metadata.get("venue") or publisher) + for key in ("number", "type", "series"): + value = metadata.get(key) + if value: + fields[key] = str(value) return BibEntry( - entry_type="book", + entry_type=entry_type, citation_key=citation_key, - fields={ - "author": authors, - "year": year_match.group(0), - "title": title, - "publisher": publisher, - "note": f"extracted_reference = {{true}}; raw_reference = {{{line}}}", - }, + fields=fields, ) @@ -123,14 +587,10 @@ def _parse_plain_year_reference(line: str, ordinal: int) -> BibEntry | None: if not author_part or not remainder: return None - segments = _segments_after_year(remainder) - if not segments: + parts = _make_reference_parts(line, author_part, year, remainder) + if parts is None: return None - - title = _clean_title(segments[0]) - venue = segments[1] if len(segments) > 1 else "" - authors = _normalize_authors(author_part) - return _build_entry(line, ordinal, authors, year, title, venue) + return _build_entry(parts, ordinal) def _normalize_authors(author_part: str) -> str: @@ -145,36 +605,71 @@ def _segments_after_year(remainder: str) -> list[str]: return [segment.strip(" .") for segment in remainder.split(". ") if segment.strip(" .")] +def _split_title_and_venue(remainder: str, *, prefer_colon: bool = False) -> tuple[str, str]: + if prefer_colon and ": " in remainder: + title, venue = remainder.split(": ", 1) + return _clean_title(title), _clean_title(venue) + + segments = _segments_after_year(remainder) + if not segments: + return "", "" + title = _clean_title(segments[0]) + venue = ". ".join(segments[1:]) if len(segments) > 1 else "" + return title, _clean_title(venue) if venue else "" + + def _clean_title(title: str) -> str: - cleaned = title.strip(" .\"'") + cleaned = title.strip(" .,;:\"'") cleaned = re.sub(r"\s+", " ", cleaned) return cleaned -def _build_entry( - raw_line: str, - ordinal: int, - authors: str, - year: str, - title: str, - venue: str, -) -> BibEntry: - citation_key = _make_citation_key(authors, year, title, ordinal) - entry_type = _guess_entry_type(venue) +def _make_reference_parts(raw_line: str, author_part: str, year: str, remainder: str) -> ParsedReferenceParts | None: + title, venue = _split_title_and_venue(remainder) + authors = _normalize_authors(author_part) + if not authors or not title: + return None + return ParsedReferenceParts( + raw_line=raw_line, + authors=authors, + year=year, + title=title, + venue=venue, + ) + + +def _build_entry(parts: ParsedReferenceParts, ordinal: int) -> BibEntry: + citation_key = _make_citation_key(parts.authors, parts.year, parts.title, ordinal) + entry_type = _guess_entry_type(parts.venue) + metadata = _parse_venue_metadata(parts.venue) + if metadata.get("entry_type"): + entry_type = str(metadata["entry_type"]) fields: dict[str, str] = { - "author": authors, - "year": year, - "title": title, - "note": f"extracted_reference = {{true}}; raw_reference = {{{raw_line}}}", + "author": parts.authors, + "year": parts.year, + "title": parts.title, + "note": f"extracted_reference = {{true}}; raw_reference = {{{parts.raw_line}}}", } - if venue: + fields.update(_extract_identifier_fields(parts.raw_line)) + if metadata.get("venue"): + venue_value = str(metadata["venue"]) if entry_type == "article": - fields["journal"] = venue - elif entry_type == "inproceedings": - fields["booktitle"] = venue + fields["journal"] = venue_value + elif entry_type in {"inproceedings", "incollection"}: + fields["booktitle"] = venue_value + elif entry_type == "book": + fields["publisher"] = venue_value + elif entry_type in {"phdthesis", "mastersthesis"}: + fields["school"] = venue_value + elif entry_type == "techreport": + fields["institution"] = venue_value else: - fields["howpublished"] = venue + fields["howpublished"] = venue_value + for key in ("volume", "number", "pages", "publisher", "institution", "school", "type", "series"): + value = metadata.get(key) + if value: + fields[key] = str(value) return BibEntry(entry_type=entry_type, citation_key=citation_key, fields=fields) @@ -192,10 +687,197 @@ def _make_citation_key(authors: str, year: str, title: str, ordinal: int) -> str def _guess_entry_type(venue: str) -> str: lowered = venue.lower() + if "master" in lowered and "thesis" in lowered: + return "mastersthesis" + if any(token in lowered for token in ("ph.d", "phd", "doctoral dissertation", "doctor's thesis", "thesis", "dissertation")): + return "phdthesis" + if any(token in lowered for token in ("technical report", "tech report", "report no", "working paper", "bulletin")): + return "techreport" + if any(token in lowered for token in ("retrieved from", "available at", "accessed", "http://", "https://", "www.")): + return "misc" if any(token in lowered for token in ("journal", "transactions", "review", "letters")): return "article" if any(token in lowered for token in ("proceedings", "conference", "workshop", "symposium")): return "inproceedings" - if any(token in lowered for token in ("press", "publisher", "university")): + if any(token in lowered for token in ("press", "publisher", "publications", "springer", "wiley", "elsevier", "university")): + return "book" + return "misc" + + +def _extract_identifier_fields(text: str) -> dict[str, str]: + fields: dict[str, str] = {} + if doi_match := DOI_PATTERN.search(text): + doi = doi_match.group(0).rstrip(".,;)") + fields["doi"] = doi + fields["url"] = f"https://doi.org/{doi}" + elif url_match := URL_PATTERN.search(text): + fields["url"] = url_match.group(0).rstrip(".,;)") + if arxiv_match := ARXIV_PATTERN.search(text): + fields["arxiv"] = arxiv_match.group(1).rstrip(".,;)") + if isbn_match := ISBN_PATTERN.search(text): + fields["isbn"] = isbn_match.group(1).strip() + if issn_match := ISSN_PATTERN.search(text): + fields["issn"] = issn_match.group(1).strip() + return fields + + +def _looks_like_citation_blob(text: str) -> bool: + lowered = text.casefold() + if any(token in lowered for token in ("http://", "https://", "www.", " accessed ", " url ")): + return True + if any(token in lowered for token in ("supporting material", "grant", "poll results", "prescribing information")): + return True + if text.count(",") >= 3 or text.count(";") >= 2: + return True + if re.search(r"\(\d{4}[a-z]?\)", text, flags=re.IGNORECASE): + return True + if re.search(r"\b[A-Z]\.[ ]?[A-Z]?\.", text): + return True + return False + + +def _extract_thesis_like_title(text: str) -> str: + normalized = _clean_title(" ".join(text.split())) + if not normalized: + return "" + + match = THESIS_MARKER_PATTERN.search(normalized) + if match is not None: + normalized = normalized[: match.start()].strip(" .") + for marker in (" ProQuest", " UMI No.", " Dissertation Abstracts", " University Microfilms"): + if marker in normalized: + normalized = normalized.split(marker, 1)[0].strip(" .") + if match is not None and ". " in normalized: + normalized = normalized.split(". ", 1)[1].strip() + return normalized.strip(" .") + + +def _parse_venue_metadata(venue: str) -> dict[str, str]: + if not venue: + return {} + + # These recovery heuristics intentionally mirror patterns already used in + # citegeist.talkorigins / citegeist.expand and were scoped using GROBID-like + # staged parsing concerns: preserve identifiers, venue fragments, and page structure. + normalized = venue.strip(" .") + metadata: dict[str, str] = {"venue": normalized} + entry_type = _guess_entry_type(normalized) + metadata["entry_type"] = entry_type + + lowered = normalized.lower() + if entry_type == "misc" and ("retrieved from" in lowered or "available at" in lowered): + metadata["venue"] = _clean_title(normalized) + + if volume_match := VOLUME_ISSUE_PAGES_PATTERN.search(normalized): + metadata["volume"] = volume_match.group("volume").strip() + if volume_match.group("number"): + metadata["number"] = volume_match.group("number").strip() + metadata["pages"] = _normalize_pages(volume_match.group("pages")) + venue_prefix = normalized[: volume_match.start()].strip(" ,;:.") + if venue_prefix: + metadata["venue"] = venue_prefix + elif pages_match := PAGES_PATTERN.search(normalized): + metadata["pages"] = _normalize_pages(pages_match.group("pages")) + venue_prefix = normalized[: pages_match.start()].strip(" ,;:.") + if venue_prefix: + metadata["venue"] = venue_prefix + elif trailing_pages_match := TRAILING_PAGE_PATTERN.search(normalized): + metadata["pages"] = _normalize_pages(trailing_pages_match.group("pages")) + venue_prefix = normalized[: trailing_pages_match.start()].strip(" ,;:.") + if venue_prefix: + metadata["venue"] = venue_prefix + + if entry_type == "techreport": + if report_match := REPORT_NUMBER_PATTERN.search(normalized): + metadata["number"] = report_match.group("number").strip() + metadata["type"] = "Technical Report" + institution = _strip_report_prefix(metadata.get("venue", normalized)) + if institution: + metadata["venue"] = institution + elif entry_type in {"phdthesis", "mastersthesis"}: + school = _strip_thesis_prefix(metadata.get("venue", normalized)) + if school: + metadata["venue"] = school + return metadata + + +def _normalize_pages(value: str) -> str: + compact = re.sub(r"\s*[\u2013-]+\s*", "--", value.strip()) + return re.sub(r"-{3,}", "--", compact) + + +def _strip_report_prefix(value: str) -> str: + cleaned = re.sub(r"\b(?:technical\s+report|tech report|report|working\s+paper|bulletin)\b", "", value, flags=re.IGNORECASE) + cleaned = re.sub(r"\b(?:no\.?|number)\s*[A-Za-z0-9.\-]+\b", "", cleaned, flags=re.IGNORECASE) + return _clean_title(cleaned) + + +def _strip_thesis_prefix(value: str) -> str: + cleaned = re.sub(r"\b(?:ph\.?d\.?|doctoral|doctor's|master'?s)\s+(?:dissertation|thesis)\b", "", value, flags=re.IGNORECASE) + cleaned = re.sub(r"^\((?:master|doctoral).*?\)\s*", "", cleaned, flags=re.IGNORECASE) + return _clean_title(cleaned) + + +def _first_text(value: object) -> str: + if isinstance(value, list): + for item in value: + text = _first_text(item) + if text: + return text + return "" + if isinstance(value, dict): + for key in ("literal", "value", "text", "name"): + text = _first_text(value.get(key)) + if text: + return text + return "" + if value is None: + return "" + return _clean_title(str(value)) + + +def _extract_year_from_values(value: object) -> str: + text = _first_text(value) + match = YEAR_PATTERN.search(text) + return match.group(0) if match is not None else "" + + +def _anystyle_people_to_names(value: object) -> str: + if not isinstance(value, list): + return "" + names: list[str] = [] + for item in value: + if isinstance(item, dict): + family = _first_text(item.get("family")) + given = _first_text(item.get("given")) + literal = _first_text(item.get("literal")) + if family and given: + names.append(f"{family}, {given}") + elif literal: + names.append(literal) + elif family: + names.append(family) + else: + text = _first_text(item) + if text: + names.append(text) + return " and ".join(name for name in names if name) + + +def _map_anystyle_type(value: str) -> str: + lowered = value.casefold() + if lowered in {"article", "journal_article", "article-journal"}: + return "article" + if lowered in {"chapter", "incollection"}: + return "incollection" + if lowered in {"paper-conference", "inproceedings", "proceedings"}: + return "inproceedings" + if lowered in {"thesis", "phdthesis", "dissertation"}: + return "phdthesis" + if lowered in {"mastersthesis", "master-thesis"}: + return "mastersthesis" + if lowered in {"report", "techreport"}: + return "techreport" + if lowered == "book": return "book" return "misc" diff --git a/src/citegeist/resolve.py b/src/citegeist/resolve.py index 89f8e52..5771a34 100644 --- a/src/citegeist/resolve.py +++ b/src/citegeist/resolve.py @@ -320,12 +320,12 @@ def _merged_entry_type(base_entry_type: str, resolved_entry_type: str) -> str: def _crossref_message_to_entry(message: dict) -> BibEntry: entry_type = _crossref_type_to_bibtype(message.get("type", "article")) title_values = message.get("title", []) - title = title_values[0] if title_values else "" + title = _normalize_text(title_values[0] if title_values else "") year = _extract_crossref_year(message) authors = " and ".join(_crossref_person_to_name(person) for person in message.get("author", [])) venue = "" if container_title := message.get("container-title", []): - venue = container_title[0] + venue = _normalize_text(container_title[0]) fields: dict[str, str] = {} if authors: @@ -339,7 +339,9 @@ def _crossref_message_to_entry(message: dict) -> BibEntry: if url := message.get("URL"): fields["url"] = url if abstract := message.get("abstract"): - fields["abstract"] = abstract + normalized_abstract = _normalize_abstract_text(str(abstract)) + if normalized_abstract: + fields["abstract"] = normalized_abstract if venue: if entry_type == "article": fields["journal"] = venue @@ -439,7 +441,9 @@ def _openalex_work_to_entry(work: dict) -> BibEntry: doi = _normalize_openalex_doi(work.get("doi")) openalex_id = _normalize_openalex_id(work.get("id", "")) authors = " and ".join(_openalex_author_name(item) for item in work.get("authorships", [])) - source = ((work.get("primary_location") or {}).get("source") or {}).get("display_name", "") + source_info = (work.get("primary_location") or {}).get("source") or {} + source = source_info.get("display_name", "") + source_type = _normalize_text(str(source_info.get("type") or "")).casefold() work_type = work.get("type", "") fields: dict[str, str] = {} @@ -460,13 +464,13 @@ def _openalex_work_to_entry(work: dict) -> BibEntry: if abstract_text: fields["abstract"] = abstract_text if source: - if work_type == "article": + if _openalex_should_use_journal_field(work_type, source_type): fields["journal"] = source else: fields["booktitle"] = source citation_key = _openalex_citation_key(doi, openalex_id, authors, year, title) - return BibEntry(entry_type=_openalex_type_to_bibtype(work_type), citation_key=citation_key, fields=fields) + return BibEntry(entry_type=_openalex_type_to_bibtype(work_type, source_type), citation_key=citation_key, fields=fields) def _openalex_author_name(authorship: dict) -> str: @@ -483,7 +487,13 @@ def _openalex_abstract_text(inverted_index: dict) -> str: return "" if _looks_like_openalex_page_blob(text) else text -def _openalex_type_to_bibtype(work_type: str) -> str: +def _openalex_should_use_journal_field(work_type: str, source_type: str) -> bool: + if work_type == "article": + return True + return source_type == "journal" + + +def _openalex_type_to_bibtype(work_type: str, source_type: str = "") -> str: mapping = { "article": "article", "book": "book", @@ -491,7 +501,13 @@ def _openalex_type_to_bibtype(work_type: str) -> str: "dissertation": "phdthesis", "proceedings-article": "inproceedings", } - return mapping.get(work_type, "misc") + if work_type in mapping: + return mapping[work_type] + if source_type == "journal": + return "article" + if source_type == "conference": + return "inproceedings" + return "misc" def _normalize_openalex_id(value: str) -> str: @@ -509,8 +525,17 @@ def _normalize_openalex_doi(value: str | None) -> str: def _normalize_text(value: str) -> str: - without_tags = re.sub(r"<[^>]+>", "", html.unescape(value)) - return " ".join(without_tags.split()) + without_tags = re.sub(r"<[^>]+>", " ", html.unescape(value)) + normalized = " ".join(without_tags.split()) + normalized = re.sub(r"([\(\[\{])\s+", r"\1", normalized) + normalized = re.sub(r"\s+([\)\]\},.;:!?])", r"\1", normalized) + return normalized + + +def _normalize_abstract_text(value: str) -> str: + normalized = _normalize_text(value) + normalized = re.sub(r"^abstract\s*[:.\-]?\s*", "", normalized, flags=re.IGNORECASE) + return normalized def _normalize_person_display_name(value: str) -> str: diff --git a/src/citegeist/talkorigins.py b/src/citegeist/talkorigins.py index c69e79a..38fa6e6 100644 --- a/src/citegeist/talkorigins.py +++ b/src/citegeist/talkorigins.py @@ -17,6 +17,7 @@ from pathlib import Path from urllib.parse import urljoin, urlparse from .bibtex import BibEntry, render_bibtex +from .extract import _clean_title, _guess_entry_type, _make_citation_key, _split_title_and_venue from .resolve import MetadataResolver, merge_entries, merge_entries_with_conflicts from .sources import SourceClient from .storage import BibliographyStore @@ -823,13 +824,15 @@ class TalkOriginsScraper: if not author_part or not remainder: return None - title, venue = _split_title_and_venue(remainder) + title, venue = _split_title_and_venue(remainder, prefer_colon=True) if not title: return None authors = _normalize_gsa_authors(author_part) citation_key = _make_citation_key(authors, year, title, ordinal) entry_type = _guess_entry_type(remainder) + if ", in " in venue.lower() and " eds." in venue.lower(): + entry_type = "book" fields = { "author": authors, "year": year, @@ -911,19 +914,6 @@ def _extract_author_prefix(entry_text: str) -> str: return entry_text[: year_match.start()].strip(" ,;:") -def _split_title_and_venue(remainder: str) -> tuple[str, str]: - if ": " in remainder: - title, venue = remainder.split(": ", 1) - return _clean_fragment(title), _clean_fragment(venue) - - parts = [part.strip() for part in remainder.split(". ") if part.strip()] - if not parts: - return "", "" - title = parts[0] - venue = ". ".join(parts[1:]) if len(parts) > 1 else "" - return _clean_fragment(title), _clean_fragment(venue) - - def _normalize_gsa_authors(author_part: str) -> str: cleaned = WHITESPACE_PATTERN.sub(" ", author_part.replace("&", " and ")).strip(" ,;:") if " and " in cleaned and "," not in cleaned: @@ -950,43 +940,8 @@ def _normalize_gsa_authors(author_part: str) -> str: return " and ".join(authors) if authors else cleaned -def _make_citation_key(authors: str, year: str, title: str, ordinal: int) -> str: - first_author = authors.split(" and ")[0] - family = first_author.split(",", 1)[0] if "," in first_author else first_author.split()[-1] - family = re.sub(r"[^A-Za-z0-9]+", "", family).lower() or "ref" - first_word = re.sub(r"[^A-Za-z0-9]+", "", title.split()[0]).lower() if title.split() else "untitled" - first_word = first_word or "untitled" - return f"{family}{year}{first_word}{ordinal}" - - -def _guess_entry_type(text: str) -> str: - lowered = text.lower() - if "ph.d" in lowered or "dissertation" in lowered or "thesis" in lowered: - return "phdthesis" - if any( - token in lowered - for token in ( - "press", - "publisher", - "publications", - "publication", - "elsevier", - "springer", - "wiley", - "university", - "books", - ) - ): - return "book" - if any(token in lowered for token in ("proceedings", "conference", "symposium", "workshop")): - return "inproceedings" - if any(token in lowered for token in ("journal", "review", "letters", "quarterly", "science", "nature")): - return "article" - return "misc" - - def _clean_fragment(value: str) -> str: - return WHITESPACE_PATTERN.sub(" ", value.strip(" .;:,\"'")) + return _clean_title(WHITESPACE_PATTERN.sub(" ", value)) def _slugify(value: str) -> str: diff --git a/tests/fixtures/extract_backend_fixture.txt b/tests/fixtures/extract_backend_fixture.txt new file mode 100644 index 0000000..5b4b1df --- /dev/null +++ b/tests/fixtures/extract_backend_fixture.txt @@ -0,0 +1,5 @@ +[1] Smith, Jane. 2024a. Graph-first bibliography augmentation. Journal of Research Systems 12(3): 45-67. doi:10.1000/example-doi. +[2] Miller, Sam. 2023. Semantic search for research corpora. Proceedings of the Retrieval Workshop. +[3] Doe, Alex. 2019. Evolutionary archives. PhD dissertation, Example University. +[4] Chen, Bo. 2018. Field methods update. Technical Report No. TR-2018-05, Example Research Lab. +[5] Nguyen, An. 2022. Project page. Retrieved from https://example.org/project-page. diff --git a/tests/test_app_api.py b/tests/test_app_api.py new file mode 100644 index 0000000..74db4d4 --- /dev/null +++ b/tests/test_app_api.py @@ -0,0 +1,179 @@ +from citegeist import BibliographyStore +from citegeist.app_api import LiteratureExplorerApi +from citegeist.bibtex import BibEntry +from citegeist.bootstrap import BootstrapResult +from citegeist.expand import ExpansionResult + + +class FakeBootstrapper: + def bootstrap(self, store, **kwargs): + if not kwargs.get("preview_only"): + store.ensure_topic("graph-topic", "Graph Topic", source_type="bootstrap", expansion_phrase="graph topic") + store.upsert_entry( + BibEntry( + entry_type="article", + citation_key="topic2024graph", + fields={"title": "Graph Topic Result", "year": "2024"}, + ), + source_type="bootstrap", + source_label="topic:graph topic", + ) + store.add_entry_topic( + "topic2024graph", + topic_slug="graph-topic", + topic_name="Graph Topic", + source_type="bootstrap", + source_label="topic:graph topic", + confidence=4.0, + ) + store.connection.commit() + return [ + BootstrapResult( + citation_key="topic2024graph", + origin="topic", + created=True, + score=4.0, + title="Graph Topic Result", + year="2024", + ) + ] + + +class FakeTopicExpander: + def expand_topic(self, store, topic_slug, **kwargs): + preview_only = kwargs.get("preview_only", False) + if not preview_only: + store.upsert_entry( + BibEntry( + entry_type="article", + citation_key="discovered2025graph", + fields={"title": "Graph Exploration Result", "year": "2025"}, + ), + source_type="graph_expand", + source_label="openalex:cites:seed2024", + ) + store.add_entry_topic( + "discovered2025graph", + topic_slug=topic_slug, + topic_name="Graph Topic", + source_type="topic_expand", + source_label="openalex:cites:seed2024", + confidence=0.8, + ) + store.connection.commit() + return [ + ExpansionResult( + source_citation_key="seed2024", + discovered_citation_key="discovered2025graph", + created_entry=True, + relation_type="cites", + source_label="openalex:cites:seed2024", + ) + ] + + +def test_literature_explorer_api_search_and_show_entry(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Graph Topic Result}, + year = {2024} +} +""" + ) + store.add_entry_topic("seed2024", topic_slug="graph-topic", topic_name="Graph Topic", source_label="seed") + api = LiteratureExplorerApi(store) + + search_payload = api.search("graph") + assert search_payload["results"][0]["citation_key"] == "seed2024" + + entry_payload = api.show_entry("seed2024", include_bibtex=True) + assert entry_payload is not None + assert entry_payload["citation_key"] == "seed2024" + assert entry_payload["topics"][0]["slug"] == "graph-topic" + assert "@article{seed2024," in entry_payload["bibtex"] + finally: + store.close() + + +def test_literature_explorer_api_bootstrap_returns_topic_payload(): + store = BibliographyStore() + try: + api = LiteratureExplorerApi(store, bootstrapper=FakeBootstrapper()) + payload = api.bootstrap( + topic="graph topic", + topic_slug="graph-topic", + topic_name="Graph Topic", + preview_only=False, + expand=False, + ) + + assert payload["topic"]["slug"] == "graph-topic" + assert payload["entries"][0]["citation_key"] == "topic2024graph" + assert payload["results"][0]["citation_key"] == "topic2024graph" + finally: + store.close() + + +def test_literature_explorer_api_expand_topic_returns_updated_entries(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Graph Seed}, + year = {2024} +} +""" + ) + store.add_entry_topic("seed2024", topic_slug="graph-topic", topic_name="Graph Topic", source_label="seed") + api = LiteratureExplorerApi(store, topic_expander=FakeTopicExpander()) + + payload = api.expand_topic("graph-topic", preview_only=False) + + assert payload is not None + assert payload["results"][0]["discovered_citation_key"] == "discovered2025graph" + assert any(item["citation_key"] == "discovered2025graph" for item in payload["entries"]) + finally: + store.close() + + +def test_literature_explorer_api_extract_verify_and_graph_payloads(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Graph Seed}, + year = {2024}, + references = {child2025} +} + +@article{child2025, + author = {Child, Bob}, + title = {Graph Child}, + year = {2025} +} +""" + ) + api = LiteratureExplorerApi(store) + + extract_payload = api.extract_text("Smith, J., 2024, Graph Topic Result: Journal of Graph Studies, v. 1, p. 1-10.") + assert extract_payload["entries"] + assert extract_payload["entries"][0]["citation_key"] + + verify_payload = api.verify_strings(["\"Graph Topic Result\" Smith 2024"], limit=1) + assert "results" in verify_payload + assert verify_payload["results"][0]["query"] + + graph_payload = api.graph(["seed2024"], depth=1) + assert [node["id"] for node in graph_payload["nodes"]] == ["child2025", "seed2024"] + assert graph_payload["edges"][0]["source"] == "seed2024" + assert graph_payload["edges"][0]["target"] == "child2025" + finally: + store.close() diff --git a/tests/test_app_server.py b/tests/test_app_server.py new file mode 100644 index 0000000..1524413 --- /dev/null +++ b/tests/test_app_server.py @@ -0,0 +1,45 @@ +from citegeist import BibliographyStore +from citegeist.app_api import LiteratureExplorerApi +from citegeist.app_server import LiteratureExplorerAppServer, create_request_handler + + +def test_literature_explorer_app_server_dispatch_search(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Graph Topic Result}, + year = {2024} +} +""" + ) + server = LiteratureExplorerAppServer(LiteratureExplorerApi(store)) + + payload = server.dispatch("search", {"query": "graph", "limit": 5}) + + assert payload["results"][0]["citation_key"] == "seed2024" + finally: + store.close() + + +def test_literature_explorer_http_handler_class_can_be_created(): + store = BibliographyStore() + try: + store.ingest_bibtex( + """ +@article{seed2024, + author = {Seed, Alice}, + title = {Graph Topic Result}, + year = {2024} +} +""" + ) + app_server = LiteratureExplorerAppServer(LiteratureExplorerApi(store)) + handler = create_request_handler(app_server) + + assert handler is not None + assert issubclass(handler, object) + finally: + store.close() diff --git a/tests/test_bootstrap.py b/tests/test_bootstrap.py index 34379ed..bbd3110 100644 --- a/tests/test_bootstrap.py +++ b/tests/test_bootstrap.py @@ -157,8 +157,7 @@ def test_bootstrap_ranks_and_deduplicates_topic_candidates(): results = bootstrapper.bootstrap(store, topic="graph topic", expand=False, topic_limit=5) topic_results = [item for item in results if item.origin == "topic"] - assert [item.citation_key for item in topic_results] == ["shared2024graph", "crossref2024other"] - assert topic_results[0].score > topic_results[1].score + assert [item.citation_key for item in topic_results] == ["shared2024graph"] finally: store.close() @@ -214,6 +213,92 @@ def test_bootstrap_topic_commit_limit_restricts_persisted_candidates(): store.close() +def test_bootstrap_topic_candidates_are_attached_to_topic(): + store = BibliographyStore() + try: + bootstrapper = Bootstrapper() + from citegeist import BibEntry + + bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign] + BibEntry( + entry_type="article", + citation_key="topic2024graph", + fields={"title": "Graph Topic Result", "year": "2024"}, + ) + ] + bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign] + bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign] + bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign] + bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: [] # type: ignore[method-assign] + + bootstrapper.bootstrap( + store, + topic="graph topic", + topic_slug="graph-topic", + topic_name="Graph Topic", + topic_phrase="graph topic methods", + expand=False, + topic_commit_limit=1, + ) + + topic = store.get_topic("graph-topic") + assert topic is not None + assert topic["entry_count"] == 1 + topic_entries = store.list_topic_entries("graph-topic") + assert [item["citation_key"] for item in topic_entries] == ["topic2024graph"] + assert topic_entries[0]["source_label"] == "topic:graph topic" + assert topic_entries[0]["confidence"] > 0 + finally: + store.close() + + +def test_bootstrap_topic_commit_requires_title_anchor(): + store = BibliographyStore() + try: + bootstrapper = Bootstrapper() + from citegeist import BibEntry + + bootstrapper.resolver.search_openalex = lambda topic, limit=5: [ # type: ignore[method-assign] + BibEntry( + entry_type="article", + citation_key="broad2024", + fields={ + "title": "The phylum Vertebrata: a case for zoological recognition", + "abstract": "Chordata includes Cephalochordata and Urochordata.", + "year": "2024", + }, + ), + BibEntry( + entry_type="article", + citation_key="anchored2024", + fields={ + "title": "Acraniates and amphioxus in comparative development", + "year": "2024", + }, + ), + ] + bootstrapper.resolver.search_crossref = lambda topic, limit=5: [] # type: ignore[method-assign] + bootstrapper.resolver.search_datacite = lambda topic, limit=5: [] # type: ignore[method-assign] + bootstrapper.crossref_expander.expand_entry_references = lambda _store, _key: [] # type: ignore[method-assign] + bootstrapper.openalex_expander.expand_entry = lambda _store, _key, relation_type="cites", limit=5: [] # type: ignore[method-assign] + + results = bootstrapper.bootstrap( + store, + topic="acraniates cephalochordata amphioxus lancelet", + topic_slug="acraniates", + topic_name="Acraniates", + expand=False, + topic_commit_limit=5, + ) + + assert [item.citation_key for item in results] == ["anchored2024"] + topic_entries = store.list_topic_entries("acraniates") + assert [item["citation_key"] for item in topic_entries] == ["anchored2024"] + assert store.get_entry("broad2024") is None + finally: + store.close() + + def test_bootstrap_preview_uses_topic_commit_limit_when_larger_than_topic_limit(): store = BibliographyStore() try: diff --git a/tests/test_extract.py b/tests/test_extract.py index e29987d..afcf411 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -1,4 +1,16 @@ -from citegeist import extract_references, parse_bibtex +import json +from pathlib import Path + +from citegeist import ( + available_extraction_backends, + check_extraction_comparison_summary, + compare_extraction_backends, + extract_references, + parse_bibtex, + register_extraction_backend, + summarize_extraction_comparison, +) +from citegeist.bibtex import BibEntry from citegeist.cli import main @@ -19,6 +31,74 @@ for bibliography pipelines. Journal of Parsing Systems. [2] Chen, Bo. 2021. Another entry. Proceedings of the Mining Workshop. """ +RICH_REFERENCES = """ +[1] Smith, Jane. 2024a. Graph-first bibliography augmentation. Journal of Research Systems 12(3): 45-67. doi:10.1000/example-doi. +[2] Doe, Alex. 2019. Evolutionary archives. PhD dissertation, Example University. +[3] Chen, Bo. 2018. Field methods update. Technical Report No. TR-2018-05, Example Research Lab. +[4] Nguyen, An. 2022. Project page. Retrieved from https://example.org/project-page. +""" + +FIXTURE_REFERENCES = Path(__file__).with_name("fixtures").joinpath("extract_backend_fixture.txt").read_text(encoding="utf-8") + + +def register_fixture_alt_backend() -> None: + class FixtureAltBackend: + name = "fixture-alt" + + def extract_references(self, text: str) -> list[BibEntry]: + return [ + BibEntry( + entry_type="article", + citation_key="smith2024graphfirst1", + fields={ + "title": "Graph-first bibliography augmentation", + "year": "2024a", + "journal": "Journal of Research Systems", + "pages": "45--67", + "doi": "10.1000/example-doi", + }, + ), + BibEntry( + entry_type="article", + citation_key="miller2023semantic2", + fields={ + "title": "Semantic search for research corpora", + "year": "2023", + "journal": "Retrieval Workshop Journal", + }, + ), + BibEntry( + entry_type="phdthesis", + citation_key="doe2019evolutionary3", + fields={ + "title": "Evolutionary archives", + "year": "2019", + "school": "Example University", + }, + ), + BibEntry( + entry_type="techreport", + citation_key="chen2018field4", + fields={ + "title": "Field methods update", + "year": "2018", + "institution": "Example Research Lab", + "number": "TR-2018-05", + }, + ), + BibEntry( + entry_type="misc", + citation_key="nguyen2022project5", + fields={ + "title": "Project page", + "year": "2022", + "url": "https://example.org/project-page", + }, + ), + ] + + register_extraction_backend(FixtureAltBackend()) + def test_extract_references_builds_draft_entries(): entries = extract_references(SAMPLE_REFERENCES) @@ -29,6 +109,7 @@ def test_extract_references_builds_draft_entries(): ] assert entries[0].entry_type == "article" assert entries[0].fields["journal"] == "Journal of Research Systems" + assert "extracted_by = {heuristic}" in entries[0].fields["note"] assert entries[1].entry_type == "inproceedings" assert entries[1].fields["booktitle"] == "Proceedings of the Retrieval Workshop" @@ -63,3 +144,379 @@ def test_extract_references_joins_wrapped_reference_lines(): assert len(entries) == 2 assert entries[0].fields["title"] == "Multi-line reference extraction for bibliography pipelines" assert entries[0].fields["journal"] == "Journal of Parsing Systems" + + +def test_extract_references_preserves_year_suffix_ids_and_pages(): + entries = extract_references(RICH_REFERENCES) + + article = entries[0] + assert article.fields["year"] == "2024a" + assert article.fields["doi"] == "10.1000/example-doi" + assert article.fields["url"] == "https://doi.org/10.1000/example-doi" + assert article.fields["journal"] == "Journal of Research Systems" + assert article.fields["volume"] == "12" + assert article.fields["number"] == "3" + assert article.fields["pages"] == "45--67" + + +def test_extract_references_supports_thesis_report_and_web_entries(): + entries = extract_references(RICH_REFERENCES) + + thesis = entries[1] + report = entries[2] + webpage = entries[3] + + assert thesis.entry_type == "phdthesis" + assert thesis.fields["school"] == "Example University" + + assert report.entry_type == "techreport" + assert report.fields["institution"] == "Example Research Lab" + assert report.fields["number"] == "TR-2018-05" + assert report.fields["type"] == "Technical Report" + + assert webpage.entry_type == "misc" + assert webpage.fields["url"] == "https://example.org/project-page" + assert webpage.fields["howpublished"] == "Retrieved from https://example.org/project-page" + + +def test_extract_references_supports_registered_backend(): + class StaticBackend: + name = "static-test" + + def extract_references(self, text: str) -> list[BibEntry]: + return [ + BibEntry( + entry_type="misc", + citation_key="static2024example1", + fields={"title": text.strip(), "year": "2024"}, + ) + ] + + register_extraction_backend(StaticBackend()) + + entries = extract_references("Custom backend input", backend="static-test") + + assert entries[0].citation_key == "static2024example1" + assert "static-test" in available_extraction_backends() + + +def test_extract_references_rejects_unknown_backend(): + try: + extract_references("anything", backend="missing-backend") + except ValueError as exc: + assert "Unknown extraction backend" in str(exc) + else: + raise AssertionError("expected ValueError for unknown backend") + + +def test_extract_cli_accepts_backend_flag(tmp_path): + input_path = tmp_path / "references.txt" + output_path = tmp_path / "draft.bib" + input_path.write_text(SAMPLE_REFERENCES, encoding="utf-8") + + exit_code = main(["extract", str(input_path), "--backend", "heuristic", "--output", str(output_path)]) + + assert exit_code == 0 + exported = output_path.read_text(encoding="utf-8") + assert "@article{smith2024graphfirst1," in exported + + +def test_extract_references_anystyle_backend_maps_json(monkeypatch): + import citegeist.extract as extract_module + + monkeypatch.setattr(extract_module.shutil, "which", lambda command: "/usr/bin/anystyle") + + class Result: + returncode = 0 + stdout = """ +[ + { + "author": [{"family": "Smith", "given": "Jane"}], + "date": ["2024"], + "title": ["Graph-first bibliography augmentation"], + "journal": ["Journal of Research Systems"], + "volume": ["12"], + "issue": ["3"], + "pages": ["45-67"], + "doi": ["10.1000/example-doi"], + "type": "article" + } +] +""" + stderr = "" + + monkeypatch.setattr(extract_module.subprocess, "run", lambda *args, **kwargs: Result()) + + entries = extract_references(SAMPLE_REFERENCES, backend="anystyle") + + assert len(entries) == 1 + assert entries[0].entry_type == "article" + assert entries[0].fields["author"] == "Smith, Jane" + assert entries[0].fields["journal"] == "Journal of Research Systems" + assert entries[0].fields["pages"] == "45--67" + assert entries[0].fields["doi"] == "10.1000/example-doi" + assert entries[0].fields["url"] == "https://doi.org/10.1000/example-doi" + assert "extracted_by = {anystyle}" in entries[0].fields["note"] + assert "raw_reference = {Smith, Jane and Doe, Alex. 2024. Graph-first bibliography augmentation. Journal of Research Systems.}" in entries[0].fields["note"] + + +def test_extract_references_anystyle_backend_reports_missing_binary(monkeypatch): + import citegeist.extract as extract_module + + monkeypatch.setattr(extract_module.shutil, "which", lambda command: None) + + try: + extract_references(SAMPLE_REFERENCES, backend="anystyle") + except RuntimeError as exc: + assert "requires the AnyStyle CLI" in str(exc) + else: + raise AssertionError("expected RuntimeError when anystyle is unavailable") + + +def test_extract_references_grobid_backend_maps_bibtex(monkeypatch): + import citegeist.extract as extract_module + + class FakeResponse: + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def read(self) -> bytes: + return b""" +@article{-1, + author = {Smith, Jane}, + title = {Graph-first bibliography augmentation}, + journal = {Journal of Research Systems}, + year = {2024}, + pages = {45--67}, + volume = {12}, + number = {3}, + doi = {10.1000/example-doi} +} +""" + + monkeypatch.setattr(extract_module.urllib.request, "urlopen", lambda request, timeout=30: FakeResponse()) + + entries = extract_references(SAMPLE_REFERENCES, backend="grobid") + + assert len(entries) == 1 + assert entries[0].citation_key == "smith2024graphfirst1" + assert entries[0].fields["doi"] == "10.1000/example-doi" + assert entries[0].fields["url"] == "https://doi.org/10.1000/example-doi" + assert "extracted_by = {grobid}" in entries[0].fields["note"] + + +def test_extract_references_grobid_backend_reports_http_errors(monkeypatch): + import citegeist.extract as extract_module + + def raise_http(request, timeout=30): + raise extract_module.urllib.error.HTTPError( + url=request.full_url, + code=503, + msg="Busy", + hdrs=None, + fp=None, + ) + + monkeypatch.setattr(extract_module.urllib.request, "urlopen", raise_http) + + try: + extract_references(SAMPLE_REFERENCES, backend="grobid") + except RuntimeError as exc: + assert "GROBID extraction failed with HTTP 503" in str(exc) + else: + raise AssertionError("expected RuntimeError when grobid returns HTTP error") + + +def test_compare_extraction_backends_reports_field_differences(): + class CompareA: + name = "compare-a" + + def extract_references(self, text: str) -> list[BibEntry]: + return [ + BibEntry( + entry_type="article", + citation_key="a1", + fields={"title": "Shared Title", "year": "2024", "journal": "Journal A"}, + ) + ] + + class CompareB: + name = "compare-b" + + def extract_references(self, text: str) -> list[BibEntry]: + return [ + BibEntry( + entry_type="inproceedings", + citation_key="b1", + fields={"title": "Shared Title", "year": "2024", "booktitle": "Proceedings B"}, + ) + ] + + register_extraction_backend(CompareA()) + register_extraction_backend(CompareB()) + + rows = compare_extraction_backends(SAMPLE_REFERENCES, backends=["compare-a", "compare-b"]) + + assert rows[0].ordinal == 1 + assert "entry_type" in rows[0].differing_fields + assert "journal" in rows[0].differing_fields + assert "booktitle" in rows[0].differing_fields + assert rows[0].entries["compare-a"]["fields"]["journal"] == "Journal A" + assert rows[0].entries["compare-b"]["fields"]["booktitle"] == "Proceedings B" + + +def test_compare_extract_cli_writes_json(tmp_path): + input_path = tmp_path / "references.txt" + output_path = tmp_path / "compare.json" + input_path.write_text(SAMPLE_REFERENCES, encoding="utf-8") + + exit_code = main( + [ + "compare-extract", + str(input_path), + "--backend", + "heuristic", + "--backend", + "heuristic", + "--output", + str(output_path), + ] + ) + + assert exit_code == 0 + payload = json.loads(output_path.read_text(encoding="utf-8")) + assert payload[0]["ordinal"] == 1 + assert payload[0]["entries"]["heuristic"]["present"] is True + + +def test_compare_extraction_backends_fixture_reports_expected_disagreement(): + register_fixture_alt_backend() + + rows = compare_extraction_backends(FIXTURE_REFERENCES, backends=["heuristic", "fixture-alt"]) + + assert len(rows) == 5 + assert "author" in rows[0].differing_fields + assert "volume" in rows[0].differing_fields + assert "number" in rows[0].differing_fields + assert "entry_type" in rows[1].differing_fields + assert "journal" in rows[1].differing_fields + assert "booktitle" in rows[1].differing_fields + assert "howpublished" in rows[4].differing_fields + + +def test_compare_extract_cli_fixture_json_contains_all_rows(tmp_path): + input_path = tmp_path / "fixture.txt" + output_path = tmp_path / "compare.json" + input_path.write_text(FIXTURE_REFERENCES, encoding="utf-8") + + exit_code = main(["compare-extract", str(input_path), "--backend", "heuristic", "--output", str(output_path)]) + + assert exit_code == 0 + payload = json.loads(output_path.read_text(encoding="utf-8")) + assert len(payload) == 5 + assert payload[2]["entries"]["heuristic"]["entry_type"] == "phdthesis" + + +def test_summarize_extraction_comparison_counts_differences(): + rows = compare_extraction_backends(FIXTURE_REFERENCES, backends=["heuristic", "heuristic"]) + + summary = summarize_extraction_comparison(rows) + + assert summary.row_count == 5 + assert summary.rows_with_differences == 0 + assert summary.backend_presence_counts["heuristic"] == 5 + assert summary.differing_field_counts == {} + + +def test_compare_extract_cli_summary_writes_counts(tmp_path): + input_path = tmp_path / "fixture.txt" + output_path = tmp_path / "summary.json" + input_path.write_text(FIXTURE_REFERENCES, encoding="utf-8") + + exit_code = main( + [ + "compare-extract", + str(input_path), + "--backend", + "heuristic", + "--backend", + "heuristic", + "--summary", + "--output", + str(output_path), + ] + ) + + assert exit_code == 0 + payload = json.loads(output_path.read_text(encoding="utf-8")) + assert payload["row_count"] == 5 + assert payload["rows_with_differences"] == 0 + assert payload["backend_presence_counts"]["heuristic"] == 5 + + +def test_check_extraction_comparison_summary_reports_failure(): + register_fixture_alt_backend() + rows = compare_extraction_backends(FIXTURE_REFERENCES, backends=["heuristic", "fixture-alt"]) + summary = summarize_extraction_comparison(rows) + + check = check_extraction_comparison_summary(summary, max_rows_with_differences=0) + + assert check.passed is False + assert "rows_with_differences" in check.failures[0] + + +def test_compare_extract_cli_summary_threshold_passes(tmp_path): + input_path = tmp_path / "fixture.txt" + output_path = tmp_path / "summary-pass.json" + input_path.write_text(FIXTURE_REFERENCES, encoding="utf-8") + + exit_code = main( + [ + "compare-extract", + str(input_path), + "--backend", + "heuristic", + "--backend", + "heuristic", + "--summary", + "--max-rows-with-differences", + "0", + "--output", + str(output_path), + ] + ) + + assert exit_code == 0 + payload = json.loads(output_path.read_text(encoding="utf-8")) + assert payload["check"]["passed"] is True + + +def test_compare_extract_cli_summary_threshold_fails(tmp_path): + register_fixture_alt_backend() + input_path = tmp_path / "fixture.txt" + output_path = tmp_path / "summary-fail.json" + input_path.write_text(FIXTURE_REFERENCES, encoding="utf-8") + + exit_code = main( + [ + "compare-extract", + str(input_path), + "--backend", + "heuristic", + "--backend", + "fixture-alt", + "--summary", + "--max-rows-with-differences", + "0", + "--output", + str(output_path), + ] + ) + + assert exit_code == 1 + payload = json.loads(output_path.read_text(encoding="utf-8")) + assert payload["check"]["passed"] is False + assert payload["summary"]["rows_with_differences"] > 0 diff --git a/tests/test_openalex_expand.py b/tests/test_openalex_expand.py index 53da1ec..7c228da 100644 --- a/tests/test_openalex_expand.py +++ b/tests/test_openalex_expand.py @@ -23,6 +23,37 @@ def test_openalex_work_to_entry_maps_basic_fields(): assert entry.fields["abstract"] == "Graph discovery" +def test_openalex_work_to_entry_uses_journal_metadata_for_non_article_work_type(): + entry = _openalex_work_to_entry( + { + "id": "https://openalex.org/W12345", + "doi": "https://doi.org/10.1000/example-openalex", + "display_name": "OpenAlex Journal-hosted Work", + "publication_year": 2022, + "type": "reference-entry", + "authorships": [{"author": {"display_name": "Jane Smith"}}], + "primary_location": {"source": {"display_name": "Journal of Graph Discovery", "type": "journal"}}, + } + ) + + assert entry.entry_type == "article" + assert entry.fields["journal"] == "Journal of Graph Discovery" + assert "booktitle" not in entry.fields + + +def test_openalex_work_to_entry_preserves_spacing_when_stripping_markup(): + entry = _openalex_work_to_entry( + { + "id": "https://openalex.org/W12345", + "display_name": "The Oral Papilla of the Lancelet Larva (Branchiostoma lanceolatum)", + "publication_year": 2022, + "type": "article", + } + ) + + assert entry.fields["title"] == "The Oral Papilla of the Lancelet Larva (Branchiostoma lanceolatum)" + + def test_openalex_expander_adds_outgoing_and_incoming_edges(): store = BibliographyStore() try: diff --git a/tests/test_resolve.py b/tests/test_resolve.py index 3c34d19..ee5c633 100644 --- a/tests/test_resolve.py +++ b/tests/test_resolve.py @@ -48,6 +48,28 @@ def test_crossref_message_to_entry_handles_missing_author_without_crashing(): assert entry.fields["year"] == "2003" +def test_crossref_message_to_entry_strips_markup_from_title_and_abstract(): + entry = _crossref_message_to_entry( + { + "type": "journal-article", + "title": [ + "The Fine Structure of the Testis of a Lancelet (=Amphioxus), Branchiostoma floridae" + ], + "container-title": ["Acta Zoologica"], + "abstract": "AbstractTagged abstract text.", + "author": [{"family": "Holland", "given": "Nicholas D."}], + "issued": {"date-parts": [[1989]]}, + } + ) + + assert entry.fields["title"] == ( + "The Fine Structure of the Testis of a Lancelet (=Amphioxus), Branchiostoma floridae" + ) + assert entry.fields["journal"] == "Acta Zoologica" + assert entry.fields["abstract"] == "Tagged abstract text." + assert "), Branchiostoma" in entry.fields["title"] + + def test_arxiv_atom_entry_to_bib_maps_basic_fields(): xml = ET.fromstring( """ @@ -208,6 +230,23 @@ def test_openalex_work_to_entry_maps_basic_fields(): assert entry.fields["abstract"] == "OpenAlex resolved" +def test_openalex_work_to_entry_uses_journal_metadata_for_non_article_work_type(): + entry = _openalex_work_to_entry( + { + "id": "https://openalex.org/W12345", + "display_name": "OpenAlex Resolved Work", + "publication_year": 2022, + "type": "reference-entry", + "authorships": [{"author": {"display_name": "Jane Smith"}}], + "primary_location": {"source": {"display_name": "Journal of Open Graphs", "type": "journal"}}, + } + ) + + assert entry.entry_type == "article" + assert entry.fields["journal"] == "Journal of Open Graphs" + assert "booktitle" not in entry.fields + + def test_openalex_work_to_entry_normalizes_reversed_initial_author_name(): entry = _openalex_work_to_entry( {