diff --git a/.gitignore b/.gitignore index 947495c..e41471e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ __pycache__/ .pytest_cache/ +.venv/ *.pyc library.sqlite3 diff --git a/README.md b/README.md index 6850f89..ef4ece4 100644 --- a/README.md +++ b/README.md @@ -43,12 +43,15 @@ But it is not the right long-term base: The initial repo includes: -- a lightweight BibTeX parser for structured ingestion; +- `pybtex`-backed BibTeX parsing and export in a repo-local virtual environment; - a SQLite-backed bibliography store; +- a small CLI for ingest, search, inspection, and export; - normalized tables for entries, creators, identifiers, and citation relations; - full-text-search-ready indexing over title, abstract, and fulltext when SQLite FTS5 is available; - tests covering parsing, ingestion, relation storage, and search. +The prioritized execution plan lives in [ROADMAP.md](./ROADMAP.md). + ## Layout ```text @@ -65,7 +68,10 @@ citegeist/ ```bash cd citegeist -PYTHONPATH=src python3 - <<'PY' +python3 -m virtualenv --always-copy .venv +.venv/bin/pip install -e . +.venv/bin/pip install pytest +PYTHONPATH=src .venv/bin/python - <<'PY' from citegeist import BibliographyStore bib = """ @@ -91,17 +97,26 @@ print(store.get_relations("smith2024graphs")) print(store.search_text("semantic")) store.close() PY -pytest -q +.venv/bin/python -m pytest -q ``` -## Planned Work +Or use the CLI directly: -- parse references from raw prose, OCR, PDF text, and bibliography sections into draft BibTeX; -- add modern metadata resolvers for DOI, Crossref, DBLP, arXiv, and similar sources; -- track provenance and confidence for enriched fields; -- add graph expansion workflows over `cites` and `cited_by` edges; -- support acquisition pipelines for open-access theses, dissertations, preprints, and publisher metadata pages; -- add embeddings or pluggable semantic indexing beyond SQLite FTS. +```bash +cd citegeist +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 ingest references.bib +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 search "semantic search" +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 show smith2024graphs +PYTHONPATH=src .venv/bin/python -m citegeist --db library.sqlite3 export --output reviewed.bib +``` + +## Near-Term Priorities + +- provenance tracking and entry review states; +- plaintext reference extraction into draft BibTeX; +- metadata resolvers for DOI, Crossref, DBLP, and arXiv. + +See [ROADMAP.md](./ROADMAP.md) for the prioritized phase plan and rationale. ## Naming diff --git a/ROADMAP.md b/ROADMAP.md new file mode 100644 index 0000000..aea0822 --- /dev/null +++ b/ROADMAP.md @@ -0,0 +1,187 @@ +# Roadmap + +This roadmap prioritizes a usable local research workflow over breadth of integrations. + +The first objective is not to support every metadata source. The first objective is to make one end-to-end path work reliably: + +1. ingest draft references, +2. normalize and store them, +3. enrich them, +4. traverse citation links, +5. export reviewed BibTeX. + +## Prioritization Principles + +- prioritize steps that make the system usable by a single researcher on a local machine; +- prioritize deterministic infrastructure before network integrations; +- keep every stage inspectable and auditable; +- treat verification and provenance as core features, not cleanup work; +- defer heavy semantic infrastructure until the local corpus model is stable. + +## Current Baseline + +Completed: + +- lightweight BibTeX parsing; +- SQLite storage for entries, creators, identifiers, and relations; +- local text search using SQLite FTS5 when available; +- tests for ingest, relation storage, and search. + +## Phase 1: Core Ingestion And Export + +Priority: P0 + +Goal: +Make `citegeist` useful as a local BibTeX workbench even before online enrichment is added. + +Tasks: + +- add BibTeX export from the normalized database back into stable, readable BibTeX; +- add a small CLI for `ingest`, `show`, `search`, and `export`; +- store field provenance metadata alongside imported and edited fields; +- add schema support for entry status such as `draft`, `enriched`, `reviewed`, and `exported`; +- add fixture-driven tests for round-tripping BibTeX through ingest and export. + +Why this comes first: + +- without export, the project is not yet useful in a LaTeX workflow; +- without a CLI, the package is a library demo rather than a tool; +- without provenance and state, later enrichment work becomes hard to audit. + +Exit criteria: + +- a user can ingest a `.bib` file, inspect entries, search locally, and export a reviewed `.bib`; +- round-trip tests show no unexpected field loss for supported entry types. + +## Phase 2: Reference Extraction + +Priority: P0 + +Goal: +Turn raw reference text into draft entries that can enter the main pipeline. + +Tasks: + +- add parsers for bibliography-section lines and plain-text reference lists; +- define a draft-entry schema for incomplete references with confidence markers; +- support ingestion of OCR- or PDF-derived plaintext bibliography sections; +- add normalization for author names, years, title casing, and page ranges; +- build gold-test fixtures from real, messy reference examples. + +Why this is next: + +- this addresses the project’s first unique bottleneck: getting rough references into structured form; +- enrichment is much more effective once draft references are normalized. + +Exit criteria: + +- a user can pass a plaintext bibliography section and receive draft BibTeX entries with unresolved fields clearly marked; +- tests cover common article, book, chapter, and proceedings references. + +## Phase 3: Metadata Enrichment + +Priority: P1 + +Goal: +Resolve draft or partial entries against external scholarly sources and merge improved metadata safely. + +Tasks: + +- define a resolver interface with deterministic merge rules; +- implement first-party resolvers for DOI/Crossref, DBLP, and arXiv; +- add identifier-first resolution, then title/author/year fallback search; +- store merge provenance per field and resolution attempt logs; +- flag conflicts rather than silently overwriting disputed values. + +Why this is P1 rather than the first phase: + +- enrichment quality depends on the ingestion and provenance model being correct first; +- it is easier to test deterministic merge behavior once local workflows already exist. + +Exit criteria: + +- an incomplete entry can be enriched from at least one authoritative source; +- conflicting fields remain visible for review instead of being lost. + +## Phase 4: Citation Graph Expansion + +Priority: P1 + +Goal: +Use citation edges as a discovery engine rather than just metadata storage. + +Tasks: + +- support explicit `cites` and `cited_by` edge ingestion with source provenance; +- add graph expansion commands starting from one or more seed entries; +- track edge discovery source, timestamp, and confidence; +- add filters for depth, source type, year range, and reviewed status; +- expose unresolved nodes so the user can decide what to enrich next. + +Why this matters: + +- this is central to literature discovery rather than mere bibliography cleanup; +- it turns the database into a research navigation tool. + +Exit criteria: + +- starting from one or more seed entries, a user can expand outward through citation edges and persist newly discovered nodes; +- graph traversal results can be exported as BibTeX candidates for review. + +## Phase 5: Search And Ranking + +Priority: P2 + +Goal: +Improve discovery quality inside the local corpus. + +Tasks: + +- refine FTS ranking across title, abstract, keywords, and fulltext; +- add saved search queries and result filters; +- add optional embedding-backed semantic search behind a pluggable interface; +- support hybrid ranking that combines lexical matching, identifiers, and citation proximity; +- add benchmarking fixtures for retrieval quality on a few research topics. + +Why this is later: + +- FTS is already enough to support early workflows; +- embedding infrastructure is expensive and should wait until the corpus schema stabilizes. + +Exit criteria: + +- local search is useful on realistic corpora without requiring external services; +- semantic indexing is optional and does not displace the simpler local search path. + +## Phase 6: Corpus Acquisition Pipelines + +Priority: P2 + +Goal: +Broaden source acquisition without mixing that complexity into the core model. + +Tasks: + +- add source adapters for open-access theses and dissertation repositories; +- add support for harvesting publisher citation pages and preprint metadata pages; +- define per-source import provenance and rate-limit behavior; +- separate source-specific scraping logic from normalized entry storage; +- add regression fixtures for representative public sources. + +Why this is later: + +- acquisition breadth is useful, but only after the core ingest/enrich/review loop is solid; +- source adapters are brittle and should sit on top of a stable model. + +Exit criteria: + +- new public corpora can be imported through adapters without changing the storage core; +- imported entries retain their source provenance and can be reviewed like any other entry. + +## Suggested Next Three Tasks + +1. Add a CLI module with `ingest`, `search`, `show`, and `export`. +2. Implement BibTeX export from the normalized store. +3. Add provenance tables and entry review status fields. + +These three tasks complete the first usable local workflow and should be treated as the immediate sprint. diff --git a/pyproject.toml b/pyproject.toml index 8207788..5357fce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,10 @@ name = "citegeist" version = "0.1.0" description = "BibTeX-native tooling for bibliography augmentation, citation graphs, and search" requires-python = ">=3.10" +dependencies = ["pybtex==0.25.1"] + +[project.scripts] +citegeist = "citegeist.cli:main" [tool.pytest.ini_options] pythonpath = ["src"] diff --git a/src/citegeist/__main__.py b/src/citegeist/__main__.py new file mode 100644 index 0000000..faaa63b --- /dev/null +++ b/src/citegeist/__main__.py @@ -0,0 +1,4 @@ +from .cli import main + + +raise SystemExit(main()) diff --git a/src/citegeist/bibtex.py b/src/citegeist/bibtex.py index 3e5a247..41ed97d 100644 --- a/src/citegeist/bibtex.py +++ b/src/citegeist/bibtex.py @@ -1,6 +1,14 @@ from __future__ import annotations from dataclasses import dataclass +from io import StringIO + +try: + from pybtex.database import BibliographyData, Entry, Person, parse_string + from pybtex.database.output.bibtex import Writer +except ImportError: # pragma: no cover - exercised only outside the configured venv + BibliographyData = Entry = Person = Writer = None + parse_string = None @dataclass(slots=True) @@ -11,148 +19,42 @@ class BibEntry: def parse_bibtex(text: str) -> list[BibEntry]: + _require_pybtex() + bibliography = parse_string(text, bib_format="bibtex") entries: list[BibEntry] = [] - index = 0 - size = len(text) - - while index < size: - at = text.find("@", index) - if at == -1: - break - entry_type_start = at + 1 - brace = text.find("{", entry_type_start) - if brace == -1: - raise ValueError("Malformed BibTeX: missing opening brace") - entry_type = text[entry_type_start:brace].strip().lower() - body, index = _read_balanced_block(text, brace) - citation_key, fields_blob = _split_key_and_fields(body) + for citation_key, entry in bibliography.entries.items(): + fields = dict(entry.fields.items()) + for role, persons in entry.persons.items(): + fields[role] = " and ".join(str(person) for person in persons) entries.append( BibEntry( - entry_type=entry_type, + entry_type=entry.type, citation_key=citation_key, - fields=_parse_fields(fields_blob), + fields=fields, ) ) - return entries -def _read_balanced_block(text: str, brace_index: int) -> tuple[str, int]: - depth = 0 - in_quotes = False - escaped = False +def render_bibtex(entries: list[BibEntry]) -> str: + _require_pybtex() + bibliography_entries = {} + for entry in entries: + fields = {key: value for key, value in entry.fields.items() if key not in {"author", "editor"}} + persons = {} + for role in ("author", "editor"): + raw_names = entry.fields.get(role) + if raw_names: + persons[role] = [Person(name.strip()) for name in raw_names.split(" and ") if name.strip()] + bibliography_entries[entry.citation_key] = Entry(entry.entry_type, fields=fields, persons=persons) - for index in range(brace_index, len(text)): - char = text[index] - if in_quotes: - if escaped: - escaped = False - elif char == "\\": - escaped = True - elif char == '"': - in_quotes = False - continue - - if char == '"': - in_quotes = True - elif char == "{": - depth += 1 - elif char == "}": - depth -= 1 - if depth == 0: - return text[brace_index + 1:index], index + 1 - - raise ValueError("Malformed BibTeX: unbalanced braces") + buffer = StringIO() + Writer().write_stream(BibliographyData(entries=bibliography_entries), buffer) + return buffer.getvalue().strip() -def _split_key_and_fields(body: str) -> tuple[str, str]: - depth = 0 - in_quotes = False - escaped = False - - for index, char in enumerate(body): - if in_quotes: - if escaped: - escaped = False - elif char == "\\": - escaped = True - elif char == '"': - in_quotes = False - continue - - if char == '"': - in_quotes = True - elif char == "{": - depth += 1 - elif char == "}": - depth -= 1 - elif char == "," and depth == 0: - return body[:index].strip(), body[index + 1 :] - - return body.strip(), "" - - -def _parse_fields(blob: str) -> dict[str, str]: - fields: dict[str, str] = {} - index = 0 - size = len(blob) - - while index < size: - while index < size and blob[index] in " \t\r\n,": - index += 1 - if index >= size: - break - - name_start = index - while index < size and (blob[index].isalnum() or blob[index] in "-_"): - index += 1 - name = blob[name_start:index].strip().lower() - - while index < size and blob[index].isspace(): - index += 1 - if index >= size or blob[index] != "=": - raise ValueError(f"Malformed BibTeX field near: {blob[name_start:]!r}") - index += 1 - - while index < size and blob[index].isspace(): - index += 1 - value, index = _parse_value(blob, index) - fields[name] = " ".join(value.split()) - - while index < size and blob[index] in " \t\r\n,": - index += 1 - - return fields - - -def _parse_value(blob: str, index: int) -> tuple[str, int]: - if index >= len(blob): - return "", index - - if blob[index] == "{": - value, next_index = _read_balanced_block(blob, index) - return value.strip(), next_index - - if blob[index] == '"': - index += 1 - chars: list[str] = [] - escaped = False - while index < len(blob): - char = blob[index] - if escaped: - chars.append(char) - escaped = False - elif char == "\\": - chars.append(char) - escaped = True - elif char == '"': - return "".join(chars).strip(), index + 1 - else: - chars.append(char) - index += 1 - raise ValueError("Malformed BibTeX: unterminated quoted string") - - end = index - while end < len(blob) and blob[end] not in ",\r\n": - end += 1 - return blob[index:end].strip(), end +def _require_pybtex() -> None: + if parse_string is None or Writer is None: + raise RuntimeError( + "pybtex is required. Use the repo-local virtual environment under .venv/ for citegeist commands." + ) diff --git a/src/citegeist/cli.py b/src/citegeist/cli.py new file mode 100644 index 0000000..0b44015 --- /dev/null +++ b/src/citegeist/cli.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +from .storage import BibliographyStore + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(prog="citegeist") + parser.add_argument("--db", default="library.sqlite3", help="Path to the SQLite database") + + subparsers = parser.add_subparsers(dest="command", required=True) + + ingest_parser = subparsers.add_parser("ingest", help="Ingest BibTeX into the database") + ingest_parser.add_argument("input", help="BibTeX file to ingest") + + search_parser = subparsers.add_parser("search", help="Search titles, abstracts, and fulltext") + search_parser.add_argument("query", help="Search query") + search_parser.add_argument("--limit", type=int, default=10, help="Maximum number of results") + + show_parser = subparsers.add_parser("show", help="Show one entry or list entries") + show_parser.add_argument("citation_key", nargs="?", help="Citation key to show") + show_parser.add_argument("--limit", type=int, default=20, help="Maximum entries when listing") + + export_parser = subparsers.add_parser("export", help="Export entries as BibTeX") + export_parser.add_argument("citation_keys", nargs="*", help="Optional citation keys to export") + export_parser.add_argument("--output", help="Write BibTeX to a file instead of stdout") + + return parser + + +def main(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + + store = BibliographyStore(args.db) + try: + if args.command == "ingest": + return _run_ingest(store, Path(args.input)) + if args.command == "search": + return _run_search(store, args.query, args.limit) + if args.command == "show": + return _run_show(store, args.citation_key, args.limit) + if args.command == "export": + return _run_export(store, args.citation_keys, args.output) + finally: + store.close() + + parser.error(f"Unknown command: {args.command}") + return 2 + + +def _run_ingest(store: BibliographyStore, input_path: Path) -> int: + text = input_path.read_text(encoding="utf-8") + keys = store.ingest_bibtex(text) + for key in keys: + print(key) + return 0 + + +def _run_search(store: BibliographyStore, query: str, limit: int) -> int: + for row in store.search_text(query, limit=limit): + score = row.get("score", 0.0) + print(f"{row['citation_key']}\t{row.get('year') or ''}\t{score:.3f}\t{row.get('title') or ''}") + return 0 + + +def _run_show(store: BibliographyStore, citation_key: str | None, limit: int) -> int: + if citation_key: + entry = store.get_entry(citation_key) + if entry is None: + print(f"Entry not found: {citation_key}", file=sys.stderr) + return 1 + print(json.dumps(entry, indent=2, sort_keys=True)) + return 0 + + print(json.dumps(store.list_entries(limit=limit), indent=2)) + return 0 + + +def _run_export(store: BibliographyStore, citation_keys: list[str], output: str | None) -> int: + rendered = store.export_bibtex(citation_keys or None) + if output: + Path(output).write_text(rendered + ("\n" if rendered else ""), encoding="utf-8") + else: + if rendered: + print(rendered) + return 0 diff --git a/src/citegeist/storage.py b/src/citegeist/storage.py index 5368724..203b2d7 100644 --- a/src/citegeist/storage.py +++ b/src/citegeist/storage.py @@ -2,9 +2,10 @@ from __future__ import annotations import json import sqlite3 +from collections import OrderedDict from pathlib import Path -from .bibtex import BibEntry, parse_bibtex +from .bibtex import BibEntry, parse_bibtex, render_bibtex IDENTIFIER_FIELDS = ("doi", "isbn", "issn", "pmid", "arxiv", "dblp", "oai", "url") RELATION_FIELDS = { @@ -268,6 +269,41 @@ class BibliographyStore: ).fetchone() return dict(row) if row else None + def list_entries(self, limit: int = 50) -> list[dict[str, object]]: + rows = self.connection.execute( + """ + SELECT citation_key, entry_type, title, year + FROM entries + ORDER BY COALESCE(year, ''), citation_key + LIMIT ? + """, + (limit,), + ).fetchall() + return [dict(row) for row in rows] + + def get_entry_bibtex(self, citation_key: str) -> str | None: + entry = self._load_bib_entry(citation_key) + if entry is None: + return None + return render_bibtex([entry]) + + def export_bibtex(self, citation_keys: list[str] | None = None) -> str: + if citation_keys is None: + rows = self.connection.execute( + "SELECT citation_key FROM entries ORDER BY COALESCE(year, ''), citation_key" + ).fetchall() + citation_keys = [str(row["citation_key"]) for row in rows] + + chunks: list[str] = [] + entries: list[BibEntry] = [] + for citation_key in citation_keys: + entry = self._load_bib_entry(citation_key) + if entry is not None: + entries.append(entry) + if not entries: + return "" + return render_bibtex(entries) + def _detect_fts5(self) -> bool: try: self.connection.execute("CREATE VIRTUAL TABLE temp.fts_probe USING fts5(content)") @@ -276,6 +312,76 @@ class BibliographyStore: except sqlite3.OperationalError: return False + def _load_bib_entry(self, citation_key: str) -> BibEntry | None: + row = self.connection.execute( + """ + SELECT citation_key, entry_type, title, year, journal, booktitle, publisher, + abstract, keywords, url, doi, isbn, extra_fields_json + FROM entries + WHERE citation_key = ? + """, + (citation_key,), + ).fetchone() + if row is None: + return None + + fields: OrderedDict[str, str] = OrderedDict() + for role in ("author", "editor"): + names = self._load_creator_names(citation_key, role) + if names: + fields[role] = " and ".join(names) + + for field_name in ( + "title", + "year", + "journal", + "booktitle", + "publisher", + "abstract", + "keywords", + "url", + "doi", + "isbn", + ): + value = row[field_name] + if value: + fields[field_name] = str(value) + + extra_fields = json.loads(row["extra_fields_json"]) + for field_name in sorted(extra_fields): + value = extra_fields[field_name] + if value: + fields[field_name] = str(value) + + for relation_type, field_name in ( + ("cites", "references"), + ("cited_by", "cited_by"), + ("crossref", "crossref"), + ): + values = self.get_relations(citation_key, relation_type) + if values: + fields[field_name] = ", ".join(values) + + return BibEntry( + entry_type=str(row["entry_type"]), + citation_key=str(row["citation_key"]), + fields=dict(fields), + ) + + def _load_creator_names(self, citation_key: str, role: str) -> list[str]: + rows = self.connection.execute( + """ + SELECT c.full_name + FROM entry_creators ec + JOIN entries e ON e.id = ec.entry_id + JOIN creators c ON c.id = ec.creator_id + WHERE e.citation_key = ? AND ec.role = ? + ORDER BY ec.ordinal + """, + (citation_key, role), + ).fetchall() + return [str(row["full_name"]) for row in rows] + def _split_names(value: str) -> list[str]: if not value: @@ -305,8 +411,4 @@ def _split_relation_values(value: str) -> list[str]: def _entry_to_bibtex(entry: BibEntry) -> str: - lines = [f"@{entry.entry_type}{{{entry.citation_key},"] - for key, value in entry.fields.items(): - lines.append(f" {key} = {{{value}}},") - lines.append("}") - return "\n".join(lines) + return render_bibtex([entry]) diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..afc7091 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + + +SAMPLE_BIB = """ +@article{smith2024graphs, + author = {Smith, Jane and Doe, Alex}, + title = {Graph-first bibliography augmentation}, + year = {2024}, + abstract = {We study citation graphs for literature discovery.}, + references = {miller2023search} +} + +@inproceedings{miller2023search, + author = {Miller, Sam}, + title = {Semantic search for research corpora}, + year = {2023}, + abstract = {Dense retrieval improves recall for academic search.} +} +""" + + +def run_cli(tmp_path: Path, *args: str) -> subprocess.CompletedProcess[str]: + database = tmp_path / "library.sqlite3" + env = {"PYTHONPATH": "src"} + return subprocess.run( + [sys.executable, "-m", "citegeist", "--db", str(database), *args], + cwd=Path(__file__).resolve().parents[1], + env=env, + capture_output=True, + text=True, + check=False, + ) + + +def test_cli_ingest_show_search_and_export(tmp_path: Path): + bib_path = tmp_path / "input.bib" + bib_path.write_text(SAMPLE_BIB, encoding="utf-8") + + ingest = run_cli(tmp_path, "ingest", str(bib_path)) + assert ingest.returncode == 0 + assert "smith2024graphs" in ingest.stdout + + show = run_cli(tmp_path, "show", "smith2024graphs") + assert show.returncode == 0 + payload = json.loads(show.stdout) + assert payload["citation_key"] == "smith2024graphs" + + search = run_cli(tmp_path, "search", "semantic") + assert search.returncode == 0 + assert "miller2023search" in search.stdout + + export_path = tmp_path / "exported.bib" + export_result = run_cli(tmp_path, "export", "--output", str(export_path)) + assert export_result.returncode == 0 + exported = export_path.read_text(encoding="utf-8") + assert "@article{smith2024graphs," in exported diff --git a/tests/test_storage.py b/tests/test_storage.py index c371eaf..a8ed075 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -51,3 +51,19 @@ def test_store_ingests_entries_relations_and_search_text(): ] finally: store.close() + + +def test_store_exports_bibtex_from_normalized_rows(): + store = BibliographyStore() + try: + store.ingest_bibtex(SAMPLE_BIB) + + exported = store.export_bibtex() + parsed = {entry.citation_key: entry for entry in parse_bibtex(exported)} + + assert "@article{smith2024graphs," in exported + assert "@inproceedings{miller2023search," in exported + assert parsed["smith2024graphs"].fields["author"] == "Smith, Jane and Doe, Alex" + assert parsed["smith2024graphs"].fields["references"] == "miller2023search" + finally: + store.close()