commit 4f3ac4decb17dd81b1356bf9f566c0ae8d393cbe Author: welsberr Date: Thu Mar 19 20:13:31 2026 -0400 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..947495c --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +__pycache__/ +.pytest_cache/ +*.pyc +library.sqlite3 diff --git a/README.md b/README.md new file mode 100644 index 0000000..6850f89 --- /dev/null +++ b/README.md @@ -0,0 +1,111 @@ +# citegeist + +`citegeist` is a research-oriented bibliography workbench for building, expanding, and auditing BibTeX libraries. + +The aim is not just to store citations. The aim is to help with the harder problem: finding, improving, connecting, and checking the literature around a topic while keeping BibTeX as a first-class output format. + +## Repo Description + +`citegeist` is a BibTeX-native research tool for citation extraction, metadata enrichment, citation-graph expansion, and semantic search over scholarly sources. + +## Scope + +The project is intended to support a workflow like this: + +1. Start from rough references extracted from papers, notes, syllabi, or dissertations. +2. Convert them into draft BibTeX entries. +3. Enrich and correct those entries using external scholarly metadata sources. +4. Persist entries, identifiers, abstracts, and citation edges in a local database. +5. Traverse the citation graph outward to discover additional relevant works. +6. Search the local corpus semantically using abstracts and extracted full text. +7. Export verified results back into BibTeX for LaTeX use. + +## Why A New Codebase + +This repository starts cleanly rather than extending the older `bib/` toolkit directly. + +The older toolkit is useful as prior art: + +- it demonstrates identifier-driven metadata augmentation; +- it caches PDFs and extracted plaintext; +- it shows one workable model for bibliography growth. + +But it is not the right long-term base: + +- it is Python 2-era code; +- it is shell-script centric; +- it does not provide a normalized database for graph workflows; +- it is not structured as a reusable Python 3 library. + +`citegeist` keeps the useful ideas and rebuilds the foundation around a cleaner Python 3 package boundary. + +## Current Status + +The initial repo includes: + +- a lightweight BibTeX parser for structured ingestion; +- a SQLite-backed bibliography store; +- normalized tables for entries, creators, identifiers, and citation relations; +- full-text-search-ready indexing over title, abstract, and fulltext when SQLite FTS5 is available; +- tests covering parsing, ingestion, relation storage, and search. + +## Layout + +```text +citegeist/ + src/citegeist/ + bibtex.py + storage.py + tests/ + test_storage.py + pyproject.toml +``` + +## Quick Start + +```bash +cd citegeist +PYTHONPATH=src python3 - <<'PY' +from citegeist import BibliographyStore + +bib = """ +@article{smith2024graphs, + author = {Smith, Jane and Doe, Alex}, + title = {Graph-first bibliography augmentation}, + year = {2024}, + abstract = {We study citation graphs for literature discovery.}, + references = {miller2023search} +} + +@inproceedings{miller2023search, + author = {Miller, Sam}, + title = {Semantic search for research corpora}, + year = {2023}, + abstract = {Dense retrieval improves recall for academic search.} +} +""" + +store = BibliographyStore("library.sqlite3") +store.ingest_bibtex(bib) +print(store.get_relations("smith2024graphs")) +print(store.search_text("semantic")) +store.close() +PY +pytest -q +``` + +## Planned Work + +- parse references from raw prose, OCR, PDF text, and bibliography sections into draft BibTeX; +- add modern metadata resolvers for DOI, Crossref, DBLP, arXiv, and similar sources; +- track provenance and confidence for enriched fields; +- add graph expansion workflows over `cites` and `cited_by` edges; +- support acquisition pipelines for open-access theses, dissertations, preprints, and publisher metadata pages; +- add embeddings or pluggable semantic indexing beyond SQLite FTS. + +## Naming + +The name is intended to be short, distinct, and memorable: + +- `cite` for citation work; +- `geist` for the organizing intelligence around the literature. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8207788 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,13 @@ +[build-system] +requires = ["setuptools>=68"] +build-backend = "setuptools.build_meta" + +[project] +name = "citegeist" +version = "0.1.0" +description = "BibTeX-native tooling for bibliography augmentation, citation graphs, and search" +requires-python = ">=3.10" + +[tool.pytest.ini_options] +pythonpath = ["src"] +testpaths = ["tests"] diff --git a/src/citegeist/__init__.py b/src/citegeist/__init__.py new file mode 100644 index 0000000..1681197 --- /dev/null +++ b/src/citegeist/__init__.py @@ -0,0 +1,4 @@ +from .bibtex import BibEntry, parse_bibtex +from .storage import BibliographyStore + +__all__ = ["BibEntry", "BibliographyStore", "parse_bibtex"] diff --git a/src/citegeist/bibtex.py b/src/citegeist/bibtex.py new file mode 100644 index 0000000..3e5a247 --- /dev/null +++ b/src/citegeist/bibtex.py @@ -0,0 +1,158 @@ +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(slots=True) +class BibEntry: + entry_type: str + citation_key: str + fields: dict[str, str] + + +def parse_bibtex(text: str) -> list[BibEntry]: + entries: list[BibEntry] = [] + index = 0 + size = len(text) + + while index < size: + at = text.find("@", index) + if at == -1: + break + entry_type_start = at + 1 + brace = text.find("{", entry_type_start) + if brace == -1: + raise ValueError("Malformed BibTeX: missing opening brace") + entry_type = text[entry_type_start:brace].strip().lower() + body, index = _read_balanced_block(text, brace) + citation_key, fields_blob = _split_key_and_fields(body) + entries.append( + BibEntry( + entry_type=entry_type, + citation_key=citation_key, + fields=_parse_fields(fields_blob), + ) + ) + + return entries + + +def _read_balanced_block(text: str, brace_index: int) -> tuple[str, int]: + depth = 0 + in_quotes = False + escaped = False + + for index in range(brace_index, len(text)): + char = text[index] + if in_quotes: + if escaped: + escaped = False + elif char == "\\": + escaped = True + elif char == '"': + in_quotes = False + continue + + if char == '"': + in_quotes = True + elif char == "{": + depth += 1 + elif char == "}": + depth -= 1 + if depth == 0: + return text[brace_index + 1:index], index + 1 + + raise ValueError("Malformed BibTeX: unbalanced braces") + + +def _split_key_and_fields(body: str) -> tuple[str, str]: + depth = 0 + in_quotes = False + escaped = False + + for index, char in enumerate(body): + if in_quotes: + if escaped: + escaped = False + elif char == "\\": + escaped = True + elif char == '"': + in_quotes = False + continue + + if char == '"': + in_quotes = True + elif char == "{": + depth += 1 + elif char == "}": + depth -= 1 + elif char == "," and depth == 0: + return body[:index].strip(), body[index + 1 :] + + return body.strip(), "" + + +def _parse_fields(blob: str) -> dict[str, str]: + fields: dict[str, str] = {} + index = 0 + size = len(blob) + + while index < size: + while index < size and blob[index] in " \t\r\n,": + index += 1 + if index >= size: + break + + name_start = index + while index < size and (blob[index].isalnum() or blob[index] in "-_"): + index += 1 + name = blob[name_start:index].strip().lower() + + while index < size and blob[index].isspace(): + index += 1 + if index >= size or blob[index] != "=": + raise ValueError(f"Malformed BibTeX field near: {blob[name_start:]!r}") + index += 1 + + while index < size and blob[index].isspace(): + index += 1 + value, index = _parse_value(blob, index) + fields[name] = " ".join(value.split()) + + while index < size and blob[index] in " \t\r\n,": + index += 1 + + return fields + + +def _parse_value(blob: str, index: int) -> tuple[str, int]: + if index >= len(blob): + return "", index + + if blob[index] == "{": + value, next_index = _read_balanced_block(blob, index) + return value.strip(), next_index + + if blob[index] == '"': + index += 1 + chars: list[str] = [] + escaped = False + while index < len(blob): + char = blob[index] + if escaped: + chars.append(char) + escaped = False + elif char == "\\": + chars.append(char) + escaped = True + elif char == '"': + return "".join(chars).strip(), index + 1 + else: + chars.append(char) + index += 1 + raise ValueError("Malformed BibTeX: unterminated quoted string") + + end = index + while end < len(blob) and blob[end] not in ",\r\n": + end += 1 + return blob[index:end].strip(), end diff --git a/src/citegeist/storage.py b/src/citegeist/storage.py new file mode 100644 index 0000000..5368724 --- /dev/null +++ b/src/citegeist/storage.py @@ -0,0 +1,312 @@ +from __future__ import annotations + +import json +import sqlite3 +from pathlib import Path + +from .bibtex import BibEntry, parse_bibtex + +IDENTIFIER_FIELDS = ("doi", "isbn", "issn", "pmid", "arxiv", "dblp", "oai", "url") +RELATION_FIELDS = { + "references": "cites", + "cites": "cites", + "cited_by": "cited_by", + "crossref": "crossref", +} +CORE_ENTRY_FIELDS = { + "title", + "year", + "journal", + "booktitle", + "publisher", + "abstract", + "keywords", + "url", + "doi", + "isbn", +} + + +class BibliographyStore: + def __init__(self, path: str | Path = ":memory:") -> None: + self.path = str(path) + self.connection = sqlite3.connect(self.path) + self.connection.row_factory = sqlite3.Row + self.connection.execute("PRAGMA foreign_keys = ON") + self._fts5_enabled = self._detect_fts5() + self.initialize() + + def close(self) -> None: + self.connection.close() + + def initialize(self) -> None: + self.connection.executescript( + """ + CREATE TABLE IF NOT EXISTS entries ( + id INTEGER PRIMARY KEY, + citation_key TEXT NOT NULL UNIQUE, + entry_type TEXT NOT NULL, + title TEXT, + year TEXT, + journal TEXT, + booktitle TEXT, + publisher TEXT, + abstract TEXT, + keywords TEXT, + url TEXT, + doi TEXT, + isbn TEXT, + fulltext TEXT, + raw_bibtex TEXT, + extra_fields_json TEXT NOT NULL DEFAULT '{}', + created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP + ); + + CREATE TABLE IF NOT EXISTS creators ( + id INTEGER PRIMARY KEY, + full_name TEXT NOT NULL UNIQUE, + family_name TEXT, + given_names TEXT + ); + + CREATE TABLE IF NOT EXISTS entry_creators ( + entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE, + creator_id INTEGER NOT NULL REFERENCES creators(id) ON DELETE CASCADE, + role TEXT NOT NULL, + ordinal INTEGER NOT NULL, + PRIMARY KEY (entry_id, role, ordinal) + ); + + CREATE TABLE IF NOT EXISTS identifiers ( + entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE, + scheme TEXT NOT NULL, + value TEXT NOT NULL, + PRIMARY KEY (scheme, value) + ); + + CREATE TABLE IF NOT EXISTS relations ( + source_entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE, + target_citation_key TEXT NOT NULL, + relation_type TEXT NOT NULL, + PRIMARY KEY (source_entry_id, target_citation_key, relation_type) + ); + """ + ) + + if self._fts5_enabled: + self.connection.execute( + """ + CREATE VIRTUAL TABLE IF NOT EXISTS entry_text_fts + USING fts5( + citation_key UNINDEXED, + title, + abstract, + fulltext + ) + """ + ) + self.connection.commit() + + def ingest_bibtex(self, text: str, fulltext_by_key: dict[str, str] | None = None) -> list[str]: + fulltext_by_key = fulltext_by_key or {} + entries = parse_bibtex(text) + keys: list[str] = [] + for entry in entries: + fulltext = fulltext_by_key.get(entry.citation_key) + self.upsert_entry(entry, fulltext=fulltext, raw_bibtex=_entry_to_bibtex(entry)) + keys.append(entry.citation_key) + self.connection.commit() + return keys + + def upsert_entry(self, entry: BibEntry, fulltext: str | None = None, raw_bibtex: str | None = None) -> int: + row = self.connection.execute( + """ + INSERT INTO entries ( + citation_key, entry_type, title, year, journal, booktitle, publisher, + abstract, keywords, url, doi, isbn, fulltext, raw_bibtex, extra_fields_json + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(citation_key) DO UPDATE SET + entry_type = excluded.entry_type, + title = excluded.title, + year = excluded.year, + journal = excluded.journal, + booktitle = excluded.booktitle, + publisher = excluded.publisher, + abstract = excluded.abstract, + keywords = excluded.keywords, + url = excluded.url, + doi = excluded.doi, + isbn = excluded.isbn, + fulltext = COALESCE(excluded.fulltext, entries.fulltext), + raw_bibtex = COALESCE(excluded.raw_bibtex, entries.raw_bibtex), + extra_fields_json = excluded.extra_fields_json, + updated_at = CURRENT_TIMESTAMP + RETURNING id + """, + ( + entry.citation_key, + entry.entry_type, + entry.fields.get("title"), + entry.fields.get("year"), + entry.fields.get("journal"), + entry.fields.get("booktitle"), + entry.fields.get("publisher"), + entry.fields.get("abstract"), + entry.fields.get("keywords"), + entry.fields.get("url"), + entry.fields.get("doi"), + entry.fields.get("isbn"), + fulltext, + raw_bibtex, + json.dumps({k: v for k, v in entry.fields.items() if k not in CORE_ENTRY_FIELDS and k not in RELATION_FIELDS}), + ), + ).fetchone() + entry_id = int(row["id"]) + + self.connection.execute("DELETE FROM entry_creators WHERE entry_id = ?", (entry_id,)) + for role in ("author", "editor"): + names = _split_names(entry.fields.get(role, "")) + for ordinal, name in enumerate(names, start=1): + creator = _split_person_name(name) + creator_row = self.connection.execute( + """ + INSERT INTO creators (full_name, family_name, given_names) + VALUES (?, ?, ?) + ON CONFLICT(full_name) DO UPDATE SET + family_name = COALESCE(excluded.family_name, creators.family_name), + given_names = COALESCE(excluded.given_names, creators.given_names) + RETURNING id + """, + (creator["full_name"], creator["family_name"], creator["given_names"]), + ).fetchone() + self.connection.execute( + """ + INSERT INTO entry_creators (entry_id, creator_id, role, ordinal) + VALUES (?, ?, ?, ?) + """, + (entry_id, int(creator_row["id"]), role, ordinal), + ) + + self.connection.execute("DELETE FROM identifiers WHERE entry_id = ?", (entry_id,)) + for scheme in IDENTIFIER_FIELDS: + value = entry.fields.get(scheme) + if value: + self.connection.execute( + "INSERT OR REPLACE INTO identifiers (entry_id, scheme, value) VALUES (?, ?, ?)", + (entry_id, scheme, value), + ) + + self.connection.execute("DELETE FROM relations WHERE source_entry_id = ?", (entry_id,)) + for field_name, relation_type in RELATION_FIELDS.items(): + values = _split_relation_values(entry.fields.get(field_name, "")) + for target_key in values: + self.connection.execute( + """ + INSERT OR IGNORE INTO relations (source_entry_id, target_citation_key, relation_type) + VALUES (?, ?, ?) + """, + (entry_id, target_key, relation_type), + ) + + if self._fts5_enabled: + self.connection.execute("DELETE FROM entry_text_fts WHERE citation_key = ?", (entry.citation_key,)) + self.connection.execute( + """ + INSERT INTO entry_text_fts (citation_key, title, abstract, fulltext) + VALUES (?, ?, ?, ?) + """, + (entry.citation_key, entry.fields.get("title", ""), entry.fields.get("abstract", ""), fulltext or ""), + ) + + return entry_id + + def search_text(self, query: str, limit: int = 10) -> list[dict[str, object]]: + if self._fts5_enabled: + rows = self.connection.execute( + """ + SELECT e.citation_key, e.title, e.year, bm25(entry_text_fts) AS score + FROM entry_text_fts + JOIN entries e ON e.citation_key = entry_text_fts.citation_key + WHERE entry_text_fts MATCH ? + ORDER BY score + LIMIT ? + """, + (query, limit), + ).fetchall() + else: + pattern = f"%{query}%" + rows = self.connection.execute( + """ + SELECT citation_key, title, year, 0.0 AS score + FROM entries + WHERE title LIKE ? OR abstract LIKE ? OR fulltext LIKE ? + LIMIT ? + """, + (pattern, pattern, pattern, limit), + ).fetchall() + + return [dict(row) for row in rows] + + def get_relations(self, citation_key: str, relation_type: str = "cites") -> list[str]: + rows = self.connection.execute( + """ + SELECT r.target_citation_key + FROM relations r + JOIN entries e ON e.id = r.source_entry_id + WHERE e.citation_key = ? AND r.relation_type = ? + ORDER BY r.target_citation_key + """, + (citation_key, relation_type), + ).fetchall() + return [str(row["target_citation_key"]) for row in rows] + + def get_entry(self, citation_key: str) -> dict[str, object] | None: + row = self.connection.execute( + "SELECT * FROM entries WHERE citation_key = ?", + (citation_key,), + ).fetchone() + return dict(row) if row else None + + def _detect_fts5(self) -> bool: + try: + self.connection.execute("CREATE VIRTUAL TABLE temp.fts_probe USING fts5(content)") + self.connection.execute("DROP TABLE temp.fts_probe") + return True + except sqlite3.OperationalError: + return False + + +def _split_names(value: str) -> list[str]: + if not value: + return [] + return [part.strip() for part in value.split(" and ") if part.strip()] + + +def _split_person_name(name: str) -> dict[str, str | None]: + if "," in name: + family_name, given_names = [part.strip() for part in name.split(",", 1)] + else: + parts = name.split() + family_name = parts[-1] if parts else "" + given_names = " ".join(parts[:-1]) if len(parts) > 1 else None + return { + "full_name": name.strip(), + "family_name": family_name or None, + "given_names": given_names or None, + } + + +def _split_relation_values(value: str) -> list[str]: + if not value: + return [] + normalized = value.replace("\n", ",").replace(";", ",") + return [part.strip() for part in normalized.split(",") if part.strip()] + + +def _entry_to_bibtex(entry: BibEntry) -> str: + lines = [f"@{entry.entry_type}{{{entry.citation_key},"] + for key, value in entry.fields.items(): + lines.append(f" {key} = {{{value}}},") + lines.append("}") + return "\n".join(lines) diff --git a/tests/test_storage.py b/tests/test_storage.py new file mode 100644 index 0000000..c371eaf --- /dev/null +++ b/tests/test_storage.py @@ -0,0 +1,53 @@ +from citegeist import BibliographyStore, parse_bibtex + + +SAMPLE_BIB = """ +@article{smith2024graphs, + author = {Smith, Jane and Doe, Alex}, + title = {Graph-first bibliography augmentation}, + year = {2024}, + doi = {10.1000/graph.2024.1}, + abstract = {We study citation graphs for literature discovery.}, + references = {miller2023search} +} + +@inproceedings{miller2023search, + author = {Miller, Sam}, + title = {Semantic search for research corpora}, + year = {2023}, + abstract = {Dense retrieval improves recall for academic search.} +} +""" + + +def test_parse_bibtex_extracts_entries_and_fields(): + entries = parse_bibtex(SAMPLE_BIB) + + assert [entry.citation_key for entry in entries] == ["smith2024graphs", "miller2023search"] + assert entries[0].fields["title"] == "Graph-first bibliography augmentation" + assert entries[0].fields["references"] == "miller2023search" + + +def test_store_ingests_entries_relations_and_search_text(): + store = BibliographyStore() + try: + store.ingest_bibtex( + SAMPLE_BIB, + fulltext_by_key={ + "smith2024graphs": "This paper links citation graphs with semantic search over abstracts." + }, + ) + + entry = store.get_entry("smith2024graphs") + assert entry is not None + assert entry["doi"] == "10.1000/graph.2024.1" + + assert store.get_relations("smith2024graphs") == ["miller2023search"] + + results = store.search_text("semantic") + assert [row["citation_key"] for row in results][:2] == [ + "miller2023search", + "smith2024graphs", + ] + finally: + store.close()