Initial commit

2026-03-19 20:13:31 -04:00 · 2026-03-19 20:13:31 -04:00 · 4f3ac4decb
commit 4f3ac4decb
7 changed files with 655 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
 __pycache__/
 .pytest_cache/
 *.pyc
 library.sqlite3
--- a/README.md
+++ b/README.md
@ -0,0 +1,111 @@
 # citegeist
 `citegeist` is a research-oriented bibliography workbench for building, expanding, and auditing BibTeX libraries.
 The aim is not just to store citations. The aim is to help with the harder problem: finding, improving, connecting, and checking the literature around a topic while keeping BibTeX as a first-class output format.
 ## Repo Description
 `citegeist` is a BibTeX-native research tool for citation extraction, metadata enrichment, citation-graph expansion, and semantic search over scholarly sources.
 ## Scope
 The project is intended to support a workflow like this:
 1. Start from rough references extracted from papers, notes, syllabi, or dissertations.
 2. Convert them into draft BibTeX entries.
 3. Enrich and correct those entries using external scholarly metadata sources.
 4. Persist entries, identifiers, abstracts, and citation edges in a local database.
 5. Traverse the citation graph outward to discover additional relevant works.
 6. Search the local corpus semantically using abstracts and extracted full text.
 7. Export verified results back into BibTeX for LaTeX use.
 ## Why A New Codebase
 This repository starts cleanly rather than extending the older `bib/` toolkit directly.
 The older toolkit is useful as prior art:
 - it demonstrates identifier-driven metadata augmentation;
 - it caches PDFs and extracted plaintext;
 - it shows one workable model for bibliography growth.
 But it is not the right long-term base:
 - it is Python 2-era code;
 - it is shell-script centric;
 - it does not provide a normalized database for graph workflows;
 - it is not structured as a reusable Python 3 library.
 `citegeist` keeps the useful ideas and rebuilds the foundation around a cleaner Python 3 package boundary.
 ## Current Status
 The initial repo includes:
 - a lightweight BibTeX parser for structured ingestion;
 - a SQLite-backed bibliography store;
 - normalized tables for entries, creators, identifiers, and citation relations;
 - full-text-search-ready indexing over title, abstract, and fulltext when SQLite FTS5 is available;
 - tests covering parsing, ingestion, relation storage, and search.
 ## Layout
 ```text
 citegeist/
  src/citegeist/
    bibtex.py
    storage.py
  tests/
    test_storage.py
  pyproject.toml
 ```
 ## Quick Start
 ```bash
 cd citegeist
 PYTHONPATH=src python3 - <<'PY'
 from citegeist import BibliographyStore
 bib = """
@article{smith2024graphs,
  author = {Smith, Jane and Doe, Alex},
  title = {Graph-first bibliography augmentation},
  year = {2024},
  abstract = {We study citation graphs for literature discovery.},
  references = {miller2023search}
 }
@inproceedings{miller2023search,
  author = {Miller, Sam},
  title = {Semantic search for research corpora},
  year = {2023},
  abstract = {Dense retrieval improves recall for academic search.}
 }
 """
 store = BibliographyStore("library.sqlite3")
 store.ingest_bibtex(bib)
 print(store.get_relations("smith2024graphs"))
 print(store.search_text("semantic"))
 store.close()
 PY
 pytest -q
 ```
 ## Planned Work
 - parse references from raw prose, OCR, PDF text, and bibliography sections into draft BibTeX;
 - add modern metadata resolvers for DOI, Crossref, DBLP, arXiv, and similar sources;
 - track provenance and confidence for enriched fields;
 - add graph expansion workflows over `cites` and `cited_by` edges;
 - support acquisition pipelines for open-access theses, dissertations, preprints, and publisher metadata pages;
 - add embeddings or pluggable semantic indexing beyond SQLite FTS.
 ## Naming
 The name is intended to be short, distinct, and memorable:
 - `cite` for citation work;
 - `geist` for the organizing intelligence around the literature.
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,13 @@
 [build-system]
 requires = ["setuptools>=68"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "citegeist"
 version = "0.1.0"
 description = "BibTeX-native tooling for bibliography augmentation, citation graphs, and search"
 requires-python = ">=3.10"
 [tool.pytest.ini_options]
 pythonpath = ["src"]
 testpaths = ["tests"]
--- a/src/citegeist/init.py
+++ b/src/citegeist/init.py
@ -0,0 +1,4 @@
 from .bibtex import BibEntry, parse_bibtex
 from .storage import BibliographyStore
 __all__ = ["BibEntry", "BibliographyStore", "parse_bibtex"]
--- a/src/citegeist/bibtex.py
+++ b/src/citegeist/bibtex.py
@ -0,0 +1,158 @@
 from __future__ import annotations
 from dataclasses import dataclass
@dataclass(slots=True)
 class BibEntry:
    entry_type: str
    citation_key: str
    fields: dict[str, str]
 def parse_bibtex(text: str) -> list[BibEntry]:
    entries: list[BibEntry] = []
    index = 0
    size = len(text)
    while index < size:
        at = text.find("@", index)
        if at == -1:
            break
        entry_type_start = at + 1
        brace = text.find("{", entry_type_start)
        if brace == -1:
            raise ValueError("Malformed BibTeX: missing opening brace")
        entry_type = text[entry_type_start:brace].strip().lower()
        body, index = _read_balanced_block(text, brace)
        citation_key, fields_blob = _split_key_and_fields(body)
        entries.append(
            BibEntry(
                entry_type=entry_type,
                citation_key=citation_key,
                fields=_parse_fields(fields_blob),
            )
        )
    return entries
 def _read_balanced_block(text: str, brace_index: int) -> tuple[str, int]:
    depth = 0
    in_quotes = False
    escaped = False
    for index in range(brace_index, len(text)):
        char = text[index]
        if in_quotes:
            if escaped:
                escaped = False
            elif char == "\\":
                escaped = True
            elif char == '"':
                in_quotes = False
            continue
        if char == '"':
            in_quotes = True
        elif char == "{":
            depth += 1
        elif char == "}":
            depth -= 1
            if depth == 0:
                return text[brace_index + 1:index], index + 1
    raise ValueError("Malformed BibTeX: unbalanced braces")
 def _split_key_and_fields(body: str) -> tuple[str, str]:
    depth = 0
    in_quotes = False
    escaped = False
    for index, char in enumerate(body):
        if in_quotes:
            if escaped:
                escaped = False
            elif char == "\\":
                escaped = True
            elif char == '"':
                in_quotes = False
            continue
        if char == '"':
            in_quotes = True
        elif char == "{":
            depth += 1
        elif char == "}":
            depth -= 1
        elif char == "," and depth == 0:
            return body[:index].strip(), body[index + 1 :]
    return body.strip(), ""
 def _parse_fields(blob: str) -> dict[str, str]:
    fields: dict[str, str] = {}
    index = 0
    size = len(blob)
    while index < size:
        while index < size and blob[index] in " \t\r\n,":
            index += 1
        if index >= size:
            break
        name_start = index
        while index < size and (blob[index].isalnum() or blob[index] in "-_"):
            index += 1
        name = blob[name_start:index].strip().lower()
        while index < size and blob[index].isspace():
            index += 1
        if index >= size or blob[index] != "=":
            raise ValueError(f"Malformed BibTeX field near: {blob[name_start:]!r}")
        index += 1
        while index < size and blob[index].isspace():
            index += 1
        value, index = _parse_value(blob, index)
        fields[name] = " ".join(value.split())
        while index < size and blob[index] in " \t\r\n,":
            index += 1
    return fields
 def _parse_value(blob: str, index: int) -> tuple[str, int]:
    if index >= len(blob):
        return "", index
    if blob[index] == "{":
        value, next_index = _read_balanced_block(blob, index)
        return value.strip(), next_index
    if blob[index] == '"':
        index += 1
        chars: list[str] = []
        escaped = False
        while index < len(blob):
            char = blob[index]
            if escaped:
                chars.append(char)
                escaped = False
            elif char == "\\":
                chars.append(char)
                escaped = True
            elif char == '"':
                return "".join(chars).strip(), index + 1
            else:
                chars.append(char)
            index += 1
        raise ValueError("Malformed BibTeX: unterminated quoted string")
    end = index
    while end < len(blob) and blob[end] not in ",\r\n":
        end += 1
    return blob[index:end].strip(), end
--- a/src/citegeist/storage.py
+++ b/src/citegeist/storage.py
@ -0,0 +1,312 @@
 from __future__ import annotations
 import json
 import sqlite3
 from pathlib import Path
 from .bibtex import BibEntry, parse_bibtex
 IDENTIFIER_FIELDS = ("doi", "isbn", "issn", "pmid", "arxiv", "dblp", "oai", "url")
 RELATION_FIELDS = {
    "references": "cites",
    "cites": "cites",
    "cited_by": "cited_by",
    "crossref": "crossref",
 }
 CORE_ENTRY_FIELDS = {
    "title",
    "year",
    "journal",
    "booktitle",
    "publisher",
    "abstract",
    "keywords",
    "url",
    "doi",
    "isbn",
 }
 class BibliographyStore:
    def __init__(self, path: str | Path = ":memory:") -> None:
        self.path = str(path)
        self.connection = sqlite3.connect(self.path)
        self.connection.row_factory = sqlite3.Row
        self.connection.execute("PRAGMA foreign_keys = ON")
        self._fts5_enabled = self._detect_fts5()
        self.initialize()
    def close(self) -> None:
        self.connection.close()
    def initialize(self) -> None:
        self.connection.executescript(
            """
            CREATE TABLE IF NOT EXISTS entries (
                id INTEGER PRIMARY KEY,
                citation_key TEXT NOT NULL UNIQUE,
                entry_type TEXT NOT NULL,
                title TEXT,
                year TEXT,
                journal TEXT,
                booktitle TEXT,
                publisher TEXT,
                abstract TEXT,
                keywords TEXT,
                url TEXT,
                doi TEXT,
                isbn TEXT,
                fulltext TEXT,
                raw_bibtex TEXT,
                extra_fields_json TEXT NOT NULL DEFAULT '{}',
                created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
                updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
            );
            CREATE TABLE IF NOT EXISTS creators (
                id INTEGER PRIMARY KEY,
                full_name TEXT NOT NULL UNIQUE,
                family_name TEXT,
                given_names TEXT
            );
            CREATE TABLE IF NOT EXISTS entry_creators (
                entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
                creator_id INTEGER NOT NULL REFERENCES creators(id) ON DELETE CASCADE,
                role TEXT NOT NULL,
                ordinal INTEGER NOT NULL,
                PRIMARY KEY (entry_id, role, ordinal)
            );
            CREATE TABLE IF NOT EXISTS identifiers (
                entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
                scheme TEXT NOT NULL,
                value TEXT NOT NULL,
                PRIMARY KEY (scheme, value)
            );
            CREATE TABLE IF NOT EXISTS relations (
                source_entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
                target_citation_key TEXT NOT NULL,
                relation_type TEXT NOT NULL,
                PRIMARY KEY (source_entry_id, target_citation_key, relation_type)
            );
            """
        )
        if self._fts5_enabled:
            self.connection.execute(
                """
                CREATE VIRTUAL TABLE IF NOT EXISTS entry_text_fts
                USING fts5(
                    citation_key UNINDEXED,
                    title,
                    abstract,
                    fulltext
                )
                """
            )
        self.connection.commit()
    def ingest_bibtex(self, text: str, fulltext_by_key: dict[str, str] | None = None) -> list[str]:
        fulltext_by_key = fulltext_by_key or {}
        entries = parse_bibtex(text)
        keys: list[str] = []
        for entry in entries:
            fulltext = fulltext_by_key.get(entry.citation_key)
            self.upsert_entry(entry, fulltext=fulltext, raw_bibtex=_entry_to_bibtex(entry))
            keys.append(entry.citation_key)
        self.connection.commit()
        return keys
    def upsert_entry(self, entry: BibEntry, fulltext: str | None = None, raw_bibtex: str | None = None) -> int:
        row = self.connection.execute(
            """
            INSERT INTO entries (
                citation_key, entry_type, title, year, journal, booktitle, publisher,
                abstract, keywords, url, doi, isbn, fulltext, raw_bibtex, extra_fields_json
            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ON CONFLICT(citation_key) DO UPDATE SET
                entry_type = excluded.entry_type,
                title = excluded.title,
                year = excluded.year,
                journal = excluded.journal,
                booktitle = excluded.booktitle,
                publisher = excluded.publisher,
                abstract = excluded.abstract,
                keywords = excluded.keywords,
                url = excluded.url,
                doi = excluded.doi,
                isbn = excluded.isbn,
                fulltext = COALESCE(excluded.fulltext, entries.fulltext),
                raw_bibtex = COALESCE(excluded.raw_bibtex, entries.raw_bibtex),
                extra_fields_json = excluded.extra_fields_json,
                updated_at = CURRENT_TIMESTAMP
            RETURNING id
            """,
            (
                entry.citation_key,
                entry.entry_type,
                entry.fields.get("title"),
                entry.fields.get("year"),
                entry.fields.get("journal"),
                entry.fields.get("booktitle"),
                entry.fields.get("publisher"),
                entry.fields.get("abstract"),
                entry.fields.get("keywords"),
                entry.fields.get("url"),
                entry.fields.get("doi"),
                entry.fields.get("isbn"),
                fulltext,
                raw_bibtex,
                json.dumps({k: v for k, v in entry.fields.items() if k not in CORE_ENTRY_FIELDS and k not in RELATION_FIELDS}),
            ),
        ).fetchone()
        entry_id = int(row["id"])
        self.connection.execute("DELETE FROM entry_creators WHERE entry_id = ?", (entry_id,))
        for role in ("author", "editor"):
            names = _split_names(entry.fields.get(role, ""))
            for ordinal, name in enumerate(names, start=1):
                creator = _split_person_name(name)
                creator_row = self.connection.execute(
                    """
                    INSERT INTO creators (full_name, family_name, given_names)
                    VALUES (?, ?, ?)
                    ON CONFLICT(full_name) DO UPDATE SET
                        family_name = COALESCE(excluded.family_name, creators.family_name),
                        given_names = COALESCE(excluded.given_names, creators.given_names)
                    RETURNING id
                    """,
                    (creator["full_name"], creator["family_name"], creator["given_names"]),
                ).fetchone()
                self.connection.execute(
                    """
                    INSERT INTO entry_creators (entry_id, creator_id, role, ordinal)
                    VALUES (?, ?, ?, ?)
                    """,
                    (entry_id, int(creator_row["id"]), role, ordinal),
                )
        self.connection.execute("DELETE FROM identifiers WHERE entry_id = ?", (entry_id,))
        for scheme in IDENTIFIER_FIELDS:
            value = entry.fields.get(scheme)
            if value:
                self.connection.execute(
                    "INSERT OR REPLACE INTO identifiers (entry_id, scheme, value) VALUES (?, ?, ?)",
                    (entry_id, scheme, value),
                )
        self.connection.execute("DELETE FROM relations WHERE source_entry_id = ?", (entry_id,))
        for field_name, relation_type in RELATION_FIELDS.items():
            values = _split_relation_values(entry.fields.get(field_name, ""))
            for target_key in values:
                self.connection.execute(
                    """
                    INSERT OR IGNORE INTO relations (source_entry_id, target_citation_key, relation_type)
                    VALUES (?, ?, ?)
                    """,
                    (entry_id, target_key, relation_type),
                )
        if self._fts5_enabled:
            self.connection.execute("DELETE FROM entry_text_fts WHERE citation_key = ?", (entry.citation_key,))
            self.connection.execute(
                """
                INSERT INTO entry_text_fts (citation_key, title, abstract, fulltext)
                VALUES (?, ?, ?, ?)
                """,
                (entry.citation_key, entry.fields.get("title", ""), entry.fields.get("abstract", ""), fulltext or ""),
            )
        return entry_id
    def search_text(self, query: str, limit: int = 10) -> list[dict[str, object]]:
        if self._fts5_enabled:
            rows = self.connection.execute(
                """
                SELECT e.citation_key, e.title, e.year, bm25(entry_text_fts) AS score
                FROM entry_text_fts
                JOIN entries e ON e.citation_key = entry_text_fts.citation_key
                WHERE entry_text_fts MATCH ?
                ORDER BY score
                LIMIT ?
                """,
                (query, limit),
            ).fetchall()
        else:
            pattern = f"%{query}%"
            rows = self.connection.execute(
                """
                SELECT citation_key, title, year, 0.0 AS score
                FROM entries
                WHERE title LIKE ? OR abstract LIKE ? OR fulltext LIKE ?
                LIMIT ?
                """,
                (pattern, pattern, pattern, limit),
            ).fetchall()
        return [dict(row) for row in rows]
    def get_relations(self, citation_key: str, relation_type: str = "cites") -> list[str]:
        rows = self.connection.execute(
            """
            SELECT r.target_citation_key
            FROM relations r
            JOIN entries e ON e.id = r.source_entry_id
            WHERE e.citation_key = ? AND r.relation_type = ?
            ORDER BY r.target_citation_key
            """,
            (citation_key, relation_type),
        ).fetchall()
        return [str(row["target_citation_key"]) for row in rows]
    def get_entry(self, citation_key: str) -> dict[str, object] | None:
        row = self.connection.execute(
            "SELECT * FROM entries WHERE citation_key = ?",
            (citation_key,),
        ).fetchone()
        return dict(row) if row else None
    def _detect_fts5(self) -> bool:
        try:
            self.connection.execute("CREATE VIRTUAL TABLE temp.fts_probe USING fts5(content)")
            self.connection.execute("DROP TABLE temp.fts_probe")
            return True
        except sqlite3.OperationalError:
            return False
 def _split_names(value: str) -> list[str]:
    if not value:
        return []
    return [part.strip() for part in value.split(" and ") if part.strip()]
 def _split_person_name(name: str) -> dict[str, str | None]:
    if "," in name:
        family_name, given_names = [part.strip() for part in name.split(",", 1)]
    else:
        parts = name.split()
        family_name = parts[-1] if parts else ""
        given_names = " ".join(parts[:-1]) if len(parts) > 1 else None
    return {
        "full_name": name.strip(),
        "family_name": family_name or None,
        "given_names": given_names or None,
    }
 def _split_relation_values(value: str) -> list[str]:
    if not value:
        return []
    normalized = value.replace("\n", ",").replace(";", ",")
    return [part.strip() for part in normalized.split(",") if part.strip()]
 def _entry_to_bibtex(entry: BibEntry) -> str:
    lines = [f"@{entry.entry_type}{{{entry.citation_key},"]
    for key, value in entry.fields.items():
        lines.append(f"  {key} = {{{value}}},")
    lines.append("}")
    return "\n".join(lines)
--- a/tests/test_storage.py
+++ b/tests/test_storage.py
@ -0,0 +1,53 @@
 from citegeist import BibliographyStore, parse_bibtex
 SAMPLE_BIB = """
@article{smith2024graphs,
  author = {Smith, Jane and Doe, Alex},
  title = {Graph-first bibliography augmentation},
  year = {2024},
  doi = {10.1000/graph.2024.1},
  abstract = {We study citation graphs for literature discovery.},
  references = {miller2023search}
 }
@inproceedings{miller2023search,
  author = {Miller, Sam},
  title = {Semantic search for research corpora},
  year = {2023},
  abstract = {Dense retrieval improves recall for academic search.}
 }
 """
 def test_parse_bibtex_extracts_entries_and_fields():
    entries = parse_bibtex(SAMPLE_BIB)
    assert [entry.citation_key for entry in entries] == ["smith2024graphs", "miller2023search"]
    assert entries[0].fields["title"] == "Graph-first bibliography augmentation"
    assert entries[0].fields["references"] == "miller2023search"
 def test_store_ingests_entries_relations_and_search_text():
    store = BibliographyStore()
    try:
        store.ingest_bibtex(
            SAMPLE_BIB,
            fulltext_by_key={
                "smith2024graphs": "This paper links citation graphs with semantic search over abstracts."
            },
        )
        entry = store.get_entry("smith2024graphs")
        assert entry is not None
        assert entry["doi"] == "10.1000/graph.2024.1"
        assert store.get_relations("smith2024graphs") == ["miller2023search"]
        results = store.search_text("semantic")
        assert [row["citation_key"] for row in results][:2] == [
            "miller2023search",
            "smith2024graphs",
        ]
    finally:
        store.close()