Initial commit

2026-03-19 20:13:31 -04:00 · 2026-03-19 20:13:31 -04:00 · 4f3ac4decb
commit 4f3ac4decb
7 changed files with 655 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+__pycache__/
+.pytest_cache/
+*.pyc
+library.sqlite3
--- a/README.md
+++ b/README.md
@ -0,0 +1,111 @@
+# citegeist
+
+`citegeist` is a research-oriented bibliography workbench for building, expanding, and auditing BibTeX libraries.
+
+The aim is not just to store citations. The aim is to help with the harder problem: finding, improving, connecting, and checking the literature around a topic while keeping BibTeX as a first-class output format.
+
+## Repo Description
+
+`citegeist` is a BibTeX-native research tool for citation extraction, metadata enrichment, citation-graph expansion, and semantic search over scholarly sources.
+
+## Scope
+
+The project is intended to support a workflow like this:
+
+1. Start from rough references extracted from papers, notes, syllabi, or dissertations.
+2. Convert them into draft BibTeX entries.
+3. Enrich and correct those entries using external scholarly metadata sources.
+4. Persist entries, identifiers, abstracts, and citation edges in a local database.
+5. Traverse the citation graph outward to discover additional relevant works.
+6. Search the local corpus semantically using abstracts and extracted full text.
+7. Export verified results back into BibTeX for LaTeX use.
+
+## Why A New Codebase
+
+This repository starts cleanly rather than extending the older `bib/` toolkit directly.
+
+The older toolkit is useful as prior art:
+
+- it demonstrates identifier-driven metadata augmentation;
+- it caches PDFs and extracted plaintext;
+- it shows one workable model for bibliography growth.
+
+But it is not the right long-term base:
+
+- it is Python 2-era code;
+- it is shell-script centric;
+- it does not provide a normalized database for graph workflows;
+- it is not structured as a reusable Python 3 library.
+
+`citegeist` keeps the useful ideas and rebuilds the foundation around a cleaner Python 3 package boundary.
+
+## Current Status
+
+The initial repo includes:
+
+- a lightweight BibTeX parser for structured ingestion;
+- a SQLite-backed bibliography store;
+- normalized tables for entries, creators, identifiers, and citation relations;
+- full-text-search-ready indexing over title, abstract, and fulltext when SQLite FTS5 is available;
+- tests covering parsing, ingestion, relation storage, and search.
+
+## Layout
+
+```text
+citegeist/
+  src/citegeist/
+    bibtex.py
+    storage.py
+  tests/
+    test_storage.py
+  pyproject.toml
+```
+
+## Quick Start
+
+```bash
+cd citegeist
+PYTHONPATH=src python3 - <<'PY'
+from citegeist import BibliographyStore
+
+bib = """
+@article{smith2024graphs,
+  author = {Smith, Jane and Doe, Alex},
+  title = {Graph-first bibliography augmentation},
+  year = {2024},
+  abstract = {We study citation graphs for literature discovery.},
+  references = {miller2023search}
+}
+
+@inproceedings{miller2023search,
+  author = {Miller, Sam},
+  title = {Semantic search for research corpora},
+  year = {2023},
+  abstract = {Dense retrieval improves recall for academic search.}
+}
+"""
+
+store = BibliographyStore("library.sqlite3")
+store.ingest_bibtex(bib)
+print(store.get_relations("smith2024graphs"))
+print(store.search_text("semantic"))
+store.close()
+PY
+pytest -q
+```
+
+## Planned Work
+
+- parse references from raw prose, OCR, PDF text, and bibliography sections into draft BibTeX;
+- add modern metadata resolvers for DOI, Crossref, DBLP, arXiv, and similar sources;
+- track provenance and confidence for enriched fields;
+- add graph expansion workflows over `cites` and `cited_by` edges;
+- support acquisition pipelines for open-access theses, dissertations, preprints, and publisher metadata pages;
+- add embeddings or pluggable semantic indexing beyond SQLite FTS.
+
+## Naming
+
+The name is intended to be short, distinct, and memorable:
+
+- `cite` for citation work;
+- `geist` for the organizing intelligence around the literature.
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,13 @@
+[build-system]
+requires = ["setuptools>=68"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "citegeist"
+version = "0.1.0"
+description = "BibTeX-native tooling for bibliography augmentation, citation graphs, and search"
+requires-python = ">=3.10"
+
+[tool.pytest.ini_options]
+pythonpath = ["src"]
+testpaths = ["tests"]
--- a/src/citegeist/init.py
+++ b/src/citegeist/init.py
@ -0,0 +1,4 @@
+from .bibtex import BibEntry, parse_bibtex
+from .storage import BibliographyStore
+
+__all__ = ["BibEntry", "BibliographyStore", "parse_bibtex"]
--- a/src/citegeist/bibtex.py
+++ b/src/citegeist/bibtex.py
@ -0,0 +1,158 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+
+@dataclass(slots=True)
+class BibEntry:
+    entry_type: str
+    citation_key: str
+    fields: dict[str, str]
+
+
+def parse_bibtex(text: str) -> list[BibEntry]:
+    entries: list[BibEntry] = []
+    index = 0
+    size = len(text)
+
+    while index < size:
+        at = text.find("@", index)
+        if at == -1:
+            break
+        entry_type_start = at + 1
+        brace = text.find("{", entry_type_start)
+        if brace == -1:
+            raise ValueError("Malformed BibTeX: missing opening brace")
+        entry_type = text[entry_type_start:brace].strip().lower()
+        body, index = _read_balanced_block(text, brace)
+        citation_key, fields_blob = _split_key_and_fields(body)
+        entries.append(
+            BibEntry(
+                entry_type=entry_type,
+                citation_key=citation_key,
+                fields=_parse_fields(fields_blob),
+            )
+        )
+
+    return entries
+
+
+def _read_balanced_block(text: str, brace_index: int) -> tuple[str, int]:
+    depth = 0
+    in_quotes = False
+    escaped = False
+
+    for index in range(brace_index, len(text)):
+        char = text[index]
+        if in_quotes:
+            if escaped:
+                escaped = False
+            elif char == "\\":
+                escaped = True
+            elif char == '"':
+                in_quotes = False
+            continue
+
+        if char == '"':
+            in_quotes = True
+        elif char == "{":
+            depth += 1
+        elif char == "}":
+            depth -= 1
+            if depth == 0:
+                return text[brace_index + 1:index], index + 1
+
+    raise ValueError("Malformed BibTeX: unbalanced braces")
+
+
+def _split_key_and_fields(body: str) -> tuple[str, str]:
+    depth = 0
+    in_quotes = False
+    escaped = False
+
+    for index, char in enumerate(body):
+        if in_quotes:
+            if escaped:
+                escaped = False
+            elif char == "\\":
+                escaped = True
+            elif char == '"':
+                in_quotes = False
+            continue
+
+        if char == '"':
+            in_quotes = True
+        elif char == "{":
+            depth += 1
+        elif char == "}":
+            depth -= 1
+        elif char == "," and depth == 0:
+            return body[:index].strip(), body[index + 1 :]
+
+    return body.strip(), ""
+
+
+def _parse_fields(blob: str) -> dict[str, str]:
+    fields: dict[str, str] = {}
+    index = 0
+    size = len(blob)
+
+    while index < size:
+        while index < size and blob[index] in " \t\r\n,":
+            index += 1
+        if index >= size:
+            break
+
+        name_start = index
+        while index < size and (blob[index].isalnum() or blob[index] in "-_"):
+            index += 1
+        name = blob[name_start:index].strip().lower()
+
+        while index < size and blob[index].isspace():
+            index += 1
+        if index >= size or blob[index] != "=":
+            raise ValueError(f"Malformed BibTeX field near: {blob[name_start:]!r}")
+        index += 1
+
+        while index < size and blob[index].isspace():
+            index += 1
+        value, index = _parse_value(blob, index)
+        fields[name] = " ".join(value.split())
+
+        while index < size and blob[index] in " \t\r\n,":
+            index += 1
+
+    return fields
+
+
+def _parse_value(blob: str, index: int) -> tuple[str, int]:
+    if index >= len(blob):
+        return "", index
+
+    if blob[index] == "{":
+        value, next_index = _read_balanced_block(blob, index)
+        return value.strip(), next_index
+
+    if blob[index] == '"':
+        index += 1
+        chars: list[str] = []
+        escaped = False
+        while index < len(blob):
+            char = blob[index]
+            if escaped:
+                chars.append(char)
+                escaped = False
+            elif char == "\\":
+                chars.append(char)
+                escaped = True
+            elif char == '"':
+                return "".join(chars).strip(), index + 1
+            else:
+                chars.append(char)
+            index += 1
+        raise ValueError("Malformed BibTeX: unterminated quoted string")
+
+    end = index
+    while end < len(blob) and blob[end] not in ",\r\n":
+        end += 1
+    return blob[index:end].strip(), end
--- a/src/citegeist/storage.py
+++ b/src/citegeist/storage.py
@ -0,0 +1,312 @@
+from __future__ import annotations
+
+import json
+import sqlite3
+from pathlib import Path
+
+from .bibtex import BibEntry, parse_bibtex
+
+IDENTIFIER_FIELDS = ("doi", "isbn", "issn", "pmid", "arxiv", "dblp", "oai", "url")
+RELATION_FIELDS = {
+    "references": "cites",
+    "cites": "cites",
+    "cited_by": "cited_by",
+    "crossref": "crossref",
+}
+CORE_ENTRY_FIELDS = {
+    "title",
+    "year",
+    "journal",
+    "booktitle",
+    "publisher",
+    "abstract",
+    "keywords",
+    "url",
+    "doi",
+    "isbn",
+}
+
+
+class BibliographyStore:
+    def __init__(self, path: str | Path = ":memory:") -> None:
+        self.path = str(path)
+        self.connection = sqlite3.connect(self.path)
+        self.connection.row_factory = sqlite3.Row
+        self.connection.execute("PRAGMA foreign_keys = ON")
+        self._fts5_enabled = self._detect_fts5()
+        self.initialize()
+
+    def close(self) -> None:
+        self.connection.close()
+
+    def initialize(self) -> None:
+        self.connection.executescript(
+            """
+            CREATE TABLE IF NOT EXISTS entries (
+                id INTEGER PRIMARY KEY,
+                citation_key TEXT NOT NULL UNIQUE,
+                entry_type TEXT NOT NULL,
+                title TEXT,
+                year TEXT,
+                journal TEXT,
+                booktitle TEXT,
+                publisher TEXT,
+                abstract TEXT,
+                keywords TEXT,
+                url TEXT,
+                doi TEXT,
+                isbn TEXT,
+                fulltext TEXT,
+                raw_bibtex TEXT,
+                extra_fields_json TEXT NOT NULL DEFAULT '{}',
+                created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
+                updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
+            );
+
+            CREATE TABLE IF NOT EXISTS creators (
+                id INTEGER PRIMARY KEY,
+                full_name TEXT NOT NULL UNIQUE,
+                family_name TEXT,
+                given_names TEXT
+            );
+
+            CREATE TABLE IF NOT EXISTS entry_creators (
+                entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
+                creator_id INTEGER NOT NULL REFERENCES creators(id) ON DELETE CASCADE,
+                role TEXT NOT NULL,
+                ordinal INTEGER NOT NULL,
+                PRIMARY KEY (entry_id, role, ordinal)
+            );
+
+            CREATE TABLE IF NOT EXISTS identifiers (
+                entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
+                scheme TEXT NOT NULL,
+                value TEXT NOT NULL,
+                PRIMARY KEY (scheme, value)
+            );
+
+            CREATE TABLE IF NOT EXISTS relations (
+                source_entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
+                target_citation_key TEXT NOT NULL,
+                relation_type TEXT NOT NULL,
+                PRIMARY KEY (source_entry_id, target_citation_key, relation_type)
+            );
+            """
+        )
+
+        if self._fts5_enabled:
+            self.connection.execute(
+                """
+                CREATE VIRTUAL TABLE IF NOT EXISTS entry_text_fts
+                USING fts5(
+                    citation_key UNINDEXED,
+                    title,
+                    abstract,
+                    fulltext
+                )
+                """
+            )
+        self.connection.commit()
+
+    def ingest_bibtex(self, text: str, fulltext_by_key: dict[str, str] | None = None) -> list[str]:
+        fulltext_by_key = fulltext_by_key or {}
+        entries = parse_bibtex(text)
+        keys: list[str] = []
+        for entry in entries:
+            fulltext = fulltext_by_key.get(entry.citation_key)
+            self.upsert_entry(entry, fulltext=fulltext, raw_bibtex=_entry_to_bibtex(entry))
+            keys.append(entry.citation_key)
+        self.connection.commit()
+        return keys
+
+    def upsert_entry(self, entry: BibEntry, fulltext: str | None = None, raw_bibtex: str | None = None) -> int:
+        row = self.connection.execute(
+            """
+            INSERT INTO entries (
+                citation_key, entry_type, title, year, journal, booktitle, publisher,
+                abstract, keywords, url, doi, isbn, fulltext, raw_bibtex, extra_fields_json
+            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            ON CONFLICT(citation_key) DO UPDATE SET
+                entry_type = excluded.entry_type,
+                title = excluded.title,
+                year = excluded.year,
+                journal = excluded.journal,
+                booktitle = excluded.booktitle,
+                publisher = excluded.publisher,
+                abstract = excluded.abstract,
+                keywords = excluded.keywords,
+                url = excluded.url,
+                doi = excluded.doi,
+                isbn = excluded.isbn,
+                fulltext = COALESCE(excluded.fulltext, entries.fulltext),
+                raw_bibtex = COALESCE(excluded.raw_bibtex, entries.raw_bibtex),
+                extra_fields_json = excluded.extra_fields_json,
+                updated_at = CURRENT_TIMESTAMP
+            RETURNING id
+            """,
+            (
+                entry.citation_key,
+                entry.entry_type,
+                entry.fields.get("title"),
+                entry.fields.get("year"),
+                entry.fields.get("journal"),
+                entry.fields.get("booktitle"),
+                entry.fields.get("publisher"),
+                entry.fields.get("abstract"),
+                entry.fields.get("keywords"),
+                entry.fields.get("url"),
+                entry.fields.get("doi"),
+                entry.fields.get("isbn"),
+                fulltext,
+                raw_bibtex,
+                json.dumps({k: v for k, v in entry.fields.items() if k not in CORE_ENTRY_FIELDS and k not in RELATION_FIELDS}),
+            ),
+        ).fetchone()
+        entry_id = int(row["id"])
+
+        self.connection.execute("DELETE FROM entry_creators WHERE entry_id = ?", (entry_id,))
+        for role in ("author", "editor"):
+            names = _split_names(entry.fields.get(role, ""))
+            for ordinal, name in enumerate(names, start=1):
+                creator = _split_person_name(name)
+                creator_row = self.connection.execute(
+                    """
+                    INSERT INTO creators (full_name, family_name, given_names)
+                    VALUES (?, ?, ?)
+                    ON CONFLICT(full_name) DO UPDATE SET
+                        family_name = COALESCE(excluded.family_name, creators.family_name),
+                        given_names = COALESCE(excluded.given_names, creators.given_names)
+                    RETURNING id
+                    """,
+                    (creator["full_name"], creator["family_name"], creator["given_names"]),
+                ).fetchone()
+                self.connection.execute(
+                    """
+                    INSERT INTO entry_creators (entry_id, creator_id, role, ordinal)
+                    VALUES (?, ?, ?, ?)
+                    """,
+                    (entry_id, int(creator_row["id"]), role, ordinal),
+                )
+
+        self.connection.execute("DELETE FROM identifiers WHERE entry_id = ?", (entry_id,))
+        for scheme in IDENTIFIER_FIELDS:
+            value = entry.fields.get(scheme)
+            if value:
+                self.connection.execute(
+                    "INSERT OR REPLACE INTO identifiers (entry_id, scheme, value) VALUES (?, ?, ?)",
+                    (entry_id, scheme, value),
+                )
+
+        self.connection.execute("DELETE FROM relations WHERE source_entry_id = ?", (entry_id,))
+        for field_name, relation_type in RELATION_FIELDS.items():
+            values = _split_relation_values(entry.fields.get(field_name, ""))
+            for target_key in values:
+                self.connection.execute(
+                    """
+                    INSERT OR IGNORE INTO relations (source_entry_id, target_citation_key, relation_type)
+                    VALUES (?, ?, ?)
+                    """,
+                    (entry_id, target_key, relation_type),
+                )
+
+        if self._fts5_enabled:
+            self.connection.execute("DELETE FROM entry_text_fts WHERE citation_key = ?", (entry.citation_key,))
+            self.connection.execute(
+                """
+                INSERT INTO entry_text_fts (citation_key, title, abstract, fulltext)
+                VALUES (?, ?, ?, ?)
+                """,
+                (entry.citation_key, entry.fields.get("title", ""), entry.fields.get("abstract", ""), fulltext or ""),
+            )
+
+        return entry_id
+
+    def search_text(self, query: str, limit: int = 10) -> list[dict[str, object]]:
+        if self._fts5_enabled:
+            rows = self.connection.execute(
+                """
+                SELECT e.citation_key, e.title, e.year, bm25(entry_text_fts) AS score
+                FROM entry_text_fts
+                JOIN entries e ON e.citation_key = entry_text_fts.citation_key
+                WHERE entry_text_fts MATCH ?
+                ORDER BY score
+                LIMIT ?
+                """,
+                (query, limit),
+            ).fetchall()
+        else:
+            pattern = f"%{query}%"
+            rows = self.connection.execute(
+                """
+                SELECT citation_key, title, year, 0.0 AS score
+                FROM entries
+                WHERE title LIKE ? OR abstract LIKE ? OR fulltext LIKE ?
+                LIMIT ?
+                """,
+                (pattern, pattern, pattern, limit),
+            ).fetchall()
+
+        return [dict(row) for row in rows]
+
+    def get_relations(self, citation_key: str, relation_type: str = "cites") -> list[str]:
+        rows = self.connection.execute(
+            """
+            SELECT r.target_citation_key
+            FROM relations r
+            JOIN entries e ON e.id = r.source_entry_id
+            WHERE e.citation_key = ? AND r.relation_type = ?
+            ORDER BY r.target_citation_key
+            """,
+            (citation_key, relation_type),
+        ).fetchall()
+        return [str(row["target_citation_key"]) for row in rows]
+
+    def get_entry(self, citation_key: str) -> dict[str, object] | None:
+        row = self.connection.execute(
+            "SELECT * FROM entries WHERE citation_key = ?",
+            (citation_key,),
+        ).fetchone()
+        return dict(row) if row else None
+
+    def _detect_fts5(self) -> bool:
+        try:
+            self.connection.execute("CREATE VIRTUAL TABLE temp.fts_probe USING fts5(content)")
+            self.connection.execute("DROP TABLE temp.fts_probe")
+            return True
+        except sqlite3.OperationalError:
+            return False
+
+
+def _split_names(value: str) -> list[str]:
+    if not value:
+        return []
+    return [part.strip() for part in value.split(" and ") if part.strip()]
+
+
+def _split_person_name(name: str) -> dict[str, str | None]:
+    if "," in name:
+        family_name, given_names = [part.strip() for part in name.split(",", 1)]
+    else:
+        parts = name.split()
+        family_name = parts[-1] if parts else ""
+        given_names = " ".join(parts[:-1]) if len(parts) > 1 else None
+    return {
+        "full_name": name.strip(),
+        "family_name": family_name or None,
+        "given_names": given_names or None,
+    }
+
+
+def _split_relation_values(value: str) -> list[str]:
+    if not value:
+        return []
+    normalized = value.replace("\n", ",").replace(";", ",")
+    return [part.strip() for part in normalized.split(",") if part.strip()]
+
+
+def _entry_to_bibtex(entry: BibEntry) -> str:
+    lines = [f"@{entry.entry_type}{{{entry.citation_key},"]
+    for key, value in entry.fields.items():
+        lines.append(f"  {key} = {{{value}}},")
+    lines.append("}")
+    return "\n".join(lines)
--- a/tests/test_storage.py
+++ b/tests/test_storage.py
@ -0,0 +1,53 @@
+from citegeist import BibliographyStore, parse_bibtex
+
+
+SAMPLE_BIB = """
+@article{smith2024graphs,
+  author = {Smith, Jane and Doe, Alex},
+  title = {Graph-first bibliography augmentation},
+  year = {2024},
+  doi = {10.1000/graph.2024.1},
+  abstract = {We study citation graphs for literature discovery.},
+  references = {miller2023search}
+}
+
+@inproceedings{miller2023search,
+  author = {Miller, Sam},
+  title = {Semantic search for research corpora},
+  year = {2023},
+  abstract = {Dense retrieval improves recall for academic search.}
+}
+"""
+
+
+def test_parse_bibtex_extracts_entries_and_fields():
+    entries = parse_bibtex(SAMPLE_BIB)
+
+    assert [entry.citation_key for entry in entries] == ["smith2024graphs", "miller2023search"]
+    assert entries[0].fields["title"] == "Graph-first bibliography augmentation"
+    assert entries[0].fields["references"] == "miller2023search"
+
+
+def test_store_ingests_entries_relations_and_search_text():
+    store = BibliographyStore()
+    try:
+        store.ingest_bibtex(
+            SAMPLE_BIB,
+            fulltext_by_key={
+                "smith2024graphs": "This paper links citation graphs with semantic search over abstracts."
+            },
+        )
+
+        entry = store.get_entry("smith2024graphs")
+        assert entry is not None
+        assert entry["doi"] == "10.1000/graph.2024.1"
+
+        assert store.get_relations("smith2024graphs") == ["miller2023search"]
+
+        results = store.search_text("semantic")
+        assert [row["citation_key"] for row in results][:2] == [
+            "miller2023search",
+            "smith2024graphs",
+        ]
+    finally:
+        store.close()