From e819f176077c358e9f413e418bede3016bdfc190 Mon Sep 17 00:00:00 2001 From: welsberr Date: Thu, 23 Apr 2026 07:06:50 -0400 Subject: [PATCH] Initial commit --- docs/README.md | 20 + docs/architecture.md | 93 ++++ .../groundrecall-assistant-architecture.md | 174 ++++++ .../legacy/groundrecall-ingestion-refactor.md | 105 ++++ docs/legacy/groundrecall-llmwiki-import.md | 496 ++++++++++++++++++ docs/legacy/groundrecall-migration-plan.md | 281 ++++++++++ docs/legacy/groundrecall-repo-bootstrap.md | 286 ++++++++++ docs/llmwiki-import.md | 85 +++ docs/quickstart.md | 97 ++++ docs/sync-roadmap.md | 73 +++ pyproject.toml | 31 ++ src/groundrecall/__init__.py | 34 ++ src/groundrecall/__main__.py | 5 + src/groundrecall/artifact_schemas.py | 81 +++ src/groundrecall/assistant_export.py | 59 +++ src/groundrecall/assistants/__init__.py | 2 + src/groundrecall/assistants/base.py | 3 + src/groundrecall/assistants/claude_code.py | 3 + src/groundrecall/assistants/codex.py | 3 + src/groundrecall/citation_support.py | 239 +++++++++ src/groundrecall/cli.py | 40 ++ src/groundrecall/export.py | 136 +++++ .../groundrecall_assistant_export.py | 12 + .../groundrecall_assistants/__init__.py | 9 + .../groundrecall_assistants/base.py | 43 ++ .../groundrecall_assistants/claude_code.py | 69 +++ .../groundrecall_assistants/codex.py | 78 +++ src/groundrecall/groundrecall_discovery.py | 54 ++ src/groundrecall/groundrecall_export.py | 24 + src/groundrecall/groundrecall_import.py | 9 + src/groundrecall/groundrecall_lint.py | 9 + src/groundrecall/groundrecall_models.py | 9 + src/groundrecall/groundrecall_normalizer.py | 136 +++++ src/groundrecall/groundrecall_promotion.py | 9 + src/groundrecall/groundrecall_query.py | 26 + .../groundrecall_review_bridge.py | 138 +++++ src/groundrecall/groundrecall_review_queue.py | 114 ++++ src/groundrecall/groundrecall_segmenter.py | 180 +++++++ .../groundrecall_source_adapters/__init__.py | 15 + .../groundrecall_source_adapters/base.py | 76 +++ .../didactopus_pack.py | 234 +++++++++ .../doclift_bundle.py | 150 ++++++ .../groundrecall_source_adapters/llmwiki.py | 36 ++ .../markdown_notes.py | 41 ++ .../groundrecall_source_adapters/polypaper.py | 106 ++++ .../transcript.py | 38 ++ src/groundrecall/groundrecall_store.py | 9 + src/groundrecall/ingest.py | 231 ++++++++ src/groundrecall/inspect.py | 47 ++ src/groundrecall/lint.py | 196 +++++++ src/groundrecall/models.py | 137 +++++ src/groundrecall/promotion.py | 250 +++++++++ src/groundrecall/query.py | 188 +++++++ src/groundrecall/review_app/app.js | 366 +++++++++++++ src/groundrecall/review_app/index.html | 13 + src/groundrecall/review_app/styles.css | 248 +++++++++ src/groundrecall/review_export.py | 439 ++++++++++++++++ src/groundrecall/review_schema.py | 80 +++ src/groundrecall/review_server.py | 246 +++++++++ src/groundrecall/review_workspace.py | 126 +++++ src/groundrecall/source_adapters/__init__.py | 3 + src/groundrecall/source_adapters/base.py | 3 + .../source_adapters/didactopus_pack.py | 3 + src/groundrecall/source_adapters/llmwiki.py | 3 + .../source_adapters/markdown_notes.py | 3 + src/groundrecall/source_adapters/polypaper.py | 3 + .../source_adapters/transcript.py | 3 + src/groundrecall/store.py | 203 +++++++ tests/test_console_script.py | 17 + tests/test_groundrecall_assistants.py | 134 +++++ tests/test_groundrecall_export.py | 136 +++++ tests/test_groundrecall_import.py | 161 ++++++ tests/test_groundrecall_namespace.py | 70 +++ tests/test_groundrecall_promotion.py | 96 ++++ tests/test_groundrecall_query.py | 190 +++++++ tests/test_groundrecall_review_server.py | 60 +++ tests/test_groundrecall_review_workspace.py | 86 +++ tests/test_groundrecall_source_adapters.py | 235 +++++++++ tests/test_groundrecall_store.py | 148 ++++++ 79 files changed, 8094 insertions(+) create mode 100644 docs/README.md create mode 100644 docs/architecture.md create mode 100644 docs/legacy/groundrecall-assistant-architecture.md create mode 100644 docs/legacy/groundrecall-ingestion-refactor.md create mode 100644 docs/legacy/groundrecall-llmwiki-import.md create mode 100644 docs/legacy/groundrecall-migration-plan.md create mode 100644 docs/legacy/groundrecall-repo-bootstrap.md create mode 100644 docs/llmwiki-import.md create mode 100644 docs/quickstart.md create mode 100644 docs/sync-roadmap.md create mode 100644 pyproject.toml create mode 100644 src/groundrecall/__init__.py create mode 100644 src/groundrecall/__main__.py create mode 100644 src/groundrecall/artifact_schemas.py create mode 100644 src/groundrecall/assistant_export.py create mode 100644 src/groundrecall/assistants/__init__.py create mode 100644 src/groundrecall/assistants/base.py create mode 100644 src/groundrecall/assistants/claude_code.py create mode 100644 src/groundrecall/assistants/codex.py create mode 100644 src/groundrecall/citation_support.py create mode 100644 src/groundrecall/cli.py create mode 100644 src/groundrecall/export.py create mode 100644 src/groundrecall/groundrecall_assistant_export.py create mode 100644 src/groundrecall/groundrecall_assistants/__init__.py create mode 100644 src/groundrecall/groundrecall_assistants/base.py create mode 100644 src/groundrecall/groundrecall_assistants/claude_code.py create mode 100644 src/groundrecall/groundrecall_assistants/codex.py create mode 100644 src/groundrecall/groundrecall_discovery.py create mode 100644 src/groundrecall/groundrecall_export.py create mode 100644 src/groundrecall/groundrecall_import.py create mode 100644 src/groundrecall/groundrecall_lint.py create mode 100644 src/groundrecall/groundrecall_models.py create mode 100644 src/groundrecall/groundrecall_normalizer.py create mode 100644 src/groundrecall/groundrecall_promotion.py create mode 100644 src/groundrecall/groundrecall_query.py create mode 100644 src/groundrecall/groundrecall_review_bridge.py create mode 100644 src/groundrecall/groundrecall_review_queue.py create mode 100644 src/groundrecall/groundrecall_segmenter.py create mode 100644 src/groundrecall/groundrecall_source_adapters/__init__.py create mode 100644 src/groundrecall/groundrecall_source_adapters/base.py create mode 100644 src/groundrecall/groundrecall_source_adapters/didactopus_pack.py create mode 100755 src/groundrecall/groundrecall_source_adapters/doclift_bundle.py create mode 100644 src/groundrecall/groundrecall_source_adapters/llmwiki.py create mode 100644 src/groundrecall/groundrecall_source_adapters/markdown_notes.py create mode 100644 src/groundrecall/groundrecall_source_adapters/polypaper.py create mode 100644 src/groundrecall/groundrecall_source_adapters/transcript.py create mode 100644 src/groundrecall/groundrecall_store.py create mode 100644 src/groundrecall/ingest.py create mode 100644 src/groundrecall/inspect.py create mode 100644 src/groundrecall/lint.py create mode 100644 src/groundrecall/models.py create mode 100644 src/groundrecall/promotion.py create mode 100644 src/groundrecall/query.py create mode 100644 src/groundrecall/review_app/app.js create mode 100644 src/groundrecall/review_app/index.html create mode 100644 src/groundrecall/review_app/styles.css create mode 100644 src/groundrecall/review_export.py create mode 100644 src/groundrecall/review_schema.py create mode 100644 src/groundrecall/review_server.py create mode 100644 src/groundrecall/review_workspace.py create mode 100644 src/groundrecall/source_adapters/__init__.py create mode 100644 src/groundrecall/source_adapters/base.py create mode 100644 src/groundrecall/source_adapters/didactopus_pack.py create mode 100644 src/groundrecall/source_adapters/llmwiki.py create mode 100644 src/groundrecall/source_adapters/markdown_notes.py create mode 100644 src/groundrecall/source_adapters/polypaper.py create mode 100644 src/groundrecall/source_adapters/transcript.py create mode 100644 src/groundrecall/store.py create mode 100644 tests/test_console_script.py create mode 100644 tests/test_groundrecall_assistants.py create mode 100644 tests/test_groundrecall_export.py create mode 100644 tests/test_groundrecall_import.py create mode 100644 tests/test_groundrecall_namespace.py create mode 100644 tests/test_groundrecall_promotion.py create mode 100644 tests/test_groundrecall_query.py create mode 100644 tests/test_groundrecall_review_server.py create mode 100644 tests/test_groundrecall_review_workspace.py create mode 100644 tests/test_groundrecall_source_adapters.py create mode 100644 tests/test_groundrecall_store.py diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..e6b5d1e --- /dev/null +++ b/docs/README.md @@ -0,0 +1,20 @@ +# Docs + +The top-level documentation in this repository is intended to describe `GroundRecall` as a standalone project. + +Primary docs: + +- [quickstart.md](quickstart.md) +- [architecture.md](architecture.md) +- [llmwiki-import.md](llmwiki-import.md) +- [sync-roadmap.md](sync-roadmap.md) + +Legacy extraction notes: + +- [legacy/groundrecall-assistant-architecture.md](legacy/groundrecall-assistant-architecture.md) +- [legacy/groundrecall-ingestion-refactor.md](legacy/groundrecall-ingestion-refactor.md) +- [legacy/groundrecall-llmwiki-import.md](legacy/groundrecall-llmwiki-import.md) +- [legacy/groundrecall-migration-plan.md](legacy/groundrecall-migration-plan.md) +- [legacy/groundrecall-repo-bootstrap.md](legacy/groundrecall-repo-bootstrap.md) + +Those legacy documents were carried over from the earlier `Didactopus`-embedded phase. They remain useful as design history, but they are not the preferred starting point for current standalone `GroundRecall` work. diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..72e24ae --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,93 @@ +# Architecture + +`GroundRecall` is the grounded knowledge substrate in a larger stack: + +- `GroundRecall`: canonical knowledge ingestion, promotion, query, export, and future sync +- `Didactopus`: learner-facing workflows and educational tooling +- `GenieHive`: model and routing layer where runtime assistant/service resolution is needed + +## Core Design + +The system is built around one canonical flow: + +1. ingest weakly structured sources +2. normalize them into stable knowledge objects +3. lint and queue them for review +4. promote reviewed objects into a canonical store +5. query and export promoted state + +## Core Objects + +The canonical store is built from these object families: + +- `Source` +- `Fragment` +- `Artifact` +- `Observation` +- `Claim` +- `Concept` +- `Relation` +- `ReviewCandidate` +- `PromotionRecord` +- `GroundRecallSnapshot` + +These objects are assistant-neutral. Assistant-specific formatting belongs at the adapter layer. + +## Package Surface + +The main standalone package surface is: + +- `groundrecall.ingest` +- `groundrecall.lint` +- `groundrecall.models` +- `groundrecall.store` +- `groundrecall.promotion` +- `groundrecall.query` +- `groundrecall.export` +- `groundrecall.assistant_export` +- `groundrecall.inspect` +- `groundrecall.source_adapters.*` +- `groundrecall.assistants.*` + +There are also compatibility-style helper modules prefixed with `groundrecall_` inside the package. Those exist because the standalone repo was extracted from an earlier monorepo layout. + +## Source Adapters + +Adapters handle source-shape-specific discovery and mapping while the downstream pipeline stays generic. + +Current adapter families include: + +- `llmwiki` +- `markdown_notes` +- `transcript` +- `didactopus_pack` + +## Assistant Boundary + +Assistant integration is intentionally outside the core store and query semantics. + +The rule is: + +- core `GroundRecall` owns truth, provenance, lifecycle, and retrieval semantics +- assistant adapters own presentation, bundle shaping, and tool-specific exports + +Current adapters include: + +- `codex` +- `claude_code` + +## Alpha Boundary + +The current alpha is strong enough for: + +- local import and promotion +- canonical query and export +- assistant-neutral bundles +- assistant-targeted bundle generation + +It is not yet complete for: + +- multi-node sync and merge +- re-import/update semantics +- richer review adjudication +- large-scale distributed corpus integration diff --git a/docs/legacy/groundrecall-assistant-architecture.md b/docs/legacy/groundrecall-assistant-architecture.md new file mode 100644 index 0000000..d06e673 --- /dev/null +++ b/docs/legacy/groundrecall-assistant-architecture.md @@ -0,0 +1,174 @@ +# GroundRecall Assistant Integration Architecture + +This document defines how GroundRecall should support Codex, Claude Code, and +future assistant environments without treating any single assistant as the +authoritative integration target. + +## Design rule + +GroundRecall core must be assistant-agnostic. + +Assistant-specific formats are derived views over promoted GroundRecall objects, +not the canonical representation of knowledge. + +## Why this boundary matters + +If assistant-specific prompt packaging leaks into the core model too early, +GroundRecall becomes: + +- harder to evolve +- harder to validate +- harder to sync across machines +- harder to support across multiple assistant environments + +The stable boundary should instead be: + +- canonical grounded knowledge objects in core +- assistant adapters at the edge + +## Core vs adapter split + +### Core GroundRecall responsibilities + +These should remain assistant-neutral: + +- schemas for `Source`, `Fragment`, `Artifact`, `Observation`, `Claim`, + `Concept`, `Relation`, `ReviewCandidate`, and `PromotionRecord` +- provenance and confidence modeling +- contradiction and supersession handling +- linting and review queue generation +- review and promotion workflows +- persistent storage for promoted objects +- query and retrieval semantics +- sync and multi-machine consolidation +- canonical export formats + +### Assistant adapter responsibilities + +These should be adapter-specific: + +- prompt/context packaging +- assistant-specific bundle layout +- memory-file rendering +- skill-file rendering +- assistant capability declarations +- token-budget shaping and truncation policy +- tool-specific metadata + +## Canonical export contract + +GroundRecall should export assistant-neutral artifacts first. + +Recommended canonical exports: + +- `groundrecall_snapshot.json` +- `claims.jsonl` +- `concepts.jsonl` +- `relations.jsonl` +- `provenance_manifest.json` +- `query_bundle.json` + +Assistant adapters then derive secondary outputs from those canonical exports. + +## Assistant adapter interface + +GroundRecall should expose a small adapter protocol. + +Example shape: + +```python +class AssistantAdapter(Protocol): + name: str + + def export_bundle(self, snapshot: dict, out_dir: Path) -> list[Path]: + ... + + def build_context(self, query_result: dict) -> dict: + ... + + def supported_capabilities(self) -> dict[str, bool]: + ... +``` + +This is a strategy/plugin boundary. A small registry or factory is acceptable, +but the important architectural decision is the separation of concerns, not the +factory itself. + +## Recommended package layout + +Recommended modules: + +- `didactopus.groundrecall.models` +- `didactopus.groundrecall.store` +- `didactopus.groundrecall.promotion` +- `didactopus.groundrecall.query` +- `didactopus.groundrecall.export` +- `didactopus.groundrecall.assistants.base` +- `didactopus.groundrecall.assistants.codex` +- `didactopus.groundrecall.assistants.claude_code` + +## Export layering + +Recommended filesystem layout: + +- `exports/canonical/` +- `exports/assistants/codex/` +- `exports/assistants/claude-code/` + +Canonical exports remain the durable interchange format. + +Assistant exports remain reproducible derived artifacts. + +## Query layering + +The query layer should return assistant-neutral structures such as: + +- relevant claims +- supporting fragments +- provenance +- contradictions +- supersessions +- confidence and recency +- suggested next actions + +Adapters may then convert this payload into: + +- Codex skill/context bundles +- Claude Code project memory/context bundles +- future assistant context packages + +## Stability policy + +GroundRecall should adopt these rules early: + +1. No assistant-specific fields in canonical `Claim` or `Concept` objects. +2. No assistant-specific persistence formats as authoritative storage. +3. No review or promotion decisions based on assistant-specific packaging. +4. Assistant adapters may be added or removed without changing canonical objects. + +## Migration implication + +Current and future GroundRecall work should replace language like: + +- "Codex-facing export" +- "Codex skill bundle" + +with: + +- "assistant adapter bundle" +- "assistant-facing export" +- "assistant-specific derived bundle" + +Codex can still be one adapter and may remain the first implemented adapter, but +it should not define the system boundary. + +## Immediate implementation impact + +The next GroundRecall milestones should be interpreted as: + +1. build assistant-neutral canonical models and storage +2. build review and promotion over canonical objects +3. build canonical query and export layers +4. add assistant adapters as thin renderers over those canonical outputs + +This is the lowest-risk path for long-term stability. diff --git a/docs/legacy/groundrecall-ingestion-refactor.md b/docs/legacy/groundrecall-ingestion-refactor.md new file mode 100644 index 0000000..fc58ae2 --- /dev/null +++ b/docs/legacy/groundrecall-ingestion-refactor.md @@ -0,0 +1,105 @@ +# GroundRecall Ingestion Refactor Plan + +GroundRecall should treat `llmwiki` as one upstream source shape, not as the +defining architecture for grounded knowledge import. + +Didactopus already has broader ambitions around ingestion of weakly structured +materials such as: + +- markdown notes +- transcripts +- HTML/text course materials +- generated draft packs +- review sessions +- learner artifacts + +The GroundRecall import pipeline should therefore be generalized around a shared +normalization and promotion substrate with pluggable source adapters. + +## Design rule + +Source-specific logic should live at the ingestion edge. + +These stages should be generic: + +- segmentation +- extraction +- normalization +- lint +- review queue generation +- review bridge +- promotion +- canonical store +- query +- canonical export + +## Recommended module split + +Recommended package layout: + +- `didactopus.groundrecall_ingest` +- `didactopus.groundrecall_source_adapters.base` +- `didactopus.groundrecall_source_adapters.llmwiki` +- `didactopus.groundrecall_source_adapters.markdown_notes` +- `didactopus.groundrecall_source_adapters.transcript` +- `didactopus.groundrecall_source_adapters.didactopus_pack` +- `didactopus.groundrecall_source_adapters.didactopus_review` + +## Shared intermediate envelope + +Adapters should emit shared discovery records rather than jumping straight into +canonical GroundRecall objects. + +Recommended intermediate types: + +- `DiscoveredImportSource` +- `SegmentCandidate` +- `ImportProfile` + +This keeps adapter-specific parsing separate from the shared import pipeline. + +## Output intent + +Not every imported source should be treated the same way. + +Adapters should declare an output intent: + +- `grounded_knowledge` +- `curriculum` +- `both` + +Examples: + +- `llmwiki` usually targets `grounded_knowledge` +- loose transcripts may target `grounded_knowledge` +- syllabus/course folders often target `curriculum` +- Didactopus packs or review sessions may target `both` + +## First refactor milestones + +### Milestone 1 + +- introduce adapter registry and adapter protocol +- move current `llmwiki` discovery/classification behind an adapter +- preserve the current import CLI behavior + +### Milestone 2 + +- add a `markdown_notes` adapter +- add a `transcript` adapter +- add import profiles that tune extraction strictness + +### Milestone 3 + +- add a `didactopus_pack` adapter for pack and review artifacts +- allow current Didactopus outputs to feed into GroundRecall directly + +## Why this matters + +This avoids building two parallel ingestion stacks inside Didactopus: + +- one for packs and educational structures +- another for grounded knowledge capture + +Instead, the system gets one generic ingestion substrate with multiple source +adapters and multiple downstream promotion/export paths. diff --git a/docs/legacy/groundrecall-llmwiki-import.md b/docs/legacy/groundrecall-llmwiki-import.md new file mode 100644 index 0000000..0954756 --- /dev/null +++ b/docs/legacy/groundrecall-llmwiki-import.md @@ -0,0 +1,496 @@ +# GroundRecall `llmwiki` Import Specification + +This document defines the first-pass import path for users who already have some +form of `llmwiki`-style repository and want to migrate it into the broader +GroundRecall substrate while staying compatible with Didactopus review and +promotion flows. + +## Goal + +The import path should let an existing `llmwiki` corpus become: + +- searchable without immediate manual cleanup +- reviewable rather than blindly trusted +- grounded in explicit provenance +- promotable into durable structured knowledge objects +- exportable back into compiled wiki pages, assistant adapter bundles, and + queryable graph artifacts + +The key rule is: + +Imported wiki pages are **derived artifacts**, not automatic source truth. + +## Import philosophy + +Users coming from `llmwiki` often have a mixture of: + +- raw notes +- compiled markdown pages +- local source files +- generated summaries +- ad hoc link graphs +- session transcripts +- speculative or weakly-supported synthesis + +GroundRecall should preserve that work without pretending all of it is already +promoted knowledge. + +The import pipeline therefore has two responsibilities: + +1. Preserve the original material with minimal loss. +2. Reify explicit structured objects that can later be reviewed and promoted. + +## Scope of the first implementation + +The first implementation should support common `llmwiki` layouts such as: + +- `raw/` +- `wiki/` +- `schema.*` +- `logs/` +- `sources/` +- top-level markdown pages + +The importer should not require a canonical upstream schema. It should operate +from directory conventions plus simple heuristics. + +## Import modes + +### 1. `archive` + +Purpose: +- preserve an existing `llmwiki` tree as read-only imported artifacts +- index it for search and later review + +Behavior: +- no claim promotion +- minimal extraction +- all compiled pages remain `draft` + +Use when: +- the user wants backward compatibility first +- the corpus quality is unknown + +### 2. `quick` + +Purpose: +- bootstrap usable structured objects fast + +Behavior: +- import pages and raw sources +- extract candidate claims and concepts heuristically +- attach lightweight provenance +- queue uncertain items for review + +Use when: +- the user wants early utility and accepts heuristic noise + +### 3. `grounded` + +Purpose: +- perform a migration suitable for long-lived shared knowledge + +Behavior: +- require provenance for promoted claims +- mark unsupported statements explicitly +- produce review records and lint findings +- populate promotion queues rather than auto-promoting + +Use when: +- the imported corpus will be shared across machines or agents + +## Pipeline stages + +### 1. Capture + +The importer records the source repository as an import artifact. + +Required metadata: + +- `import_id` +- `import_mode` +- `source_root` +- `imported_at` +- `machine_id` +- `agent_id` +- `source_repo_kind=llmwiki` + +Outputs: + +- import manifest +- artifact records for all discovered files + +### 2. Segment + +Imported content is split into stable units. + +Primary segment types: + +- `source_document` +- `source_fragment` +- `compiled_page` +- `section_summary` +- `candidate_claim` +- `candidate_concept` +- `candidate_relation` +- `session_observation` + +Segmentation should preserve: + +- original path +- section heading +- line or byte offsets when possible +- page title +- frontmatter fields + +### 3. Classify + +Each segment gets a semantic role. + +Recommended roles: + +- `source` +- `derivation` +- `claim` +- `summary` +- `question` +- `todo` +- `speculation` +- `obsolete` +- `transcript` + +This prevents unsupported prose from being confused with grounded knowledge. + +### 4. Ground + +Each imported segment gets provenance and support metadata. + +Required grounding fields: + +- `origin_artifact_id` +- `origin_path` +- `origin_section` +- `source_url` when known +- `retrieval_date` when known +- `machine_id` +- `session_id` when known +- `support_kind` +- `grounding_status` + +Suggested values: + +- `support_kind`: `direct_source`, `derived_from_page`, `derived_from_session`, + `inferred`, `unknown` +- `grounding_status`: `grounded`, `partially_grounded`, `ungrounded` + +### 5. Normalize + +The importer emits explicit GroundRecall objects. + +Minimum object set: + +- `Source` +- `Fragment` +- `Artifact` +- `Observation` +- `Claim` +- `Concept` +- `Relation` + +### 6. Lint + +The importer produces machine-readable findings before promotion. + +Required lint checks: + +- claim has no supporting fragment +- multiple claims appear text-identical +- concept is orphaned +- relation points to missing concept +- page summary has no cited support +- imported item marked `obsolete` still linked as current +- same claim imported with conflicting confidence or polarity + +### 7. Promote + +Imported objects enter existing Didactopus review/promotion lanes rather than +becoming trusted immediately. + +Recommended states: + +- `draft` +- `triaged` +- `reviewed` +- `promoted` +- `superseded` +- `archived` + +### 8. Export + +Promoted objects can then be rendered back out as: + +- compiled wiki pages +- graph snapshots +- assistant adapter bundles +- review reports +- query bundles for assistant-facing use + +## Object contracts + +### `ImportedArtifact` + +```json +{ + "artifact_id": "ia_001", + "import_id": "imp_2026_04_16_a", + "artifact_kind": "compiled_page", + "path": "wiki/channel-capacity.md", + "title": "Channel Capacity", + "sha256": "abc123", + "created_at": "2026-04-16T14:00:00Z", + "metadata": { + "frontmatter": {}, + "headings": ["Definition", "Examples"] + }, + "current_status": "draft" +} +``` + +### `ImportedObservation` + +```json +{ + "observation_id": "obs_001", + "import_id": "imp_2026_04_16_a", + "artifact_id": "ia_001", + "role": "summary", + "text": "Capacity bounds reliable communication over a noisy channel.", + "origin_path": "wiki/channel-capacity.md", + "origin_section": "Definition", + "line_start": 12, + "line_end": 14, + "grounding_status": "partially_grounded", + "support_kind": "derived_from_page", + "confidence_hint": 0.63, + "current_status": "draft" +} +``` + +### `ImportedClaim` + +```json +{ + "claim_id": "clm_001", + "import_id": "imp_2026_04_16_a", + "claim_text": "Channel capacity is the maximum reliable communication rate for a channel model.", + "claim_kind": "definition", + "source_observation_ids": ["obs_001"], + "supporting_fragment_ids": ["frag_014"], + "concept_ids": ["concept::channel-capacity"], + "confidence_hint": 0.74, + "grounding_status": "grounded", + "current_status": "triaged" +} +``` + +### `ImportedConcept` + +```json +{ + "concept_id": "concept::channel-capacity", + "import_id": "imp_2026_04_16_a", + "title": "Channel Capacity", + "aliases": [], + "description": "Imported concept from llmwiki corpus.", + "source_artifact_ids": ["ia_001"], + "current_status": "triaged" +} +``` + +### `ImportedRelation` + +```json +{ + "relation_id": "rel_001", + "import_id": "imp_2026_04_16_a", + "source_id": "concept::shannon-entropy", + "target_id": "concept::channel-capacity", + "relation_type": "supports_understanding_of", + "evidence_ids": ["obs_015"], + "current_status": "draft" +} +``` + +## Mapping from `llmwiki` into GroundRecall + +Recommended first-pass mapping: + +- `raw/*` -> `Source` or `Artifact(kind=raw_note)` +- `wiki/*.md` -> `Artifact(kind=compiled_page)` +- frontmatter -> artifact metadata +- headings -> section boundaries +- linked page names -> candidate `Concept` and `Relation` +- bullet or sentence extraction -> candidate `Observation` and `Claim` +- chat or session logs -> `Observation(kind=session_note)` +- schema files -> import metadata only unless a future adapter exists + +## Confidence and trust policy + +Imported confidence must remain clearly separate from reviewed confidence. + +Recommended fields: + +- `confidence_hint` +- `review_confidence` +- `grounding_status` +- `review_verdict` + +Policy: + +- `confidence_hint` comes from heuristic import scoring +- `review_confidence` exists only after review +- promotion requires at least `partially_grounded` +- fully ungrounded claims can be stored, but only as `draft` or `archived` + +## Provenance policy + +The importer should follow the existing Didactopus provenance direction: + +- preserve source identity +- preserve retrieval date when available +- preserve adaptation status +- keep both human-readable and machine-readable provenance + +When only a compiled wiki page exists and the original source is missing: + +- the compiled page becomes the immediate origin artifact +- all extracted claims must be marked `derived_from_page` +- such claims should not auto-promote in `grounded` mode + +## Review and promotion integration + +Imported `Claim` and `Concept` objects should feed into the same general review +machinery already used for pack-oriented promotion: + +- create candidate records +- attach lint findings +- route to a triage lane +- collect review verdicts +- emit promotion records + +Suggested triage lanes: + +- `knowledge_capture` +- `pack_improvement` +- `skill_export` +- `source_cleanup` +- `conflict_resolution` + +## Module layout + +First-pass module layout: + +- `didactopus.groundrecall_import` + Entry points and top-level orchestration. +- `didactopus.groundrecall_discovery` + Finds `llmwiki`-style files and classifies paths. +- `didactopus.groundrecall_segmenter` + Splits pages and logs into stable observations and candidate claims. +- `didactopus.groundrecall_normalizer` + Emits normalized import objects. +- `didactopus.groundrecall_lint` + Import-time lint checks. +- `didactopus.groundrecall_review_bridge` + Converts imported objects into review candidates and promotion records. +- `didactopus.groundrecall_export` + Renders promoted objects back to wiki, graph, and skill artifacts. + +## CLI shape + +Suggested CLI: + +```bash +python -m didactopus.groundrecall.cli import /path/to/llmwiki --mode archive +python -m didactopus.groundrecall.cli import /path/to/llmwiki --mode quick +python -m didactopus.groundrecall.cli import /path/to/llmwiki --mode grounded +python -m didactopus.groundrecall.cli lint imports/ +python -m didactopus.groundrecall.cli promote imports/ /path/to/store +python -m didactopus.groundrecall.cli export /path/to/store exports/groundrecall --concept channel-capacity +``` + +Compatibility wrappers still exist during migration: + +```bash +python -m didactopus.groundrecall_import /path/to/llmwiki --mode grounded +python -m didactopus.groundrecall_lint imports/ +python -m didactopus.groundrecall_export /path/to/store exports/groundrecall --concept channel-capacity +``` + +## Filesystem layout + +Suggested repository-local layout: + +- `imports//manifest.json` +- `imports//artifacts.jsonl` +- `imports//observations.jsonl` +- `imports//claims.jsonl` +- `imports//concepts.jsonl` +- `imports//relations.jsonl` +- `imports//lint_findings.json` +- `imports//review_queue.json` + +This keeps imported state auditable and easy to sync across machines. + +## Multi-machine sync implication + +For distributed assistant use, imported state should be append-oriented and +rebuildable. + +Recommended sync primitives: + +- import manifests +- normalized jsonl object streams +- review records +- promotion records + +Non-authoritative derived artifacts: + +- rendered wiki pages +- local indexes +- embeddings +- cache files + +This allows multiple machines to contribute import events without making the +compiled page tree the merge primitive. + +## First implementation milestones + +### Milestone 1 + +- discover `raw/` and `wiki/` +- import artifacts +- segment markdown by headings +- emit observations and candidate claims +- write import manifest and jsonl outputs + +### Milestone 2 + +- add grounding metadata +- add lint checks +- add triage lanes and review queue output + +### Milestone 3 + +- map promoted claims into assistant-neutral exports plus assistant adapter bundles +- render compiled wiki views from promoted objects +- support multi-machine import manifests and merge-safe event storage + +## Non-goals for the first pass + +- perfect semantic claim extraction +- automatic trust assignment +- full upstream `llmwiki` schema compatibility +- lossless import of every custom plugin or script +- embeddings-first retrieval + +The first pass should be conservative, inspectable, and easy to improve. diff --git a/docs/legacy/groundrecall-migration-plan.md b/docs/legacy/groundrecall-migration-plan.md new file mode 100644 index 0000000..37b54c2 --- /dev/null +++ b/docs/legacy/groundrecall-migration-plan.md @@ -0,0 +1,281 @@ +# GroundRecall Migration Plan + +This document turns the boundary decisions in [deployment-modes.md](deployment-modes.md) into an implementation plan. + +The goal is not an immediate repo split. The goal is to let `GroundRecall` become independently deployable and operable without destabilizing ongoing `Didactopus` learner work. + +## Current State + +Today, GroundRecall exists as a set of modules under `src/didactopus/`: + +- `groundrecall_import` +- `groundrecall_source_adapters/*` +- `groundrecall_lint` +- `groundrecall_review_queue` +- `groundrecall_review_bridge` +- `groundrecall_models` +- `groundrecall_store` +- `groundrecall_promotion` +- `groundrecall_query` +- `groundrecall_export` +- `groundrecall_assistant_export` +- `groundrecall_assistants/*` + +This is acceptable as an implementation phase, but it creates two risks: + +1. generic knowledge-substrate functionality may continue to accrete under `didactopus.main` +2. feature work may silently assume the presence of learner-facing Didactopus components + +## Migration Goal + +Target state: + +- `Didactopus` remains the learner-facing application +- `GroundRecall` becomes the standalone grounded knowledge substrate +- `GenieHive` remains the model and routing control plane + +The package, CLI, and deployment boundaries should eventually reflect that. + +## Target Ownership + +### GroundRecall should own + +- source ingestion and normalization +- claim/concept/relation/artifact/provenance schemas +- canonical store and snapshots +- lint and review queue generation +- promotion and merge semantics +- assistant-neutral query and export +- assistant adapter export +- sync, merge, and team/shared knowledge operations + +### Didactopus should own + +- learner session flows +- mentor/practice/evaluator/project-advisor workflows +- pack and curriculum-specific review UX +- mastery-ledger and learner evidence experiences +- educational packaging over grounded knowledge + +### Shared boundary helpers should stay narrow + +- provider policy that depends on GenieHive route resolution but serves learner workflows +- review bridges where GroundRecall needs to feed an existing Didactopus review process during the transition + +## Packaging Direction + +### Phase 0: Present layout, stricter discipline + +Keep the code in `src/didactopus/`, but use naming and imports that preserve the eventual split. + +Rules: + +- new generic knowledge features go into `groundrecall_*` modules +- new learner-facing features go into `didactopus` learner modules +- do not add generic knowledge operations to `didactopus.main` +- treat review bridges as bridges, not permanent core ownership + +### Phase 1: Explicit namespace inside the repo + +Preferred direction: + +- move GroundRecall modules under `src/didactopus/groundrecall/` + +Target structure: + +- `src/didactopus/groundrecall/ingest.py` +- `src/didactopus/groundrecall/source_adapters/` +- `src/didactopus/groundrecall/models.py` +- `src/didactopus/groundrecall/store.py` +- `src/didactopus/groundrecall/promotion.py` +- `src/didactopus/groundrecall/query.py` +- `src/didactopus/groundrecall/export.py` +- `src/didactopus/groundrecall/assistants/` +- `src/didactopus/groundrecall/sync.py` +- `src/didactopus/groundrecall/merge.py` +- `src/didactopus/groundrecall/cli.py` + +Benefits: + +- cleaner conceptual grouping +- easier extraction later +- clearer import discipline + +Compatibility path: + +- keep thin wrapper modules at old import paths during transition +- deprecate wrappers only after tests and docs have moved + +### Phase 2: Dual CLI identity + +Before any repo split, expose GroundRecall as a first-class CLI namespace. + +Desired commands: + +- `python -m didactopus.groundrecall.cli import ...` +- `python -m didactopus.groundrecall.cli lint ...` +- `python -m didactopus.groundrecall.cli promote ...` +- `python -m didactopus.groundrecall.cli query ...` +- `python -m didactopus.groundrecall.cli export ...` +- `python -m didactopus.groundrecall.cli inspect ...` + +At that point, `didactopus.main` should only surface: + +- learner-facing commands +- review-workflow commands with educational intent +- possibly a pointer to GroundRecall commands, but not ownership of them + +### Phase 3: Optional package extraction + +Only after sync/merge and standalone use are mature: + +- move GroundRecall to its own package or repo if that becomes operationally useful +- keep Didactopus consuming it as a dependency + +This step is optional. A clean package boundary inside one repo may be sufficient for a long time. + +## CLI Migration Plan + +### Keep under `didactopus.main` + +- `review` +- future learner-facing workbench commands + +### Move toward GroundRecall CLI + +- import +- lint +- review queue +- promotion +- canonical query +- canonical export +- assistant export +- sync and merge + +### Transitional exception + +`provider-inspect` can remain on the Didactopus umbrella CLI for now because: + +- it is already useful operationally +- it supports learner-node deployments +- it is not a GroundRecall-specific operation + +Longer term, it may also belong on a separate operator surface depending on whether Didactopus becomes the standard local application shell. + +## Module Mapping + +### Move first + +Current -> target + +- `didactopus.groundrecall_import` -> `didactopus.groundrecall.ingest` +- `didactopus.groundrecall_source_adapters.*` -> `didactopus.groundrecall.source_adapters.*` +- `didactopus.groundrecall_models` -> `didactopus.groundrecall.models` +- `didactopus.groundrecall_store` -> `didactopus.groundrecall.store` +- `didactopus.groundrecall_promotion` -> `didactopus.groundrecall.promotion` +- `didactopus.groundrecall_query` -> `didactopus.groundrecall.query` +- `didactopus.groundrecall_export` -> `didactopus.groundrecall.export` +- `didactopus.groundrecall_assistants.*` -> `didactopus.groundrecall.assistants.*` + +### Keep as transitional bridges + +- `didactopus.groundrecall_review_bridge` +- source adapters that ingest Didactopus-native artifacts + +These are legitimate but should be documented as cross-boundary adapters rather than intrinsic ownership proof. + +### Stay in Didactopus + +- `learner_session` +- `learner_session_demo` +- `mentor` +- `practice` +- `project_advisor` +- educational review UX modules +- pack and graph-planning modules + +## Service Boundary Direction + +### GroundRecall service candidates + +Once needed, a GroundRecall service should focus on: + +- canonical knowledge query +- import status and queue inspection +- promotion status +- sync/merge status +- assistant-neutral bundle retrieval + +### Didactopus service candidates + +- learner session orchestration +- learner progress and evaluation +- pack/workbench interactions + +### GenieHive service candidates + +- model and service inspection +- route resolution +- cluster health + +## Milestones + +### Milestone 1: Namespace discipline + +Done when: + +- new generic knowledge work lands only in GroundRecall-oriented modules +- `didactopus.main` stops growing generic knowledge commands +- docs consistently describe GroundRecall as a substrate, not a learner feature + +### Milestone 2: Internal package reorganization + +Done when: + +- GroundRecall modules live under an explicit package path +- old flat import paths are wrappers only +- tests target the new package paths + +### Milestone 3: First-class GroundRecall CLI + +Done when: + +- import/lint/promote/query/export/inspect are available under one GroundRecall CLI surface +- operator docs no longer require `Didactopus` framing for generic knowledge tasks + +### Milestone 4: Sync and merge maturity + +Done when: + +- append-only event ingestion exists +- promoted-state merge semantics exist +- team/shared knowledge workflows are practical without learner workflows + +### Milestone 5: Extraction decision + +Done when: + +- the project can make an informed choice between: + - one repo, multiple packages + - separate GroundRecall package/repo + +## Immediate Next Work + +Recommended next implementation steps: + +1. Introduce `didactopus.groundrecall` as an internal package namespace. +2. Add a single GroundRecall umbrella CLI module. +3. Keep thin wrapper modules for compatibility. +4. Start moving docs and tests to the new namespace. +5. Begin implementing sync/merge primitives under GroundRecall rather than under Didactopus learner flows. + +## Decision Rule For New Work + +Before adding a new command, module, or service, ask: + +1. Would this still be needed if there were no learner session? +2. Would a team using only shared knowledge still need it? +3. Is the canonical artifact knowledge state or educational interaction? +4. Would it still matter if Didactopus UI vanished? + +If yes, default toward GroundRecall. diff --git a/docs/legacy/groundrecall-repo-bootstrap.md b/docs/legacy/groundrecall-repo-bootstrap.md new file mode 100644 index 0000000..3a02909 --- /dev/null +++ b/docs/legacy/groundrecall-repo-bootstrap.md @@ -0,0 +1,286 @@ +# GroundRecall Repo Bootstrap Checklist + +This document turns the broader [groundrecall-migration-plan.md](groundrecall-migration-plan.md) into a practical checklist for creating a standalone `GroundRecall` repository. + +The goal here is narrower than full feature completion. The goal is to get to a standalone repository that can be installed, run locally, and used for real `llmwiki++`-style work without requiring `Didactopus` as the primary shell. + +## Bootstrap Goal + +Minimum viable standalone `GroundRecall` repo: + +- installable as its own Python package +- exposes a first-class `groundrecall` CLI +- imports and normalizes knowledge sources +- promotes reviewed knowledge into a canonical store +- supports query and export over promoted state +- supports assistant-neutral exports plus adapter exports +- remains consumable by `Didactopus` as a dependency or sibling package + +This is enough for a local standalone alpha. It is not yet the full distributed team and corpus-scale vision. + +## What Already Exists + +The current `Didactopus` codebase already contains most of the implementation spine: + +- `didactopus.groundrecall.ingest` +- `didactopus.groundrecall.source_adapters.*` +- `didactopus.groundrecall.models` +- `didactopus.groundrecall.store` +- `didactopus.groundrecall.promotion` +- `didactopus.groundrecall.query` +- `didactopus.groundrecall.export` +- `didactopus.groundrecall.assistant_export` +- `didactopus.groundrecall.assistants.*` +- `didactopus.groundrecall.inspect` +- `didactopus.groundrecall.cli` + +This means the repo bootstrap is primarily a packaging and boundary exercise, not a greenfield implementation. + +## Target Repo Shape + +Suggested standalone layout: + +```text +groundrecall/ + pyproject.toml + README.md + LICENSE + src/ + groundrecall/ + __init__.py + cli.py + ingest.py + inspect.py + lint.py + models.py + store.py + promotion.py + query.py + export.py + assistant_export.py + review_queue.py + review_bridge.py + source_adapters/ + assistants/ + tests/ + docs/ + quickstart.md + llmwiki-import.md + deployment-modes.md + assistant-architecture.md + sync-roadmap.md +``` + +Notes: + +- `review_bridge.py` may remain optional if the standalone repo only needs generic review artifacts. +- `review_queue.py` belongs in `GroundRecall`; it is not a learner-only concern. +- `review_bridge.py` is the most likely file to stay transitional if it depends too directly on Didactopus review objects. + +## Move / Keep / Bridge + +### Move into standalone `GroundRecall` + +Move first: + +- `didactopus.groundrecall.ingest` +- `didactopus.groundrecall.inspect` +- `didactopus.groundrecall.lint` +- `didactopus.groundrecall.models` +- `didactopus.groundrecall.store` +- `didactopus.groundrecall.promotion` +- `didactopus.groundrecall.query` +- `didactopus.groundrecall.export` +- `didactopus.groundrecall.assistant_export` +- `didactopus.groundrecall.review_queue` +- `didactopus.groundrecall.source_adapters.*` +- `didactopus.groundrecall.assistants.*` +- `didactopus.groundrecall.cli` + +### Keep in `Didactopus` + +These should not move: + +- learner session and mentor/practice flows +- educational pack authoring and pack-specific UX +- mastery/evidence learner experiences +- provider demos that exist to support Didactopus learner workflows + +### Keep as temporary bridges + +These may need a staged treatment: + +- `groundrecall_review_bridge` +- `didactopus_pack` source adapter + +Those are useful during transition, but they are cross-boundary integrations, not proof that `GroundRecall` must remain inside `Didactopus`. + +## Bootstrap Checklist + +### 1. Create the new repo skeleton + +Required: + +- create a new repo root +- add `pyproject.toml` +- add `src/groundrecall/` +- add `tests/` +- add `docs/` +- add a minimal `README.md` +- add `LICENSE` + +Definition of done: + +- `pip install -e .` works +- `python -m groundrecall.cli --help` works + +### 2. Move the package code + +Required: + +- copy the current `didactopus.groundrecall.*` package into `src/groundrecall/` +- update relative imports as needed +- remove `didactopus`-prefixed assumptions in docstrings and parser help text + +Definition of done: + +- module imports succeed under `groundrecall.*` +- no package file requires `didactopus` imports except explicit transition bridges + +### 3. Extract the tests + +Required: + +- move GroundRecall-focused tests into the new repo +- keep Didactopus integration tests in Didactopus +- add an end-to-end CLI smoke test that runs: + - `import` + - `promote` + - `query` + - `export` + - `inspect` + +Definition of done: + +- the new repo has its own passing test suite +- Didactopus retains only integration tests that prove interoperability + +### 4. Harden the standalone CLI + +Required commands: + +- `groundrecall import` +- `groundrecall lint` +- `groundrecall promote` +- `groundrecall query` +- `groundrecall export` +- `groundrecall inspect` + +Recommended additions: + +- `groundrecall assistant-export` +- `groundrecall review-queue` + +Definition of done: + +- the CLI help text is standalone and does not refer users back to `Didactopus` + +### 5. Publish a repo-local data layout + +Pick and document a stable layout such as: + +```text +.groundrecall/ + imports/ + store/ + exports/ + events/ +``` + +Required: + +- make these paths configurable +- define sane defaults +- remove assumptions that the caller already knows the Didactopus workspace layout + +Definition of done: + +- a new user can run GroundRecall in an empty directory and get predictable local state + +### 6. Document the standalone workflows + +At minimum: + +- quickstart +- migrate from `llmwiki` +- query and export patterns +- assistant adapter exports +- relationship to `Didactopus` +- relationship to `GenieHive` + +Definition of done: + +- the README can orient a new user without requiring Didactopus-specific context + +### 7. Leave compatibility shims in `Didactopus` + +Required: + +- keep thin wrappers at `didactopus.groundrecall_*` or `didactopus.groundrecall.*` integration paths as needed +- make `Didactopus` import the extracted package where possible +- clearly mark the wrappers as compatibility paths + +Definition of done: + +- existing Didactopus workflows do not break during the split + +## Alpha Completion Criteria + +The standalone repo is alpha-ready when: + +- `llmwiki` import works +- `markdown_notes` import works +- at least one Didactopus-native adapter still works as an integration adapter +- canonical store creation and snapshot export work +- query works over promoted objects +- assistant-neutral export works +- at least two assistant adapters export usable bundles + +This is the right threshold for “functional GroundRecall repo.” + +## Still Missing After Alpha + +A standalone alpha is not yet the full target system. These remain post-bootstrap priorities: + +- re-import and update semantics +- append-only event logs for multi-node merge +- shared/private scope support +- merge and sync conflict handling +- stronger claim extraction +- richer claim-level review and adjudication +- corpus-scale distributed coordination + +Those features should be built in `GroundRecall`, but they do not need to block repo extraction. + +## Recommended Execution Order + +Use this order: + +1. create the repo and package skeleton +2. copy the current `groundrecall` package and make imports pass +3. move tests and get the standalone suite green +4. finalize CLI and README +5. switch Didactopus integration points to consume the extracted package +6. only then continue with sync/merge and corpus-scale features + +This keeps the boundary clean without stalling feature progress. + +## First PR-Sized Steps + +If this were executed as concrete work, the first three small changes should be: + +1. create the new repo with package skeleton and copy `src/didactopus/groundrecall/` +2. move the existing namespace-focused tests and make them pass under `groundrecall.*` +3. add a standalone README quickstart and one end-to-end CLI smoke test + +After that, the repo is real enough to iterate in place rather than continuing to plan around it. diff --git a/docs/llmwiki-import.md b/docs/llmwiki-import.md new file mode 100644 index 0000000..423c144 --- /dev/null +++ b/docs/llmwiki-import.md @@ -0,0 +1,85 @@ +# llmwiki Import + +`GroundRecall` treats `llmwiki` as one important source shape, not as the defining architecture. + +An imported `llmwiki` tree is treated as: + +- raw source material +- prior synthesized artifacts +- candidate claims and concepts +- provenance that needs to be normalized and reviewed + +Compiled wiki pages are useful artifacts, but they are not automatically promoted as canonical truth. + +## Import Modes + +### `archive` + +- preserve source material with minimal interpretation +- index and normalize without assuming promotion readiness +- useful for long-tail historical corpora + +### `quick` + +- fast bootstrap mode +- extracts candidate concepts, claims, and relations heuristically +- useful when getting an old corpus into GroundRecall quickly matters more than perfect grounding + +### `grounded` + +- stricter mode +- expects better provenance and cleaner support signals +- better fit for shared or promoted knowledge + +## Import Flow + +The normalized import flow is: + +1. capture source files +2. discover and classify artifacts +3. segment content into observations +4. normalize claims, concepts, and relations +5. lint the import +6. emit a review queue and review bundle +7. promote reviewed artifacts into the canonical store + +## Commands + +```bash +groundrecall import /path/to/llmwiki --mode archive +groundrecall import /path/to/llmwiki --mode quick +groundrecall import /path/to/llmwiki --mode grounded + +groundrecall lint imports/ +groundrecall promote imports/ store/ +groundrecall export store/ exports/groundrecall --concept channel-capacity +``` + +## Current Heuristics + +Today’s importer already supports: + +- `raw/` and `wiki/` discovery +- markdown and log segmentation +- claim extraction with inline contradiction and supersession markers +- review queue generation +- review bundle export + +Areas still planned: + +- stronger re-import/update semantics +- more robust transcript and semi-structured document handling +- stronger large-corpus extraction and consolidation + +## Recommended Promotion Rule + +Treat imported wiki pages as derived artifacts. + +That means: + +- preserve them +- mine them for claims and concepts +- review what matters +- promote canonical claims and concepts into the store + +This is the main difference between `GroundRecall` and a plain markdown wiki. diff --git a/docs/quickstart.md b/docs/quickstart.md new file mode 100644 index 0000000..5641c33 --- /dev/null +++ b/docs/quickstart.md @@ -0,0 +1,97 @@ +# Quickstart + +`GroundRecall` is a local-first grounded knowledge substrate for `llmwiki++`-style workflows. + +This quickstart assumes a fresh checkout of the standalone repository. + +## Install + +```bash +pip install -e . +groundrecall --help +``` + +You can also use the module entry point: + +```bash +PYTHONPATH=src python -m groundrecall --help +``` + +## Import A Knowledge Source + +Fast import from an `llmwiki`-style tree: + +```bash +groundrecall import /path/to/llmwiki --mode quick +``` + +More conservative import with stronger grounding expectations: + +```bash +groundrecall import /path/to/llmwiki --mode grounded +``` + +The importer writes normalized artifacts under `imports//`. + +## Review And Promote + +Inspect the import outputs: + +```bash +groundrecall lint imports/ +``` + +Promote the imported review artifacts into a canonical store: + +```bash +groundrecall promote imports/ store/ +``` + +## Query The Canonical Store + +Query a concept: + +```bash +groundrecall query store/ channel-capacity +``` + +Inspect the overall store: + +```bash +groundrecall inspect store/ +``` + +## Export + +Export assistant-neutral artifacts: + +```bash +groundrecall export store/ exports/groundrecall --concept channel-capacity +``` + +Export assistant-targeted bundles: + +```bash +groundrecall assistant-export store/ codex exports/codex --concept channel-capacity +groundrecall assistant-export store/ claude_code exports/claude --concept channel-capacity +``` + +## Default Working Layout + +A simple local layout is: + +```text +.groundrecall/ + imports/ + store/ + exports/ + events/ +``` + +The current alpha does not require this exact layout, but it is a sensible starting point. + +## Next Reading + +- [architecture.md](architecture.md) +- [llmwiki-import.md](llmwiki-import.md) +- [sync-roadmap.md](sync-roadmap.md) diff --git a/docs/sync-roadmap.md b/docs/sync-roadmap.md new file mode 100644 index 0000000..d824e99 --- /dev/null +++ b/docs/sync-roadmap.md @@ -0,0 +1,73 @@ +# Sync Roadmap + +The current standalone alpha is local-first. Sync and merge are planned next-stage features. + +## Goal + +Support these use cases cleanly: + +- one user across multiple machines +- teams with shared and individual knowledge +- parallel corpus transformation and consolidation + +## Planned Model + +The intended model is: + +- append-only event capture at the edge +- canonical promoted store as the durable reviewed state +- generated exports and assistant bundles as derived artifacts + +This avoids treating compiled wiki pages or generated bundles as merge primitives. + +## Likely Local Layout + +```text +.groundrecall/ + events/ + imports/ + store/ + exports/ +``` + +## Planned Phases + +### Phase 1: Re-import And Update Semantics + +- import the same source tree repeatedly without duplicating everything +- support import lineage and supersession +- track object continuity across imports + +### Phase 2: Event Log Capture + +- record machine-local observations and import events +- distinguish machine-local state from promoted shared state +- preserve provenance and timestamps explicitly + +### Phase 3: Merge And Consolidation + +- merge append-only events from multiple machines +- consolidate draft claims and review candidates +- preserve contradiction and supersession history + +### Phase 4: Shared And Private Scopes + +- private notes and private candidate knowledge +- shared promoted knowledge +- controlled promotion from private to shared + +### Phase 5: Team And Corpus Workflows + +- parallel ingestion over large corpora +- coordinated claim review and adjudication +- export of consolidated assistant-neutral snapshots + +## Non-Goals For The Current Alpha + +The current repo does not yet provide: + +- real-time networked sync +- conflict-free replicated data types +- hosted review services + +The next useful milestone is a practical local event-log and re-import model, not a full distributed platform in one step. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7429378 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,31 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "groundrecall" +version = "0.1.0a0" +description = "Grounded knowledge substrate for llmwiki++ style workflows." +readme = "README.md" +requires-python = ">=3.10" +license = { text = "MIT" } +authors = [ + { name = "GroundRecall contributors" } +] +dependencies = [ + "pydantic>=2,<3", + "PyYAML>=6,<7", +] + +[project.scripts] +groundrecall = "groundrecall.cli:main" + +[tool.setuptools] +package-dir = { "" = "src" } + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.pytest.ini_options] +pythonpath = ["src"] +testpaths = ["tests"] diff --git a/src/groundrecall/__init__.py b/src/groundrecall/__init__.py new file mode 100644 index 0000000..d26682f --- /dev/null +++ b/src/groundrecall/__init__.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from .inspect import inspect_store, summarize_store +from .ingest import ImportResult, build_parser as build_import_parser, main as import_main, run_groundrecall_import +from .models import * # noqa: F403 +from .promotion import build_parser as build_promotion_parser, main as promotion_main, promote_import_to_store +from .query import ( + build_parser as build_query_parser, + build_query_bundle_for_concept, + main as query_main, + query_concept, + query_provenance, + search_claims, +) +from .store import GroundRecallStore + +__all__ = [ + "GroundRecallStore", + "ImportResult", + "run_groundrecall_import", + "build_import_parser", + "import_main", + "promote_import_to_store", + "build_promotion_parser", + "promotion_main", + "query_concept", + "query_provenance", + "search_claims", + "build_query_bundle_for_concept", + "build_query_parser", + "query_main", + "summarize_store", + "inspect_store", +] diff --git a/src/groundrecall/__main__.py b/src/groundrecall/__main__.py new file mode 100644 index 0000000..2f05ddc --- /dev/null +++ b/src/groundrecall/__main__.py @@ -0,0 +1,5 @@ +from .cli import main + + +if __name__ == "__main__": + main() diff --git a/src/groundrecall/artifact_schemas.py b/src/groundrecall/artifact_schemas.py new file mode 100644 index 0000000..aedd100 --- /dev/null +++ b/src/groundrecall/artifact_schemas.py @@ -0,0 +1,81 @@ +from typing import Any +from pydantic import BaseModel, Field + + +class DependencySpec(BaseModel): + name: str + min_version: str = "0.0.0" + max_version: str = "9999.9999.9999" + + +class MasteryProfileSpec(BaseModel): + template: str | None = None + required_dimensions: list[str] = Field(default_factory=list) + dimension_threshold_overrides: dict[str, float] = Field(default_factory=dict) + + +class CrossPackLinkSpec(BaseModel): + source_concept: str + target_concept: str + relation: str + + +class ProfileTemplateSpec(BaseModel): + required_dimensions: list[str] = Field(default_factory=list) + dimension_threshold_overrides: dict[str, float] = Field(default_factory=dict) + + +class PackManifest(BaseModel): + name: str + display_name: str + version: str + schema_version: str + didactopus_min_version: str + didactopus_max_version: str + description: str = "" + author: str = "" + license: str = "unspecified" + dependencies: list[DependencySpec] = Field(default_factory=list) + overrides: list[str] = Field(default_factory=list) + profile_templates: dict[str, ProfileTemplateSpec] = Field(default_factory=dict) + cross_pack_links: list[CrossPackLinkSpec] = Field(default_factory=list) + + +class ConceptEntry(BaseModel): + id: str + title: str + description: str = "" + prerequisites: list[str] = Field(default_factory=list) + mastery_signals: list[str] = Field(default_factory=list) + mastery_profile: MasteryProfileSpec = Field(default_factory=MasteryProfileSpec) + + +class ConceptsFile(BaseModel): + concepts: list[ConceptEntry] + + +class RoadmapStageEntry(BaseModel): + id: str + title: str + concepts: list[str] = Field(default_factory=list) + checkpoint: list[str] = Field(default_factory=list) + + +class RoadmapFile(BaseModel): + stages: list[RoadmapStageEntry] + + +class ProjectEntry(BaseModel): + id: str + title: str + difficulty: str = "" + prerequisites: list[str] = Field(default_factory=list) + deliverables: list[str] = Field(default_factory=list) + + +class ProjectsFile(BaseModel): + projects: list[ProjectEntry] + + +class RubricsFile(BaseModel): + rubrics: list[dict[str, Any]] diff --git a/src/groundrecall/assistant_export.py b/src/groundrecall/assistant_export.py new file mode 100644 index 0000000..da26719 --- /dev/null +++ b/src/groundrecall/assistant_export.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any + +from .assistants.base import get_assistant_adapter +from .query import build_query_bundle_for_concept +from .store import GroundRecallStore + + +def export_assistant_bundle( + store_dir: str | Path, + assistant: str, + out_dir: str | Path, + concept_refs: list[str] | None = None, +) -> dict[str, Any]: + store = GroundRecallStore(store_dir) + snapshot = store.build_snapshot( + snapshot_id="assistant-export", + created_at="", + metadata={"export_kind": "assistant_adapter", "assistant": assistant}, + ).model_dump() + query_bundles = [] + for concept_ref in concept_refs or []: + payload = build_query_bundle_for_concept(store_dir, concept_ref) + if payload is not None: + query_bundles.append(payload) + adapter = get_assistant_adapter(assistant) + paths = adapter.export_bundle(snapshot, query_bundles, out_dir) + manifest = { + "assistant": assistant, + "output_paths": [str(path) for path in paths], + "query_bundle_count": len(query_bundles), + } + Path(out_dir).mkdir(parents=True, exist_ok=True) + (Path(out_dir) / "assistant_export_manifest.json").write_text(json.dumps(manifest, indent=2), encoding="utf-8") + return manifest + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Export assistant-specific GroundRecall bundles from canonical store data.") + parser.add_argument("store_dir") + parser.add_argument("assistant") + parser.add_argument("out_dir") + parser.add_argument("--concept", action="append", default=[]) + return parser + + +def main() -> None: + args = build_parser().parse_args() + payload = export_assistant_bundle( + store_dir=args.store_dir, + assistant=args.assistant, + out_dir=args.out_dir, + concept_refs=list(args.concept or []), + ) + print(json.dumps(payload, indent=2)) diff --git a/src/groundrecall/assistants/__init__.py b/src/groundrecall/assistants/__init__.py new file mode 100644 index 0000000..3ce55a6 --- /dev/null +++ b/src/groundrecall/assistants/__init__.py @@ -0,0 +1,2 @@ +from __future__ import annotations + diff --git a/src/groundrecall/assistants/base.py b/src/groundrecall/assistants/base.py new file mode 100644 index 0000000..b6470b5 --- /dev/null +++ b/src/groundrecall/assistants/base.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from ..groundrecall_assistants.base import * # noqa: F403 diff --git a/src/groundrecall/assistants/claude_code.py b/src/groundrecall/assistants/claude_code.py new file mode 100644 index 0000000..d0c43af --- /dev/null +++ b/src/groundrecall/assistants/claude_code.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from ..groundrecall_assistants.claude_code import * # noqa: F403 diff --git a/src/groundrecall/assistants/codex.py b/src/groundrecall/assistants/codex.py new file mode 100644 index 0000000..62b24b5 --- /dev/null +++ b/src/groundrecall/assistants/codex.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from ..groundrecall_assistants.codex import * # noqa: F403 diff --git a/src/groundrecall/citation_support.py b/src/groundrecall/citation_support.py new file mode 100644 index 0000000..93da099 --- /dev/null +++ b/src/groundrecall/citation_support.py @@ -0,0 +1,239 @@ +from __future__ import annotations + +import json +from pathlib import Path +import re +from typing import Any + + +def _load_citegeist_symbols() -> dict[str, Any] | None: + import sys + + citegeist_src = Path("/home/netuser/bin/CiteGeist/src") + if citegeist_src.exists(): + sys.path.insert(0, str(citegeist_src)) + try: + from citegeist.app_api import LiteratureExplorerApi # type: ignore + from citegeist.bibtex import BibEntry, parse_bibtex, render_bibtex # type: ignore + from citegeist.storage import BibliographyStore # type: ignore + except Exception: + return None + return { + "LiteratureExplorerApi": LiteratureExplorerApi, + "BibEntry": BibEntry, + "parse_bibtex": parse_bibtex, + "render_bibtex": render_bibtex, + "BibliographyStore": BibliographyStore, + } + + +def discover_bib_files(source_root: str | Path) -> list[Path]: + root = Path(source_root) + if not root.exists(): + return [] + candidates = [ + path + for path in root.rglob("*.bib") + if path.is_file() and not path.name.endswith("-bak.bib") and not path.name.startswith(".") + ] + + def rank(path: Path) -> tuple[int, int, str]: + rel = path.relative_to(root) + name = path.name + if rel == Path("refs.bib"): + return (0, len(rel.parts), str(rel)) + if rel == Path("biblio.bib"): + return (1, len(rel.parts), str(rel)) + if name == "refs.bib": + return (2, len(rel.parts), str(rel)) + if name == "biblio.bib": + return (3, len(rel.parts), str(rel)) + return (4, len(rel.parts), str(rel)) + + return sorted(candidates, key=rank) + + +def load_bibliography_index(source_root: str | Path) -> dict[str, dict[str, Any]]: + symbols = _load_citegeist_symbols() + root = Path(source_root) + index: dict[str, dict[str, Any]] = {} + for bib_path in discover_bib_files(root): + try: + entries = _parse_bib_entries(bib_path.read_text(encoding="utf-8"), symbols=symbols) + except Exception: + continue + for entry in entries: + raw_bibtex = _render_entry_bibtex(entry, symbols=symbols) + payload = { + "citation_key": entry.citation_key, + "entry_type": entry.entry_type, + "fields": dict(entry.fields), + "source_bib_path": str(bib_path.relative_to(root)), + "raw_bibtex": raw_bibtex, + "duplicate_source_bib_paths": [], + } + existing = index.get(entry.citation_key) + if existing is None: + index[entry.citation_key] = payload + else: + existing.setdefault("duplicate_source_bib_paths", []).append(str(bib_path.relative_to(root))) + return index + + +def materialize_citegeist_store(import_dir: str | Path, source_root: str | Path) -> dict[str, Any]: + symbols = _load_citegeist_symbols() + if symbols is None: + return {"available": False} + BibliographyStore = symbols["BibliographyStore"] + LiteratureExplorerApi = symbols["LiteratureExplorerApi"] + + import_root = Path(import_dir) + db_path = import_root / "citegeist.sqlite3" + if db_path.exists(): + db_path.unlink() + store = BibliographyStore(db_path) + ingested_files: list[str] = [] + for bib_path in discover_bib_files(source_root): + try: + text = bib_path.read_text(encoding="utf-8") + entries = _parse_bib_entries(text, symbols=symbols) + for entry in entries: + store.upsert_entry( + entry, + raw_bibtex=_render_entry_bibtex(entry, symbols=symbols), + source_type="bibtex", + source_label=str(bib_path.relative_to(Path(source_root))), + review_status="draft", + ) + store.connection.commit() + ingested_files.append(str(bib_path.relative_to(Path(source_root)))) + except Exception: + continue + api = LiteratureExplorerApi(store) + return { + "available": True, + "db_path": str(db_path), + "ingested_files": ingested_files, + "api": api, + "store": store, + } + + +def bibliography_summary_payload(source_root: str | Path) -> dict[str, Any]: + index = load_bibliography_index(source_root) + source_files = discover_bib_files(source_root) + return { + "enabled": bool(index), + "entry_count": len(index), + "source_files": [str(path.relative_to(Path(source_root))) for path in source_files], + } + + +def serialize_bib_entry(entry: dict[str, Any] | None) -> dict[str, Any] | None: + if entry is None: + return None + return { + "citation_key": entry.get("citation_key", ""), + "entry_type": entry.get("entry_type", ""), + "fields": dict(entry.get("fields", {})), + "source_bib_path": entry.get("source_bib_path", ""), + "raw_bibtex": entry.get("raw_bibtex", ""), + "duplicate_source_bib_paths": list(entry.get("duplicate_source_bib_paths", [])), + } + + +def serialize_citegeist_entry_payload(payload: dict[str, Any] | None) -> dict[str, Any] | None: + if payload is None: + return None + result = dict(payload) + if "raw_bibtex" in result and isinstance(result["raw_bibtex"], str): + return result + return json.loads(json.dumps(result)) + + +def _parse_bib_entries(text: str, *, symbols: dict[str, Any] | None) -> list[Any]: + if symbols is not None: + try: + return symbols["parse_bibtex"](text) + except Exception: + pass + return _fallback_parse_bibtex(text, symbols=symbols) + + +def _render_entry_bibtex(entry: Any, *, symbols: dict[str, Any] | None) -> str: + if symbols is not None: + try: + return symbols["render_bibtex"]([entry]) + except Exception: + pass + fields = [] + for key, value in entry.fields.items(): + fields.append(f" {key} = {{{value}}}") + body = ",\n".join(fields) + return f"@{entry.entry_type}{{{entry.citation_key},\n{body}\n}}" + + +def _fallback_parse_bibtex(text: str, *, symbols: dict[str, Any] | None) -> list[Any]: + BibEntry = symbols["BibEntry"] if symbols is not None else None + entries: list[Any] = [] + pattern = re.compile(r"@(?P[A-Za-z]+)\s*\{\s*(?P[^,\s]+)\s*,", re.MULTILINE) + matches = list(pattern.finditer(text)) + for index, match in enumerate(matches): + start = match.end() + end = matches[index + 1].start() if index + 1 < len(matches) else len(text) + body = text[start:end] + fields = _fallback_parse_fields(body) + if BibEntry is not None: + entries.append(BibEntry(entry_type=match.group("entry_type").lower(), citation_key=match.group("citation_key").strip(), fields=fields)) + else: + entries.append(type("BibEntryFallback", (), {"entry_type": match.group("entry_type").lower(), "citation_key": match.group("citation_key").strip(), "fields": fields})()) + return entries + + +def _fallback_parse_fields(body: str) -> dict[str, str]: + fields: dict[str, str] = {} + index = 0 + length = len(body) + while index < length: + while index < length and body[index] in " \t\r\n,": + index += 1 + if index >= length or body[index] == "}": + break + key_start = index + while index < length and re.match(r"[A-Za-z0-9_:-]", body[index]): + index += 1 + key = body[key_start:index].strip().lower() + while index < length and body[index] in " \t\r\n=": + index += 1 + value = "" + if index < length and body[index] == "{": + depth = 1 + index += 1 + value_start = index + while index < length and depth > 0: + if body[index] == "{": + depth += 1 + elif body[index] == "}": + depth -= 1 + if depth == 0: + break + index += 1 + value = body[value_start:index].strip() + index += 1 + elif index < length and body[index] == '"': + index += 1 + value_start = index + while index < length and body[index] != '"': + if body[index] == "\\": + index += 1 + index += 1 + value = body[value_start:index].strip() + index += 1 + else: + value_start = index + while index < length and body[index] not in ",\n": + index += 1 + value = body[value_start:index].strip() + if key: + fields[key] = value.rstrip(",") + return fields diff --git a/src/groundrecall/cli.py b/src/groundrecall/cli.py new file mode 100644 index 0000000..0206d0e --- /dev/null +++ b/src/groundrecall/cli.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +import argparse +import sys + +from . import assistant_export, export, ingest, inspect, lint, promotion, query, review_server + + +COMMANDS = { + "import": ingest.main, + "lint": lint.main, + "promote": promotion.main, + "query": query.main, + "export": export.main, + "assistant-export": assistant_export.main, + "inspect": inspect.main, + "review-server": review_server.main, +} + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="GroundRecall command-line tools") + parser.add_argument("command", nargs="?", choices=sorted(COMMANDS)) + return parser + + +def main() -> None: + argv = sys.argv[1:] + parser = build_parser() + args, remainder = parser.parse_known_args(argv) + if not args.command: + parser.print_help() + return + handler = COMMANDS[args.command] + original_argv = sys.argv + try: + sys.argv = [f"groundrecall.cli {args.command}", *remainder] + handler() + finally: + sys.argv = original_argv diff --git a/src/groundrecall/export.py b/src/groundrecall/export.py new file mode 100644 index 0000000..0188e52 --- /dev/null +++ b/src/groundrecall/export.py @@ -0,0 +1,136 @@ +from __future__ import annotations + +import argparse +import json +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from .query import build_query_bundle_for_concept +from .store import GroundRecallStore + + +def _now() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def _write_json(path: Path, payload: dict[str, Any]) -> None: + path.write_text(json.dumps(payload, indent=2), encoding="utf-8") + + +def _write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None: + text = "\n".join(json.dumps(row, sort_keys=True) for row in rows) + if text: + text += "\n" + path.write_text(text, encoding="utf-8") + + +def export_canonical_snapshot( + store_dir: str | Path, + out_dir: str | Path, + snapshot_id: str | None = None, + metadata: dict[str, Any] | None = None, +) -> dict[str, str]: + store = GroundRecallStore(store_dir) + target = Path(out_dir) + target.mkdir(parents=True, exist_ok=True) + + actual_snapshot_id = snapshot_id or f"snapshot-export-{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}" + snapshot = store.build_snapshot( + snapshot_id=actual_snapshot_id, + created_at=_now(), + metadata={"export_kind": "canonical", **(metadata or {})}, + ) + store.save_snapshot(snapshot) + + snapshot_path = target / "groundrecall_snapshot.json" + _write_json(snapshot_path, snapshot.model_dump()) + _write_jsonl(target / "claims.jsonl", [item.model_dump() for item in snapshot.claims]) + _write_jsonl(target / "concepts.jsonl", [item.model_dump() for item in snapshot.concepts]) + _write_jsonl(target / "relations.jsonl", [item.model_dump() for item in snapshot.relations]) + provenance_manifest = { + "snapshot_id": snapshot.snapshot_id, + "created_at": snapshot.created_at, + "source_count": len(snapshot.sources), + "artifact_count": len(snapshot.artifacts), + "observation_count": len(snapshot.observations), + } + _write_json(target / "provenance_manifest.json", provenance_manifest) + manifest = { + "export_kind": "canonical", + "snapshot_id": snapshot.snapshot_id, + "files": [ + "groundrecall_snapshot.json", + "claims.jsonl", + "concepts.jsonl", + "relations.jsonl", + "provenance_manifest.json", + ], + } + _write_json(target / "export_manifest.json", manifest) + return { + "snapshot_json": str(snapshot_path), + "claims_jsonl": str(target / "claims.jsonl"), + "concepts_jsonl": str(target / "concepts.jsonl"), + "relations_jsonl": str(target / "relations.jsonl"), + "provenance_manifest_json": str(target / "provenance_manifest.json"), + "export_manifest_json": str(target / "export_manifest.json"), + } + + +def export_query_bundle( + store_dir: str | Path, + concept_ref: str, + out_path: str | Path, +) -> dict[str, Any]: + payload = build_query_bundle_for_concept(store_dir, concept_ref) + if payload is None: + raise KeyError(f"Unknown concept reference: {concept_ref}") + path = Path(out_path) + path.parent.mkdir(parents=True, exist_ok=True) + _write_json(path, payload) + return payload + + +def export_canonical_bundle( + store_dir: str | Path, + out_dir: str | Path, + concept_refs: list[str] | None = None, + snapshot_id: str | None = None, +) -> dict[str, Any]: + target = Path(out_dir) + target.mkdir(parents=True, exist_ok=True) + outputs = export_canonical_snapshot(store_dir, target, snapshot_id=snapshot_id) + query_bundle_paths: list[str] = [] + for concept_ref in concept_refs or []: + safe_name = concept_ref.lower().replace(" ", "-").replace("::", "-") + bundle_path = target / f"query_bundle__{safe_name}.json" + export_query_bundle(store_dir, concept_ref, bundle_path) + query_bundle_paths.append(str(bundle_path)) + manifest = json.loads((target / "export_manifest.json").read_text(encoding="utf-8")) + manifest["query_bundles"] = query_bundle_paths + _write_json(target / "export_manifest.json", manifest) + return { + "canonical_outputs": outputs, + "query_bundles": query_bundle_paths, + } + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Export canonical GroundRecall artifacts.") + parser.add_argument("store_dir") + parser.add_argument("out_dir") + parser.add_argument("--snapshot-id", default=None) + parser.add_argument("--concept", action="append", default=[]) + return parser + + +def main() -> None: + args = build_parser().parse_args() + payload = export_canonical_bundle( + store_dir=args.store_dir, + out_dir=args.out_dir, + concept_refs=list(args.concept or []), + snapshot_id=args.snapshot_id, + ) + print(json.dumps(payload, indent=2)) diff --git a/src/groundrecall/groundrecall_assistant_export.py b/src/groundrecall/groundrecall_assistant_export.py new file mode 100644 index 0000000..a4dffde --- /dev/null +++ b/src/groundrecall/groundrecall_assistant_export.py @@ -0,0 +1,12 @@ +from __future__ import annotations + +"""Legacy flat GroundRecall assistant export module. + +Compatibility path retained during the internal namespace migration. +Prefer imports under ``didactopus.groundrecall.assistant_export`` or CLI usage +via ``didactopus.groundrecall.cli`` for new code. +""" + +from .groundrecall.assistant_export import build_parser, export_assistant_bundle, main + +__all__ = ["export_assistant_bundle", "build_parser", "main"] diff --git a/src/groundrecall/groundrecall_assistants/__init__.py b/src/groundrecall/groundrecall_assistants/__init__.py new file mode 100644 index 0000000..86b3db6 --- /dev/null +++ b/src/groundrecall/groundrecall_assistants/__init__.py @@ -0,0 +1,9 @@ +"""Legacy flat GroundRecall assistants package. + +Compatibility path retained during the internal namespace migration. +Prefer imports under ``didactopus.groundrecall.assistants`` for new code. +""" + +from .base import get_assistant_adapter, list_assistant_adapters + +__all__ = ["get_assistant_adapter", "list_assistant_adapters"] diff --git a/src/groundrecall/groundrecall_assistants/base.py b/src/groundrecall/groundrecall_assistants/base.py new file mode 100644 index 0000000..9028102 --- /dev/null +++ b/src/groundrecall/groundrecall_assistants/base.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +"""Legacy flat GroundRecall assistant adapter base module. + +Compatibility path retained during the internal namespace migration. +Prefer imports under ``didactopus.groundrecall.assistants.base`` for new code. +""" + +from dataclasses import dataclass +from pathlib import Path +from typing import Protocol + + +class AssistantAdapter(Protocol): + name: str + + def export_bundle(self, snapshot: dict, query_bundles: list[dict], out_dir: str | Path) -> list[Path]: + ... + + def build_context(self, query_result: dict) -> dict: + ... + + def supported_capabilities(self) -> dict[str, bool]: + ... + + +_REGISTRY: dict[str, AssistantAdapter] = {} + + +def register_assistant_adapter(adapter: AssistantAdapter) -> AssistantAdapter: + _REGISTRY[adapter.name] = adapter + return adapter + + +def get_assistant_adapter(name: str) -> AssistantAdapter: + try: + return _REGISTRY[name] + except KeyError as exc: + raise KeyError(f"Unknown assistant adapter: {name}") from exc + + +def list_assistant_adapters() -> list[str]: + return sorted(_REGISTRY) diff --git a/src/groundrecall/groundrecall_assistants/claude_code.py b/src/groundrecall/groundrecall_assistants/claude_code.py new file mode 100644 index 0000000..69f5139 --- /dev/null +++ b/src/groundrecall/groundrecall_assistants/claude_code.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from .base import register_assistant_adapter + + +class ClaudeCodeAdapter: + name = "claude_code" + + def export_bundle(self, snapshot: dict, query_bundles: list[dict], out_dir: str | Path) -> list[Path]: + target = Path(out_dir) + target.mkdir(parents=True, exist_ok=True) + paths: list[Path] = [] + + memory_md = "\n".join( + [ + "# GroundRecall Memory", + "", + f"- Snapshot: `{snapshot.get('snapshot_id', '')}`", + f"- Concepts: {len(snapshot.get('concepts', []))}", + f"- Claims: {len(snapshot.get('claims', []))}", + "", + "Prefer the canonical GroundRecall snapshot and query bundles over free-form recollection.", + "", + "## Query Bundles", + ] + + [f"- `{bundle.get('concept', {}).get('concept_id', 'unknown')}`" for bundle in query_bundles] + ) + memory_path = target / "CLAUDE.md" + memory_path.write_text(memory_md, encoding="utf-8") + paths.append(memory_path) + + bundle_path = target / "claude_code_bundle.json" + bundle_path.write_text( + json.dumps( + { + "assistant": "claude_code", + "snapshot_id": snapshot.get("snapshot_id", ""), + "query_bundle_count": len(query_bundles), + "query_bundles": query_bundles, + }, + indent=2, + ), + encoding="utf-8", + ) + paths.append(bundle_path) + return paths + + def build_context(self, query_result: dict) -> dict: + return { + "assistant": "claude_code", + "memory_kind": "groundrecall_query_bundle", + "concept": query_result.get("concept", {}), + "claims": query_result.get("relevant_claims", []), + "support": query_result.get("supporting_observations", []), + "next_actions": query_result.get("suggested_next_actions", []), + } + + def supported_capabilities(self) -> dict[str, bool]: + return { + "skill_markdown": False, + "json_bundle": True, + "project_memory": True, + } + + +register_assistant_adapter(ClaudeCodeAdapter()) diff --git a/src/groundrecall/groundrecall_assistants/codex.py b/src/groundrecall/groundrecall_assistants/codex.py new file mode 100644 index 0000000..9a9763f --- /dev/null +++ b/src/groundrecall/groundrecall_assistants/codex.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from .base import register_assistant_adapter + + +class CodexAdapter: + name = "codex" + + def export_bundle(self, snapshot: dict, query_bundles: list[dict], out_dir: str | Path) -> list[Path]: + target = Path(out_dir) + target.mkdir(parents=True, exist_ok=True) + paths: list[Path] = [] + + skill_payload = { + "name": f"groundrecall-{snapshot.get('snapshot_id', 'snapshot')}", + "description": "GroundRecall assistant adapter bundle for Codex.", + "snapshot_id": snapshot.get("snapshot_id", ""), + "concept_count": len(snapshot.get("concepts", [])), + "claim_count": len(snapshot.get("claims", [])), + } + skill_md = "\n".join( + [ + "---", + f"name: {skill_payload['name']}", + f"description: {skill_payload['description']}", + "---", + "", + "# GroundRecall Codex Bundle", + "", + f"- Snapshot: `{skill_payload['snapshot_id']}`", + f"- Concepts: {skill_payload['concept_count']}", + f"- Claims: {skill_payload['claim_count']}", + "", + "Use the accompanying canonical JSON and query bundles as the primary source of grounded context.", + ] + ) + skill_path = target / "SKILL.md" + skill_path.write_text(skill_md, encoding="utf-8") + paths.append(skill_path) + + bundle_path = target / "codex_bundle.json" + bundle_path.write_text( + json.dumps( + { + "assistant": "codex", + "snapshot_id": snapshot.get("snapshot_id", ""), + "query_bundle_count": len(query_bundles), + "query_bundles": query_bundles, + }, + indent=2, + ), + encoding="utf-8", + ) + paths.append(bundle_path) + return paths + + def build_context(self, query_result: dict) -> dict: + return { + "assistant": "codex", + "context_kind": "groundrecall_query_bundle", + "concept": query_result.get("concept", {}), + "relevant_claims": query_result.get("relevant_claims", []), + "supporting_observations": query_result.get("supporting_observations", []), + "suggested_next_actions": query_result.get("suggested_next_actions", []), + } + + def supported_capabilities(self) -> dict[str, bool]: + return { + "skill_markdown": True, + "json_bundle": True, + "project_memory": False, + } + + +register_assistant_adapter(CodexAdapter()) diff --git a/src/groundrecall/groundrecall_discovery.py b/src/groundrecall/groundrecall_discovery.py new file mode 100644 index 0000000..3ddddb2 --- /dev/null +++ b/src/groundrecall/groundrecall_discovery.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + + +TEXT_EXTENSIONS = { + ".md", + ".markdown", + ".txt", + ".tex", + ".json", + ".yaml", + ".yml", + ".csv", + ".log", +} + + +@dataclass +class DiscoveredArtifact: + path: Path + relative_path: str + artifact_kind: str + is_text: bool + + +def classify_artifact(root: Path, path: Path) -> DiscoveredArtifact: + rel = path.relative_to(root).as_posix() + top = rel.split("/", 1)[0] + suffix = path.suffix.lower() + is_text = suffix in TEXT_EXTENSIONS or path.name in {"README", "LICENSE"} + artifact_kind = "generic_artifact" + if top == "wiki": + artifact_kind = "compiled_page" + elif top in {"raw", "sources"}: + artifact_kind = "raw_note" + elif top == "logs": + artifact_kind = "session_log" + elif path.name.startswith("schema."): + artifact_kind = "schema_file" + elif suffix in {".md", ".markdown"}: + artifact_kind = "markdown_note" + return DiscoveredArtifact(path=path, relative_path=rel, artifact_kind=artifact_kind, is_text=is_text) + + +def discover_llmwiki_artifacts(root: str | Path) -> list[DiscoveredArtifact]: + base = Path(root) + artifacts: list[DiscoveredArtifact] = [] + for path in sorted(p for p in base.rglob("*") if p.is_file()): + if any(part in {".git", "__pycache__", ".pytest_cache"} for part in path.parts): + continue + artifacts.append(classify_artifact(base, path)) + return artifacts diff --git a/src/groundrecall/groundrecall_export.py b/src/groundrecall/groundrecall_export.py new file mode 100644 index 0000000..c6ee989 --- /dev/null +++ b/src/groundrecall/groundrecall_export.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +"""Legacy flat GroundRecall export module. + +Compatibility path retained during the internal namespace migration. +Prefer imports under ``didactopus.groundrecall.export`` or CLI usage via +``didactopus.groundrecall.cli`` for new code. +""" + +from .groundrecall.export import ( + build_parser, + export_canonical_bundle, + export_canonical_snapshot, + export_query_bundle, + main, +) + +__all__ = [ + "export_canonical_snapshot", + "export_query_bundle", + "export_canonical_bundle", + "build_parser", + "main", +] diff --git a/src/groundrecall/groundrecall_import.py b/src/groundrecall/groundrecall_import.py new file mode 100644 index 0000000..280e2af --- /dev/null +++ b/src/groundrecall/groundrecall_import.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +"""Legacy extracted GroundRecall import module. + +Compatibility path retained while the standalone repo converges on the +top-level ``groundrecall.ingest`` module as the primary implementation. +""" + +from .ingest import ImportResult, build_parser, main, run_groundrecall_import diff --git a/src/groundrecall/groundrecall_lint.py b/src/groundrecall/groundrecall_lint.py new file mode 100644 index 0000000..ec76a9f --- /dev/null +++ b/src/groundrecall/groundrecall_lint.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +"""Legacy extracted GroundRecall lint module. + +Compatibility path retained while the standalone repo converges on the +top-level ``groundrecall.lint`` module as the primary implementation. +""" + +from .lint import build_parser, lint_import_directory, main diff --git a/src/groundrecall/groundrecall_models.py b/src/groundrecall/groundrecall_models.py new file mode 100644 index 0000000..7c9dee7 --- /dev/null +++ b/src/groundrecall/groundrecall_models.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +"""Legacy extracted GroundRecall models module. + +Compatibility path retained while the standalone repo converges on the +top-level ``groundrecall.models`` module as the primary implementation. +""" + +from .models import * # noqa: F403 diff --git a/src/groundrecall/groundrecall_normalizer.py b/src/groundrecall/groundrecall_normalizer.py new file mode 100644 index 0000000..fa0c872 --- /dev/null +++ b/src/groundrecall/groundrecall_normalizer.py @@ -0,0 +1,136 @@ +from __future__ import annotations + +from dataclasses import asdict, dataclass +from hashlib import sha256 +from pathlib import Path +from typing import Any + +from .groundrecall_discovery import DiscoveredArtifact +from .groundrecall_segmenter import SegmentedPage, SegmentedObservation + + +@dataclass +class ImportContext: + import_id: str + import_mode: str + machine_id: str + agent_id: str + source_root: str + imported_at: str + + +def _sanitize_claim_key(value: str) -> str: + text = "".join(ch.lower() if ch.isalnum() else "-" for ch in value).strip("-") + return text or "claim" + + +def _claim_id_for_observation(observation_record: dict[str, Any], observation: SegmentedObservation, index: int) -> str: + if observation.explicit_claim_key: + return f"clm_{_sanitize_claim_key(observation.explicit_claim_key)}" + return f"clm_{observation_record['observation_id']}_{index}" + + +def build_artifact_record(context: ImportContext, artifact: DiscoveredArtifact, page: SegmentedPage | None) -> dict[str, Any]: + record = { + "artifact_id": f"ia_{sha256(artifact.relative_path.encode('utf-8')).hexdigest()[:12]}", + "import_id": context.import_id, + "artifact_kind": artifact.artifact_kind, + "path": artifact.relative_path, + "title": page.title if page else Path(artifact.relative_path).stem, + "sha256": sha256(artifact.path.read_bytes()).hexdigest(), + "created_at": context.imported_at, + "metadata": { + "frontmatter": page.frontmatter if page else {}, + "headings": page.headings if page else [], + }, + "current_status": "draft", + } + return record + + +def build_observation_record( + context: ImportContext, + artifact_record: dict[str, Any], + observation: SegmentedObservation, + index: int, +) -> dict[str, Any]: + return { + "observation_id": f"obs_{artifact_record['artifact_id']}_{index}", + "import_id": context.import_id, + "artifact_id": artifact_record["artifact_id"], + "role": observation.role, + "text": observation.text, + "origin_path": observation.artifact_relative_path, + "origin_section": observation.section, + "line_start": observation.line_start, + "line_end": observation.line_end, + "grounding_status": observation.grounding_status, + "support_kind": observation.support_kind, + "confidence_hint": observation.confidence_hint, + "current_status": "draft", + } + + +def build_claim_record( + context: ImportContext, + observation_record: dict[str, Any], + observation: SegmentedObservation, + concept_ids: list[str], + index: int, +) -> dict[str, Any]: + return { + "claim_id": _claim_id_for_observation(observation_record, observation, index), + "import_id": context.import_id, + "claim_text": observation_record["text"], + "claim_kind": "statement" if observation_record["role"] == "claim" else "summary", + "source_observation_ids": [observation_record["observation_id"]], + "supporting_fragment_ids": [], + "concept_ids": [f"concept::{concept_id}" for concept_id in concept_ids], + "contradicts_claim_ids": [f"clm_{_sanitize_claim_key(value)}" for value in observation.contradict_keys], + "supersedes_claim_ids": [f"clm_{_sanitize_claim_key(value)}" for value in observation.supersede_keys], + "confidence_hint": observation_record["confidence_hint"], + "grounding_status": observation_record["grounding_status"], + "current_status": "triaged" if observation_record["grounding_status"] != "ungrounded" else "draft", + } + + +def build_concept_records(context: ImportContext, artifact_record: dict[str, Any], concept_ids: list[str]) -> list[dict[str, Any]]: + records = [] + for concept_id in concept_ids: + records.append( + { + "concept_id": f"concept::{concept_id}", + "import_id": context.import_id, + "title": concept_id.replace("-", " ").title(), + "aliases": [], + "description": "Imported concept from llmwiki corpus.", + "source_artifact_ids": [artifact_record["artifact_id"]], + "current_status": "triaged", + } + ) + return records + + +def build_relation_records(context: ImportContext, artifact_record: dict[str, Any], concept_ids: list[str], links: list[str]) -> list[dict[str, Any]]: + if not concept_ids: + return [] + primary = f"concept::{concept_ids[0]}" + records = [] + for idx, link in enumerate(links, start=1): + target = f"concept::{link.lower().replace(' ', '-')}" + records.append( + { + "relation_id": f"rel_{artifact_record['artifact_id']}_{idx}", + "import_id": context.import_id, + "source_id": primary, + "target_id": target, + "relation_type": "references", + "evidence_ids": [], + "current_status": "draft", + } + ) + return records + + +def manifest_record(context: ImportContext) -> dict[str, Any]: + return asdict(context) | {"source_repo_kind": "llmwiki"} diff --git a/src/groundrecall/groundrecall_promotion.py b/src/groundrecall/groundrecall_promotion.py new file mode 100644 index 0000000..95e19ee --- /dev/null +++ b/src/groundrecall/groundrecall_promotion.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +"""Legacy extracted GroundRecall promotion module. + +Compatibility path retained while the standalone repo converges on the +top-level ``groundrecall.promotion`` module as the primary implementation. +""" + +from .promotion import build_parser, main, promote_import_to_store diff --git a/src/groundrecall/groundrecall_query.py b/src/groundrecall/groundrecall_query.py new file mode 100644 index 0000000..2beec95 --- /dev/null +++ b/src/groundrecall/groundrecall_query.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +"""Legacy flat GroundRecall query module. + +Compatibility path retained during the internal namespace migration. +Prefer imports under ``didactopus.groundrecall.query`` or CLI usage via +``didactopus.groundrecall.cli`` for new code. +""" + +from .groundrecall.query import ( + build_parser, + build_query_bundle_for_concept, + main, + query_concept, + query_provenance, + search_claims, +) + +__all__ = [ + "query_concept", + "search_claims", + "query_provenance", + "build_query_bundle_for_concept", + "build_parser", + "main", +] diff --git a/src/groundrecall/groundrecall_review_bridge.py b/src/groundrecall/groundrecall_review_bridge.py new file mode 100644 index 0000000..677b283 --- /dev/null +++ b/src/groundrecall/groundrecall_review_bridge.py @@ -0,0 +1,138 @@ +from __future__ import annotations + +import argparse +import json +from collections import defaultdict +from pathlib import Path +from typing import Any + +from .review_export import build_citation_review_entries_from_import, export_review_state_json, export_review_ui_data +from .review_schema import ConceptReviewEntry, DraftPackData, ReviewSession + + +def _read_json(path: Path) -> dict[str, Any]: + return json.loads(path.read_text(encoding="utf-8")) + + +def _read_jsonl(path: Path) -> list[dict[str, Any]]: + if not path.exists(): + return [] + text = path.read_text(encoding="utf-8").strip() + if not text: + return [] + return [json.loads(line) for line in text.splitlines()] + + +def _claim_summary(claims: list[dict[str, Any]]) -> list[str]: + lines: list[str] = [] + for claim in claims[:3]: + grounding = claim.get("grounding_status", "unknown") + lines.append(f"Claim: {claim.get('claim_text', '')} [{grounding}]") + if len(claims) > 3: + lines.append(f"{len(claims) - 3} additional claims omitted from notes summary.") + return lines + + +def build_review_session_from_import(import_dir: str | Path, reviewer: str = "GroundRecall Import") -> ReviewSession: + base = Path(import_dir) + manifest = _read_json(base / "manifest.json") + lint_payload = _read_json(base / "lint_findings.json") + claims = _read_jsonl(base / "claims.jsonl") + concepts = _read_jsonl(base / "concepts.jsonl") + + claims_by_concept: defaultdict[str, list[dict[str, Any]]] = defaultdict(list) + for claim in claims: + for concept_id in claim.get("concept_ids", []): + claims_by_concept[concept_id].append(claim) + + findings_by_target: defaultdict[str, list[dict[str, Any]]] = defaultdict(list) + concept_findings: defaultdict[str, list[dict[str, Any]]] = defaultdict(list) + for finding in lint_payload.get("findings", []): + findings_by_target[finding["target_id"]].append(finding) + for claim in claims: + for concept_id in claim.get("concept_ids", []): + concept_findings[concept_id].extend(findings_by_target.get(claim["claim_id"], [])) + for concept in concepts: + concept_findings[concept["concept_id"]].extend(findings_by_target.get(concept["concept_id"], [])) + + entries: list[ConceptReviewEntry] = [] + for concept in concepts: + concept_id = concept["concept_id"] + related_claims = claims_by_concept.get(concept_id, []) + related_findings = concept_findings.get(concept_id, []) + has_errors = any(item["severity"] == "error" for item in related_findings) + all_grounded = bool(related_claims) and all(item.get("grounding_status") == "grounded" for item in related_claims) + status = "needs_review" + if not has_errors and all_grounded: + status = "provisional" + + notes = _claim_summary(related_claims) + notes.extend(item["message"] for item in related_findings[:5]) + + entries.append( + ConceptReviewEntry( + concept_id=concept_id.replace("concept::", "", 1), + title=concept.get("title", concept_id), + description=concept.get("description", ""), + prerequisites=[], + mastery_signals=[], + status=status, + notes=notes, + ) + ) + + conflicts = [item["message"] for item in lint_payload.get("findings", []) if item["severity"] == "error"] + review_flags = [item["message"] for item in lint_payload.get("findings", []) if item["severity"] == "warning"] + pack = { + "name": f"groundrecall-import-{manifest['import_id']}", + "display_name": f"GroundRecall Import {manifest['import_id']}", + "version": "0.1.0-draft", + "source_import_id": manifest["import_id"], + "source_root": manifest.get("source_root", ""), + } + attribution = { + "source_repo_kind": manifest.get("source_repo_kind", "llmwiki"), + "source_root": manifest.get("source_root", ""), + "imported_at": manifest.get("imported_at", ""), + "machine_id": manifest.get("machine_id", ""), + "rights_note": "Imported llmwiki-style corpus requires review before promotion.", + } + return ReviewSession( + reviewer=reviewer, + draft_pack=DraftPackData( + pack=pack, + concepts=entries, + conflicts=conflicts, + review_flags=review_flags, + attribution=attribution, + ), + citation_reviews=build_citation_review_entries_from_import(base), + ) + + +def export_review_bundle_from_import(import_dir: str | Path, out_dir: str | Path | None = None, reviewer: str = "GroundRecall Import") -> dict[str, str]: + base = Path(import_dir) + target = Path(out_dir) if out_dir is not None else base + target.mkdir(parents=True, exist_ok=True) + session = build_review_session_from_import(base, reviewer=reviewer) + review_state_path = target / "review_session.json" + export_review_state_json(session, review_state_path) + export_review_ui_data(session, target, import_dir=base) + return { + "review_session_json": str(review_state_path), + "review_data_json": str(target / "review_data.json"), + } + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Build Didactopus review artifacts from a GroundRecall import.") + parser.add_argument("import_dir") + parser.add_argument("--out-dir", default=None) + parser.add_argument("--reviewer", default="GroundRecall Import") + return parser + + +def main() -> None: + args = build_parser().parse_args() + outputs = export_review_bundle_from_import(args.import_dir, out_dir=args.out_dir, reviewer=args.reviewer) + print(json.dumps(outputs, indent=2)) diff --git a/src/groundrecall/groundrecall_review_queue.py b/src/groundrecall/groundrecall_review_queue.py new file mode 100644 index 0000000..23196c9 --- /dev/null +++ b/src/groundrecall/groundrecall_review_queue.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +import argparse +import json +from collections import defaultdict +from pathlib import Path +from typing import Any + + +def _read_json(path: Path) -> dict[str, Any]: + return json.loads(path.read_text(encoding="utf-8")) + + +def _read_jsonl(path: Path) -> list[dict[str, Any]]: + if not path.exists(): + return [] + text = path.read_text(encoding="utf-8").strip() + if not text: + return [] + return [json.loads(line) for line in text.splitlines()] + + +def _triage_lane(item: dict[str, Any], finding_codes: set[str]) -> str: + if {"claim_ungrounded", "ungrounded_summary"} & finding_codes: + return "source_cleanup" + if {"relation_missing_source", "relation_missing_target", "orphan_concept"} & finding_codes: + return "conflict_resolution" + return "knowledge_capture" + + +def _priority(item: dict[str, Any], finding_codes: set[str]) -> int: + priority = 50 + if item.get("grounding_status") == "grounded": + priority -= 10 + if item.get("current_status") == "triaged": + priority -= 5 + if any(code.startswith("claim_") or code.startswith("relation_") for code in finding_codes): + priority += 20 + priority -= min(len(finding_codes) * 2, 10) + return max(priority, 1) + + +def build_review_queue(import_dir: str | Path) -> dict[str, Any]: + base = Path(import_dir) + manifest = _read_json(base / "manifest.json") + lint_payload = _read_json(base / "lint_findings.json") + claims = _read_jsonl(base / "claims.jsonl") + concepts = _read_jsonl(base / "concepts.jsonl") + + findings_by_target: defaultdict[str, list[dict[str, Any]]] = defaultdict(list) + for finding in lint_payload.get("findings", []): + findings_by_target[finding["target_id"]].append(finding) + + queue: list[dict[str, Any]] = [] + + for claim in claims: + related = findings_by_target.get(claim["claim_id"], []) + finding_codes = {item["code"] for item in related} + queue.append( + { + "queue_id": f"rq_{claim['claim_id']}", + "candidate_type": "claim", + "candidate_id": claim["claim_id"], + "title": claim["claim_text"][:100], + "triage_lane": _triage_lane(claim, finding_codes), + "priority": _priority(claim, finding_codes), + "grounding_status": claim.get("grounding_status"), + "status": "needs_review", + "finding_codes": sorted(finding_codes), + "concept_ids": list(claim.get("concept_ids", [])), + } + ) + + for concept in concepts: + related = findings_by_target.get(concept["concept_id"], []) + finding_codes = {item["code"] for item in related} + if not finding_codes: + continue + queue.append( + { + "queue_id": f"rq_{concept['concept_id'].replace('::', '_')}", + "candidate_type": "concept", + "candidate_id": concept["concept_id"], + "title": concept["title"], + "triage_lane": _triage_lane(concept, finding_codes), + "priority": _priority(concept, finding_codes), + "grounding_status": concept.get("grounding_status", "triaged"), + "status": "needs_review", + "finding_codes": sorted(finding_codes), + "concept_ids": [concept["concept_id"]], + } + ) + + queue.sort(key=lambda item: (item["priority"], item["candidate_type"], item["candidate_id"])) + return { + "import_id": manifest["import_id"], + "queue_length": len(queue), + "items": queue, + } + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Build a GroundRecall review queue from import artifacts.") + parser.add_argument("import_dir") + parser.add_argument("--out", default=None) + return parser + + +def main() -> None: + args = build_parser().parse_args() + payload = build_review_queue(args.import_dir) + out_path = Path(args.out) if args.out else Path(args.import_dir) / "review_queue.json" + out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8") + print(f"Wrote {out_path}") diff --git a/src/groundrecall/groundrecall_segmenter.py b/src/groundrecall/groundrecall_segmenter.py new file mode 100644 index 0000000..1ae4c0e --- /dev/null +++ b/src/groundrecall/groundrecall_segmenter.py @@ -0,0 +1,180 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +import re + +from .groundrecall_discovery import DiscoveredArtifact + + +HEADING_RE = re.compile(r"^(#{1,6})\s+(.*)$") +FRONTMATTER_DELIM = "---" +ANNOTATION_RE = re.compile(r"\[(claim_id|contradicts|supersedes):([^\]]+)\]", re.IGNORECASE) +TABLE_SEPARATOR_RE = re.compile(r"^\|(?:\s*:?-{3,}:?\s*\|)+\s*$") +LATEX_STRUCTURAL_RE = re.compile(r"^\\(begin|end|centering|caption|label|tikzset|node|draw|path|matrix|includegraphics)\b") +LATEX_MATH_ONLY_RE = re.compile(r"^[\\{}[\]()$&_^%.,;:=+\-*/|<>~0-9A-Za-z ]+$") + + +@dataclass +class SegmentedObservation: + artifact_relative_path: str + role: str + text: str + section: str + line_start: int + line_end: int + grounding_status: str + support_kind: str + confidence_hint: float + explicit_claim_key: str = "" + contradict_keys: list[str] = field(default_factory=list) + supersede_keys: list[str] = field(default_factory=list) + + +@dataclass +class SegmentedPage: + title: str + headings: list[str] = field(default_factory=list) + frontmatter: dict[str, str] = field(default_factory=dict) + observations: list[SegmentedObservation] = field(default_factory=list) + concepts: list[str] = field(default_factory=list) + links: list[str] = field(default_factory=list) + + +def _parse_frontmatter(lines: list[str]) -> tuple[dict[str, str], int]: + if not lines or lines[0].strip() != FRONTMATTER_DELIM: + return {}, 0 + data: dict[str, str] = {} + idx = 1 + while idx < len(lines): + stripped = lines[idx].strip() + if stripped == FRONTMATTER_DELIM: + return data, idx + 1 + if ":" in stripped: + key, value = stripped.split(":", 1) + data[key.strip()] = value.strip() + idx += 1 + return data, 0 + + +def _extract_links(text: str) -> list[str]: + return re.findall(r"\[\[([^\]]+)\]\]", text) + + +def _to_concept_id(text: str) -> str: + text = re.sub(r"[^a-zA-Z0-9]+", "-", text.strip().lower()).strip("-") + return text or "untitled" + + +def _parse_annotations(text: str) -> tuple[str, str, list[str], list[str]]: + claim_key = "" + contradict_keys: list[str] = [] + supersede_keys: list[str] = [] + for kind, raw_value in ANNOTATION_RE.findall(text): + values = [value.strip() for value in raw_value.split(",") if value.strip()] + kind_lower = kind.lower() + if kind_lower == "claim_id" and values: + claim_key = values[0] + elif kind_lower == "contradicts": + contradict_keys.extend(values) + elif kind_lower == "supersedes": + supersede_keys.extend(values) + cleaned = ANNOTATION_RE.sub("", text) + cleaned = re.sub(r"\s{2,}", " ", cleaned).strip() + return cleaned, claim_key, contradict_keys, supersede_keys + + +def _should_skip_line(text: str) -> bool: + stripped = text.strip() + if not stripped: + return True + if stripped.startswith("!["): + return True + if stripped in {"---", "```", "};", "{", "}", "", "
", ":::"}: + return True + if stripped.startswith(":::"): + return True + if stripped.startswith("|") and stripped.endswith("|"): + return True + if TABLE_SEPARATOR_RE.match(stripped): + return True + if LATEX_STRUCTURAL_RE.match(stripped): + return True + if stripped.startswith("%"): + return True + if stripped.startswith("\\") and LATEX_MATH_ONLY_RE.match(stripped): + return True + return False + + +def segment_markdown_artifact(artifact: DiscoveredArtifact, text: str | None = None) -> SegmentedPage: + text = artifact.path.read_text(encoding="utf-8") if text is None else text + lines = text.splitlines() + frontmatter, start_idx = _parse_frontmatter(lines) + current_section = frontmatter.get("title", Path(artifact.relative_path).stem.replace("-", " ").title()) + title = current_section + headings: list[str] = [] + observations: list[SegmentedObservation] = [] + concepts: list[str] = [] + links: list[str] = [] + + for idx in range(start_idx, len(lines)): + raw_line = lines[idx] + stripped = raw_line.strip() + if _should_skip_line(stripped): + continue + heading_match = HEADING_RE.match(raw_line) + if heading_match: + current_section = heading_match.group(2).strip() + headings.append(current_section) + if not title and heading_match.group(1) == "#": + title = current_section + concepts.append(_to_concept_id(current_section)) + continue + + role = "summary" + obs_text = stripped + if stripped.startswith(("- ", "* ")): + role = "claim" + obs_text = stripped[2:].strip() + elif stripped.lower().startswith(("todo:", "question:", "q:")): + role = "question" + elif stripped.lower().startswith(("speculation:", "hypothesis:")): + role = "speculation" + elif artifact.artifact_kind == "session_log": + role = "transcript" + + obs_text, claim_key, contradict_keys, supersede_keys = _parse_annotations(obs_text) + + links.extend(_extract_links(obs_text)) + if role in {"summary", "claim"}: + concepts.extend(_to_concept_id(link) for link in _extract_links(obs_text)) + observations.append( + SegmentedObservation( + artifact_relative_path=artifact.relative_path, + role=role, + text=obs_text, + section=current_section, + line_start=idx + 1, + line_end=idx + 1, + grounding_status="partially_grounded" if artifact.artifact_kind == "compiled_page" else "grounded", + support_kind="derived_from_page" if artifact.artifact_kind == "compiled_page" else "direct_source", + confidence_hint=0.55 if role == "speculation" else 0.7 if role == "claim" else 0.6, + explicit_claim_key=claim_key, + contradict_keys=contradict_keys, + supersede_keys=supersede_keys, + ) + ) + + if not headings and title: + headings.append(title) + if not concepts and title: + concepts.append(_to_concept_id(title)) + return SegmentedPage( + title=title, + headings=headings, + frontmatter=frontmatter, + observations=observations, + concepts=sorted({c for c in concepts if c}), + links=sorted({link for link in links if link}), + ) diff --git a/src/groundrecall/groundrecall_source_adapters/__init__.py b/src/groundrecall/groundrecall_source_adapters/__init__.py new file mode 100644 index 0000000..b0b9fd3 --- /dev/null +++ b/src/groundrecall/groundrecall_source_adapters/__init__.py @@ -0,0 +1,15 @@ +"""Legacy flat GroundRecall source adapter package. + +Compatibility path retained during the internal namespace migration. +Prefer imports under ``didactopus.groundrecall.source_adapters`` for new code. +""" + +from .base import get_source_adapter, list_source_adapters +from . import llmwiki # noqa: F401 +from . import polypaper # noqa: F401 +from . import doclift_bundle # noqa: F401 +from . import markdown_notes # noqa: F401 +from . import transcript # noqa: F401 +from . import didactopus_pack # noqa: F401 + +__all__ = ["get_source_adapter", "list_source_adapters"] diff --git a/src/groundrecall/groundrecall_source_adapters/base.py b/src/groundrecall/groundrecall_source_adapters/base.py new file mode 100644 index 0000000..3ccc5d6 --- /dev/null +++ b/src/groundrecall/groundrecall_source_adapters/base.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +"""Legacy flat GroundRecall source adapter base module. + +Compatibility path retained during the internal namespace migration. +Prefer imports under ``didactopus.groundrecall.source_adapters.base`` for new +code. +""" + +from dataclasses import dataclass +from pathlib import Path +from typing import Literal, Protocol + + +ImportIntent = Literal["grounded_knowledge", "curriculum", "both"] + + +@dataclass +class DiscoveredImportSource: + path: Path + relative_path: str + source_kind: str + artifact_kind: str + is_text: bool + metadata: dict + + +@dataclass +class StructuredImportRows: + artifact_rows: list[dict] + observation_rows: list[dict] + claim_rows: list[dict] + concept_rows: list[dict] + relation_rows: list[dict] + + +class GroundRecallSourceAdapter(Protocol): + name: str + + def detect(self, root: str | Path) -> bool: + ... + + def discover(self, root: str | Path) -> list[DiscoveredImportSource]: + ... + + def import_intent(self) -> ImportIntent: + ... + + def build_rows(self, context, sources: list[DiscoveredImportSource]) -> StructuredImportRows | None: + ... + + +_REGISTRY: dict[str, GroundRecallSourceAdapter] = {} + + +def register_source_adapter(adapter: GroundRecallSourceAdapter) -> GroundRecallSourceAdapter: + _REGISTRY[adapter.name] = adapter + return adapter + + +def get_source_adapter(name: str) -> GroundRecallSourceAdapter: + try: + return _REGISTRY[name] + except KeyError as exc: + raise KeyError(f"Unknown GroundRecall source adapter: {name}") from exc + + +def list_source_adapters() -> list[str]: + return sorted(_REGISTRY) + + +def detect_source_adapter(root: str | Path) -> GroundRecallSourceAdapter: + for adapter in _REGISTRY.values(): + if adapter.detect(root): + return adapter + raise ValueError(f"No GroundRecall source adapter detected for {root}") diff --git a/src/groundrecall/groundrecall_source_adapters/didactopus_pack.py b/src/groundrecall/groundrecall_source_adapters/didactopus_pack.py new file mode 100644 index 0000000..c55a22f --- /dev/null +++ b/src/groundrecall/groundrecall_source_adapters/didactopus_pack.py @@ -0,0 +1,234 @@ +from __future__ import annotations + +from hashlib import sha256 +import yaml +from pathlib import Path + +from ..artifact_schemas import ConceptsFile, RoadmapFile +from .base import DiscoveredImportSource, StructuredImportRows, register_source_adapter + + +class DidactopusPackSourceAdapter: + name = "didactopus_pack" + + def detect(self, root: str | Path) -> bool: + base = Path(root) + required = {"pack.yaml", "concepts.yaml"} + return required.issubset({path.name for path in base.iterdir() if path.exists()}) + + def discover(self, root: str | Path) -> list[DiscoveredImportSource]: + base = Path(root) + rows: list[DiscoveredImportSource] = [] + for filename in ["pack.yaml", "concepts.yaml", "roadmap.yaml", "projects.yaml", "rubrics.yaml", "review_ledger.json"]: + path = base / filename + if not path.exists(): + continue + rows.append( + DiscoveredImportSource( + path=path, + relative_path=path.relative_to(base).as_posix(), + source_kind="didactopus_pack", + artifact_kind="didactopus_pack_artifact", + is_text=True, + metadata={}, + ) + ) + return rows + + def import_intent(self) -> str: + return "both" + + def build_rows(self, context, sources: list[DiscoveredImportSource]) -> StructuredImportRows | None: + by_name = {Path(item.relative_path).name: item for item in sources} + concepts_src = by_name.get("concepts.yaml") + if concepts_src is None: + return None + + pack_src = by_name.get("pack.yaml") + pack_payload = {} + if pack_src is not None: + pack_payload = yaml.safe_load(pack_src.path.read_text(encoding="utf-8")) or {} + concepts_payload = ConceptsFile.model_validate( + yaml.safe_load(concepts_src.path.read_text(encoding="utf-8")) or {"concepts": []} + ) + roadmap_payload = None + roadmap_src = by_name.get("roadmap.yaml") + if roadmap_src is not None: + roadmap_payload = RoadmapFile.model_validate( + yaml.safe_load(roadmap_src.path.read_text(encoding="utf-8")) or {"stages": []} + ) + + artifact_rows: list[dict] = [] + observation_rows: list[dict] = [] + claim_rows: list[dict] = [] + concept_rows: list[dict] = [] + relation_rows: list[dict] = [] + + for source in sources: + artifact_rows.append( + { + "artifact_id": f"ia_{sha256(source.relative_path.encode('utf-8')).hexdigest()[:12]}", + "import_id": context.import_id, + "artifact_kind": source.artifact_kind, + "path": source.relative_path, + "title": source.path.stem, + "sha256": sha256(source.path.read_bytes()).hexdigest(), + "created_at": context.imported_at, + "metadata": {"source_kind": source.source_kind}, + "current_status": "draft", + } + ) + + pack_name = pack_payload.get("name", Path(context.source_root).name) + concepts_artifact_id = next( + (row["artifact_id"] for row in artifact_rows if row["path"] == concepts_src.relative_path), + "", + ) + + for index, concept in enumerate(concepts_payload.concepts, start=1): + concept_key = f"concept::{concept.id}" + concept_rows.append( + { + "concept_id": concept_key, + "import_id": context.import_id, + "title": concept.title, + "aliases": [], + "description": concept.description or f"Imported concept from Didactopus pack {pack_name}.", + "source_artifact_ids": [concepts_artifact_id] if concepts_artifact_id else [], + "current_status": "triaged", + } + ) + observation_id = f"obs_pack_{concept.id}_{index}" + observation_rows.append( + { + "observation_id": observation_id, + "import_id": context.import_id, + "artifact_id": concepts_artifact_id, + "role": "summary", + "text": concept.description or concept.title, + "origin_path": concepts_src.relative_path, + "origin_section": concept.title, + "line_start": 0, + "line_end": 0, + "grounding_status": "grounded", + "support_kind": "direct_source", + "confidence_hint": 0.85, + "current_status": "draft", + } + ) + claim_rows.append( + { + "claim_id": f"clm_pack_{concept.id}", + "import_id": context.import_id, + "claim_text": concept.description or f"{concept.title} is a concept in pack {pack_name}.", + "claim_kind": "summary", + "source_observation_ids": [observation_id], + "supporting_fragment_ids": [], + "concept_ids": [concept_key], + "contradicts_claim_ids": [], + "supersedes_claim_ids": [], + "confidence_hint": 0.85, + "grounding_status": "grounded", + "current_status": "triaged", + } + ) + for prereq in concept.prerequisites: + relation_rows.append( + { + "relation_id": f"rel_prereq_{concept.id}_{prereq}", + "import_id": context.import_id, + "source_id": f"concept::{prereq}", + "target_id": concept_key, + "relation_type": "prerequisite", + "evidence_ids": [f"clm_pack_{concept.id}"], + "current_status": "draft", + } + ) + for signal_idx, signal in enumerate(concept.mastery_signals, start=1): + signal_obs_id = f"obs_signal_{concept.id}_{signal_idx}" + observation_rows.append( + { + "observation_id": signal_obs_id, + "import_id": context.import_id, + "artifact_id": concepts_artifact_id, + "role": "summary", + "text": signal, + "origin_path": concepts_src.relative_path, + "origin_section": f"{concept.title} mastery signal", + "line_start": 0, + "line_end": 0, + "grounding_status": "grounded", + "support_kind": "direct_source", + "confidence_hint": 0.8, + "current_status": "draft", + } + ) + claim_rows.append( + { + "claim_id": f"clm_signal_{concept.id}_{signal_idx}", + "import_id": context.import_id, + "claim_text": signal, + "claim_kind": "mastery_signal", + "source_observation_ids": [signal_obs_id], + "supporting_fragment_ids": [], + "concept_ids": [concept_key], + "contradicts_claim_ids": [], + "supersedes_claim_ids": [], + "confidence_hint": 0.8, + "grounding_status": "grounded", + "current_status": "triaged", + } + ) + + if roadmap_payload is not None and roadmap_src is not None: + roadmap_artifact_id = next( + (row["artifact_id"] for row in artifact_rows if row["path"] == roadmap_src.relative_path), + "", + ) + for stage in roadmap_payload.stages: + for concept_id in stage.concepts: + observation_id = f"obs_stage_{stage.id}_{concept_id}" + observation_rows.append( + { + "observation_id": observation_id, + "import_id": context.import_id, + "artifact_id": roadmap_artifact_id, + "role": "summary", + "text": f"{concept_id} appears in roadmap stage {stage.title}.", + "origin_path": roadmap_src.relative_path, + "origin_section": stage.title, + "line_start": 0, + "line_end": 0, + "grounding_status": "grounded", + "support_kind": "direct_source", + "confidence_hint": 0.75, + "current_status": "draft", + } + ) + claim_rows.append( + { + "claim_id": f"clm_stage_{stage.id}_{concept_id}", + "import_id": context.import_id, + "claim_text": f"{concept_id} belongs to roadmap stage {stage.title}.", + "claim_kind": "roadmap_stage", + "source_observation_ids": [observation_id], + "supporting_fragment_ids": [], + "concept_ids": [f"concept::{concept_id}"], + "contradicts_claim_ids": [], + "supersedes_claim_ids": [], + "confidence_hint": 0.75, + "grounding_status": "grounded", + "current_status": "triaged", + } + ) + + return StructuredImportRows( + artifact_rows=artifact_rows, + observation_rows=observation_rows, + claim_rows=claim_rows, + concept_rows=concept_rows, + relation_rows=relation_rows, + ) + + +register_source_adapter(DidactopusPackSourceAdapter()) diff --git a/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py b/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py new file mode 100755 index 0000000..c15530f --- /dev/null +++ b/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py @@ -0,0 +1,150 @@ +from __future__ import annotations + +import json +from hashlib import sha256 +from pathlib import Path + +from .base import DiscoveredImportSource, StructuredImportRows, register_source_adapter + + +class DocliftBundleSourceAdapter: + name = "doclift_bundle" + + def detect(self, root: str | Path) -> bool: + base = Path(root) + return (base / "manifest.json").exists() and (base / "documents").exists() + + def discover(self, root: str | Path) -> list[DiscoveredImportSource]: + base = Path(root) + rows: list[DiscoveredImportSource] = [] + for path in sorted(p for p in base.rglob("*") if p.is_file() and p.suffix.lower() in {".json", ".md"}): + rows.append( + DiscoveredImportSource( + path=path, + relative_path=path.relative_to(base).as_posix(), + source_kind="doclift_bundle", + artifact_kind="doclift_bundle_artifact", + is_text=True, + metadata={}, + ) + ) + return rows + + def import_intent(self) -> str: + return "both" + + def build_rows(self, context, sources: list[DiscoveredImportSource]) -> StructuredImportRows | None: + base = Path(context.source_root) + manifest_path = base / "manifest.json" + if not manifest_path.exists(): + return None + manifest = json.loads(manifest_path.read_text(encoding="utf-8")) + + artifact_rows: list[dict] = [] + observation_rows: list[dict] = [] + claim_rows: list[dict] = [] + concept_rows: list[dict] = [] + relation_rows: list[dict] = [] + + artifact_by_path: dict[str, str] = {} + for source in sources: + artifact_id = f"ia_{sha256(source.relative_path.encode('utf-8')).hexdigest()[:12]}" + artifact_rows.append( + { + "artifact_id": artifact_id, + "import_id": context.import_id, + "artifact_kind": source.artifact_kind, + "path": source.relative_path, + "title": source.path.stem, + "sha256": sha256(source.path.read_bytes()).hexdigest(), + "created_at": context.imported_at, + "metadata": {"source_kind": source.source_kind}, + "current_status": "draft", + } + ) + artifact_by_path[source.relative_path] = artifact_id + + documents = [item for item in manifest.get("documents", []) if isinstance(item, dict)] + previous_concept_id: str | None = None + for index, document in enumerate(documents, start=1): + title = str(document.get("title") or f"Document {index}") + concept_id = f"concept::{document.get('document_id') or title.lower().replace(' ', '-')}" + markdown_path = Path(document.get("markdown_path", "")) + relative_markdown = markdown_path.relative_to(base).as_posix() if markdown_path.is_absolute() and markdown_path.exists() and markdown_path.is_relative_to(base) else document.get("markdown_path", "") + artifact_id = artifact_by_path.get(str(relative_markdown), "") + figures_path = Path(document.get("figures_path", "")) + figure_payload = {} + if figures_path.exists(): + figure_payload = json.loads(figures_path.read_text(encoding="utf-8")) + source_path = str(figure_payload.get("source_path") or document.get("source_path") or relative_markdown) + + concept_rows.append( + { + "concept_id": concept_id, + "import_id": context.import_id, + "title": title, + "aliases": [], + "description": f"Imported from doclift bundle document kind '{document.get('document_kind', 'document')}'.", + "source_artifact_ids": [artifact_id] if artifact_id else [], + "current_status": "triaged", + } + ) + observation_id = f"obs_doclift_{index}" + observation_rows.append( + { + "observation_id": observation_id, + "import_id": context.import_id, + "artifact_id": artifact_id, + "role": "summary", + "text": title, + "origin_path": relative_markdown, + "origin_section": title, + "line_start": 0, + "line_end": 0, + "source_url": source_path, + "grounding_status": "grounded", + "support_kind": "direct_source", + "confidence_hint": 0.85, + "current_status": "draft", + } + ) + claim_rows.append( + { + "claim_id": f"clm_doclift_{index}", + "import_id": context.import_id, + "claim_text": f"{title} is a {document.get('document_kind', 'document')} in the imported doclift bundle.", + "claim_kind": "summary", + "source_observation_ids": [observation_id], + "supporting_fragment_ids": [], + "concept_ids": [concept_id], + "contradicts_claim_ids": [], + "supersedes_claim_ids": [], + "confidence_hint": 0.85, + "grounding_status": "grounded", + "current_status": "triaged", + } + ) + if previous_concept_id is not None: + relation_rows.append( + { + "relation_id": f"rel_doclift_seq_{index}", + "import_id": context.import_id, + "source_id": previous_concept_id, + "target_id": concept_id, + "relation_type": "references", + "evidence_ids": [f"clm_doclift_{index}"], + "current_status": "draft", + } + ) + previous_concept_id = concept_id + + return StructuredImportRows( + artifact_rows=artifact_rows, + observation_rows=observation_rows, + claim_rows=claim_rows, + concept_rows=concept_rows, + relation_rows=relation_rows, + ) + + +register_source_adapter(DocliftBundleSourceAdapter()) diff --git a/src/groundrecall/groundrecall_source_adapters/llmwiki.py b/src/groundrecall/groundrecall_source_adapters/llmwiki.py new file mode 100644 index 0000000..f68799f --- /dev/null +++ b/src/groundrecall/groundrecall_source_adapters/llmwiki.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +from pathlib import Path + +from ..groundrecall_discovery import discover_llmwiki_artifacts +from .base import DiscoveredImportSource, StructuredImportRows, register_source_adapter + + +class LLMWikiSourceAdapter: + name = "llmwiki" + + def detect(self, root: str | Path) -> bool: + base = Path(root) + return (base / "wiki").exists() or (base / "raw").exists() or any(path.name.startswith("schema.") for path in base.iterdir() if path.exists()) + + def discover(self, root: str | Path) -> list[DiscoveredImportSource]: + return [ + DiscoveredImportSource( + path=item.path, + relative_path=item.relative_path, + source_kind="llmwiki", + artifact_kind=item.artifact_kind, + is_text=item.is_text, + metadata={}, + ) + for item in discover_llmwiki_artifacts(root) + ] + + def import_intent(self) -> str: + return "grounded_knowledge" + + def build_rows(self, context, sources: list[DiscoveredImportSource]) -> StructuredImportRows | None: + return None + + +register_source_adapter(LLMWikiSourceAdapter()) diff --git a/src/groundrecall/groundrecall_source_adapters/markdown_notes.py b/src/groundrecall/groundrecall_source_adapters/markdown_notes.py new file mode 100644 index 0000000..78e62be --- /dev/null +++ b/src/groundrecall/groundrecall_source_adapters/markdown_notes.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from pathlib import Path + +from .base import DiscoveredImportSource, StructuredImportRows, register_source_adapter + + +TEXT_SUFFIXES = {".md", ".markdown", ".txt", ".tex"} + + +class MarkdownNotesSourceAdapter: + name = "markdown_notes" + + def detect(self, root: str | Path) -> bool: + base = Path(root) + return any(path.suffix.lower() in TEXT_SUFFIXES for path in base.rglob("*") if path.is_file()) + + def discover(self, root: str | Path) -> list[DiscoveredImportSource]: + base = Path(root) + rows: list[DiscoveredImportSource] = [] + for path in sorted(p for p in base.rglob("*") if p.is_file() and p.suffix.lower() in TEXT_SUFFIXES): + rows.append( + DiscoveredImportSource( + path=path, + relative_path=path.relative_to(base).as_posix(), + source_kind="markdown_notes", + artifact_kind="markdown_note", + is_text=True, + metadata={}, + ) + ) + return rows + + def import_intent(self) -> str: + return "grounded_knowledge" + + def build_rows(self, context, sources: list[DiscoveredImportSource]) -> StructuredImportRows | None: + return None + + +register_source_adapter(MarkdownNotesSourceAdapter()) diff --git a/src/groundrecall/groundrecall_source_adapters/polypaper.py b/src/groundrecall/groundrecall_source_adapters/polypaper.py new file mode 100644 index 0000000..814b455 --- /dev/null +++ b/src/groundrecall/groundrecall_source_adapters/polypaper.py @@ -0,0 +1,106 @@ +from __future__ import annotations + +from pathlib import Path +import re + +from .base import DiscoveredImportSource, StructuredImportRows, register_source_adapter + + +TEXT_SUFFIXES = {".tex"} +EXCLUDED_NAMES = { + ".pp-export-tmp.tex", + "paper.woven.arxiv.tex", + "paper.woven.test.tex", + "paper.woven.org", + "paper.org", + "paper_b.org", + "paper_c.org", + "paper_c.bak.org", + "paper-demo.org", + "paper-orig.org", + "test.output.org", + "tex-blocks.org", +} +EXCLUDED_DIRS = {".git", "__pycache__", ".pytest_cache", "setup"} +EXCLUDED_PREFIXES = ("table-", "figure-", "fig-") +INCLUDE_RE = re.compile(r"\\(?:include|input)\{([^}]+)\}") + + +class PolyPaperSourceAdapter: + name = "polypaper" + + def detect(self, root: str | Path) -> bool: + base = Path(root) + return ( + (base / "main.tex").exists() + and (base / "pieces").is_dir() + and ((base / "paper.org").exists() or (base / "README.md").exists()) + ) + + def discover(self, root: str | Path) -> list[DiscoveredImportSource]: + base = Path(root) + allowed_paths = self._collect_reachable_tex(base) + rows: list[DiscoveredImportSource] = [] + for path in sorted(allowed_paths): + rows.append( + DiscoveredImportSource( + path=path, + relative_path=path.relative_to(base).as_posix(), + source_kind="polypaper", + artifact_kind="markdown_note", + is_text=True, + metadata={}, + ) + ) + return rows + + def import_intent(self) -> str: + return "grounded_knowledge" + + def build_rows(self, context, sources: list[DiscoveredImportSource]) -> StructuredImportRows | None: + return None + + def _collect_reachable_tex(self, base: Path) -> set[Path]: + entrypoint = base / "main.tex" + reachable: set[Path] = set() + pending: list[Path] = [entrypoint] + + while pending: + current = pending.pop() + if not current.exists(): + continue + if current in reachable: + continue + if any(part in EXCLUDED_DIRS for part in current.relative_to(base).parts): + continue + if current.name in EXCLUDED_NAMES or current.suffix.lower() not in TEXT_SUFFIXES: + continue + if current.parent.name == "figs": + continue + if current.name.startswith(EXCLUDED_PREFIXES) or current.name == "tables.tex": + continue + text = current.read_text(encoding="utf-8") + for raw_ref in INCLUDE_RE.findall(text): + candidate = self._resolve_include(base, current.parent, raw_ref.strip()) + if candidate is not None and candidate not in reachable: + pending.append(candidate) + if current != entrypoint: + reachable.add(current) + + return reachable + + def _resolve_include(self, base: Path, current_dir: Path, raw_ref: str) -> Path | None: + candidates = [current_dir / raw_ref, base / raw_ref] + resolved: list[Path] = [] + for candidate in candidates: + if candidate.suffix: + resolved.append(candidate) + else: + resolved.append(candidate.with_suffix(".tex")) + for candidate in resolved: + if candidate.exists(): + return candidate + return None + + +register_source_adapter(PolyPaperSourceAdapter()) diff --git a/src/groundrecall/groundrecall_source_adapters/transcript.py b/src/groundrecall/groundrecall_source_adapters/transcript.py new file mode 100644 index 0000000..7efad8a --- /dev/null +++ b/src/groundrecall/groundrecall_source_adapters/transcript.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from pathlib import Path + +from .base import DiscoveredImportSource, StructuredImportRows, register_source_adapter + + +class TranscriptSourceAdapter: + name = "transcript" + + def detect(self, root: str | Path) -> bool: + base = Path(root) + return any("transcript" in path.name.lower() for path in base.rglob("*") if path.is_file()) + + def discover(self, root: str | Path) -> list[DiscoveredImportSource]: + base = Path(root) + rows: list[DiscoveredImportSource] = [] + for path in sorted(p for p in base.rglob("*") if p.is_file() and "transcript" in p.name.lower()): + rows.append( + DiscoveredImportSource( + path=path, + relative_path=path.relative_to(base).as_posix(), + source_kind="transcript", + artifact_kind="session_log", + is_text=True, + metadata={}, + ) + ) + return rows + + def import_intent(self) -> str: + return "grounded_knowledge" + + def build_rows(self, context, sources: list[DiscoveredImportSource]) -> StructuredImportRows | None: + return None + + +register_source_adapter(TranscriptSourceAdapter()) diff --git a/src/groundrecall/groundrecall_store.py b/src/groundrecall/groundrecall_store.py new file mode 100644 index 0000000..a92ec7e --- /dev/null +++ b/src/groundrecall/groundrecall_store.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +"""Legacy extracted GroundRecall store module. + +Compatibility path retained while the standalone repo converges on the +top-level ``groundrecall.store`` module as the primary implementation. +""" + +from .store import GroundRecallStore diff --git a/src/groundrecall/ingest.py b/src/groundrecall/ingest.py new file mode 100644 index 0000000..27854e3 --- /dev/null +++ b/src/groundrecall/ingest.py @@ -0,0 +1,231 @@ +from __future__ import annotations + +import argparse +import json +import shutil +import socket +import subprocess +from collections import OrderedDict +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from .groundrecall_discovery import DiscoveredArtifact +from .groundrecall_lint import lint_import_directory +from .groundrecall_normalizer import ( + ImportContext, + build_artifact_record, + build_claim_record, + build_concept_records, + build_observation_record, + build_relation_records, + manifest_record, +) +from .groundrecall_review_bridge import export_review_bundle_from_import +from .groundrecall_review_queue import build_review_queue +from .groundrecall_segmenter import SegmentedPage, segment_markdown_artifact +from .groundrecall_source_adapters.base import detect_source_adapter +import groundrecall.groundrecall_source_adapters # noqa: F401 + + +VALID_MODES = {"archive", "quick", "grounded"} + + +@dataclass +class ImportResult: + manifest: dict[str, Any] + artifacts: list[dict[str, Any]] + observations: list[dict[str, Any]] + claims: list[dict[str, Any]] + concepts: list[dict[str, Any]] + relations: list[dict[str, Any]] + out_dir: Path + + +def _timestamp() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def _default_import_id(source_root: Path) -> str: + stem = source_root.name.lower().replace("_", "-") + stamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + return f"{stem}-{stamp}" + + +def _write_json(path: Path, payload: dict[str, Any]) -> None: + path.write_text(json.dumps(payload, indent=2), encoding="utf-8") + + +def _write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None: + text = "\n".join(json.dumps(row, sort_keys=True) for row in rows) + if text: + text += "\n" + path.write_text(text, encoding="utf-8") + + +def _dedupe_by_key(rows: list[dict[str, Any]], key: str) -> list[dict[str, Any]]: + unique: OrderedDict[str, dict[str, Any]] = OrderedDict() + for row in rows: + unique.setdefault(str(row[key]), row) + return list(unique.values()) + + +def _convert_tex_to_markdown(path: Path) -> str | None: + pandoc = shutil.which("pandoc") + if pandoc is None: + return None + result = subprocess.run( + [pandoc, "-f", "latex", "-t", "gfm", str(path)], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + return None + markdown = result.stdout.strip() + return markdown or None + + +def _segment_artifact(artifact: DiscoveredArtifact) -> SegmentedPage | None: + if not artifact.is_text: + return None + suffix = artifact.path.suffix.lower() + if suffix not in {".md", ".markdown", ".txt", ".tex", ".log"}: + return None + if suffix == ".tex": + converted = _convert_tex_to_markdown(artifact.path) + if converted is not None: + return segment_markdown_artifact(artifact, text=converted) + return segment_markdown_artifact(artifact) + + +def run_groundrecall_import( + source_root: str | Path, + out_root: str | Path | None = None, + mode: str = "quick", + import_id: str | None = None, + machine_id: str | None = None, + agent_id: str = "groundrecall.ingest", +) -> ImportResult: + source_path = Path(source_root).resolve() + if mode not in VALID_MODES: + raise ValueError(f"Unsupported import mode: {mode}") + adapter = detect_source_adapter(source_path) + discovered = adapter.discover(source_path) + artifacts = [ + DiscoveredArtifact( + path=item.path, + relative_path=item.relative_path, + artifact_kind=item.artifact_kind, + is_text=item.is_text, + ) + for item in discovered + ] + actual_import_id = import_id or _default_import_id(source_path) + output_root = Path(out_root) if out_root else source_path / "imports" + output_dir = output_root / actual_import_id + output_dir.mkdir(parents=True, exist_ok=True) + + context = ImportContext( + import_id=actual_import_id, + import_mode=mode, + machine_id=machine_id or socket.gethostname(), + agent_id=agent_id, + source_root=str(source_path), + imported_at=_timestamp(), + ) + + artifact_rows: list[dict[str, Any]] = [] + observation_rows: list[dict[str, Any]] = [] + claim_rows: list[dict[str, Any]] = [] + concept_rows: list[dict[str, Any]] = [] + relation_rows: list[dict[str, Any]] = [] + structured_rows = adapter.build_rows(context, discovered) + if structured_rows is not None: + artifact_rows.extend(structured_rows.artifact_rows) + observation_rows.extend(structured_rows.observation_rows) + claim_rows.extend(structured_rows.claim_rows) + concept_rows.extend(structured_rows.concept_rows) + relation_rows.extend(structured_rows.relation_rows) + else: + for artifact in artifacts: + page = _segment_artifact(artifact) + artifact_row = build_artifact_record(context, artifact, page) + artifact_rows.append(artifact_row) + if page is None: + continue + + concept_rows.extend(build_concept_records(context, artifact_row, page.concepts)) + relation_rows.extend(build_relation_records(context, artifact_row, page.concepts, page.links)) + + for index, observation in enumerate(page.observations, start=1): + observation_row = build_observation_record(context, artifact_row, observation, index) + observation_rows.append(observation_row) + if mode == "archive": + continue + if observation.role not in {"claim", "summary"}: + continue + claim_rows.append(build_claim_record(context, observation_row, observation, page.concepts[:3], index)) + + concept_rows = _dedupe_by_key(concept_rows, "concept_id") + relation_rows = _dedupe_by_key(relation_rows, "relation_id") + artifact_rows = _dedupe_by_key(artifact_rows, "artifact_id") + observation_rows = _dedupe_by_key(observation_rows, "observation_id") + claim_rows = _dedupe_by_key(claim_rows, "claim_id") + + manifest = manifest_record(context) | { + "source_adapter": adapter.name, + "import_intent": adapter.import_intent(), + "artifact_count": len(artifact_rows), + "observation_count": len(observation_rows), + "claim_count": len(claim_rows), + "concept_count": len(concept_rows), + "relation_count": len(relation_rows), + } + + _write_json(output_dir / "manifest.json", manifest) + _write_jsonl(output_dir / "artifacts.jsonl", artifact_rows) + _write_jsonl(output_dir / "observations.jsonl", observation_rows) + _write_jsonl(output_dir / "claims.jsonl", claim_rows) + _write_jsonl(output_dir / "concepts.jsonl", concept_rows) + _write_jsonl(output_dir / "relations.jsonl", relation_rows) + lint_payload = lint_import_directory(output_dir) + _write_json(output_dir / "lint_findings.json", lint_payload) + review_queue = build_review_queue(output_dir) + _write_json(output_dir / "review_queue.json", review_queue) + export_review_bundle_from_import(output_dir) + + return ImportResult( + manifest=manifest, + artifacts=artifact_rows, + observations=observation_rows, + claims=claim_rows, + concepts=concept_rows, + relations=relation_rows, + out_dir=output_dir, + ) + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Import an llmwiki-style repository into GroundRecall import artifacts.") + parser.add_argument("source_root") + parser.add_argument("--out-root", default=None) + parser.add_argument("--mode", choices=sorted(VALID_MODES), default="quick") + parser.add_argument("--import-id", default=None) + parser.add_argument("--machine-id", default=None) + parser.add_argument("--agent-id", default="groundrecall.ingest") + return parser + + +def main() -> None: + args = build_parser().parse_args() + result = run_groundrecall_import( + source_root=args.source_root, + out_root=args.out_root, + mode=args.mode, + import_id=args.import_id, + machine_id=args.machine_id, + agent_id=args.agent_id, + ) + print(f"Wrote import artifacts to {result.out_dir}") diff --git a/src/groundrecall/inspect.py b/src/groundrecall/inspect.py new file mode 100644 index 0000000..08c6bf1 --- /dev/null +++ b/src/groundrecall/inspect.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any + +from .store import GroundRecallStore + + +def summarize_store(store_dir: str | Path) -> dict[str, Any]: + store = GroundRecallStore(store_dir) + snapshots = store.list_snapshots() + latest_snapshot = max(snapshots, key=lambda item: item.created_at, default=None) + return { + "store_dir": str(Path(store_dir)), + "source_count": len(store.list_sources()), + "artifact_count": len(store.list_artifacts()), + "observation_count": len(store.list_observations()), + "claim_count": len(store.list_claims()), + "concept_count": len(store.list_concepts()), + "relation_count": len(store.list_relations()), + "review_candidate_count": len(store.list_review_candidates()), + "promotion_count": len(store.list_promotions()), + "snapshot_count": len(snapshots), + "latest_snapshot_id": latest_snapshot.snapshot_id if latest_snapshot is not None else "", + } + + +def inspect_store(store_dir: str | Path, out_path: str | Path | None = None) -> dict[str, Any]: + payload = summarize_store(store_dir) + if out_path is not None: + Path(out_path).write_text(json.dumps(payload, indent=2), encoding="utf-8") + return payload + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Inspect canonical GroundRecall store contents.") + parser.add_argument("store_dir") + parser.add_argument("--out", default=None) + return parser + + +def main() -> None: + args = build_parser().parse_args() + payload = inspect_store(args.store_dir, out_path=args.out) + print(json.dumps(payload, indent=2)) diff --git a/src/groundrecall/lint.py b/src/groundrecall/lint.py new file mode 100644 index 0000000..dbda355 --- /dev/null +++ b/src/groundrecall/lint.py @@ -0,0 +1,196 @@ +from __future__ import annotations + +import argparse +import json +from collections import Counter, defaultdict +from pathlib import Path +from typing import Any + + +def _read_json(path: Path) -> dict[str, Any]: + return json.loads(path.read_text(encoding="utf-8")) + + +def _read_jsonl(path: Path) -> list[dict[str, Any]]: + if not path.exists(): + return [] + text = path.read_text(encoding="utf-8").strip() + if not text: + return [] + return [json.loads(line) for line in text.splitlines()] + + +def lint_import_directory(import_dir: str | Path) -> dict[str, Any]: + base = Path(import_dir) + manifest = _read_json(base / "manifest.json") + artifacts = _read_jsonl(base / "artifacts.jsonl") + observations = _read_jsonl(base / "observations.jsonl") + claims = _read_jsonl(base / "claims.jsonl") + concepts = _read_jsonl(base / "concepts.jsonl") + relations = _read_jsonl(base / "relations.jsonl") + + findings: list[dict[str, Any]] = [] + observation_by_id = {row["observation_id"]: row for row in observations} + concept_ids = {row["concept_id"] for row in concepts} + + text_counter = Counter(row["claim_text"].strip().lower() for row in claims if row.get("claim_text", "").strip()) + claim_ids = {row["claim_id"] for row in claims} + for claim in claims: + claim_text = claim.get("claim_text", "").strip() + if not claim.get("source_observation_ids"): + findings.append( + { + "severity": "error", + "code": "claim_missing_observation", + "target_id": claim["claim_id"], + "message": "Claim has no source observation ids.", + } + ) + if not claim.get("concept_ids"): + findings.append( + { + "severity": "warning", + "code": "claim_missing_concept", + "target_id": claim["claim_id"], + "message": "Claim is not associated with any concepts.", + } + ) + if claim.get("grounding_status") == "ungrounded": + findings.append( + { + "severity": "warning", + "code": "claim_ungrounded", + "target_id": claim["claim_id"], + "message": "Claim is ungrounded and should not be promoted directly.", + } + ) + if claim_text and text_counter[claim_text.lower()] > 1: + findings.append( + { + "severity": "warning", + "code": "duplicate_claim_text", + "target_id": claim["claim_id"], + "message": "Claim text duplicates another imported claim.", + } + ) + for obs_id in claim.get("source_observation_ids", []): + if obs_id not in observation_by_id: + findings.append( + { + "severity": "error", + "code": "claim_observation_missing", + "target_id": claim["claim_id"], + "message": f"Claim references missing observation {obs_id}.", + } + ) + for target_claim_id in claim.get("contradicts_claim_ids", []): + if target_claim_id not in claim_ids: + findings.append( + { + "severity": "warning", + "code": "unresolved_contradiction_ref", + "target_id": claim["claim_id"], + "message": f"Claim references missing contradiction target {target_claim_id}.", + } + ) + for target_claim_id in claim.get("supersedes_claim_ids", []): + if target_claim_id not in claim_ids: + findings.append( + { + "severity": "warning", + "code": "unresolved_supersession_ref", + "target_id": claim["claim_id"], + "message": f"Claim references missing supersession target {target_claim_id}.", + } + ) + if claim.get("contradicts_claim_ids") and claim.get("supersedes_claim_ids"): + findings.append( + { + "severity": "warning", + "code": "claim_mixed_conflict_and_supersession", + "target_id": claim["claim_id"], + "message": "Claim marks both contradiction and supersession targets; review the intended relation.", + } + ) + + concept_sources: defaultdict[str, set[str]] = defaultdict(set) + for claim in claims: + for concept_id in claim.get("concept_ids", []): + concept_sources[concept_id].add(claim["claim_id"]) + for relation in relations: + concept_sources[relation.get("source_id", "")].add(relation["relation_id"]) + concept_sources[relation.get("target_id", "")].add(relation["relation_id"]) + + for concept in concepts: + if not concept_sources.get(concept["concept_id"]): + findings.append( + { + "severity": "warning", + "code": "orphan_concept", + "target_id": concept["concept_id"], + "message": "Concept has no connected claims or relations.", + } + ) + + for relation in relations: + if relation.get("source_id") not in concept_ids: + findings.append( + { + "severity": "error", + "code": "relation_missing_source", + "target_id": relation["relation_id"], + "message": f"Relation source {relation.get('source_id')} is missing.", + } + ) + if relation.get("target_id") not in concept_ids: + findings.append( + { + "severity": "error", + "code": "relation_missing_target", + "target_id": relation["relation_id"], + "message": f"Relation target {relation.get('target_id')} is missing.", + } + ) + + for observation in observations: + role = observation.get("role") + if role == "summary" and observation.get("grounding_status") == "ungrounded": + findings.append( + { + "severity": "warning", + "code": "ungrounded_summary", + "target_id": observation["observation_id"], + "message": "Summary observation is ungrounded.", + } + ) + + summary = { + "artifact_count": len(artifacts), + "observation_count": len(observations), + "claim_count": len(claims), + "concept_count": len(concepts), + "relation_count": len(relations), + "error_count": sum(1 for item in findings if item["severity"] == "error"), + "warning_count": sum(1 for item in findings if item["severity"] == "warning"), + } + return { + "import_id": manifest["import_id"], + "import_mode": manifest["import_mode"], + "summary": summary, + "findings": findings, + } + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Lint GroundRecall import artifacts.") + parser.add_argument("import_dir") + parser.add_argument("--out", default=None) + return parser + + +def main() -> None: + args = build_parser().parse_args() + payload = lint_import_directory(args.import_dir) + out_path = Path(args.out) if args.out else Path(args.import_dir) / "lint_findings.json" + out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8") + print(f"Wrote {out_path}") diff --git a/src/groundrecall/models.py b/src/groundrecall/models.py new file mode 100644 index 0000000..3fa5fe4 --- /dev/null +++ b/src/groundrecall/models.py @@ -0,0 +1,137 @@ +from __future__ import annotations + +from typing import Literal + +from pydantic import BaseModel, Field + + +LifecycleStatus = Literal["draft", "triaged", "reviewed", "promoted", "superseded", "archived", "rejected"] +GroundingStatus = Literal["grounded", "partially_grounded", "ungrounded"] +SupportKind = Literal["direct_source", "derived_from_page", "derived_from_session", "inferred", "unknown"] + + +class ProvenanceRecord(BaseModel): + origin_artifact_id: str = "" + origin_path: str = "" + origin_section: str = "" + source_url: str = "" + retrieval_date: str = "" + machine_id: str = "" + session_id: str = "" + support_kind: SupportKind = "unknown" + grounding_status: GroundingStatus = "ungrounded" + + +class SourceRecord(BaseModel): + source_id: str + title: str = "" + source_type: str = "document" + path: str = "" + url: str = "" + retrieved_at: str = "" + metadata: dict = Field(default_factory=dict) + current_status: LifecycleStatus = "draft" + + +class FragmentRecord(BaseModel): + fragment_id: str + source_id: str + text: str + section: str = "" + line_start: int = 0 + line_end: int = 0 + metadata: dict = Field(default_factory=dict) + current_status: LifecycleStatus = "draft" + + +class ArtifactRecord(BaseModel): + artifact_id: str + artifact_kind: str + title: str = "" + path: str = "" + sha256: str = "" + created_at: str = "" + metadata: dict = Field(default_factory=dict) + current_status: LifecycleStatus = "draft" + + +class ObservationRecord(BaseModel): + observation_id: str + artifact_id: str = "" + role: str + text: str + provenance: ProvenanceRecord = Field(default_factory=ProvenanceRecord) + confidence_hint: float = 0.0 + current_status: LifecycleStatus = "draft" + + +class ClaimRecord(BaseModel): + claim_id: str + claim_text: str + claim_kind: str = "statement" + source_observation_ids: list[str] = Field(default_factory=list) + supporting_fragment_ids: list[str] = Field(default_factory=list) + concept_ids: list[str] = Field(default_factory=list) + contradicts_claim_ids: list[str] = Field(default_factory=list) + supersedes_claim_ids: list[str] = Field(default_factory=list) + confidence_hint: float = 0.0 + review_confidence: float = 0.0 + last_confirmed_at: str = "" + provenance: ProvenanceRecord = Field(default_factory=ProvenanceRecord) + current_status: LifecycleStatus = "draft" + + +class ConceptRecord(BaseModel): + concept_id: str + title: str + aliases: list[str] = Field(default_factory=list) + description: str = "" + source_artifact_ids: list[str] = Field(default_factory=list) + current_status: LifecycleStatus = "draft" + + +class RelationRecord(BaseModel): + relation_id: str + source_id: str + target_id: str + relation_type: str + evidence_ids: list[str] = Field(default_factory=list) + provenance: ProvenanceRecord = Field(default_factory=ProvenanceRecord) + current_status: LifecycleStatus = "draft" + + +class ReviewCandidateRecord(BaseModel): + review_candidate_id: str + candidate_type: Literal["claim", "concept", "relation"] + candidate_id: str + triage_lane: str = "knowledge_capture" + priority: int = 50 + finding_codes: list[str] = Field(default_factory=list) + rationale: str = "" + current_status: LifecycleStatus = "draft" + + +class PromotionRecord(BaseModel): + promotion_id: str + candidate_type: Literal["claim", "concept", "relation"] + candidate_id: str + promotion_target: str = "groundrecall_store" + verdict: Literal["approved", "rejected", "superseded"] = "approved" + reviewer: str = "" + promoted_object_ids: list[str] = Field(default_factory=list) + notes: str = "" + promoted_at: str = "" + + +class GroundRecallSnapshot(BaseModel): + snapshot_id: str + created_at: str + sources: list[SourceRecord] = Field(default_factory=list) + fragments: list[FragmentRecord] = Field(default_factory=list) + artifacts: list[ArtifactRecord] = Field(default_factory=list) + observations: list[ObservationRecord] = Field(default_factory=list) + claims: list[ClaimRecord] = Field(default_factory=list) + concepts: list[ConceptRecord] = Field(default_factory=list) + relations: list[RelationRecord] = Field(default_factory=list) + promotions: list[PromotionRecord] = Field(default_factory=list) + metadata: dict = Field(default_factory=dict) diff --git a/src/groundrecall/promotion.py b/src/groundrecall/promotion.py new file mode 100644 index 0000000..ab51bf7 --- /dev/null +++ b/src/groundrecall/promotion.py @@ -0,0 +1,250 @@ +from __future__ import annotations + +import argparse +import json +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from .models import ( + ArtifactRecord, + ClaimRecord, + ConceptRecord, + ObservationRecord, + PromotionRecord, + ProvenanceRecord, + RelationRecord, + ReviewCandidateRecord, +) +from .review_schema import ReviewSession +from .store import GroundRecallStore + + +def _read_json(path: Path) -> dict[str, Any]: + return json.loads(path.read_text(encoding="utf-8")) + + +def _read_jsonl(path: Path) -> list[dict[str, Any]]: + if not path.exists(): + return [] + text = path.read_text(encoding="utf-8").strip() + if not text: + return [] + return [json.loads(line) for line in text.splitlines()] + + +def _now() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def _review_status_map(status: str) -> str: + return { + "trusted": "promoted", + "provisional": "reviewed", + "rejected": "rejected", + "needs_review": "triaged", + }.get(status, "triaged") + + +def _provenance_from_payload(payload: dict[str, Any]) -> ProvenanceRecord: + return ProvenanceRecord( + origin_artifact_id=payload.get("origin_artifact_id", ""), + origin_path=payload.get("origin_path", ""), + origin_section=payload.get("origin_section", ""), + source_url=payload.get("source_url", ""), + retrieval_date=payload.get("retrieval_date", ""), + machine_id=payload.get("machine_id", ""), + session_id=payload.get("session_id", ""), + support_kind=payload.get("support_kind", "unknown"), + grounding_status=payload.get("grounding_status", "ungrounded"), + ) + + +def promote_import_to_store( + import_dir: str | Path, + store_dir: str | Path, + reviewer: str | None = None, + snapshot_id: str | None = None, +) -> dict[str, Any]: + base = Path(import_dir) + manifest = _read_json(base / "manifest.json") + review_session = ReviewSession.model_validate_json((base / "review_session.json").read_text(encoding="utf-8")) + queue_payload = _read_json(base / "review_queue.json") + artifacts = _read_jsonl(base / "artifacts.jsonl") + observations = _read_jsonl(base / "observations.jsonl") + claims = _read_jsonl(base / "claims.jsonl") + concepts = _read_jsonl(base / "concepts.jsonl") + relations = _read_jsonl(base / "relations.jsonl") + + store = GroundRecallStore(store_dir) + reviewed_by_concept = {entry.concept_id: entry for entry in review_session.draft_pack.concepts} + promoted_claim_ids: list[str] = [] + promoted_concept_ids: list[str] = [] + promoted_relation_ids: list[str] = [] + + for artifact in artifacts: + store.save_artifact( + ArtifactRecord( + artifact_id=artifact["artifact_id"], + artifact_kind=artifact["artifact_kind"], + title=artifact.get("title", ""), + path=artifact.get("path", ""), + sha256=artifact.get("sha256", ""), + created_at=artifact.get("created_at", ""), + metadata=dict(artifact.get("metadata", {})), + current_status="reviewed", + ) + ) + + for observation in observations: + store.save_observation( + ObservationRecord( + observation_id=observation["observation_id"], + artifact_id=observation.get("artifact_id", ""), + role=observation.get("role", "summary"), + text=observation.get("text", ""), + provenance=_provenance_from_payload(observation), + confidence_hint=float(observation.get("confidence_hint", 0.0)), + current_status="reviewed", + ) + ) + + for concept in concepts: + short_id = concept["concept_id"].replace("concept::", "", 1) + review_entry = reviewed_by_concept.get(short_id) + current_status = _review_status_map(review_entry.status if review_entry else concept.get("current_status", "triaged")) + record = store.save_concept( + ConceptRecord( + concept_id=concept["concept_id"], + title=review_entry.title if review_entry else concept.get("title", concept["concept_id"]), + aliases=list(concept.get("aliases", [])), + description=review_entry.description if review_entry else concept.get("description", ""), + source_artifact_ids=list(concept.get("source_artifact_ids", [])), + current_status=current_status, # type: ignore[arg-type] + ) + ) + if record.current_status in {"promoted", "reviewed"}: + promoted_concept_ids.append(record.concept_id) + + reviewed_concept_ids = set(promoted_concept_ids) + for claim in claims: + concept_ids = list(claim.get("concept_ids", [])) + statuses = [] + for concept_id in concept_ids: + short_id = concept_id.replace("concept::", "", 1) + review_entry = reviewed_by_concept.get(short_id) + statuses.append(_review_status_map(review_entry.status) if review_entry else "triaged") + if statuses and all(status == "rejected" for status in statuses): + current_status = "rejected" + elif statuses and any(status == "promoted" for status in statuses): + current_status = "promoted" + elif statuses and any(status == "reviewed" for status in statuses): + current_status = "reviewed" + else: + current_status = "triaged" + record = store.save_claim( + ClaimRecord( + claim_id=claim["claim_id"], + claim_text=claim.get("claim_text", ""), + claim_kind=claim.get("claim_kind", "statement"), + source_observation_ids=list(claim.get("source_observation_ids", [])), + supporting_fragment_ids=list(claim.get("supporting_fragment_ids", [])), + concept_ids=concept_ids, + contradicts_claim_ids=list(claim.get("contradicts_claim_ids", [])), + supersedes_claim_ids=list(claim.get("supersedes_claim_ids", [])), + confidence_hint=float(claim.get("confidence_hint", 0.0)), + review_confidence=float(claim.get("review_confidence", 0.0)), + last_confirmed_at=claim.get("last_confirmed_at", ""), + provenance=_provenance_from_payload(claim), + current_status=current_status, # type: ignore[arg-type] + ) + ) + if record.current_status in {"promoted", "reviewed"}: + promoted_claim_ids.append(record.claim_id) + + for relation in relations: + src_ok = relation.get("source_id") in reviewed_concept_ids + tgt_ok = relation.get("target_id") in reviewed_concept_ids + current_status = "promoted" if src_ok and tgt_ok else "triaged" + record = store.save_relation( + RelationRecord( + relation_id=relation["relation_id"], + source_id=relation.get("source_id", ""), + target_id=relation.get("target_id", ""), + relation_type=relation.get("relation_type", "references"), + evidence_ids=list(relation.get("evidence_ids", [])), + provenance=_provenance_from_payload(relation), + current_status=current_status, # type: ignore[arg-type] + ) + ) + if record.current_status in {"promoted", "reviewed"}: + promoted_relation_ids.append(record.relation_id) + + for item in queue_payload.get("items", []): + store.save_review_candidate( + ReviewCandidateRecord( + review_candidate_id=item["queue_id"], + candidate_type=item["candidate_type"], + candidate_id=item["candidate_id"], + triage_lane=item.get("triage_lane", "knowledge_capture"), + priority=int(item.get("priority", 50)), + finding_codes=list(item.get("finding_codes", [])), + rationale=item.get("title", ""), + current_status="reviewed" if item["candidate_id"] in set(promoted_claim_ids + promoted_concept_ids + promoted_relation_ids) else "triaged", + ) + ) + + promotion = store.save_promotion( + PromotionRecord( + promotion_id=f"promotion-{manifest['import_id']}", + candidate_type="concept", + candidate_id=manifest["import_id"], + promotion_target="groundrecall_store", + verdict="approved", + reviewer=reviewer or review_session.reviewer, + promoted_object_ids=promoted_concept_ids + promoted_claim_ids + promoted_relation_ids, + notes=f"Promoted import {manifest['import_id']} into GroundRecallStore.", + promoted_at=_now(), + ) + ) + + built_snapshot = store.build_snapshot( + snapshot_id=snapshot_id or f"snapshot-{manifest['import_id']}", + created_at=_now(), + metadata={ + "source_import_id": manifest["import_id"], + "reviewer": reviewer or review_session.reviewer, + "export_kind": "canonical", + }, + ) + store.save_snapshot(built_snapshot) + + return { + "import_id": manifest["import_id"], + "store_dir": str(Path(store_dir)), + "promotion_id": promotion.promotion_id, + "promoted_concept_count": len(promoted_concept_ids), + "promoted_claim_count": len(promoted_claim_ids), + "promoted_relation_count": len(promoted_relation_ids), + "snapshot_id": built_snapshot.snapshot_id, + } + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Promote a GroundRecall import into canonical store objects.") + parser.add_argument("import_dir") + parser.add_argument("store_dir") + parser.add_argument("--reviewer", default=None) + parser.add_argument("--snapshot-id", default=None) + return parser + + +def main() -> None: + args = build_parser().parse_args() + payload = promote_import_to_store( + import_dir=args.import_dir, + store_dir=args.store_dir, + reviewer=args.reviewer, + snapshot_id=args.snapshot_id, + ) + print(json.dumps(payload, indent=2)) diff --git a/src/groundrecall/query.py b/src/groundrecall/query.py new file mode 100644 index 0000000..a992b0f --- /dev/null +++ b/src/groundrecall/query.py @@ -0,0 +1,188 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any + +from .store import GroundRecallStore + + +def _normalize(text: str) -> str: + return " ".join(text.lower().split()) + + +def _matches(query: str, *values: str) -> bool: + needle = _normalize(query) + return any(needle in _normalize(value) for value in values if value) + + +def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | None: + store = GroundRecallStore(store_dir) + concepts = store.list_concepts() + concept = next( + ( + item + for item in concepts + if concept_ref == item.concept_id + or concept_ref == item.concept_id.replace("concept::", "", 1) + or _matches(concept_ref, item.title, item.description, *item.aliases) + ), + None, + ) + if concept is None: + return None + + claims = [item for item in store.list_claims() if concept.concept_id in item.concept_ids and item.current_status != "rejected"] + relations = [ + item + for item in store.list_relations() + if (item.source_id == concept.concept_id or item.target_id == concept.concept_id) and item.current_status != "rejected" + ] + artifacts = {item.artifact_id: item for item in store.list_artifacts()} + observations = {item.observation_id: item for item in store.list_observations()} + + supporting_observations = [] + for claim in claims: + for observation_id in claim.source_observation_ids: + observation = observations.get(observation_id) + if observation is not None: + supporting_observations.append( + { + "observation_id": observation.observation_id, + "text": observation.text, + "role": observation.role, + "origin_path": observation.provenance.origin_path, + "grounding_status": observation.provenance.grounding_status, + } + ) + + related_concept_ids = sorted( + { + relation.target_id if relation.source_id == concept.concept_id else relation.source_id + for relation in relations + if relation.source_id != relation.target_id + } + ) + related_concepts = [item.model_dump() for item in concepts if item.concept_id in related_concept_ids] + + source_artifacts = [ + artifact.model_dump() + for artifact in artifacts.values() + if artifact.artifact_id in set(concept.source_artifact_ids) + ] + + return { + "query_type": "concept", + "concept": concept.model_dump(), + "claims": [item.model_dump() for item in claims], + "relations": [item.model_dump() for item in relations], + "related_concepts": related_concepts, + "supporting_observations": supporting_observations, + "source_artifacts": source_artifacts, + } + + +def search_claims( + store_dir: str | Path, + text: str, + include_rejected: bool = False, + limit: int = 20, +) -> dict[str, Any]: + store = GroundRecallStore(store_dir) + concepts = {item.concept_id: item for item in store.list_concepts()} + matches = [] + for claim in store.list_claims(): + if not include_rejected and claim.current_status == "rejected": + continue + concept_titles = [concepts[concept_id].title for concept_id in claim.concept_ids if concept_id in concepts] + if _matches(text, claim.claim_text, *concept_titles): + matches.append( + { + "claim": claim.model_dump(), + "concept_titles": concept_titles, + "provenance": claim.provenance.model_dump(), + } + ) + if len(matches) >= limit: + break + return { + "query_type": "claim_search", + "query": text, + "matches": matches, + } + + +def query_provenance( + store_dir: str | Path, + origin_path: str | None = None, + source_url: str | None = None, +) -> dict[str, Any]: + store = GroundRecallStore(store_dir) + claims = [] + observations = [] + for claim in store.list_claims(): + if origin_path and claim.provenance.origin_path == origin_path: + claims.append(claim.model_dump()) + continue + if source_url and claim.provenance.source_url == source_url: + claims.append(claim.model_dump()) + for observation in store.list_observations(): + if origin_path and observation.provenance.origin_path == origin_path: + observations.append(observation.model_dump()) + continue + if source_url and observation.provenance.source_url == source_url: + observations.append(observation.model_dump()) + return { + "query_type": "provenance", + "origin_path": origin_path or "", + "source_url": source_url or "", + "claims": claims, + "observations": observations, + } + + +def build_query_bundle_for_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | None: + payload = query_concept(store_dir, concept_ref) + if payload is None: + return None + claims = payload["claims"] + contradictions = [item for item in claims if item.get("contradicts_claim_ids")] + supersessions = [item for item in claims if item.get("supersedes_claim_ids")] + return { + "bundle_kind": "groundrecall_query_bundle", + "query_type": "concept", + "concept": payload["concept"], + "relevant_claims": claims, + "supporting_observations": payload["supporting_observations"], + "related_concepts": payload["related_concepts"], + "contradictions": contradictions, + "supersessions": supersessions, + "suggested_next_actions": [ + "Review promoted claims with low review confidence.", + "Inspect supporting observations before exporting assistant context.", + "Check related concepts for hidden prerequisite or contradiction edges.", + ], + } + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Query canonical GroundRecall objects.") + parser.add_argument("store_dir") + parser.add_argument("query") + parser.add_argument("--kind", choices=["concept", "claim", "provenance", "bundle"], default="concept") + parser.add_argument("--source-url", default=None) + return parser + + +def main() -> None: + args = build_parser().parse_args() + if args.kind == "concept": + payload = query_concept(args.store_dir, args.query) + elif args.kind == "claim": + payload = search_claims(args.store_dir, args.query) + elif args.kind == "provenance": + payload = query_provenance(args.store_dir, origin_path=args.query, source_url=args.source_url) + else: + payload = build_query_bundle_for_concept(args.store_dir, args.query) + print(json.dumps(payload, indent=2)) diff --git a/src/groundrecall/review_app/app.js b/src/groundrecall/review_app/app.js new file mode 100644 index 0000000..561afdf --- /dev/null +++ b/src/groundrecall/review_app/app.js @@ -0,0 +1,366 @@ +const state = { + reviewData: null, + selectedConceptId: null, + selectedCitationId: null, + conceptSearch: "", + citationFilter: "all", + message: "", + verificationResult: null, +}; + +function escapeHtml(value) { + return String(value ?? "") + .replaceAll("&", "&") + .replaceAll("<", "<") + .replaceAll(">", ">") + .replaceAll('"', """); +} + +function splitLines(value) { + return String(value || "") + .split("\n") + .map((line) => line.trim()) + .filter(Boolean); +} + +function conceptRows() { + const data = state.reviewData; + if (!data) return []; + const reviewById = new Map((data.concept_reviews || []).map((item) => [item.concept_id, item])); + return (data.draft_pack?.concepts || []).map((concept) => ({ + ...concept, + review: reviewById.get(concept.concept_id) || null, + })); +} + +function citationRows() { + return state.reviewData?.citation_reviews || []; +} + +function selectedConcept() { + return conceptRows().find((item) => item.concept_id === state.selectedConceptId) || conceptRows()[0] || null; +} + +function selectedCitation() { + return citationRows().find((item) => item.citation_review_id === state.selectedCitationId) || citationRows()[0] || null; +} + +async function loadReviewData() { + const response = await fetch("/api/load"); + const payload = await response.json(); + state.reviewData = payload.review_data; + if (!state.selectedConceptId && conceptRows()[0]) { + state.selectedConceptId = conceptRows()[0].concept_id; + } + if (!state.selectedCitationId && citationRows()[0]) { + state.selectedCitationId = citationRows()[0].citation_review_id; + } + render(); +} + +async function saveConcept(form) { + const payload = { + concept_updates: [ + { + concept_id: form.get("concept_id"), + status: form.get("status"), + description: form.get("description"), + prerequisites: splitLines(form.get("prerequisites")), + notes: splitLines(form.get("notes")), + }, + ], + }; + const response = await fetch("/api/save", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(payload), + }); + const result = await response.json(); + state.reviewData = result.review_data; + state.message = `Saved concept ${payload.concept_updates[0].concept_id}.`; + render(); +} + +async function saveCitation(form) { + const payload = { + citation_updates: [ + { + citation_review_id: form.get("citation_review_id"), + status: form.get("status"), + notes: splitLines(form.get("notes")), + }, + ], + }; + const response = await fetch("/api/save", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(payload), + }); + const result = await response.json(); + state.reviewData = result.review_data; + state.message = `Saved citation review ${payload.citation_updates[0].citation_review_id}.`; + render(); +} + +async function verifyCitation(citationReviewId) { + const response = await fetch("/api/citations/verify", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ citation_review_id: citationReviewId }), + }); + state.verificationResult = await response.json(); + state.message = `Verification run for ${citationReviewId}.`; + render(); +} + +function statusOptions(specs, selectedValue) { + return (specs?.options || []) + .map((option) => ``) + .join(""); +} + +function renderConceptPanel(concept) { + if (!concept) { + return `

No concept selected

`; + } + const review = concept.review || {}; + const statusSpec = (state.reviewData.field_specs || []).find((item) => item.field === "status"); + const guidance = (state.reviewData.review_guidance?.priorities || []).map((item) => `
  • ${escapeHtml(item)}
  • `).join(""); + const claims = (review.top_claims || []).map((claim) => ` +
    +
    + ${escapeHtml(claim.claim_kind || "claim")} + ${escapeHtml(claim.grounding_status || "unknown")} +
    +

    ${escapeHtml(claim.claim_text || "")}

    +
    Artifacts: ${escapeHtml((claim.artifact_paths || []).join(", ") || "none")}
    + ${(claim.supporting_observations || []).slice(0, 2).map((obs) => ` +
    +
    ${escapeHtml(obs.origin_path || "")}${obs.line_start ? `:${obs.line_start}` : ""}
    +
    ${escapeHtml(obs.text || "")}
    +
    + `).join("")} +
    + `).join(""); + + return ` +
    +
    +
    +

    ${escapeHtml(concept.title)}

    +
    ${escapeHtml(concept.concept_id)} · claims ${escapeHtml(review.claim_count || 0)} · grounded ${escapeHtml(review.grounded_claim_count || 0)} · warnings ${escapeHtml(review.warning_count || 0)}
    +
    +
    ${review.has_citation_support ? "citation-bearing" : "no citation support"}
    +
    +

    ${escapeHtml(review.review_help || "")}

    +
    + + + + + +
    + +
    +
    +
    +

    Reviewer guidance

    +
      ${guidance}
    +
    +
    +

    Representative claims

    +
    ${claims || "
    No representative claims available.
    "}
    +
    +
    + `; +} + +function renderCitationPanel(citation) { + const statusSpec = (state.reviewData.citation_field_specs || []).find((item) => item.field === "status"); + const nextActions = (state.reviewData.citations?.next_actions || []).map((item) => `
  • ${escapeHtml(item)}
  • `).join(""); + if (!citation) { + return `

    No citation selected

    `; + } + return ` +
    +
    +
    +

    Citation lane

    +
    ${escapeHtml(citation.source_kind)} · ${escapeHtml(citation.artifact_path || citation.locator || "")}
    +
    +
    ${escapeHtml(citation.status)}
    +
    +
    + + + + + + +
    Related concepts: ${escapeHtml((citation.related_concept_ids || []).join(", ") || "none")}
    +
    Related claims: ${escapeHtml((citation.related_claim_ids || []).join(", ") || "none")}
    +
    + + +
    +
    +
    +

    Citation guidance

    +
      ${(state.reviewData.review_guidance?.citation_guidance || []).map((item) => `
    • ${escapeHtml(item)}
    • `).join("")}
    +
    +
    +

    Next actions

    +
      ${nextActions}
    +
    +
    +

    Verification

    + ${ + state.verificationResult && state.verificationResult.citation_review_id === citation.citation_review_id + ? `
    ${escapeHtml(JSON.stringify(state.verificationResult, null, 2))}
    ` + : `
    Run CiteGeist verification to inspect the stored entry and candidate matches.
    ` + } +
    +
    + `; +} + +function render() { + const app = document.getElementById("app"); + if (!state.reviewData) { + app.innerHTML = `

    Loading review data…

    `; + return; + } + const summary = state.reviewData.import_context?.manifest || {}; + const conceptList = conceptRows().filter((item) => { + const needle = state.conceptSearch.trim().toLowerCase(); + return !needle || item.title.toLowerCase().includes(needle) || item.concept_id.toLowerCase().includes(needle); + }); + const citationList = citationRows().filter((item) => { + if (state.citationFilter === "all") return true; + return item.status === state.citationFilter; + }); + const concept = selectedConcept(); + const citation = selectedCitation(); + + app.innerHTML = ` +
    +
    +
    +

    GroundRecall Review Workbench

    +

    Concept-first review with a dedicated citation lane for academic imports.

    +
    ${escapeHtml(summary.import_id || "")} · ${escapeHtml(summary.source_root || "")}
    + ${state.message ? `
    ${escapeHtml(state.message)}
    ` : ""} +
    +
    +
    ${escapeHtml(summary.artifact_count || 0)}artifacts
    +
    ${escapeHtml(summary.claim_count || 0)}claims
    +
    ${escapeHtml(summary.concept_count || 0)}concepts
    +
    ${escapeHtml(state.reviewData.citations?.summary?.citation_key_total || 0)}citation keys
    +
    +
    + +
    + + ${renderConceptPanel(concept)} +
    + +
    + + ${renderCitationPanel(citation)} +
    +
    + `; + + document.querySelectorAll("[data-concept-id]").forEach((node) => { + node.addEventListener("click", () => { + state.selectedConceptId = node.getAttribute("data-concept-id"); + render(); + }); + }); + document.querySelectorAll("[data-citation-id]").forEach((node) => { + node.addEventListener("click", () => { + state.selectedCitationId = node.getAttribute("data-citation-id"); + render(); + }); + }); + document.getElementById("concept-search")?.addEventListener("input", (event) => { + state.conceptSearch = event.target.value; + render(); + }); + document.getElementById("citation-filter")?.addEventListener("change", (event) => { + state.citationFilter = event.target.value; + render(); + }); + document.getElementById("concept-form")?.addEventListener("submit", async (event) => { + event.preventDefault(); + await saveConcept(new FormData(event.target)); + }); + document.getElementById("citation-form")?.addEventListener("submit", async (event) => { + event.preventDefault(); + await saveCitation(new FormData(event.target)); + }); + document.getElementById("verify-citation")?.addEventListener("click", async () => { + if (state.selectedCitationId) { + await verifyCitation(state.selectedCitationId); + } + }); +} + +loadReviewData(); diff --git a/src/groundrecall/review_app/index.html b/src/groundrecall/review_app/index.html new file mode 100644 index 0000000..bbaab84 --- /dev/null +++ b/src/groundrecall/review_app/index.html @@ -0,0 +1,13 @@ + + + + + + GroundRecall Review Workbench + + + +
    + + + diff --git a/src/groundrecall/review_app/styles.css b/src/groundrecall/review_app/styles.css new file mode 100644 index 0000000..4bae6ef --- /dev/null +++ b/src/groundrecall/review_app/styles.css @@ -0,0 +1,248 @@ +:root { + --bg: #f4f0e8; + --panel: rgba(255, 251, 245, 0.92); + --panel-strong: #fffdfa; + --ink: #1f2933; + --muted: #5f6c76; + --accent: #0f766e; + --accent-strong: #134e4a; + --warn: #9a3412; + --line: rgba(31, 41, 51, 0.12); + --shadow: 0 20px 60px rgba(31, 41, 51, 0.08); +} + +* { + box-sizing: border-box; +} + +body { + margin: 0; + color: var(--ink); + background: + radial-gradient(circle at top right, rgba(15, 118, 110, 0.14), transparent 28%), + radial-gradient(circle at top left, rgba(154, 52, 18, 0.08), transparent 24%), + linear-gradient(180deg, #fbf8f1 0%, var(--bg) 100%); + font-family: "Iowan Old Style", "Palatino Linotype", "Book Antiqua", serif; +} + +button, +input, +select, +textarea { + font: inherit; +} + +.shell { + width: min(1500px, calc(100vw - 32px)); + margin: 24px auto 48px; +} + +.hero, +.panel { + background: var(--panel); + border: 1px solid var(--line); + border-radius: 20px; + box-shadow: var(--shadow); +} + +.hero { + display: grid; + grid-template-columns: 1.6fr 1fr; + gap: 20px; + padding: 24px; + margin-bottom: 18px; +} + +.hero h1, +.panel h2, +.panel h3 { + margin: 0 0 8px; + font-family: Georgia, "Times New Roman", serif; +} + +.hero-stats { + display: grid; + grid-template-columns: repeat(2, minmax(0, 1fr)); + gap: 12px; +} + +.stat { + padding: 14px; + border-radius: 16px; + background: var(--panel-strong); + border: 1px solid var(--line); +} + +.stat strong { + display: block; + font-size: 1.8rem; +} + +.stat span, +.muted, +.tiny, +.help { + color: var(--muted); +} + +.message { + margin-top: 10px; + color: var(--accent-strong); +} + +.workspace-grid { + display: grid; + grid-template-columns: minmax(280px, 360px) 1fr; + gap: 18px; + margin-bottom: 18px; +} + +.panel { + padding: 18px; +} + +.panel-head { + display: flex; + justify-content: space-between; + align-items: flex-start; + gap: 12px; + margin-bottom: 12px; +} + +.list-panel { + max-height: 78vh; + overflow: auto; +} + +.search, +label { + display: grid; + gap: 6px; + margin-bottom: 12px; +} + +.stack { + display: grid; + gap: 10px; +} + +.list-item, +.primary, +.secondary { + border: 1px solid var(--line); + border-radius: 14px; + background: var(--panel-strong); +} + +.list-item { + display: grid; + gap: 4px; + width: 100%; + padding: 12px; + text-align: left; + cursor: pointer; +} + +.list-item.active { + border-color: rgba(15, 118, 110, 0.45); + background: rgba(15, 118, 110, 0.08); +} + +input, +select, +textarea { + width: 100%; + padding: 10px 12px; + border: 1px solid var(--line); + border-radius: 12px; + background: #fff; +} + +textarea { + resize: vertical; +} + +.actions { + display: flex; + justify-content: flex-end; +} + +.primary { + padding: 10px 16px; + color: white; + background: linear-gradient(135deg, var(--accent) 0%, var(--accent-strong) 100%); + cursor: pointer; +} + +.secondary { + padding: 10px 16px; + color: var(--accent-strong); + cursor: pointer; +} + +.subpanel { + margin-top: 16px; + padding-top: 14px; + border-top: 1px solid var(--line); +} + +.claim-card, +.support-block { + padding: 12px; + border-radius: 14px; + background: var(--panel-strong); + border: 1px solid var(--line); +} + +.claim-head { + display: flex; + justify-content: space-between; + gap: 12px; + margin-bottom: 8px; +} + +.chip, +.pill { + display: inline-flex; + align-items: center; + padding: 4px 10px; + border-radius: 999px; + border: 1px solid var(--line); + background: #fff; + font-size: 0.85rem; +} + +.pill-good { + color: var(--accent-strong); +} + +.pill-warn { + color: var(--warn); +} + +ul { + margin: 0; + padding-left: 20px; +} + +.json-block { + padding: 12px; + overflow: auto; + border-radius: 14px; + background: #f8f7f3; + border: 1px solid var(--line); + font-family: "SFMono-Regular", Consolas, "Liberation Mono", monospace; + font-size: 0.9rem; + white-space: pre-wrap; +} + +@media (max-width: 980px) { + .hero, + .workspace-grid { + grid-template-columns: 1fr; + } + + .list-panel { + max-height: none; + } +} diff --git a/src/groundrecall/review_export.py b/src/groundrecall/review_export.py new file mode 100644 index 0000000..22628ea --- /dev/null +++ b/src/groundrecall/review_export.py @@ -0,0 +1,439 @@ +from __future__ import annotations +from pathlib import Path +import hashlib +import json, yaml +import re +import sys +from collections import defaultdict +from typing import Any, Callable +from .citation_support import bibliography_summary_payload, load_bibliography_index, serialize_bib_entry +from .review_schema import CitationReviewEntry, ReviewSession + +def export_review_state_json(session: ReviewSession, path: str | Path) -> None: + Path(path).write_text(session.model_dump_json(indent=2), encoding="utf-8") + +def export_promoted_pack(session: ReviewSession, outdir: str | Path) -> None: + outdir = Path(outdir) + outdir.mkdir(parents=True, exist_ok=True) + promoted_pack = dict(session.draft_pack.pack) + promoted_pack["version"] = str(promoted_pack.get("version", "0.1.0-draft")).replace("-draft", "-reviewed") + promoted_pack["curation"] = {"reviewer": session.reviewer, "ledger_entries": len(session.ledger)} + + concepts = [] + for concept in session.draft_pack.concepts: + if concept.status == "rejected": + continue + concepts.append({ + "id": concept.concept_id, + "title": concept.title, + "description": concept.description, + "prerequisites": concept.prerequisites, + "mastery_signals": concept.mastery_signals, + "status": concept.status, + "notes": concept.notes, + "mastery_profile": {}, + }) + + (outdir / "pack.yaml").write_text(yaml.safe_dump(promoted_pack, sort_keys=False), encoding="utf-8") + (outdir / "concepts.yaml").write_text(yaml.safe_dump({"concepts": concepts}, sort_keys=False), encoding="utf-8") + (outdir / "review_ledger.json").write_text(json.dumps(session.model_dump(), indent=2), encoding="utf-8") + (outdir / "license_attribution.json").write_text(json.dumps(session.draft_pack.attribution, indent=2), encoding="utf-8") + + +def export_promoted_pack_to_course_repo(session: ReviewSession, course_repo: str | Path, outdir: str | Path | None = None) -> Path: + from .course_repo import resolve_course_repo + + resolved = resolve_course_repo(course_repo) + target = Path(outdir) if outdir is not None else Path(resolved.generated_pack_dir or (Path(resolved.repo_root) / "generated" / "pack")) + export_promoted_pack(session, target) + return target + + +LATEX_CITE_RE = re.compile(r"\\cite[a-zA-Z*]*(?:\[[^\]]*\])?(?:\[[^\]]*\])?\{([^}]+)\}") + + +def _read_json(path: Path) -> dict[str, Any]: + return json.loads(path.read_text(encoding="utf-8")) + + +def _read_jsonl(path: Path) -> list[dict[str, Any]]: + text = path.read_text(encoding="utf-8").strip() + if not text: + return [] + return [json.loads(line) for line in text.splitlines()] + + +def _status_field_spec() -> dict[str, Any]: + return { + "field": "status", + "label": "Review status", + "input": "select", + "required": True, + "options": [ + { + "value": "trusted", + "label": "Trusted", + "help": "Promote this concept and its supported claims when the evidence and wording are ready.", + }, + { + "value": "provisional", + "label": "Provisional", + "help": "Keep this concept in reviewed state when it is promising but still needs citation or wording cleanup.", + }, + { + "value": "needs_review", + "label": "Needs Review", + "help": "Leave undecided when support, scope, or concept boundaries are still unclear.", + }, + { + "value": "rejected", + "label": "Rejected", + "help": "Exclude this concept when it is noise, unsupported, duplicated, or misleading.", + }, + ], + } + + +def _text_field_spec(field: str, label: str, help_text: str, *, multiline: bool = False) -> dict[str, Any]: + return { + "field": field, + "label": label, + "input": "textarea" if multiline else "text", + "required": False, + "help": help_text, + } + + +def _citation_status_field_spec() -> dict[str, Any]: + return { + "field": "status", + "label": "Citation review status", + "input": "select", + "required": True, + "options": [ + { + "value": "unreviewed", + "label": "Unreviewed", + "help": "Keep this citation candidate in triage until fit and existence are checked.", + }, + { + "value": "verified", + "label": "Verified", + "help": "The cited work exists and materially supports the associated manuscript claim.", + }, + { + "value": "needs_source_check", + "label": "Needs Source Check", + "help": "The citation may be useful but still needs direct source inspection or metadata cleanup.", + }, + { + "value": "misleading", + "label": "Misleading", + "help": "The citation exists but overstates, contradicts, or poorly fits the claim.", + }, + { + "value": "irrelevant", + "label": "Irrelevant", + "help": "The citation does not materially support the concept or claim under review.", + }, + { + "value": "fabricated", + "label": "Fabricated", + "help": "The citation appears invented, malformed, or otherwise not real.", + }, + ], + } + + +def _load_citegeist_extract() -> tuple[Callable[[str], list[Any]] | None, list[str]]: + citegeist_src = Path("/home/netuser/bin/CiteGeist/src") + if citegeist_src.exists(): + sys.path.insert(0, str(citegeist_src)) + try: + from citegeist import available_extraction_backends, extract_references # type: ignore + except Exception: + return None, [] + return extract_references, list(available_extraction_backends()) + + +def _extract_citation_keys(text: str) -> list[str]: + keys: list[str] = [] + for raw_group in LATEX_CITE_RE.findall(text): + keys.extend(part.strip() for part in raw_group.split(",") if part.strip()) + return sorted(set(keys)) + + +def _artifact_citation_payloads( + artifacts: list[dict[str, Any]], + *, + source_root: str, +) -> tuple[list[dict[str, Any]], dict[str, dict[str, Any]]]: + extract_references, backends = _load_citegeist_extract() + artifact_payloads: list[dict[str, Any]] = [] + summaries: dict[str, dict[str, Any]] = {} + root = Path(source_root) if source_root else None + bibliography_index = load_bibliography_index(source_root) if source_root else {} + + for artifact in artifacts: + path = Path(source_root) / artifact["path"] if root is not None else None + raw_text = "" + if path is not None and path.exists(): + try: + raw_text = path.read_text(encoding="utf-8") + except UnicodeDecodeError: + raw_text = "" + citation_keys = _extract_citation_keys(raw_text) if raw_text else [] + extracted_refs: list[dict[str, Any]] = [] + if extract_references is not None and raw_text: + try: + for entry in extract_references(raw_text): + extracted_refs.append( + { + "citation_key": "", + "entry_type": entry.entry_type, + "title": entry.fields.get("title", ""), + "author": entry.fields.get("author", ""), + "year": entry.fields.get("year", ""), + "venue": entry.fields.get("journal", "") or entry.fields.get("booktitle", ""), + } + ) + except Exception: + extracted_refs = [] + + payload = { + "artifact_id": artifact["artifact_id"], + "path": artifact["path"], + "title": artifact.get("title", ""), + "citation_keys": citation_keys, + "resolved_entries": [serialize_bib_entry(bibliography_index.get(key)) for key in citation_keys if bibliography_index.get(key)], + "citation_key_count": len(citation_keys), + "extracted_references": extracted_refs[:12], + "extracted_reference_count": len(extracted_refs), + "citegeist_backends": backends, + } + artifact_payloads.append(payload) + summaries[artifact["artifact_id"]] = { + "citation_key_count": len(citation_keys), + "extracted_reference_count": len(extracted_refs), + "has_citation_support": bool(citation_keys or extracted_refs), + } + return artifact_payloads, summaries + + +def build_citation_review_entries_from_import(import_dir: str | Path) -> list[CitationReviewEntry]: + base = Path(import_dir) + manifest = _read_json(base / "manifest.json") + artifacts = _read_jsonl(base / "artifacts.jsonl") + observations = _read_jsonl(base / "observations.jsonl") + claims = _read_jsonl(base / "claims.jsonl") + bibliography_index = load_bibliography_index(manifest.get("source_root", "")) + + artifact_payloads, _ = _artifact_citation_payloads( + artifacts, + source_root=manifest.get("source_root", ""), + ) + observations_by_id = {item["observation_id"]: item for item in observations} + artifact_claim_links: dict[str, dict[str, set[str]]] = defaultdict(lambda: {"claim_ids": set(), "concept_ids": set()}) + + for claim in claims: + artifact_ids = { + observations_by_id[item]["artifact_id"] + for item in claim.get("source_observation_ids", []) + if item in observations_by_id and observations_by_id[item].get("artifact_id") + } + for artifact_id in artifact_ids: + artifact_claim_links[artifact_id]["claim_ids"].add(claim["claim_id"]) + artifact_claim_links[artifact_id]["concept_ids"].update( + concept_id.replace("concept::", "", 1) for concept_id in claim.get("concept_ids", []) + ) + + entries: list[CitationReviewEntry] = [] + for artifact in artifact_payloads: + link_payload = artifact_claim_links.get(artifact["artifact_id"], {"claim_ids": set(), "concept_ids": set()}) + for citation_key in artifact.get("citation_keys", []): + digest = hashlib.sha1(f"{artifact['artifact_id']}|key|{citation_key}".encode("utf-8")).hexdigest()[:12] + bib_entry = bibliography_index.get(citation_key, {}) + fields = bib_entry.get("fields", {}) + entries.append( + CitationReviewEntry( + citation_review_id=f"citrev-{digest}", + artifact_id=artifact["artifact_id"], + artifact_path=artifact.get("path", ""), + artifact_title=artifact.get("title", ""), + source_kind="citation_key", + locator=artifact.get("path", ""), + citation_key=citation_key, + title=str(fields.get("title", "")), + author=str(fields.get("author", "")), + year=str(fields.get("year", "")), + venue=str(fields.get("journal", "") or fields.get("booktitle", "") or fields.get("publisher", "")), + source_bib_path=str(bib_entry.get("source_bib_path", "")), + raw_bibtex=str(bib_entry.get("raw_bibtex", "")), + related_concept_ids=sorted(link_payload["concept_ids"]), + related_claim_ids=sorted(link_payload["claim_ids"]), + ) + ) + for index, reference in enumerate(artifact.get("extracted_references", []), start=1): + digest = hashlib.sha1( + f"{artifact['artifact_id']}|ref|{reference.get('title', '')}|{reference.get('author', '')}|{index}".encode("utf-8") + ).hexdigest()[:12] + entries.append( + CitationReviewEntry( + citation_review_id=f"citrev-{digest}", + artifact_id=artifact["artifact_id"], + artifact_path=artifact.get("path", ""), + artifact_title=artifact.get("title", ""), + source_kind="extracted_reference", + locator=f"{artifact.get('path', '')}#ref-{index}", + citation_key="", + title=reference.get("title", ""), + author=reference.get("author", ""), + year=reference.get("year", ""), + venue=reference.get("venue", ""), + related_concept_ids=sorted(link_payload["concept_ids"]), + related_claim_ids=sorted(link_payload["claim_ids"]), + ) + ) + return entries + + +def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> dict[str, Any]: + manifest = _read_json(import_dir / "manifest.json") + lint_payload = _read_json(import_dir / "lint_findings.json") + queue_payload = _read_json(import_dir / "review_queue.json") + artifacts = _read_jsonl(import_dir / "artifacts.jsonl") + observations = _read_jsonl(import_dir / "observations.jsonl") + claims = _read_jsonl(import_dir / "claims.jsonl") + + observations_by_id = {item["observation_id"]: item for item in observations} + claims_by_concept: dict[str, list[dict[str, Any]]] = defaultdict(list) + findings_by_target: dict[str, list[dict[str, Any]]] = defaultdict(list) + for finding in lint_payload.get("findings", []): + findings_by_target[finding["target_id"]].append(finding) + for claim in claims: + for concept_id in claim.get("concept_ids", []): + claims_by_concept[concept_id].append(claim) + + artifact_citations, artifact_citation_summary = _artifact_citation_payloads( + artifacts, + source_root=manifest.get("source_root", ""), + ) + artifact_by_id = {item["artifact_id"]: item for item in artifacts} + + concept_reviews: list[dict[str, Any]] = [] + for concept in session.draft_pack.concepts: + full_concept_id = f"concept::{concept.concept_id}" if not concept.concept_id.startswith("concept::") else concept.concept_id + concept_claims = claims_by_concept.get(full_concept_id, []) + claim_payloads: list[dict[str, Any]] = [] + has_citation_support = False + for claim in concept_claims[:25]: + supporting_observations = [observations_by_id[item] for item in claim.get("source_observation_ids", []) if item in observations_by_id] + artifact_ids = {item["artifact_id"] for item in supporting_observations} + citation_support = [artifact_citation_summary.get(artifact_id, {}) for artifact_id in artifact_ids] + has_citation_support = has_citation_support or any(item.get("has_citation_support") for item in citation_support) + claim_payloads.append( + { + "claim_id": claim["claim_id"], + "claim_text": claim.get("claim_text", ""), + "claim_kind": claim.get("claim_kind", ""), + "grounding_status": claim.get("grounding_status", "unknown"), + "supporting_observations": [ + { + "observation_id": obs["observation_id"], + "origin_path": obs.get("origin_path", ""), + "origin_section": obs.get("origin_section", ""), + "text": obs.get("text", ""), + "line_start": obs.get("line_start", 0), + "line_end": obs.get("line_end", 0), + } + for obs in supporting_observations + ], + "citation_support": citation_support, + "artifact_paths": [artifact_by_id[item]["path"] for item in artifact_ids if item in artifact_by_id], + "finding_messages": [item["message"] for item in findings_by_target.get(claim["claim_id"], [])], + } + ) + + concept_reviews.append( + { + "concept_id": concept.concept_id, + "title": concept.title, + "status": concept.status, + "description": concept.description, + "review_help": ( + "Prefer `trusted` when claims are coherent and citation-bearing support is appropriate; " + "prefer `provisional` when the concept is plausible but still needs citation or wording cleanup." + ), + "claim_count": len(concept_claims), + "grounded_claim_count": sum(1 for item in concept_claims if item.get("grounding_status") == "grounded"), + "warning_count": len(findings_by_target.get(full_concept_id, [])), + "has_citation_support": has_citation_support, + "top_claims": claim_payloads, + "notes": list(concept.notes), + } + ) + + return { + "import_context": { + "manifest": manifest, + "lint_summary": lint_payload.get("summary", {}), + "queue_length": queue_payload.get("queue_length", 0), + "source_adapter": manifest.get("source_adapter", ""), + }, + "review_guidance": { + "overview": ( + "Review concepts first, then inspect representative claims and their source observations before promotion." + ), + "priorities": [ + "Focus reviewer effort on concepts with strong grounded claims and explicit citations first.", + "Downgrade or reject concepts whose claims are fragmented, duplicated, or missing meaningful support.", + "For academic material, citation-bearing claims deserve special scrutiny for fit, contradiction, and fabrication risk.", + ], + "citation_guidance": [ + "A citation key or extracted reference is evidence of traceability, not correctness.", + "Check whether the cited work actually supports the claim and whether the claim overstates it.", + "Use the citation track to prioritize claims that can move into a separate citation-ingestion workflow.", + ], + }, + "field_specs": [ + _status_field_spec(), + _text_field_spec("description", "Concept description", "Refine the concept summary to match the strongest supported interpretation."), + _text_field_spec("notes", "Reviewer notes", "Record why this concept is trusted, provisional, rejected, or still unclear.", multiline=True), + _text_field_spec("prerequisites", "Prerequisites", "List prerequisite concepts only when the manuscript support is explicit or defensible.", multiline=True), + ], + "citation_field_specs": [ + _citation_status_field_spec(), + _text_field_spec("notes", "Citation notes", "Record whether the cited work exists, fits the claim, or should move into a dedicated citation-ingestion lane.", multiline=True), + ], + "concept_reviews": concept_reviews, + "citation_reviews": [entry.model_dump() for entry in session.citation_reviews], + "bibliography": bibliography_summary_payload(manifest.get("source_root", "")), + "citations": { + "enabled": True, + "provider": "citegeist" if artifact_citations and artifact_citations[0].get("citegeist_backends") else "none", + "artifacts": artifact_citations, + "summary": { + "artifact_count_with_citations": sum(1 for item in artifact_citations if item["citation_key_count"] or item["extracted_reference_count"]), + "citation_key_total": sum(item["citation_key_count"] for item in artifact_citations), + "extracted_reference_total": sum(item["extracted_reference_count"] for item in artifact_citations), + }, + "next_actions": [ + "Promote citation-bearing claims into a dedicated citation review lane.", + "Use CiteGeist extraction as a first pass, then verify support and metadata before trusting the citation.", + ], + }, + } + + +def export_review_ui_data(session: ReviewSession, outdir: str | Path, import_dir: str | Path | None = None) -> None: + outdir = Path(outdir) + outdir.mkdir(parents=True, exist_ok=True) + payload = { + "reviewer": session.reviewer, + "draft_pack": session.draft_pack.model_dump(), + "citation_reviews": [entry.model_dump() for entry in session.citation_reviews], + "ledger": [entry.model_dump() for entry in session.ledger], + } + if import_dir is not None: + payload.update(_build_import_review_payload(session, Path(import_dir))) + (outdir / "review_data.json").write_text(json.dumps(payload, indent=2), encoding="utf-8") diff --git a/src/groundrecall/review_schema.py b/src/groundrecall/review_schema.py new file mode 100644 index 0000000..e165094 --- /dev/null +++ b/src/groundrecall/review_schema.py @@ -0,0 +1,80 @@ +from __future__ import annotations +from pydantic import BaseModel, Field +from typing import Literal + +TrustStatus = Literal["trusted", "provisional", "rejected", "needs_review"] +CitationStatus = Literal["unreviewed", "verified", "needs_source_check", "misleading", "irrelevant", "fabricated"] + +class ConceptReviewEntry(BaseModel): + concept_id: str + title: str + description: str = "" + prerequisites: list[str] = Field(default_factory=list) + mastery_signals: list[str] = Field(default_factory=list) + status: TrustStatus = "needs_review" + notes: list[str] = Field(default_factory=list) + + +class CitationReviewEntry(BaseModel): + citation_review_id: str + artifact_id: str + artifact_path: str = "" + artifact_title: str = "" + source_kind: Literal["citation_key", "extracted_reference"] = "citation_key" + locator: str = "" + citation_key: str = "" + title: str = "" + author: str = "" + year: str = "" + venue: str = "" + source_bib_path: str = "" + raw_bibtex: str = "" + status: CitationStatus = "unreviewed" + notes: list[str] = Field(default_factory=list) + related_concept_ids: list[str] = Field(default_factory=list) + related_claim_ids: list[str] = Field(default_factory=list) + +class DraftPackData(BaseModel): + pack: dict = Field(default_factory=dict) + concepts: list[ConceptReviewEntry] = Field(default_factory=list) + conflicts: list[str] = Field(default_factory=list) + review_flags: list[str] = Field(default_factory=list) + attribution: dict = Field(default_factory=dict) + +class ReviewAction(BaseModel): + action_type: str + target: str = "" + payload: dict = Field(default_factory=dict) + rationale: str = "" + +class ReviewLedgerEntry(BaseModel): + reviewer: str + action: ReviewAction + +class ReviewSession(BaseModel): + reviewer: str + draft_pack: DraftPackData + citation_reviews: list[CitationReviewEntry] = Field(default_factory=list) + ledger: list[ReviewLedgerEntry] = Field(default_factory=list) + +class WorkspaceMeta(BaseModel): + workspace_id: str + title: str + path: str + created_at: str + last_opened_at: str + notes: str = "" + +class WorkspaceRegistry(BaseModel): + workspaces: list[WorkspaceMeta] = Field(default_factory=list) + recent_workspace_ids: list[str] = Field(default_factory=list) + +class ImportPreview(BaseModel): + ok: bool = False + source_dir: str + workspace_id: str + overwrite_required: bool = False + errors: list[str] = Field(default_factory=list) + warnings: list[str] = Field(default_factory=list) + summary: dict = Field(default_factory=dict) + semantic_warnings: list[str] = Field(default_factory=list) diff --git a/src/groundrecall/review_server.py b/src/groundrecall/review_server.py new file mode 100644 index 0000000..f482e67 --- /dev/null +++ b/src/groundrecall/review_server.py @@ -0,0 +1,246 @@ +from __future__ import annotations + +import argparse +import json +import mimetypes +from http.server import BaseHTTPRequestHandler, HTTPServer +from pathlib import Path +from urllib.parse import parse_qs, urlparse + +from .citation_support import materialize_citegeist_store +from .promotion import promote_import_to_store +from .review_workspace import GroundRecallReviewWorkspace + + +def _json_response(handler: BaseHTTPRequestHandler, status: int, payload: dict) -> None: + body = json.dumps(payload, indent=2).encode("utf-8") + handler.send_response(status) + handler.send_header("Content-Type", "application/json") + handler.send_header("Content-Length", str(len(body))) + handler.send_header("Access-Control-Allow-Origin", "*") + handler.send_header("Access-Control-Allow-Methods", "GET,POST,OPTIONS") + handler.send_header("Access-Control-Allow-Headers", "Content-Type") + handler.end_headers() + handler.wfile.write(body) + + +def _serve_static(handler: BaseHTTPRequestHandler, asset_path: Path) -> None: + if not asset_path.exists(): + _json_response(handler, 404, {"error": "asset not found"}) + return + body = asset_path.read_bytes() + handler.send_response(200) + handler.send_header("Content-Type", mimetypes.guess_type(str(asset_path))[0] or "application/octet-stream") + handler.send_header("Content-Length", str(len(body))) + handler.end_headers() + handler.wfile.write(body) + + +def _safe_show_entry(api: object, citation_key: str) -> dict | None: + if not citation_key: + return None + try: + return api.show_entry( # type: ignore[attr-defined] + citation_key, + include_provenance=True, + include_conflicts=True, + include_bibtex=True, + ) + except AttributeError: + pass + + store = getattr(api, "store", None) + if store is None: + return None + entry = store.get_entry(citation_key) + if entry is None: + return None + payload = dict(entry) + if hasattr(store, "get_field_provenance"): + try: + payload["provenance"] = store.get_field_provenance(citation_key) + except Exception: + payload["provenance"] = [] + if hasattr(store, "get_conflicts"): + try: + payload["conflicts"] = store.get_conflicts(citation_key) + except Exception: + payload["conflicts"] = [] + else: + payload["conflicts"] = [] + if hasattr(store, "get_entry_bibtex"): + try: + payload["bibtex"] = store.get_entry_bibtex(citation_key) + except Exception: + payload["bibtex"] = None + return payload + + +def _safe_verify_entry(api: object, entry: object, *, context: str, limit: int) -> dict: + if getattr(entry, "raw_bibtex", ""): + try: + return api.verify_bibtex(entry.raw_bibtex, context=context, limit=limit) # type: ignore[attr-defined] + except Exception: + pass + values = [item for item in [getattr(entry, "citation_key", ""), getattr(entry, "title", ""), getattr(entry, "author", ""), getattr(entry, "year", "")] if item] + try: + return api.verify_strings(values, context=context, limit=limit) # type: ignore[attr-defined] + except Exception as exc: + return { + "context": context, + "results": [], + "error": str(exc), + } + + +class GroundRecallReviewHandler(BaseHTTPRequestHandler): + workspace: GroundRecallReviewWorkspace + default_store_dir: str | None = None + citegeist_bundle: dict | None = None + + def do_OPTIONS(self) -> None: + _json_response(self, 200, {"ok": True}) + + def do_GET(self) -> None: + parsed = urlparse(self.path) + if parsed.path == "/api/healthz": + _json_response(self, 200, {"ok": True}) + return + if parsed.path == "/api/load": + review_data = self.workspace.load_review_data() + review_data["citegeist"] = { + "enabled": bool(self.citegeist_bundle and self.citegeist_bundle.get("available")), + "db_path": self.citegeist_bundle.get("db_path") if self.citegeist_bundle else "", + "ingested_files": self.citegeist_bundle.get("ingested_files", []) if self.citegeist_bundle else [], + "show_entry_endpoint": "/api/citations/show-entry", + "verify_endpoint": "/api/citations/verify", + } + _json_response( + self, + 200, + { + "ok": True, + "import_dir": str(self.workspace.import_dir), + "review_data": review_data, + }, + ) + return + if parsed.path == "/api/citations/show-entry": + if not self.citegeist_bundle or not self.citegeist_bundle.get("available"): + _json_response(self, 404, {"ok": False, "error": "citegeist unavailable"}) + return + citation_key = parse_qs(parsed.query).get("citation_key", [""])[0] + if not citation_key: + _json_response(self, 400, {"ok": False, "error": "citation_key is required"}) + return + payload = _safe_show_entry(self.citegeist_bundle["api"], citation_key) + _json_response(self, 200, {"ok": payload is not None, "entry": payload}) + return + + asset_root = Path(__file__).with_name("review_app") + if parsed.path in {"/", "/index.html"}: + _serve_static(self, asset_root / "index.html") + return + if parsed.path == "/app.js": + _serve_static(self, asset_root / "app.js") + return + if parsed.path == "/styles.css": + _serve_static(self, asset_root / "styles.css") + return + _json_response(self, 404, {"error": "not found"}) + + def do_POST(self) -> None: + parsed = urlparse(self.path) + length = int(self.headers.get("Content-Length", "0")) + raw = self.rfile.read(length) if length else b"{}" + payload = json.loads(raw.decode("utf-8") or "{}") + + if parsed.path == "/api/save": + self.workspace.apply_updates( + concept_updates=payload.get("concept_updates"), + citation_updates=payload.get("citation_updates"), + reviewer=payload.get("reviewer"), + ) + _json_response( + self, + 200, + { + "ok": True, + "import_dir": str(self.workspace.import_dir), + "review_data": self.workspace.load_review_data(), + }, + ) + return + + if parsed.path == "/api/promote": + store_dir = payload.get("store_dir") or self.default_store_dir + if not store_dir: + _json_response(self, 400, {"ok": False, "error": "store_dir is required"}) + return + result = promote_import_to_store( + import_dir=self.workspace.import_dir, + store_dir=store_dir, + reviewer=payload.get("reviewer"), + snapshot_id=payload.get("snapshot_id"), + ) + _json_response(self, 200, {"ok": True, "promotion": result}) + return + if parsed.path == "/api/citations/verify": + if not self.citegeist_bundle or not self.citegeist_bundle.get("available"): + _json_response(self, 404, {"ok": False, "error": "citegeist unavailable"}) + return + citation_review_id = str(payload.get("citation_review_id") or "").strip() + if not citation_review_id: + _json_response(self, 400, {"ok": False, "error": "citation_review_id is required"}) + return + session = self.workspace.load_session() + entry = next((item for item in session.citation_reviews if item.citation_review_id == citation_review_id), None) + if entry is None: + _json_response(self, 404, {"ok": False, "error": "citation review entry not found"}) + return + api = self.citegeist_bundle["api"] + show_entry_payload = _safe_show_entry(api, entry.citation_key) if entry.citation_key else None + context = f"{entry.artifact_path} {entry.artifact_title}".strip() + verification = _safe_verify_entry(api, entry, context=context, limit=int(payload.get("limit", 5))) + _json_response( + self, + 200, + { + "ok": True, + "citation_review_id": citation_review_id, + "entry": show_entry_payload, + "verification": verification, + }, + ) + return + + _json_response(self, 404, {"error": "not found"}) + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="GroundRecall local review server") + parser.add_argument("import_dir") + parser.add_argument("--host", default="127.0.0.1") + parser.add_argument("--port", type=int, default=8766) + parser.add_argument("--reviewer", default="GroundRecall Import") + parser.add_argument("--store-dir", default=None) + return parser + + +def main() -> None: + args = build_parser().parse_args() + GroundRecallReviewHandler.workspace = GroundRecallReviewWorkspace(args.import_dir, reviewer=args.reviewer) + GroundRecallReviewHandler.default_store_dir = args.store_dir + GroundRecallReviewHandler.workspace.ensure_review_bundle() + session = GroundRecallReviewHandler.workspace.load_session() + GroundRecallReviewHandler.citegeist_bundle = materialize_citegeist_store( + args.import_dir, + session.draft_pack.pack.get("source_root", ""), + ) + server = HTTPServer((args.host, args.port), GroundRecallReviewHandler) + print(f"GroundRecall review server listening on http://{args.host}:{args.port}") + server.serve_forever() + + +if __name__ == "__main__": + main() diff --git a/src/groundrecall/review_workspace.py b/src/groundrecall/review_workspace.py new file mode 100644 index 0000000..0b9c249 --- /dev/null +++ b/src/groundrecall/review_workspace.py @@ -0,0 +1,126 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +from .groundrecall_review_bridge import export_review_bundle_from_import +from .review_export import build_citation_review_entries_from_import, export_review_state_json, export_review_ui_data +from .review_schema import ReviewAction, ReviewLedgerEntry, ReviewSession + + +def _normalize_lines(value: Any) -> list[str]: + if isinstance(value, list): + return [str(item).strip() for item in value if str(item).strip()] + if isinstance(value, str): + return [line.strip() for line in value.splitlines() if line.strip()] + return [] + + +class GroundRecallReviewWorkspace: + def __init__(self, import_dir: str | Path, reviewer: str = "GroundRecall Import") -> None: + self.import_dir = Path(import_dir) + self.reviewer = reviewer + + @property + def review_session_path(self) -> Path: + return self.import_dir / "review_session.json" + + @property + def review_data_path(self) -> Path: + return self.import_dir / "review_data.json" + + def ensure_review_bundle(self) -> None: + if not self.review_session_path.exists(): + export_review_bundle_from_import(self.import_dir, reviewer=self.reviewer) + return + session = ReviewSession.model_validate_json(self.review_session_path.read_text(encoding="utf-8")) + updated = False + if ( + not session.citation_reviews + or any(entry.source_kind == "citation_key" and not entry.title for entry in session.citation_reviews) + or any(entry.source_kind == "citation_key" and not entry.source_bib_path for entry in session.citation_reviews) + ): + session.citation_reviews = build_citation_review_entries_from_import(self.import_dir) + updated = True + if updated or not self.review_data_path.exists(): + self.save_session(session) + + def load_session(self) -> ReviewSession: + self.ensure_review_bundle() + return ReviewSession.model_validate_json(self.review_session_path.read_text(encoding="utf-8")) + + def save_session(self, session: ReviewSession) -> None: + export_review_state_json(session, self.review_session_path) + export_review_ui_data(session, self.import_dir, import_dir=self.import_dir) + + def load_review_data(self) -> dict[str, Any]: + self.ensure_review_bundle() + return json.loads(self.review_data_path.read_text(encoding="utf-8")) + + def apply_updates( + self, + *, + concept_updates: list[dict[str, Any]] | None = None, + citation_updates: list[dict[str, Any]] | None = None, + reviewer: str | None = None, + ) -> ReviewSession: + session = self.load_session() + if reviewer: + session.reviewer = reviewer + concept_by_id = {concept.concept_id: concept for concept in session.draft_pack.concepts} + citation_by_id = {entry.citation_review_id: entry for entry in session.citation_reviews} + + for payload in concept_updates or []: + concept_id = str(payload.get("concept_id", "")).strip() + if not concept_id or concept_id not in concept_by_id: + continue + concept = concept_by_id[concept_id] + if "status" in payload: + concept.status = payload["status"] + if "description" in payload: + concept.description = str(payload.get("description", "")).strip() + if "notes" in payload: + concept.notes = _normalize_lines(payload.get("notes")) + if "prerequisites" in payload: + concept.prerequisites = _normalize_lines(payload.get("prerequisites")) + session.ledger.append( + ReviewLedgerEntry( + reviewer=session.reviewer, + action=ReviewAction( + action_type="edit_concept", + target=concept_id, + payload={ + "status": concept.status, + "description": concept.description, + "notes": concept.notes, + "prerequisites": concept.prerequisites, + }, + rationale=str(payload.get("rationale", "")).strip(), + ), + ) + ) + + for payload in citation_updates or []: + citation_review_id = str(payload.get("citation_review_id", "")).strip() + if not citation_review_id or citation_review_id not in citation_by_id: + continue + entry = citation_by_id[citation_review_id] + if "status" in payload: + entry.status = payload["status"] + if "notes" in payload: + entry.notes = _normalize_lines(payload.get("notes")) + session.ledger.append( + ReviewLedgerEntry( + reviewer=session.reviewer, + action=ReviewAction( + action_type="edit_citation", + target=citation_review_id, + payload={"status": entry.status, "notes": entry.notes}, + rationale=str(payload.get("rationale", "")).strip(), + ), + ) + ) + + self.save_session(session) + return session diff --git a/src/groundrecall/source_adapters/__init__.py b/src/groundrecall/source_adapters/__init__.py new file mode 100644 index 0000000..f5ed856 --- /dev/null +++ b/src/groundrecall/source_adapters/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from .. import groundrecall_source_adapters as _legacy_source_adapters # noqa: F401 diff --git a/src/groundrecall/source_adapters/base.py b/src/groundrecall/source_adapters/base.py new file mode 100644 index 0000000..b67a196 --- /dev/null +++ b/src/groundrecall/source_adapters/base.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from ..groundrecall_source_adapters.base import * # noqa: F403 diff --git a/src/groundrecall/source_adapters/didactopus_pack.py b/src/groundrecall/source_adapters/didactopus_pack.py new file mode 100644 index 0000000..cc2bc52 --- /dev/null +++ b/src/groundrecall/source_adapters/didactopus_pack.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from ..groundrecall_source_adapters.didactopus_pack import * # noqa: F403 diff --git a/src/groundrecall/source_adapters/llmwiki.py b/src/groundrecall/source_adapters/llmwiki.py new file mode 100644 index 0000000..6f397fc --- /dev/null +++ b/src/groundrecall/source_adapters/llmwiki.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from ..groundrecall_source_adapters.llmwiki import * # noqa: F403 diff --git a/src/groundrecall/source_adapters/markdown_notes.py b/src/groundrecall/source_adapters/markdown_notes.py new file mode 100644 index 0000000..c0c5118 --- /dev/null +++ b/src/groundrecall/source_adapters/markdown_notes.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from ..groundrecall_source_adapters.markdown_notes import * # noqa: F403 diff --git a/src/groundrecall/source_adapters/polypaper.py b/src/groundrecall/source_adapters/polypaper.py new file mode 100644 index 0000000..c276da4 --- /dev/null +++ b/src/groundrecall/source_adapters/polypaper.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from ..groundrecall_source_adapters.polypaper import * # noqa: F403 diff --git a/src/groundrecall/source_adapters/transcript.py b/src/groundrecall/source_adapters/transcript.py new file mode 100644 index 0000000..53f3488 --- /dev/null +++ b/src/groundrecall/source_adapters/transcript.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from ..groundrecall_source_adapters.transcript import * # noqa: F403 diff --git a/src/groundrecall/store.py b/src/groundrecall/store.py new file mode 100644 index 0000000..28d3267 --- /dev/null +++ b/src/groundrecall/store.py @@ -0,0 +1,203 @@ +from __future__ import annotations + +import os +import tempfile +from pathlib import Path +from typing import TypeVar + +from pydantic import BaseModel + +from .models import ( + ArtifactRecord, + ClaimRecord, + ConceptRecord, + FragmentRecord, + GroundRecallSnapshot, + ObservationRecord, + PromotionRecord, + RelationRecord, + ReviewCandidateRecord, + SourceRecord, +) + + +ModelT = TypeVar("ModelT", bound=BaseModel) + + +class GroundRecallStore: + def __init__(self, base_dir: str | Path): + self.base_dir = Path(base_dir) + self.sources_dir = self.base_dir / "sources" + self.fragments_dir = self.base_dir / "fragments" + self.artifacts_dir = self.base_dir / "artifacts" + self.observations_dir = self.base_dir / "observations" + self.claims_dir = self.base_dir / "claims" + self.concepts_dir = self.base_dir / "concepts" + self.relations_dir = self.base_dir / "relations" + self.review_candidates_dir = self.base_dir / "review_candidates" + self.promotions_dir = self.base_dir / "promotions" + self.snapshots_dir = self.base_dir / "snapshots" + for path in [ + self.sources_dir, + self.fragments_dir, + self.artifacts_dir, + self.observations_dir, + self.claims_dir, + self.concepts_dir, + self.relations_dir, + self.review_candidates_dir, + self.promotions_dir, + self.snapshots_dir, + ]: + path.mkdir(parents=True, exist_ok=True) + + def _save(self, directory: Path, key: str, model: BaseModel) -> None: + target = directory / f"{key}.json" + payload = model.model_dump_json(indent=2) + self._write_text_atomic(target, payload) + + def _write_text_atomic(self, path: Path, text: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + fd, tmp_name = tempfile.mkstemp( + prefix=f".{path.name}.", + suffix=".tmp", + dir=path.parent, + text=True, + ) + tmp_path = Path(tmp_name) + try: + with os.fdopen(fd, "w", encoding="utf-8") as handle: + handle.write(text) + handle.flush() + os.fsync(handle.fileno()) + os.replace(tmp_path, path) + finally: + if tmp_path.exists(): + tmp_path.unlink() + + def _load(self, directory: Path, key: str, model_type: type[ModelT]) -> ModelT | None: + path = directory / f"{key}.json" + if not path.exists(): + return None + return model_type.model_validate_json(path.read_text(encoding="utf-8")) + + def _list(self, directory: Path, model_type: type[ModelT]) -> list[ModelT]: + items: list[ModelT] = [] + for path in sorted(directory.glob("*.json")): + items.append(model_type.model_validate_json(path.read_text(encoding="utf-8"))) + return items + + def save_source(self, record: SourceRecord) -> SourceRecord: + self._save(self.sources_dir, record.source_id, record) + return record + + def get_source(self, source_id: str) -> SourceRecord | None: + return self._load(self.sources_dir, source_id, SourceRecord) + + def list_sources(self) -> list[SourceRecord]: + return self._list(self.sources_dir, SourceRecord) + + def save_fragment(self, record: FragmentRecord) -> FragmentRecord: + self._save(self.fragments_dir, record.fragment_id, record) + return record + + def get_fragment(self, fragment_id: str) -> FragmentRecord | None: + return self._load(self.fragments_dir, fragment_id, FragmentRecord) + + def list_fragments(self) -> list[FragmentRecord]: + return self._list(self.fragments_dir, FragmentRecord) + + def save_artifact(self, record: ArtifactRecord) -> ArtifactRecord: + self._save(self.artifacts_dir, record.artifact_id, record) + return record + + def get_artifact(self, artifact_id: str) -> ArtifactRecord | None: + return self._load(self.artifacts_dir, artifact_id, ArtifactRecord) + + def list_artifacts(self) -> list[ArtifactRecord]: + return self._list(self.artifacts_dir, ArtifactRecord) + + def save_observation(self, record: ObservationRecord) -> ObservationRecord: + self._save(self.observations_dir, record.observation_id, record) + return record + + def get_observation(self, observation_id: str) -> ObservationRecord | None: + return self._load(self.observations_dir, observation_id, ObservationRecord) + + def list_observations(self) -> list[ObservationRecord]: + return self._list(self.observations_dir, ObservationRecord) + + def save_claim(self, record: ClaimRecord) -> ClaimRecord: + self._save(self.claims_dir, record.claim_id, record) + return record + + def get_claim(self, claim_id: str) -> ClaimRecord | None: + return self._load(self.claims_dir, claim_id, ClaimRecord) + + def list_claims(self) -> list[ClaimRecord]: + return self._list(self.claims_dir, ClaimRecord) + + def save_concept(self, record: ConceptRecord) -> ConceptRecord: + self._save(self.concepts_dir, record.concept_id.replace("::", "__"), record) + return record + + def get_concept(self, concept_id: str) -> ConceptRecord | None: + return self._load(self.concepts_dir, concept_id.replace("::", "__"), ConceptRecord) + + def list_concepts(self) -> list[ConceptRecord]: + return self._list(self.concepts_dir, ConceptRecord) + + def save_relation(self, record: RelationRecord) -> RelationRecord: + self._save(self.relations_dir, record.relation_id, record) + return record + + def get_relation(self, relation_id: str) -> RelationRecord | None: + return self._load(self.relations_dir, relation_id, RelationRecord) + + def list_relations(self) -> list[RelationRecord]: + return self._list(self.relations_dir, RelationRecord) + + def save_review_candidate(self, record: ReviewCandidateRecord) -> ReviewCandidateRecord: + self._save(self.review_candidates_dir, record.review_candidate_id, record) + return record + + def get_review_candidate(self, review_candidate_id: str) -> ReviewCandidateRecord | None: + return self._load(self.review_candidates_dir, review_candidate_id, ReviewCandidateRecord) + + def list_review_candidates(self) -> list[ReviewCandidateRecord]: + return self._list(self.review_candidates_dir, ReviewCandidateRecord) + + def save_promotion(self, record: PromotionRecord) -> PromotionRecord: + self._save(self.promotions_dir, record.promotion_id, record) + return record + + def get_promotion(self, promotion_id: str) -> PromotionRecord | None: + return self._load(self.promotions_dir, promotion_id, PromotionRecord) + + def list_promotions(self) -> list[PromotionRecord]: + return self._list(self.promotions_dir, PromotionRecord) + + def save_snapshot(self, snapshot: GroundRecallSnapshot) -> GroundRecallSnapshot: + self._save(self.snapshots_dir, snapshot.snapshot_id, snapshot) + return snapshot + + def get_snapshot(self, snapshot_id: str) -> GroundRecallSnapshot | None: + return self._load(self.snapshots_dir, snapshot_id, GroundRecallSnapshot) + + def list_snapshots(self) -> list[GroundRecallSnapshot]: + return self._list(self.snapshots_dir, GroundRecallSnapshot) + + def build_snapshot(self, snapshot_id: str, created_at: str, metadata: dict | None = None) -> GroundRecallSnapshot: + return GroundRecallSnapshot( + snapshot_id=snapshot_id, + created_at=created_at, + sources=self.list_sources(), + fragments=self.list_fragments(), + artifacts=self.list_artifacts(), + observations=self.list_observations(), + claims=self.list_claims(), + concepts=self.list_concepts(), + relations=self.list_relations(), + promotions=self.list_promotions(), + metadata=metadata or {}, + ) diff --git a/tests/test_console_script.py b/tests/test_console_script.py new file mode 100644 index 0000000..0a1f2bf --- /dev/null +++ b/tests/test_console_script.py @@ -0,0 +1,17 @@ +import shutil +import subprocess + + +def test_groundrecall_console_script_help() -> None: + executable = shutil.which("groundrecall") + assert executable is not None + + result = subprocess.run( + [executable, "--help"], + capture_output=True, + text=True, + check=False, + ) + + assert result.returncode == 0 + assert "GroundRecall command-line tools" in result.stdout diff --git a/tests/test_groundrecall_assistants.py b/tests/test_groundrecall_assistants.py new file mode 100644 index 0000000..2913f87 --- /dev/null +++ b/tests/test_groundrecall_assistants.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from groundrecall.assistant_export import export_assistant_bundle +from groundrecall.assistants.base import get_assistant_adapter, list_assistant_adapters +import groundrecall.assistants.codex # noqa: F401 +import groundrecall.assistants.claude_code # noqa: F401 +from groundrecall.models import ( + ArtifactRecord, + ClaimRecord, + ConceptRecord, + ObservationRecord, + ProvenanceRecord, + RelationRecord, +) +from groundrecall.query import build_query_bundle_for_concept +from groundrecall.store import GroundRecallStore + + +def _seed_store(store: GroundRecallStore) -> None: + store.save_artifact( + ArtifactRecord( + artifact_id="ia_001", + artifact_kind="compiled_page", + title="Channel Capacity", + path="wiki/channel-capacity.md", + current_status="reviewed", + ) + ) + store.save_observation( + ObservationRecord( + observation_id="obs_001", + artifact_id="ia_001", + role="claim", + text="Reliable communication rate is bounded by channel capacity.", + provenance=ProvenanceRecord( + origin_artifact_id="ia_001", + origin_path="wiki/channel-capacity.md", + support_kind="derived_from_page", + grounding_status="grounded", + ), + current_status="reviewed", + ) + ) + store.save_concept( + ConceptRecord( + concept_id="concept::channel-capacity", + title="Channel Capacity", + description="Reliable communication limit.", + source_artifact_ids=["ia_001"], + current_status="promoted", + ) + ) + store.save_concept( + ConceptRecord( + concept_id="concept::shannon-entropy", + title="Shannon Entropy", + description="Average uncertainty.", + current_status="promoted", + ) + ) + store.save_claim( + ClaimRecord( + claim_id="clm_001", + claim_text="Channel capacity bounds reliable communication rate.", + concept_ids=["concept::channel-capacity"], + source_observation_ids=["obs_001"], + confidence_hint=0.8, + review_confidence=0.9, + provenance=ProvenanceRecord( + origin_artifact_id="ia_001", + origin_path="wiki/channel-capacity.md", + support_kind="derived_from_page", + grounding_status="grounded", + ), + current_status="promoted", + ) + ) + store.save_relation( + RelationRecord( + relation_id="rel_001", + source_id="concept::channel-capacity", + target_id="concept::shannon-entropy", + relation_type="references", + current_status="promoted", + ) + ) + + +def test_assistant_adapter_registry_lists_known_adapters() -> None: + assert "codex" in list_assistant_adapters() + assert "claude_code" in list_assistant_adapters() + + +def test_codex_adapter_exports_skill_and_json_bundle(tmp_path: Path) -> None: + store = GroundRecallStore(tmp_path / "groundrecall") + _seed_store(store) + manifest = export_assistant_bundle(store.base_dir, "codex", tmp_path / "codex", concept_refs=["channel-capacity"]) + + assert (tmp_path / "codex" / "SKILL.md").exists() + assert (tmp_path / "codex" / "codex_bundle.json").exists() + assert (tmp_path / "codex" / "assistant_export_manifest.json").exists() + assert manifest["assistant"] == "codex" + + +def test_claude_code_adapter_exports_memory_and_json_bundle(tmp_path: Path) -> None: + store = GroundRecallStore(tmp_path / "groundrecall") + _seed_store(store) + manifest = export_assistant_bundle(store.base_dir, "claude_code", tmp_path / "claude", concept_refs=["channel-capacity"]) + + assert (tmp_path / "claude" / "CLAUDE.md").exists() + assert (tmp_path / "claude" / "claude_code_bundle.json").exists() + assert manifest["assistant"] == "claude_code" + + +def test_adapter_contexts_are_derived_from_assistant_neutral_query_bundles(tmp_path: Path) -> None: + store = GroundRecallStore(tmp_path / "groundrecall") + _seed_store(store) + query_bundle = build_query_bundle_for_concept(store.base_dir, "channel-capacity") + assert query_bundle is not None + + codex = get_assistant_adapter("codex") + claude = get_assistant_adapter("claude_code") + codex_context = codex.build_context(query_bundle) + claude_context = claude.build_context(query_bundle) + + assert codex_context["concept"]["concept_id"] == "concept::channel-capacity" + assert claude_context["concept"]["concept_id"] == "concept::channel-capacity" + assert codex_context["assistant"] == "codex" + assert claude_context["assistant"] == "claude_code" + assert "relevant_claims" in codex_context + assert "claims" in claude_context diff --git a/tests/test_groundrecall_export.py b/tests/test_groundrecall_export.py new file mode 100644 index 0000000..4166b8f --- /dev/null +++ b/tests/test_groundrecall_export.py @@ -0,0 +1,136 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from groundrecall.export import export_canonical_bundle, export_query_bundle +from groundrecall.models import ( + ArtifactRecord, + ClaimRecord, + ConceptRecord, + ObservationRecord, + ProvenanceRecord, + RelationRecord, + SourceRecord, +) +from groundrecall.store import GroundRecallStore + + +def _read_jsonl(path: Path) -> list[dict]: + text = path.read_text(encoding="utf-8").strip() + if not text: + return [] + return [json.loads(line) for line in text.splitlines()] + + +def _seed_store(store: GroundRecallStore) -> None: + store.save_source(SourceRecord(source_id="src_001", title="Source", current_status="promoted")) + store.save_artifact( + ArtifactRecord( + artifact_id="ia_001", + artifact_kind="compiled_page", + title="Channel Capacity", + path="wiki/channel-capacity.md", + current_status="reviewed", + ) + ) + store.save_observation( + ObservationRecord( + observation_id="obs_001", + artifact_id="ia_001", + role="claim", + text="Reliable communication rate is bounded by channel capacity.", + provenance=ProvenanceRecord( + origin_artifact_id="ia_001", + origin_path="wiki/channel-capacity.md", + support_kind="derived_from_page", + grounding_status="grounded", + ), + current_status="reviewed", + ) + ) + store.save_concept( + ConceptRecord( + concept_id="concept::channel-capacity", + title="Channel Capacity", + description="Reliable communication limit.", + source_artifact_ids=["ia_001"], + current_status="promoted", + ) + ) + store.save_concept( + ConceptRecord( + concept_id="concept::shannon-entropy", + title="Shannon Entropy", + description="Average uncertainty.", + current_status="promoted", + ) + ) + store.save_claim( + ClaimRecord( + claim_id="clm_001", + claim_text="Channel capacity bounds reliable communication rate.", + concept_ids=["concept::channel-capacity"], + source_observation_ids=["obs_001"], + confidence_hint=0.8, + review_confidence=0.9, + provenance=ProvenanceRecord( + origin_artifact_id="ia_001", + origin_path="wiki/channel-capacity.md", + support_kind="derived_from_page", + grounding_status="grounded", + ), + current_status="promoted", + ) + ) + store.save_relation( + RelationRecord( + relation_id="rel_001", + source_id="concept::channel-capacity", + target_id="concept::shannon-entropy", + relation_type="references", + current_status="promoted", + ) + ) + + +def test_export_canonical_bundle_writes_expected_files(tmp_path: Path) -> None: + store = GroundRecallStore(tmp_path / "groundrecall") + _seed_store(store) + + out_dir = tmp_path / "exports" + payload = export_canonical_bundle( + store_dir=store.base_dir, + out_dir=out_dir, + concept_refs=["channel-capacity"], + snapshot_id="snap_export_001", + ) + + assert (out_dir / "groundrecall_snapshot.json").exists() + assert (out_dir / "claims.jsonl").exists() + assert (out_dir / "concepts.jsonl").exists() + assert (out_dir / "relations.jsonl").exists() + assert (out_dir / "provenance_manifest.json").exists() + assert (out_dir / "export_manifest.json").exists() + assert (out_dir / "query_bundle__channel-capacity.json").exists() + + snapshot = json.loads((out_dir / "groundrecall_snapshot.json").read_text(encoding="utf-8")) + manifest = json.loads((out_dir / "export_manifest.json").read_text(encoding="utf-8")) + claims = _read_jsonl(out_dir / "claims.jsonl") + assert snapshot["snapshot_id"] == "snap_export_001" + assert manifest["export_kind"] == "canonical" + assert len(manifest["query_bundles"]) == 1 + assert claims[0]["claim_id"] == "clm_001" + assert payload["query_bundles"] + + +def test_export_query_bundle_is_assistant_neutral(tmp_path: Path) -> None: + store = GroundRecallStore(tmp_path / "groundrecall") + _seed_store(store) + + out_path = tmp_path / "bundle.json" + payload = export_query_bundle(store.base_dir, "channel capacity", out_path) + assert out_path.exists() + assert payload["bundle_kind"] == "groundrecall_query_bundle" + forbidden = {"assistant", "codex", "claude", "prompt_text"} + assert set(payload).isdisjoint(forbidden) diff --git a/tests/test_groundrecall_import.py b/tests/test_groundrecall_import.py new file mode 100644 index 0000000..2fe12cb --- /dev/null +++ b/tests/test_groundrecall_import.py @@ -0,0 +1,161 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from groundrecall.ingest import run_groundrecall_import +from groundrecall.lint import lint_import_directory + + +def _read_jsonl(path: Path) -> list[dict]: + text = path.read_text(encoding="utf-8").strip() + if not text: + return [] + return [json.loads(line) for line in text.splitlines()] + + +def test_groundrecall_import_emits_normalized_artifacts(tmp_path: Path) -> None: + root = tmp_path / "llmwiki" + (root / "wiki").mkdir(parents=True) + (root / "raw").mkdir() + (root / "logs").mkdir() + + (root / "wiki" / "channel-capacity.md").write_text( + "# Channel Capacity\n\n" + "- Reliable rate upper bound for a noisy channel.\n\n" + "See also [[Shannon Entropy]].\n", + encoding="utf-8", + ) + (root / "raw" / "notes.md").write_text( + "Speculation: Capacity may depend on constraints.\n", + encoding="utf-8", + ) + (root / "logs" / "session.log").write_text( + "Learner asked about entropy and communication limits.\n", + encoding="utf-8", + ) + + result = run_groundrecall_import(root, mode="quick", import_id="import-test") + + assert result.out_dir == root / "imports" / "import-test" + manifest = json.loads((result.out_dir / "manifest.json").read_text(encoding="utf-8")) + assert manifest["source_repo_kind"] == "llmwiki" + assert manifest["artifact_count"] == 3 + assert manifest["claim_count"] >= 1 + + artifacts = _read_jsonl(result.out_dir / "artifacts.jsonl") + assert {item["artifact_kind"] for item in artifacts} == {"compiled_page", "raw_note", "session_log"} + + claims = _read_jsonl(result.out_dir / "claims.jsonl") + assert any("Reliable rate upper bound" in item["claim_text"] for item in claims) + + concepts = _read_jsonl(result.out_dir / "concepts.jsonl") + concept_ids = {item["concept_id"] for item in concepts} + assert "concept::channel-capacity" in concept_ids + assert "concept::shannon-entropy" in concept_ids + + relations = _read_jsonl(result.out_dir / "relations.jsonl") + assert any(item["target_id"] == "concept::shannon-entropy" for item in relations) + + lint_payload = json.loads((result.out_dir / "lint_findings.json").read_text(encoding="utf-8")) + assert "summary" in lint_payload + assert lint_payload["summary"]["warning_count"] >= 0 + + review_queue = json.loads((result.out_dir / "review_queue.json").read_text(encoding="utf-8")) + assert review_queue["queue_length"] >= 1 + assert any(item["candidate_type"] == "claim" for item in review_queue["items"]) + review_session = json.loads((result.out_dir / "review_session.json").read_text(encoding="utf-8")) + assert review_session["reviewer"] == "GroundRecall Import" + assert review_session["draft_pack"]["pack"]["source_import_id"] == "import-test" + assert any(item["concept_id"] == "channel-capacity" for item in review_session["draft_pack"]["concepts"]) + review_data = json.loads((result.out_dir / "review_data.json").read_text(encoding="utf-8")) + assert review_data["reviewer"] == "GroundRecall Import" + assert "field_specs" in review_data + assert any(item["field"] == "status" for item in review_data["field_specs"]) + assert "review_guidance" in review_data + assert "concept_reviews" in review_data + assert "citations" in review_data + assert "citation_reviews" in review_data + + +def test_groundrecall_import_parses_explicit_claim_relations(tmp_path: Path) -> None: + root = tmp_path / "llmwiki" + (root / "wiki").mkdir(parents=True) + (root / "wiki" / "notes.md").write_text( + "# Notes\n\n" + "- [claim_id: base] Channel capacity bounds reliable communication rate.\n" + "- [claim_id: revised] [supersedes: base] Channel capacity bounds reliable communication rate for a specified channel model.\n" + "- [claim_id: dissent] [contradicts: revised] Channel capacity has no stable interpretation.\n", + encoding="utf-8", + ) + + result = run_groundrecall_import(root, mode="quick", import_id="relations-test") + claims = _read_jsonl(result.out_dir / "claims.jsonl") + by_id = {item["claim_id"]: item for item in claims} + + assert "clm_base" in by_id + assert by_id["clm_revised"]["supersedes_claim_ids"] == ["clm_base"] + assert by_id["clm_dissent"]["contradicts_claim_ids"] == ["clm_revised"] + + lint_payload = json.loads((result.out_dir / "lint_findings.json").read_text(encoding="utf-8")) + codes = {item["code"] for item in lint_payload["findings"]} + assert "unresolved_supersession_ref" not in codes + assert "unresolved_contradiction_ref" not in codes + + +def test_groundrecall_lint_flags_orphan_concepts_and_missing_targets(tmp_path: Path) -> None: + root = tmp_path / "llmwiki" + (root / "wiki").mkdir(parents=True) + (root / "wiki" / "solo.md").write_text( + "# Solo Concept\n", + encoding="utf-8", + ) + (root / "wiki" / "broken.md").write_text( + "# Broken\n\nSee also [[Missing Concept]].\n", + encoding="utf-8", + ) + + result = run_groundrecall_import(root, mode="quick", import_id="lint-test") + lint_payload = json.loads((result.out_dir / "lint_findings.json").read_text(encoding="utf-8")) + codes = {item["code"] for item in lint_payload["findings"]} + assert "orphan_concept" in codes + + +def test_groundrecall_lint_detects_relation_missing_target(tmp_path: Path) -> None: + import_dir = tmp_path / "imports" / "broken-import" + import_dir.mkdir(parents=True) + (import_dir / "manifest.json").write_text( + json.dumps({"import_id": "broken-import", "import_mode": "quick"}), + encoding="utf-8", + ) + (import_dir / "artifacts.jsonl").write_text("", encoding="utf-8") + (import_dir / "observations.jsonl").write_text("", encoding="utf-8") + (import_dir / "claims.jsonl").write_text("", encoding="utf-8") + (import_dir / "concepts.jsonl").write_text( + json.dumps( + { + "concept_id": "concept::existing", + "title": "Existing", + "current_status": "triaged", + } + ) + + "\n", + encoding="utf-8", + ) + (import_dir / "relations.jsonl").write_text( + json.dumps( + { + "relation_id": "rel_1", + "source_id": "concept::existing", + "target_id": "concept::missing", + "relation_type": "references", + "current_status": "draft", + } + ) + + "\n", + encoding="utf-8", + ) + + payload = lint_import_directory(import_dir) + codes = {item["code"] for item in payload["findings"]} + assert "relation_missing_target" in codes diff --git a/tests/test_groundrecall_namespace.py b/tests/test_groundrecall_namespace.py new file mode 100644 index 0000000..4942a2b --- /dev/null +++ b/tests/test_groundrecall_namespace.py @@ -0,0 +1,70 @@ +import sys +from pathlib import Path + +from groundrecall.cli import main as groundrecall_cli_main +from groundrecall.export import export_canonical_bundle +from groundrecall.ingest import run_groundrecall_import +from groundrecall.inspect import inspect_store +from groundrecall.models import ClaimRecord +from groundrecall.query import query_concept +from groundrecall.store import GroundRecallStore +from groundrecall.lint import lint_import_directory +from groundrecall.promotion import promote_import_to_store + + +def _build_llmwiki_fixture(root: Path) -> Path: + (root / "wiki").mkdir(parents=True) + (root / "raw").mkdir() + (root / "wiki" / "channel-capacity.md").write_text( + "# Channel Capacity\n\n" + "- Reliable rate upper bound for a noisy channel.\n\n" + "See also [[Shannon Entropy]].\n", + encoding="utf-8", + ) + (root / "raw" / "notes.md").write_text( + "Speculation: Capacity may depend on constraints.\n", + encoding="utf-8", + ) + return root + + +def test_groundrecall_namespace_reexports_core_functions() -> None: + assert run_groundrecall_import.__module__ == "groundrecall.ingest" + assert query_concept.__module__ == "groundrecall.query" + assert export_canonical_bundle.__module__ == "groundrecall.export" + assert lint_import_directory.__module__ == "groundrecall.lint" + assert promote_import_to_store.__module__ == "groundrecall.promotion" + assert GroundRecallStore.__module__ == "groundrecall.store" + assert ClaimRecord.__module__ == "groundrecall.models" + + +def test_groundrecall_inspect_summarizes_store(tmp_path: Path) -> None: + source_root = _build_llmwiki_fixture(tmp_path / "llmwiki") + import_result = run_groundrecall_import(source_root, out_root=tmp_path / "imports", mode="quick", import_id="fixture-import") + store_dir = tmp_path / "store" + promote_import_to_store(import_result.out_dir, store_dir) + + payload = inspect_store(store_dir, out_path=tmp_path / "inspect.json") + + assert (tmp_path / "inspect.json").exists() + assert payload["claim_count"] >= 1 + assert payload["concept_count"] >= 1 + assert payload["snapshot_count"] >= 1 + + +def test_groundrecall_cli_inspect_dispatches(tmp_path: Path, capsys) -> None: + source_root = _build_llmwiki_fixture(tmp_path / "llmwiki") + import_result = run_groundrecall_import(source_root, out_root=tmp_path / "imports", mode="quick", import_id="fixture-import") + store_dir = tmp_path / "store" + promote_import_to_store(import_result.out_dir, store_dir) + + original_argv = sys.argv + try: + sys.argv = ["groundrecall.cli", "inspect", str(store_dir)] + groundrecall_cli_main() + finally: + sys.argv = original_argv + + output = capsys.readouterr().out + assert '"claim_count"' in output + assert '"concept_count"' in output diff --git a/tests/test_groundrecall_promotion.py b/tests/test_groundrecall_promotion.py new file mode 100644 index 0000000..4e05a00 --- /dev/null +++ b/tests/test_groundrecall_promotion.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from groundrecall.ingest import run_groundrecall_import +from groundrecall.promotion import promote_import_to_store +from groundrecall.store import GroundRecallStore + + +def test_groundrecall_promotion_writes_canonical_objects(tmp_path: Path) -> None: + root = tmp_path / "llmwiki" + (root / "wiki").mkdir(parents=True) + (root / "wiki" / "channel-capacity.md").write_text( + "# Channel Capacity\n\n" + "- Reliable rate upper bound for a noisy channel.\n\n" + "See also [[Shannon Entropy]].\n", + encoding="utf-8", + ) + + result = run_groundrecall_import(root, mode="quick", import_id="promote-test") + review_path = result.out_dir / "review_session.json" + review_payload = json.loads(review_path.read_text(encoding="utf-8")) + for concept in review_payload["draft_pack"]["concepts"]: + concept["status"] = "trusted" + review_path.write_text(json.dumps(review_payload, indent=2), encoding="utf-8") + + store_dir = tmp_path / "groundrecall-store" + payload = promote_import_to_store(result.out_dir, store_dir, reviewer="R") + + store = GroundRecallStore(store_dir) + concepts = store.list_concepts() + claims = store.list_claims() + relations = store.list_relations() + promotions = store.list_promotions() + snapshots = store.list_snapshots() + + assert payload["promoted_concept_count"] >= 1 + assert payload["promoted_claim_count"] >= 1 + assert len(concepts) >= 2 + assert any(item.current_status == "promoted" for item in concepts) + assert any(item.current_status == "promoted" for item in claims) + assert len(relations) >= 1 + assert len(promotions) == 1 + assert promotions[0].reviewer == "R" + assert len(snapshots) == 1 + assert snapshots[0].metadata["source_import_id"] == "promote-test" + + +def test_groundrecall_promotion_respects_rejected_review_status(tmp_path: Path) -> None: + root = tmp_path / "llmwiki" + (root / "wiki").mkdir(parents=True) + (root / "wiki" / "solo.md").write_text( + "# Solo Concept\n\n- A solitary claim.\n", + encoding="utf-8", + ) + + result = run_groundrecall_import(root, mode="quick", import_id="reject-test") + review_path = result.out_dir / "review_session.json" + review_payload = json.loads(review_path.read_text(encoding="utf-8")) + review_payload["draft_pack"]["concepts"][0]["status"] = "rejected" + review_path.write_text(json.dumps(review_payload, indent=2), encoding="utf-8") + + store_dir = tmp_path / "groundrecall-store" + promote_import_to_store(result.out_dir, store_dir, reviewer="R") + + store = GroundRecallStore(store_dir) + assert store.list_concepts()[0].current_status == "rejected" + assert store.list_claims()[0].current_status == "rejected" + + +def test_groundrecall_promotion_preserves_contradiction_and_supersession_links(tmp_path: Path) -> None: + root = tmp_path / "llmwiki" + (root / "wiki").mkdir(parents=True) + (root / "wiki" / "notes.md").write_text( + "# Notes\n\n" + "- [claim_id: base] Channel capacity bounds reliable communication rate.\n" + "- [claim_id: revised] [supersedes: base] Channel capacity bounds reliable communication rate for a specified channel model.\n" + "- [claim_id: dissent] [contradicts: revised] Channel capacity has no stable interpretation.\n", + encoding="utf-8", + ) + + result = run_groundrecall_import(root, mode="quick", import_id="graph-test") + review_path = result.out_dir / "review_session.json" + review_payload = json.loads(review_path.read_text(encoding="utf-8")) + for concept in review_payload["draft_pack"]["concepts"]: + concept["status"] = "trusted" + review_path.write_text(json.dumps(review_payload, indent=2), encoding="utf-8") + + store_dir = tmp_path / "groundrecall-store" + promote_import_to_store(result.out_dir, store_dir, reviewer="R") + + store = GroundRecallStore(store_dir) + claims = {item.claim_id: item for item in store.list_claims()} + assert claims["clm_revised"].supersedes_claim_ids == ["clm_base"] + assert claims["clm_dissent"].contradicts_claim_ids == ["clm_revised"] diff --git a/tests/test_groundrecall_query.py b/tests/test_groundrecall_query.py new file mode 100644 index 0000000..f405a5f --- /dev/null +++ b/tests/test_groundrecall_query.py @@ -0,0 +1,190 @@ +from __future__ import annotations + +from pathlib import Path + +from groundrecall.models import ( + ArtifactRecord, + ClaimRecord, + ConceptRecord, + ObservationRecord, + ProvenanceRecord, + RelationRecord, +) +from groundrecall.query import ( + build_query_bundle_for_concept, + query_concept, + query_provenance, + search_claims, +) +from groundrecall.store import GroundRecallStore + + +def _seed_store(store: GroundRecallStore) -> None: + store.save_artifact( + ArtifactRecord( + artifact_id="ia_001", + artifact_kind="compiled_page", + title="Channel Capacity", + path="wiki/channel-capacity.md", + current_status="reviewed", + ) + ) + store.save_observation( + ObservationRecord( + observation_id="obs_001", + artifact_id="ia_001", + role="claim", + text="Reliable communication rate is bounded by channel capacity.", + provenance=ProvenanceRecord( + origin_artifact_id="ia_001", + origin_path="wiki/channel-capacity.md", + support_kind="derived_from_page", + grounding_status="grounded", + ), + current_status="reviewed", + ) + ) + store.save_concept( + ConceptRecord( + concept_id="concept::channel-capacity", + title="Channel Capacity", + description="Reliable communication limit.", + source_artifact_ids=["ia_001"], + current_status="promoted", + ) + ) + store.save_concept( + ConceptRecord( + concept_id="concept::shannon-entropy", + title="Shannon Entropy", + description="Average uncertainty.", + current_status="promoted", + ) + ) + store.save_claim( + ClaimRecord( + claim_id="clm_001", + claim_text="Channel capacity bounds reliable communication rate.", + concept_ids=["concept::channel-capacity"], + source_observation_ids=["obs_001"], + confidence_hint=0.8, + review_confidence=0.9, + provenance=ProvenanceRecord( + origin_artifact_id="ia_001", + origin_path="wiki/channel-capacity.md", + support_kind="derived_from_page", + grounding_status="grounded", + ), + current_status="promoted", + ) + ) + store.save_claim( + ClaimRecord( + claim_id="clm_002", + claim_text="Shannon entropy can inform channel coding intuition.", + concept_ids=["concept::shannon-entropy"], + contradicts_claim_ids=["clm_999"], + provenance=ProvenanceRecord( + origin_artifact_id="ia_001", + origin_path="wiki/channel-capacity.md", + support_kind="derived_from_page", + grounding_status="partially_grounded", + ), + current_status="reviewed", + ) + ) + store.save_relation( + RelationRecord( + relation_id="rel_001", + source_id="concept::channel-capacity", + target_id="concept::shannon-entropy", + relation_type="references", + current_status="promoted", + ) + ) + + +def test_query_concept_returns_neighborhood_and_support(tmp_path: Path) -> None: + store = GroundRecallStore(tmp_path / "groundrecall") + _seed_store(store) + + payload = query_concept(store.base_dir, "channel-capacity") + assert payload is not None + assert payload["concept"]["concept_id"] == "concept::channel-capacity" + assert len(payload["claims"]) == 1 + assert len(payload["relations"]) == 1 + assert any(item["concept_id"] == "concept::shannon-entropy" for item in payload["related_concepts"]) + assert payload["supporting_observations"][0]["origin_path"] == "wiki/channel-capacity.md" + + +def test_search_claims_matches_text_and_concept_titles(tmp_path: Path) -> None: + store = GroundRecallStore(tmp_path / "groundrecall") + _seed_store(store) + + payload = search_claims(store.base_dir, "entropy") + assert payload["query_type"] == "claim_search" + assert any(match["claim"]["claim_id"] == "clm_002" for match in payload["matches"]) + + +def test_query_provenance_filters_by_origin_path(tmp_path: Path) -> None: + store = GroundRecallStore(tmp_path / "groundrecall") + _seed_store(store) + + payload = query_provenance(store.base_dir, origin_path="wiki/channel-capacity.md") + assert len(payload["claims"]) == 2 + assert len(payload["observations"]) == 1 + + +def test_build_query_bundle_for_concept_is_assistant_neutral(tmp_path: Path) -> None: + store = GroundRecallStore(tmp_path / "groundrecall") + _seed_store(store) + + payload = build_query_bundle_for_concept(store.base_dir, "channel capacity") + assert payload is not None + assert payload["bundle_kind"] == "groundrecall_query_bundle" + assert payload["concept"]["concept_id"] == "concept::channel-capacity" + assert isinstance(payload["suggested_next_actions"], list) + forbidden = {"assistant", "codex", "claude", "prompt_text"} + assert set(payload).isdisjoint(forbidden) + + +def test_query_bundle_surfaces_contradictions_and_supersessions(tmp_path: Path) -> None: + store = GroundRecallStore(tmp_path / "groundrecall") + _seed_store(store) + store.save_claim( + ClaimRecord( + claim_id="clm_003", + claim_text="Channel capacity is undefined in practice.", + concept_ids=["concept::channel-capacity"], + contradicts_claim_ids=["clm_001"], + provenance=ProvenanceRecord( + origin_artifact_id="ia_001", + origin_path="wiki/channel-capacity.md", + support_kind="derived_from_page", + grounding_status="partially_grounded", + ), + current_status="reviewed", + ) + ) + store.save_claim( + ClaimRecord( + claim_id="clm_004", + claim_text="Channel capacity should be interpreted relative to a specific channel model.", + concept_ids=["concept::channel-capacity"], + supersedes_claim_ids=["clm_001"], + provenance=ProvenanceRecord( + origin_artifact_id="ia_001", + origin_path="wiki/channel-capacity.md", + support_kind="derived_from_page", + grounding_status="grounded", + ), + current_status="reviewed", + ) + ) + + payload = build_query_bundle_for_concept(store.base_dir, "channel-capacity") + assert payload is not None + contradiction_ids = {item["claim_id"] for item in payload["contradictions"]} + supersession_ids = {item["claim_id"] for item in payload["supersessions"]} + assert "clm_003" in contradiction_ids + assert "clm_004" in supersession_ids diff --git a/tests/test_groundrecall_review_server.py b/tests/test_groundrecall_review_server.py new file mode 100644 index 0000000..4f2e884 --- /dev/null +++ b/tests/test_groundrecall_review_server.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from types import SimpleNamespace + +from groundrecall.review_server import _safe_show_entry, _safe_verify_entry + + +class _StoreWithoutConflicts: + def get_entry(self, citation_key: str): + if citation_key != "baum1974generalized": + return None + return {"citation_key": citation_key, "title": "On two types of deviation"} + + def get_field_provenance(self, citation_key: str): + return [{"field_name": "title", "source_label": "refs.bib"}] + + def get_entry_bibtex(self, citation_key: str): + return "@article{baum1974generalized, title={On two types of deviation}}" + + +class _ApiWithPartialSupport: + def __init__(self): + self.store = _StoreWithoutConflicts() + + def show_entry(self, citation_key: str, **kwargs): + raise AttributeError("get_conflicts missing in underlying store") + + def verify_bibtex(self, bibtex_text: str, *, context: str = "", limit: int = 5): + raise RuntimeError("pybtex unavailable") + + def verify_strings(self, values: list[str], *, context: str = "", limit: int = 5): + return {"context": context, "results": [{"values": values, "limit": limit}]} + + +def test_safe_show_entry_falls_back_when_citegeist_show_entry_is_incompatible() -> None: + api = _ApiWithPartialSupport() + + payload = _safe_show_entry(api, "baum1974generalized") + + assert payload is not None + assert payload["citation_key"] == "baum1974generalized" + assert payload["conflicts"] == [] + assert payload["provenance"][0]["source_label"] == "refs.bib" + assert "bibtex" in payload + + +def test_safe_verify_entry_falls_back_to_verify_strings() -> None: + api = _ApiWithPartialSupport() + entry = SimpleNamespace( + citation_key="baum1974generalized", + title="On two types of deviation", + author="W. M. Baum", + year="1974", + raw_bibtex="@article{baum1974generalized, title={On two types of deviation}}", + ) + + payload = _safe_verify_entry(api, entry, context="pieces/intro.tex Intro", limit=5) + + assert payload["results"] + assert payload["results"][0]["values"][0] == "baum1974generalized" diff --git a/tests/test_groundrecall_review_workspace.py b/tests/test_groundrecall_review_workspace.py new file mode 100644 index 0000000..9cef3ca --- /dev/null +++ b/tests/test_groundrecall_review_workspace.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from groundrecall.ingest import run_groundrecall_import +from groundrecall.review_workspace import GroundRecallReviewWorkspace + + +def _build_citation_fixture(root: Path) -> Path: + (root / "wiki").mkdir(parents=True) + (root / "wiki" / "learning-theory.md").write_text( + "# Learning Theory\n\n" + "Matching-law style regularities can be compared with machine learning optimization.\n\n" + "See \\\\cite{herrnstein1961matching} for the classic framing.\n", + encoding="utf-8", + ) + return root + + +def test_review_workspace_populates_and_persists_citation_reviews(tmp_path: Path) -> None: + source_root = _build_citation_fixture(tmp_path / "llmwiki") + import_result = run_groundrecall_import(source_root, out_root=tmp_path / "imports", mode="quick", import_id="review-fixture") + + workspace = GroundRecallReviewWorkspace(import_result.out_dir) + payload = workspace.load_review_data() + assert payload["citation_reviews"] + citation_review_id = payload["citation_reviews"][0]["citation_review_id"] + + workspace.apply_updates( + concept_updates=[ + { + "concept_id": "learning-theory", + "status": "trusted", + "notes": ["Strong framing concept.", "Citation support looks plausible."], + } + ], + citation_updates=[ + { + "citation_review_id": citation_review_id, + "status": "verified", + "notes": ["Classic matching-law citation."], + } + ], + reviewer="Unit Test Reviewer", + ) + + session = json.loads((import_result.out_dir / "review_session.json").read_text(encoding="utf-8")) + concept = next(item for item in session["draft_pack"]["concepts"] if item["concept_id"] == "learning-theory") + citation = next(item for item in session["citation_reviews"] if item["citation_review_id"] == citation_review_id) + + assert session["reviewer"] == "Unit Test Reviewer" + assert concept["status"] == "trusted" + assert citation["status"] == "verified" + + review_data = json.loads((import_result.out_dir / "review_data.json").read_text(encoding="utf-8")) + assert any(item["citation_review_id"] == citation_review_id for item in review_data["citation_reviews"]) + + +def test_review_workspace_resolves_citation_metadata_from_bibtex(tmp_path: Path) -> None: + root = tmp_path / "llmwiki" + (root / "wiki").mkdir(parents=True) + (root / "wiki" / "matching.md").write_text( + "# Matching\n\n" + "The manuscript cites \\\\cite{baum1974generalized} here.\n", + encoding="utf-8", + ) + (root / "refs.bib").write_text( + "@article{baum1974generalized,\n" + " author = {W. M. Baum},\n" + " title = {On two types of deviation from the matching law: Bias and undermatching},\n" + " journal = {Journal of the Experimental Analysis of Behavior},\n" + " year = {1974}\n" + "}\n", + encoding="utf-8", + ) + + import_result = run_groundrecall_import(root, out_root=tmp_path / "imports", mode="quick", import_id="bib-fixture") + workspace = GroundRecallReviewWorkspace(import_result.out_dir) + payload = workspace.load_review_data() + + entry = next(item for item in payload["citation_reviews"] if item["citation_key"] == "baum1974generalized") + assert entry["title"] == "On two types of deviation from the matching law: Bias and undermatching" + assert entry["source_bib_path"] == "refs.bib" + assert entry["raw_bibtex"] + assert payload["bibliography"]["entry_count"] >= 1 diff --git a/tests/test_groundrecall_source_adapters.py b/tests/test_groundrecall_source_adapters.py new file mode 100644 index 0000000..01f4ca4 --- /dev/null +++ b/tests/test_groundrecall_source_adapters.py @@ -0,0 +1,235 @@ +from __future__ import annotations + +from pathlib import Path + +import groundrecall.ingest as ingest_module +import groundrecall.source_adapters # noqa: F401 +from groundrecall.source_adapters.base import detect_source_adapter, list_source_adapters +from groundrecall.ingest import run_groundrecall_import + + +def test_groundrecall_source_adapter_registry_lists_expected_adapters() -> None: + names = set(list_source_adapters()) + assert "llmwiki" in names + assert "polypaper" in names + assert "markdown_notes" in names + assert "transcript" in names + assert "didactopus_pack" in names + assert "doclift_bundle" in names + + +def test_detect_llmwiki_adapter(tmp_path: Path) -> None: + (tmp_path / "wiki").mkdir() + adapter = detect_source_adapter(tmp_path) + assert adapter.name == "llmwiki" + assert adapter.import_intent() == "grounded_knowledge" + + +def test_detect_didactopus_pack_adapter(tmp_path: Path) -> None: + (tmp_path / "pack.yaml").write_text("name: p\n", encoding="utf-8") + (tmp_path / "concepts.yaml").write_text("concepts: []\n", encoding="utf-8") + adapter = detect_source_adapter(tmp_path) + assert adapter.name == "didactopus_pack" + assert adapter.import_intent() == "both" + + +def test_detect_doclift_bundle_adapter(tmp_path: Path) -> None: + (tmp_path / "documents").mkdir() + (tmp_path / "manifest.json").write_text('{"documents": []}\n', encoding="utf-8") + adapter = detect_source_adapter(tmp_path) + assert adapter.name == "doclift_bundle" + assert adapter.import_intent() == "both" + + +def test_groundrecall_import_records_adapter_and_intent(tmp_path: Path) -> None: + (tmp_path / "wiki").mkdir() + (tmp_path / "wiki" / "note.md").write_text("# Title\n\n- A note.\n", encoding="utf-8") + result = run_groundrecall_import(tmp_path, mode="quick", import_id="adapter-test") + assert result.manifest["source_adapter"] == "llmwiki" + assert result.manifest["import_intent"] == "grounded_knowledge" + + +def test_markdown_notes_adapter_ingests_tex_files(tmp_path: Path) -> None: + (tmp_path / "draft.tex").write_text( + "\\section{Related Work}\n\n" + "We connect behaviorism and language models.\n", + encoding="utf-8", + ) + + adapter = detect_source_adapter(tmp_path) + assert adapter.name == "markdown_notes" + + result = run_groundrecall_import(tmp_path, mode="quick", import_id="tex-test") + assert result.manifest["source_adapter"] == "markdown_notes" + assert result.manifest["artifact_count"] == 1 + assert result.artifacts[0]["path"] == "draft.tex" + assert result.claims + + +def test_tex_import_uses_pandoc_markdown_when_available(tmp_path: Path, monkeypatch) -> None: + (tmp_path / "draft.tex").write_text( + "\\section{Ignored by fallback}\n" + "\\usepackage{amsmath}\n", + encoding="utf-8", + ) + + monkeypatch.setattr( + ingest_module, + "_convert_tex_to_markdown", + lambda path: "# Converted Draft\n\n- Converted claim from pandoc.\n", + ) + + result = run_groundrecall_import(tmp_path, mode="quick", import_id="tex-pandoc-test") + claim_texts = [item["claim_text"] for item in result.claims] + concept_ids = {item["concept_id"] for item in result.concepts} + + assert "Converted claim from pandoc." in claim_texts + assert "concept::converted-draft" in concept_ids + + +def test_detect_polypaper_adapter_and_exclude_support_files(tmp_path: Path) -> None: + (tmp_path / "pieces").mkdir() + (tmp_path / "figs").mkdir() + (tmp_path / "setup").mkdir() + (tmp_path / "main.tex").write_text( + "\\include{pieces/discussion}\n" + "\\include{pieces/table-results}\n" + "\\input{figs/figure-system}\n", + encoding="utf-8", + ) + (tmp_path / "paper.org").write_text("* draft\n", encoding="utf-8") + (tmp_path / "pieces" / "discussion.tex").write_text("\\section{Discussion}\n\nMore text.\n", encoding="utf-8") + (tmp_path / "pieces" / "table-results.tex").write_text("\\begin{tabular}x\\end{tabular}\n", encoding="utf-8") + (tmp_path / "pieces" / "unused.tex").write_text("\\section{Unused}\n\nIgnore me.\n", encoding="utf-8") + (tmp_path / "figs" / "figure-system.tex").write_text("\\begin{figure}x\\end{figure}\n", encoding="utf-8") + (tmp_path / "setup" / "venue-arxiv.tex").write_text("\\section{Setup}\n", encoding="utf-8") + (tmp_path / ".pp-export-tmp.tex").write_text("\\section{Tmp}\n", encoding="utf-8") + + adapter = detect_source_adapter(tmp_path) + assert adapter.name == "polypaper" + + result = run_groundrecall_import(tmp_path, mode="quick", import_id="polypaper-test") + paths = {item["path"] for item in result.artifacts} + assert "main.tex" not in paths + assert "pieces/discussion.tex" in paths + assert "pieces/table-results.tex" not in paths + assert "figs/figure-system.tex" not in paths + assert "pieces/unused.tex" not in paths + assert "setup/venue-arxiv.tex" not in paths + assert ".pp-export-tmp.tex" not in paths + + +def test_tex_import_skips_table_and_figure_markup_from_pandoc(tmp_path: Path, monkeypatch) -> None: + (tmp_path / "draft.tex").write_text("\\section{Draft}\n", encoding="utf-8") + + monkeypatch.setattr( + ingest_module, + "_convert_tex_to_markdown", + lambda path: "\n".join( + [ + "# Draft", + "", + "![image](figure.png)", + "| Col A | Col B |", + "| --- | --- |", + "| 1 | 2 |", + "
    ", + "\\begin{tabular}{ll}", + "- Real manuscript claim.", + ] + ), + ) + + result = run_groundrecall_import(tmp_path, mode="quick", import_id="tex-cleanup-test") + claim_texts = [item["claim_text"] for item in result.claims] + + assert claim_texts == ["Real manuscript claim."] + + +def test_didactopus_pack_import_generates_structured_concepts_and_relations(tmp_path: Path) -> None: + (tmp_path / "pack.yaml").write_text( + "\n".join( + [ + "name: sample-pack", + "display_name: Sample Pack", + "version: 0.1.0", + "schema_version: 0.1.0", + "didactopus_min_version: 0.1.0", + "didactopus_max_version: 9.9.9", + ] + ), + encoding="utf-8", + ) + (tmp_path / "concepts.yaml").write_text( + "\n".join( + [ + "concepts:", + " - id: basics", + " title: Basics", + " description: Foundational concept.", + " mastery_signals: [Explain the foundation.]", + " - id: advanced", + " title: Advanced", + " description: Builds on basics.", + " prerequisites: [basics]", + ] + ), + encoding="utf-8", + ) + (tmp_path / "roadmap.yaml").write_text( + "\n".join( + [ + "stages:", + " - id: stage1", + " title: Stage One", + " concepts: [basics, advanced]", + ] + ), + encoding="utf-8", + ) + + result = run_groundrecall_import(tmp_path, mode="quick", import_id="pack-test") + assert result.manifest["source_adapter"] == "didactopus_pack" + assert result.manifest["import_intent"] == "both" + concept_ids = {item["concept_id"] for item in result.concepts} + assert "concept::basics" in concept_ids + assert "concept::advanced" in concept_ids + relation_targets = {(item["source_id"], item["target_id"], item["relation_type"]) for item in result.relations} + assert ("concept::basics", "concept::advanced", "prerequisite") in relation_targets + claim_ids = {item["claim_id"] for item in result.claims} + assert "clm_pack_basics" in claim_ids + assert "clm_stage_stage1_basics" in claim_ids + + +def test_doclift_bundle_import_generates_structured_concepts(tmp_path: Path) -> None: + doc_dir = tmp_path / "documents" / "lesson-a" + doc_dir.mkdir(parents=True) + (tmp_path / "manifest.json").write_text( + '\n'.join( + [ + "{", + ' "documents": [', + " {", + ' "document_id": "lesson-a",', + ' "title": "Lecture 1. Example",', + ' "document_kind": "lecture",', + f' "output_dir": "{doc_dir}",', + f' "markdown_path": "{doc_dir / "document.md"}",', + f' "figures_path": "{doc_dir / "document.figures.json"}"', + " }", + " ]", + "}", + ] + ), + encoding="utf-8", + ) + (doc_dir / "document.md").write_text("# Lecture 1. Example\n\nBody.\n", encoding="utf-8") + (doc_dir / "document.figures.json").write_text('{"source_path": "/tmp/source.doc"}\n', encoding="utf-8") + + result = run_groundrecall_import(tmp_path, mode="quick", import_id="doclift-test") + assert result.manifest["source_adapter"] == "doclift_bundle" + assert result.manifest["import_intent"] == "both" + concept_ids = {item["concept_id"] for item in result.concepts} + assert "concept::lesson-a" in concept_ids + claim_ids = {item["claim_id"] for item in result.claims} + assert "clm_doclift_1" in claim_ids diff --git a/tests/test_groundrecall_store.py b/tests/test_groundrecall_store.py new file mode 100644 index 0000000..2c7ba43 --- /dev/null +++ b/tests/test_groundrecall_store.py @@ -0,0 +1,148 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from groundrecall.models import ( + ClaimRecord, + ConceptRecord, + GroundRecallSnapshot, + PromotionRecord, + ProvenanceRecord, + RelationRecord, + ReviewCandidateRecord, + SourceRecord, +) +from groundrecall.store import GroundRecallStore + + +def test_groundrecall_store_round_trips_canonical_objects(tmp_path: Path) -> None: + store = GroundRecallStore(tmp_path / "groundrecall") + + source = store.save_source( + SourceRecord( + source_id="src_001", + title="Channel Notes", + source_type="markdown", + path="wiki/channel-capacity.md", + current_status="promoted", + ) + ) + claim = store.save_claim( + ClaimRecord( + claim_id="clm_001", + claim_text="Channel capacity bounds reliable communication rate.", + claim_kind="definition", + concept_ids=["concept::channel-capacity"], + confidence_hint=0.72, + provenance=ProvenanceRecord( + origin_artifact_id="ia_001", + origin_path="wiki/channel-capacity.md", + support_kind="derived_from_page", + grounding_status="partially_grounded", + ), + current_status="reviewed", + ) + ) + concept = store.save_concept( + ConceptRecord( + concept_id="concept::channel-capacity", + title="Channel Capacity", + description="Imported concept.", + current_status="promoted", + ) + ) + relation = store.save_relation( + RelationRecord( + relation_id="rel_001", + source_id="concept::channel-capacity", + target_id="concept::shannon-entropy", + relation_type="references", + current_status="draft", + ) + ) + review_candidate = store.save_review_candidate( + ReviewCandidateRecord( + review_candidate_id="rc_001", + candidate_type="claim", + candidate_id="clm_001", + triage_lane="knowledge_capture", + priority=10, + current_status="triaged", + ) + ) + promotion = store.save_promotion( + PromotionRecord( + promotion_id="pr_001", + candidate_type="claim", + candidate_id="clm_001", + reviewer="R", + promoted_object_ids=["clm_001"], + promoted_at="2026-04-17T12:00:00Z", + ) + ) + + assert store.get_source(source.source_id) is not None + assert store.get_claim(claim.claim_id) is not None + assert store.get_concept(concept.concept_id) is not None + assert store.get_relation(relation.relation_id) is not None + assert store.get_review_candidate(review_candidate.review_candidate_id) is not None + assert store.get_promotion(promotion.promotion_id) is not None + + +def test_groundrecall_store_builds_and_persists_snapshot(tmp_path: Path) -> None: + store = GroundRecallStore(tmp_path / "groundrecall") + store.save_source(SourceRecord(source_id="src_001", title="T", current_status="promoted")) + store.save_claim( + ClaimRecord( + claim_id="clm_001", + claim_text="A grounded claim.", + concept_ids=["concept::c1"], + current_status="promoted", + ) + ) + store.save_concept(ConceptRecord(concept_id="concept::c1", title="C1", current_status="promoted")) + + snapshot = store.build_snapshot( + snapshot_id="snap_001", + created_at="2026-04-17T12:00:00Z", + metadata={"export_kind": "canonical"}, + ) + saved = store.save_snapshot(snapshot) + + loaded = store.get_snapshot(saved.snapshot_id) + assert loaded is not None + assert isinstance(loaded, GroundRecallSnapshot) + assert loaded.metadata["export_kind"] == "canonical" + assert len(loaded.sources) == 1 + assert len(loaded.claims) == 1 + assert len(loaded.concepts) == 1 + + +def test_groundrecall_models_remain_assistant_neutral() -> None: + claim_fields = set(ClaimRecord.model_fields) + concept_fields = set(ConceptRecord.model_fields) + snapshot_fields = set(GroundRecallSnapshot.model_fields) + forbidden = {"assistant", "assistant_name", "codex", "claude", "skill_bundle", "prompt_text"} + + assert claim_fields.isdisjoint(forbidden) + assert concept_fields.isdisjoint(forbidden) + assert snapshot_fields.isdisjoint(forbidden) + + +def test_groundrecall_store_writes_json_atomically_without_tmp_artifacts(tmp_path: Path) -> None: + store = GroundRecallStore(tmp_path / "groundrecall") + + claim = store.save_claim( + ClaimRecord( + claim_id="clm_atomic", + claim_text="Atomic writes should leave valid JSON on disk.", + concept_ids=["concept::atomicity"], + current_status="reviewed", + ) + ) + + claim_path = store.claims_dir / f"{claim.claim_id}.json" + payload = json.loads(claim_path.read_text(encoding="utf-8")) + assert payload["claim_id"] == "clm_atomic" + assert list(store.claims_dir.glob("*.tmp")) == []