diff --git a/README.md b/README.md index e718190..2a58740 100644 --- a/README.md +++ b/README.md @@ -2,21 +2,25 @@ **Didactopus** is a local-first AI-assisted autodidactic mastery platform. -This revision upgrades the evidence layer from simple averaging to a more realistic weighted and recency-aware mastery model. +This revision upgrades the evidence layer from a single weighted score to a **multi-dimensional mastery model**. ## Added in this revision -- evidence-type weighting -- recency weighting -- confidence estimation from weighted evidence mass -- dimension-level rubric storage -- weighted concept summaries -- mastery decisions using weighted score and confidence -- resurfacing from recent weak evidence -- tests for weighted scoring and recency behavior +- per-concept mastery dimensions: + - correctness + - explanation + - transfer + - project_execution + - critique +- weighted, recency-aware dimension summaries +- per-dimension mastery thresholds +- concept-level mastery determined from all required dimensions +- dimension-specific weakness reporting +- adaptive next-step selection informed by weak dimensions +- tests for multi-dimensional mastery promotion and partial weakness detection ## Why this matters -Not all evidence should count equally. +Real mastery is not one scalar. -A capstone project or transfer task should usually matter more than a short explanation, and recent poor performance should sometimes matter more than older success. This revision begins to model that explicitly. +A learner can be strong at routine correctness and still be weak at transfer, explanation, or critique. This revision lets Didactopus represent that distinction explicitly. diff --git a/configs/config.example.yaml b/configs/config.example.yaml index 6e53c12..ff7dd24 100644 --- a/configs/config.example.yaml +++ b/configs/config.example.yaml @@ -14,7 +14,6 @@ platform: verification_required: true require_learner_explanations: true permit_direct_answers: false - mastery_threshold: 0.8 resurfacing_threshold: 0.55 confidence_threshold: 0.8 evidence_weights: @@ -23,6 +22,12 @@ platform: project: 2.5 transfer: 2.0 recent_evidence_multiplier: 1.35 + dimension_thresholds: + correctness: 0.8 + explanation: 0.75 + transfer: 0.7 + project_execution: 0.75 + critique: 0.7 artifacts: local_pack_dirs: diff --git a/docs/multidimensional-mastery.md b/docs/multidimensional-mastery.md new file mode 100644 index 0000000..3dc5f10 --- /dev/null +++ b/docs/multidimensional-mastery.md @@ -0,0 +1,33 @@ +# Multi-Dimensional Mastery Model + +## Purpose + +Didactopus should distinguish among different forms of competence rather than collapsing them into one number. + +## Dimensions in this revision + +- **correctness**: routine technical correctness +- **explanation**: ability to explain clearly and justify reasoning +- **transfer**: ability to apply knowledge in new contexts +- **project_execution**: ability to carry work through in an authentic task +- **critique**: ability to detect flaws and evaluate reasoning + +## Current rule + +A concept counts as mastered only if: +- confidence meets threshold +- every required dimension present for that concept meets its configured threshold + +## Consequences + +A learner can now be: +- **ready in correctness but weak in transfer** +- **strong in explanations but weak in project execution** +- **mastered overall only when all required dimensions are adequate** + +## Future work + +- concept-specific dimension requirements +- different thresholds by domain +- prerequisite softening based on partial dimension mastery +- deliberate weak-area practice generation diff --git a/src/didactopus/artifact_registry.py b/src/didactopus/artifact_registry.py index 602f458..a5e9466 100644 --- a/src/didactopus/artifact_registry.py +++ b/src/didactopus/artifact_registry.py @@ -7,14 +7,7 @@ import yaml import networkx as nx from . import __version__ as DIDACTOPUS_VERSION -from .artifact_schemas import ( - ConceptsFile, - PackManifest, - ProjectsFile, - RoadmapFile, - RubricsFile, - validate_top_level_key, -) +from .artifact_schemas import ConceptsFile, PackManifest, ProjectsFile, RoadmapFile, RubricsFile REQUIRED_FILES = ["pack.yaml", "concepts.yaml", "roadmap.yaml", "projects.yaml", "rubrics.yaml"] @@ -48,13 +41,11 @@ def _load_yaml(path: Path) -> dict[str, Any]: def validate_pack(pack_dir: str | Path) -> PackValidationResult: pack_path = Path(pack_dir) result = PackValidationResult(pack_dir=pack_path) - for filename in REQUIRED_FILES: if not (pack_path / filename).exists(): result.errors.append(f"missing required file: {filename}") if result.errors: return result - try: result.manifest = PackManifest.model_validate(_load_yaml(pack_path / "pack.yaml")) if not _version_in_range(DIDACTOPUS_VERSION, result.manifest.didactopus_min_version, result.manifest.didactopus_max_version): @@ -62,19 +53,12 @@ def validate_pack(pack_dir: str | Path) -> PackValidationResult: f"incompatible with Didactopus core version {DIDACTOPUS_VERSION}; supported range is " f"{result.manifest.didactopus_min_version}..{result.manifest.didactopus_max_version}" ) - - concepts = ConceptsFile.model_validate(_load_yaml(pack_path / "concepts.yaml")) - roadmap = RoadmapFile.model_validate(_load_yaml(pack_path / "roadmap.yaml")) - projects = ProjectsFile.model_validate(_load_yaml(pack_path / "projects.yaml")) - rubrics = RubricsFile.model_validate(_load_yaml(pack_path / "rubrics.yaml")) - - result.loaded_files["concepts"] = concepts - result.loaded_files["roadmap"] = roadmap - result.loaded_files["projects"] = projects - result.loaded_files["rubrics"] = rubrics + result.loaded_files["concepts"] = ConceptsFile.model_validate(_load_yaml(pack_path / "concepts.yaml")) + result.loaded_files["roadmap"] = RoadmapFile.model_validate(_load_yaml(pack_path / "roadmap.yaml")) + result.loaded_files["projects"] = ProjectsFile.model_validate(_load_yaml(pack_path / "projects.yaml")) + result.loaded_files["rubrics"] = RubricsFile.model_validate(_load_yaml(pack_path / "rubrics.yaml")) except Exception as exc: result.errors.append(str(exc)) - result.is_valid = not result.errors return result diff --git a/src/didactopus/artifact_schemas.py b/src/didactopus/artifact_schemas.py index 8c85946..602dac4 100644 --- a/src/didactopus/artifact_schemas.py +++ b/src/didactopus/artifact_schemas.py @@ -56,15 +56,5 @@ class ProjectsFile(BaseModel): projects: list[ProjectEntry] -class RubricEntry(BaseModel): - id: str - title: str - criteria: list[str] = Field(default_factory=list) - - class RubricsFile(BaseModel): - rubrics: list[RubricEntry] - - -def validate_top_level_key(data: dict[str, Any], required_key: str) -> list[str]: - return [] if required_key in data else [f"missing required top-level key: {required_key}"] + rubrics: list[dict[str, Any]] diff --git a/src/didactopus/config.py b/src/didactopus/config.py index 145c8e6..a8e0740 100644 --- a/src/didactopus/config.py +++ b/src/didactopus/config.py @@ -26,7 +26,6 @@ class PlatformConfig(BaseModel): verification_required: bool = True require_learner_explanations: bool = True permit_direct_answers: bool = False - mastery_threshold: float = 0.8 resurfacing_threshold: float = 0.55 confidence_threshold: float = 0.8 evidence_weights: dict[str, float] = Field( @@ -38,6 +37,15 @@ class PlatformConfig(BaseModel): } ) recent_evidence_multiplier: float = 1.35 + dimension_thresholds: dict[str, float] = Field( + default_factory=lambda: { + "correctness": 0.8, + "explanation": 0.75, + "transfer": 0.7, + "project_execution": 0.75, + "critique": 0.7, + } + ) class ArtifactConfig(BaseModel): diff --git a/src/didactopus/evidence_engine.py b/src/didactopus/evidence_engine.py index 5c26429..f70ed37 100644 --- a/src/didactopus/evidence_engine.py +++ b/src/didactopus/evidence_engine.py @@ -6,6 +6,7 @@ from typing import Literal from .adaptive_engine import LearnerProfile EvidenceType = Literal["explanation", "problem", "project", "transfer"] +MASTERY_DIMENSIONS = ["correctness", "explanation", "transfer", "project_execution", "critique"] @dataclass @@ -26,6 +27,8 @@ class ConceptEvidenceSummary: total_weight: float = 0.0 confidence: float = 0.0 dimension_means: dict[str, float] = field(default_factory=dict) + weak_dimensions: list[str] = field(default_factory=list) + mastered: bool = False @dataclass @@ -53,11 +56,13 @@ def recompute_concept_summary( items: list[EvidenceItem], type_weights: dict[str, float], recent_multiplier: float, + dimension_thresholds: dict[str, float], + confidence_threshold: float, ) -> ConceptEvidenceSummary: weighted_score_sum = 0.0 total_weight = 0.0 - dimension_totals: dict[str, float] = {} - dimension_weights: dict[str, float] = {} + dim_totals: dict[str, float] = {} + dim_weights: dict[str, float] = {} for item in items: item.score = clamp_score(item.score) @@ -65,24 +70,42 @@ def recompute_concept_summary( weighted_score_sum += item.score * w total_weight += w - for dim, val in item.rubric_dimensions.items(): - v = clamp_score(val) - dimension_totals[dim] = dimension_totals.get(dim, 0.0) + v * w - dimension_weights[dim] = dimension_weights.get(dim, 0.0) + w + for dim, value in item.rubric_dimensions.items(): + v = clamp_score(value) + dim_totals[dim] = dim_totals.get(dim, 0.0) + v * w + dim_weights[dim] = dim_weights.get(dim, 0.0) + w dimension_means = { - dim: (dimension_totals[dim] / dimension_weights[dim]) - for dim in dimension_totals - if dimension_weights[dim] > 0 + dim: dim_totals[dim] / dim_weights[dim] + for dim in dim_totals + if dim_weights[dim] > 0 } + confidence = confidence_from_weight(total_weight) + + weak_dimensions = [] + for dim, threshold in dimension_thresholds.items(): + if dim in dimension_means and dimension_means[dim] < threshold: + weak_dimensions.append(dim) + + mastered = ( + confidence >= confidence_threshold + and all( + (dim in dimension_means and dimension_means[dim] >= threshold) + for dim, threshold in dimension_thresholds.items() + if dim in dimension_means + ) + and len(dimension_means) > 0 + ) return ConceptEvidenceSummary( concept_key=concept_key, count=len(items), weighted_mean_score=(weighted_score_sum / total_weight) if total_weight > 0 else 0.0, total_weight=total_weight, - confidence=confidence_from_weight(total_weight), + confidence=confidence, dimension_means=dimension_means, + weak_dimensions=sorted(weak_dimensions), + mastered=mastered, ) @@ -91,6 +114,8 @@ def add_evidence_item( item: EvidenceItem, type_weights: dict[str, float], recent_multiplier: float, + dimension_thresholds: dict[str, float], + confidence_threshold: float, ) -> None: item.score = clamp_score(item.score) state.evidence_by_concept.setdefault(item.concept_key, []).append(item) @@ -99,30 +124,21 @@ def add_evidence_item( state.evidence_by_concept[item.concept_key], type_weights, recent_multiplier, + dimension_thresholds, + confidence_threshold, ) def update_profile_mastery_from_evidence( profile: LearnerProfile, state: EvidenceState, - mastery_threshold: float, resurfacing_threshold: float, - confidence_threshold: float, ) -> None: for concept_key, summary in state.summary_by_concept.items(): - if summary.count == 0: - continue - - if ( - summary.weighted_mean_score >= mastery_threshold - and summary.confidence >= confidence_threshold - ): + if summary.mastered: profile.mastered_concepts.add(concept_key) state.resurfaced_concepts.discard(concept_key) - elif ( - concept_key in profile.mastered_concepts - and summary.weighted_mean_score < resurfacing_threshold - ): + elif concept_key in profile.mastered_concepts and summary.weighted_mean_score < resurfacing_threshold: profile.mastered_concepts.remove(concept_key) state.resurfaced_concepts.add(concept_key) @@ -130,20 +146,25 @@ def update_profile_mastery_from_evidence( def ingest_evidence_bundle( profile: LearnerProfile, items: list[EvidenceItem], - mastery_threshold: float, resurfacing_threshold: float, confidence_threshold: float, type_weights: dict[str, float], recent_multiplier: float, + dimension_thresholds: dict[str, float], ) -> EvidenceState: state = EvidenceState() for item in items: - add_evidence_item(state, item, type_weights, recent_multiplier) + add_evidence_item( + state, + item, + type_weights, + recent_multiplier, + dimension_thresholds, + confidence_threshold, + ) update_profile_mastery_from_evidence( profile=profile, state=state, - mastery_threshold=mastery_threshold, resurfacing_threshold=resurfacing_threshold, - confidence_threshold=confidence_threshold, - ) + ) return state diff --git a/src/didactopus/learning_graph.py b/src/didactopus/learning_graph.py index 15dc379..eac81a7 100644 --- a/src/didactopus/learning_graph.py +++ b/src/didactopus/learning_graph.py @@ -26,8 +26,7 @@ def build_merged_learning_graph(results: list[PackValidationResult]) -> MergedLe for pack_name in merged.load_order: result = valid[pack_name] - concepts_file = result.loaded_files["concepts"] - for concept in concepts_file.concepts: + for concept in result.loaded_files["concepts"].concepts: key = namespaced_concept(pack_name, concept.id) merged.concept_data[key] = { "id": concept.id, @@ -40,16 +39,13 @@ def build_merged_learning_graph(results: list[PackValidationResult]) -> MergedLe for pack_name in merged.load_order: result = valid[pack_name] - concepts_file = result.loaded_files["concepts"] - for concept in concepts_file.concepts: + for concept in result.loaded_files["concepts"].concepts: concept_key = namespaced_concept(pack_name, concept.id) for prereq in concept.prerequisites: prereq_key = namespaced_concept(pack_name, prereq) if prereq_key in merged.graph: merged.graph.add_edge(prereq_key, concept_key) - - projects_file = result.loaded_files["projects"] - for project in projects_file.projects: + for project in result.loaded_files["projects"].projects: merged.project_catalog.append({ "id": f"{pack_name}::{project.id}", "pack": pack_name, @@ -58,5 +54,4 @@ def build_merged_learning_graph(results: list[PackValidationResult]) -> MergedLe "prerequisites": [namespaced_concept(pack_name, p) for p in project.prerequisites], "deliverables": list(project.deliverables), }) - return merged diff --git a/src/didactopus/main.py b/src/didactopus/main.py index 3480d46..24b0394 100644 --- a/src/didactopus/main.py +++ b/src/didactopus/main.py @@ -11,7 +11,6 @@ from .artifact_registry import ( ) from .config import load_config from .evidence_engine import EvidenceItem, ingest_evidence_bundle -from .evaluation import score_simple_rubric from .learning_graph import build_merged_learning_graph from .mentor import generate_socratic_prompt from .model_provider import ModelProvider @@ -20,7 +19,7 @@ from .project_advisor import suggest_capstone def build_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser(description="Didactopus weighted evidence scaffold") + parser = argparse.ArgumentParser(description="Didactopus multi-dimensional mastery scaffold") parser.add_argument("--domain", required=True) parser.add_argument("--goal", required=True) parser.add_argument( @@ -68,88 +67,67 @@ def main() -> None: hide_mastered=True, ) - rubric = score_simple_rubric(0.92, 0.86, 0.82, 0.78) evidence_items = [ - EvidenceItem( - concept_key="foundations-statistics::descriptive-statistics", - evidence_type="explanation", - score=rubric.mean(), - is_recent=False, - rubric_dimensions={ - "correctness": rubric.correctness, - "clarity": rubric.clarity, - "justification": rubric.justification, - "transfer": rubric.transfer, - }, - notes="Good explanation.", - ), EvidenceItem( concept_key="foundations-statistics::descriptive-statistics", evidence_type="project", - score=0.9, + score=0.88, is_recent=True, rubric_dimensions={ "correctness": 0.9, - "clarity": 0.84, - "justification": 0.88, - "transfer": 0.82, + "explanation": 0.83, + "transfer": 0.79, + "project_execution": 0.88, + "critique": 0.74, }, - notes="Strong project evidence.", + notes="Strong integrated performance.", ), EvidenceItem( concept_key="bayes-extension::prior", evidence_type="problem", - score=0.58, + score=0.68, is_recent=True, rubric_dimensions={ - "correctness": 0.6, - "clarity": 0.55, + "correctness": 0.75, + "explanation": 0.62, + "transfer": 0.55, + "critique": 0.58, }, - notes="Recent weak but informative performance.", + notes="Knows some basics, weak transfer and critique.", ), ] evidence_state = ingest_evidence_bundle( profile=profile, items=evidence_items, - mastery_threshold=config.platform.mastery_threshold, resurfacing_threshold=config.platform.resurfacing_threshold, confidence_threshold=config.platform.confidence_threshold, type_weights=config.platform.evidence_weights, recent_multiplier=config.platform.recent_evidence_multiplier, + dimension_thresholds=config.platform.dimension_thresholds, ) + plan = build_adaptive_plan(merged, profile) - print("== Weighted Evidence Summary ==") + print("== Multi-Dimensional Evidence Summary ==") for concept_key, summary in evidence_state.summary_by_concept.items(): print( - f"- {concept_key}: count={summary.count}, " - f"weighted_mean={summary.weighted_mean_score:.2f}, " - f"confidence={summary.confidence:.2f}, " - f"total_weight={summary.total_weight:.2f}" + f"- {concept_key}: weighted_mean={summary.weighted_mean_score:.2f}, " + f"confidence={summary.confidence:.2f}, mastered={summary.mastered}" ) if summary.dimension_means: dims = ", ".join(f"{k}={v:.2f}" for k, v in sorted(summary.dimension_means.items())) print(f" * dimensions: {dims}") + if summary.weak_dimensions: + print(f" * weak dimensions: {', '.join(summary.weak_dimensions)}") print() - print("== Mastered Concepts After Weighted Evidence ==") - for concept_key in sorted(profile.mastered_concepts): - print(f"- {concept_key}") - print() - - print("== Resurfaced Concepts ==") - if evidence_state.resurfaced_concepts: - for concept_key in sorted(evidence_state.resurfaced_concepts): + print("== Mastered Concepts ==") + if profile.mastered_concepts: + for concept_key in sorted(profile.mastered_concepts): print(f"- {concept_key}") else: - print("- none") - print() - - print("== Adaptive Plan Summary ==") - print(f"- roadmap items visible: {len(plan.learner_roadmap)}") - print(f"- next-best concepts: {len(plan.next_best_concepts)}") - print(f"- eligible projects: {len(plan.eligible_projects)}") + print("- none yet") print() print("== Next Best Concepts ==") @@ -157,7 +135,8 @@ def main() -> None: print(f"- {concept}") print() - focus_concept = plan.next_best_concepts[0] if plan.next_best_concepts else args.domain - print(generate_socratic_prompt(provider, focus_concept)) - print(generate_practice_task(provider, focus_concept)) + focus_concept = "bayes-extension::prior" + weak_dims = evidence_state.summary_by_concept.get(focus_concept).weak_dimensions if focus_concept in evidence_state.summary_by_concept else [] + print(generate_socratic_prompt(provider, focus_concept, weak_dims)) + print(generate_practice_task(provider, focus_concept, weak_dims)) print(suggest_capstone(provider, args.domain)) diff --git a/src/didactopus/mentor.py b/src/didactopus/mentor.py index df3b81a..9ded6fd 100644 --- a/src/didactopus/mentor.py +++ b/src/didactopus/mentor.py @@ -1,7 +1,10 @@ from .model_provider import ModelProvider -def generate_socratic_prompt(provider: ModelProvider, concept: str) -> str: +def generate_socratic_prompt(provider: ModelProvider, concept: str, weak_dimensions: list[str] | None = None) -> str: + weak_text = "" + if weak_dimensions: + weak_text = f" Focus especially on weak dimensions: {', '.join(weak_dimensions)}." return provider.generate( - f"You are a Socratic mentor. Ask one probing question about '{concept}'." + f"You are a Socratic mentor. Ask one probing question about '{concept}'.{weak_text}" ).text diff --git a/src/didactopus/practice.py b/src/didactopus/practice.py index acf54a2..4d58d69 100644 --- a/src/didactopus/practice.py +++ b/src/didactopus/practice.py @@ -1,7 +1,10 @@ from .model_provider import ModelProvider -def generate_practice_task(provider: ModelProvider, concept: str) -> str: +def generate_practice_task(provider: ModelProvider, concept: str, weak_dimensions: list[str] | None = None) -> str: + weak_text = "" + if weak_dimensions: + weak_text = f" Target the weak dimensions: {', '.join(weak_dimensions)}." return provider.generate( - f"Generate one reasoning-heavy practice task for '{concept}'." + f"Generate one reasoning-heavy practice task for '{concept}'.{weak_text}" ).text diff --git a/tests/test_config.py b/tests/test_config.py index e9e53bd..85c8279 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -4,5 +4,5 @@ from didactopus.config import load_config def test_load_example_config() -> None: config = load_config(Path("configs/config.example.yaml")) - assert config.platform.evidence_weights["project"] == 2.5 - assert config.platform.recent_evidence_multiplier == 1.35 + assert config.platform.dimension_thresholds["transfer"] == 0.7 + assert config.platform.confidence_threshold == 0.8 diff --git a/tests/test_multidim_mastery.py b/tests/test_multidim_mastery.py new file mode 100644 index 0000000..066dc06 --- /dev/null +++ b/tests/test_multidim_mastery.py @@ -0,0 +1,100 @@ +from didactopus.adaptive_engine import LearnerProfile +from didactopus.evidence_engine import EvidenceItem, ingest_evidence_bundle + + +DEFAULT_WEIGHTS = {"explanation": 1.0, "problem": 1.5, "project": 2.5, "transfer": 2.0} +DEFAULT_THRESHOLDS = { + "correctness": 0.8, + "explanation": 0.75, + "transfer": 0.7, + "project_execution": 0.75, + "critique": 0.7, +} + + +def test_full_multidim_mastery() -> None: + profile = LearnerProfile(learner_id="u1") + state = ingest_evidence_bundle( + profile, + [ + EvidenceItem( + "c1", + "project", + 0.9, + is_recent=True, + rubric_dimensions={ + "correctness": 0.88, + "explanation": 0.82, + "transfer": 0.77, + "project_execution": 0.9, + "critique": 0.76, + }, + ) + ], + resurfacing_threshold=0.55, + confidence_threshold=0.75, + type_weights=DEFAULT_WEIGHTS, + recent_multiplier=1.35, + dimension_thresholds=DEFAULT_THRESHOLDS, + ) + assert "c1" in profile.mastered_concepts + assert state.summary_by_concept["c1"].mastered is True + assert state.summary_by_concept["c1"].weak_dimensions == [] + + +def test_partial_weakness_blocks_mastery() -> None: + profile = LearnerProfile(learner_id="u1") + state = ingest_evidence_bundle( + profile, + [ + EvidenceItem( + "c1", + "project", + 0.85, + is_recent=True, + rubric_dimensions={ + "correctness": 0.9, + "explanation": 0.86, + "transfer": 0.52, + "project_execution": 0.88, + "critique": 0.8, + }, + ) + ], + resurfacing_threshold=0.55, + confidence_threshold=0.75, + type_weights=DEFAULT_WEIGHTS, + recent_multiplier=1.35, + dimension_thresholds=DEFAULT_THRESHOLDS, + ) + assert "c1" not in profile.mastered_concepts + assert state.summary_by_concept["c1"].mastered is False + assert "transfer" in state.summary_by_concept["c1"].weak_dimensions + + +def test_resurfacing_from_multidim_weakness() -> None: + profile = LearnerProfile(learner_id="u1", mastered_concepts={"c1"}) + state = ingest_evidence_bundle( + profile, + [ + EvidenceItem( + "c1", + "problem", + 0.45, + is_recent=True, + rubric_dimensions={ + "correctness": 0.45, + "explanation": 0.5, + "transfer": 0.4, + "critique": 0.42, + }, + ) + ], + resurfacing_threshold=0.55, + confidence_threshold=0.75, + type_weights=DEFAULT_WEIGHTS, + recent_multiplier=1.35, + dimension_thresholds=DEFAULT_THRESHOLDS, + ) + assert "c1" not in profile.mastered_concepts + assert "c1" in state.resurfaced_concepts