From aa0951ebf155ab96f5cd4df01867f6a221925b15 Mon Sep 17 00:00:00 2001 From: welsberr Date: Wed, 22 Apr 2026 16:42:49 -0400 Subject: [PATCH] Initial commit --- .gitignore | 28 ++++ Dockerfile | 17 +++ README.md | 84 ++++++++++++ docker-compose.yml | 28 ++++ docs/architecture.md | 38 ++++++ docs/bundle-format.md | 41 ++++++ pyproject.toml | 23 ++++ src/doclift/__init__.py | 3 + src/doclift/cli.py | 50 +++++++ src/doclift/convert.py | 101 +++++++++++++++ src/doclift/inspect.py | 26 ++++ src/doclift/legacy_doc.py | 266 ++++++++++++++++++++++++++++++++++++++ src/doclift/schemas.py | 52 ++++++++ src/doclift/utils.py | 16 +++ tests/test_legacy_doc.py | 24 ++++ 15 files changed, 797 insertions(+) create mode 100755 .gitignore create mode 100755 Dockerfile create mode 100755 README.md create mode 100755 docker-compose.yml create mode 100755 docs/architecture.md create mode 100755 docs/bundle-format.md create mode 100755 pyproject.toml create mode 100755 src/doclift/__init__.py create mode 100755 src/doclift/cli.py create mode 100755 src/doclift/convert.py create mode 100755 src/doclift/inspect.py create mode 100755 src/doclift/legacy_doc.py create mode 100755 src/doclift/schemas.py create mode 100755 src/doclift/utils.py create mode 100755 tests/test_legacy_doc.py diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..3abe23f --- /dev/null +++ b/.gitignore @@ -0,0 +1,28 @@ +__pycache__/ +*.py[cod] +*.so + +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.coverage +.coverage.* +htmlcov/ + +.venv/ +venv/ +env/ + +build/ +dist/ +*.egg-info/ + +.DS_Store + +tmp/ +temp/ +artifacts/ +outputs/ + +*.swp +*~ diff --git a/Dockerfile b/Dockerfile new file mode 100755 index 0000000..20d029e --- /dev/null +++ b/Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.11-slim + +RUN apt-get update && apt-get install -y --no-install-recommends \ + catdoc \ + antiword \ + libreoffice \ + pandoc \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY pyproject.toml README.md /app/ +COPY src /app/src + +RUN pip install --no-cache-dir -e . + +ENTRYPOINT ["doclift"] diff --git a/README.md b/README.md new file mode 100755 index 0000000..557bdeb --- /dev/null +++ b/README.md @@ -0,0 +1,84 @@ +# doclift + +`doclift` is a legacy-document normalization toolkit for turning old office documents into reviewable, structured bundles. + +The initial target is legacy Word `.doc` files, but the repository boundary is intentionally broader: + +- extract legacy document text and metadata +- preserve layout cues that survive extraction +- recover tables, figure references, and other structural signals +- emit normalized Markdown plus JSON sidecars +- produce deterministic conversion reports for downstream systems such as Didactopus and GroundRecall + +## Scope + +`doclift` is not a learner-facing system. It is a source-normalization layer that other projects can consume. + +Current implementation: + +- legacy Word `.doc` conversion through `catdoc` +- bundle emission with: + - `document.md` + - `document.layout.json` + - `document.tables.json` + - `document.figures.json` + - `manifest.json` + - `conversion_report.json` +- course/workspace-level external figure asset inventory + +Planned follow-on formats: + +- WordPerfect +- RTF +- DOCX as a higher-fidelity path +- old HTML +- OCR-assisted scanned documents + +## Install + +```bash +pip install -e . +doclift --help +``` + +## Quick Start + +Inspect a source: + +```bash +doclift inspect /path/to/legacy.doc +``` + +Convert one document: + +```bash +doclift convert /path/to/legacy.doc /tmp/doclift-out +``` + +Convert a directory tree and inventory external figure assets: + +```bash +doclift convert-dir /path/to/source-tree /tmp/doclift-bundle --asset-root /path/to/source-tree +``` + +## Bundle Layout + +```text +out/ + conversion_report.json + manifest.json + assets/ + figure_asset_inventory.json + documents/ + some-doc/ + document.md + document.layout.json + document.tables.json + document.figures.json +``` + +## Relationship To Other Projects + +- `Didactopus` should consume `doclift` bundles rather than own legacy format handling. +- `GroundRecall` can use the same bundles for provenance-aware import. +- other archival or scholarly tooling can reuse the same normalization path without depending on Didactopus. diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100755 index 0000000..bc50613 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,28 @@ +version: "3.9" + +services: + doclift: + build: . + working_dir: /workspace + volumes: + - ./:/app + - ${DOCLIFT_WORKSPACE:-/tmp}:/workspace + environment: + PYTHONUNBUFFERED: "1" + XDG_CONFIG_HOME: /tmp/doclift-config + XDG_CACHE_HOME: /tmp/doclift-cache + XDG_RUNTIME_DIR: /tmp/doclift-runtime + entrypoint: ["doclift"] + + shell: + build: . + working_dir: /workspace + volumes: + - ./:/app + - ${DOCLIFT_WORKSPACE:-/tmp}:/workspace + environment: + PYTHONUNBUFFERED: "1" + XDG_CONFIG_HOME: /tmp/doclift-config + XDG_CACHE_HOME: /tmp/doclift-cache + XDG_RUNTIME_DIR: /tmp/doclift-runtime + entrypoint: ["/bin/bash"] diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100755 index 0000000..63c7a70 --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,38 @@ +# Architecture + +`doclift` is intended to sit between raw legacy sources and downstream domain-specific systems. + +## Layers + +1. Format detection +2. Format-specific extraction +3. Structural recovery +4. Normalized bundle emission +5. Downstream import by applications such as Didactopus or GroundRecall + +## Design constraints + +- deterministic outputs +- explicit provenance +- structured sidecars for non-prose information +- graceful degradation when exact layout cannot be recovered +- container-friendly execution to reduce cross-platform variance + +## Output philosophy + +The primary artifact is not a page-faithful rendering. It is a normalized bundle: + +- readable by humans +- structured enough for agents and pipelines +- explicit about uncertainty and extraction limits + +## Initial format strategy + +- `.doc`: implemented through `catdoc`, with layout/table recovery on extracted text +- `.docx`: planned as a higher-fidelity path +- `.wpd`: planned as a plugin/adapter target, not hard-coded into core assumptions + +## Why separate from Didactopus + +`doclift` owns document rescue and normalization complexity. +`Didactopus` should stay focused on course ingestion, concept extraction, and learning-path generation. diff --git a/docs/bundle-format.md b/docs/bundle-format.md new file mode 100755 index 0000000..71022ec --- /dev/null +++ b/docs/bundle-format.md @@ -0,0 +1,41 @@ +# Bundle Format + +## Top-level + +`manifest.json` +- bundle version +- source root +- converter summary +- document list + +`conversion_report.json` +- per-document conversion metrics +- counts for tables, figure references, and errors + +`assets/figure_asset_inventory.json` +- optional inventory of external image/figure files discovered under an asset root + +## Per-document + +Each normalized document lives under `documents//`. + +`document.md` +- readable normalized text +- extracted table and figure sections when available + +`document.layout.json` +- line-oriented layout manifest +- indentation, tabs, and coarse line classification + +`document.tables.json` +- table references found in text +- recovered tables with captions, raw lines, parsed rows, and source line ranges + +`document.figures.json` +- explicit figure references from text +- related external assets when available + +## Stability + +The schema should be stable enough for downstream adapters. +Converters may improve row parsing or figure linking without breaking field names. diff --git a/pyproject.toml b/pyproject.toml new file mode 100755 index 0000000..c33a274 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,23 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "doclift" +version = "0.1.0" +description = "Legacy-document normalization and structured conversion toolkit" +requires-python = ">=3.10" +dependencies = [ + "pydantic>=2.7", + "PyYAML>=6.0", +] + +[project.scripts] +doclift = "doclift.cli:main" + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.pytest.ini_options] +pythonpath = ["src"] +testpaths = ["tests"] diff --git a/src/doclift/__init__.py b/src/doclift/__init__.py new file mode 100755 index 0000000..a05eb9a --- /dev/null +++ b/src/doclift/__init__.py @@ -0,0 +1,3 @@ +__all__ = ["__version__"] + +__version__ = "0.1.0" diff --git a/src/doclift/cli.py b/src/doclift/cli.py new file mode 100755 index 0000000..e0e9b3a --- /dev/null +++ b/src/doclift/cli.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +from .convert import convert_directory, convert_doc +from .inspect import inspect_path +from .legacy_doc import collect_figure_assets + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Legacy-document normalization toolkit") + subparsers = parser.add_subparsers(dest="command", required=True) + + inspect_parser = subparsers.add_parser("inspect", help="Inspect a source file") + inspect_parser.add_argument("source") + + convert_parser = subparsers.add_parser("convert", help="Convert a single legacy Word .doc file") + convert_parser.add_argument("source") + convert_parser.add_argument("out") + convert_parser.add_argument("--asset-root", default=None) + + convert_dir_parser = subparsers.add_parser("convert-dir", help="Convert all supported files in a directory tree") + convert_dir_parser.add_argument("source_root") + convert_dir_parser.add_argument("out") + convert_dir_parser.add_argument("--asset-root", default=None) + return parser + + +def main() -> None: + args = build_parser().parse_args() + if args.command == "inspect": + print(json.dumps(inspect_path(Path(args.source)), indent=2)) + return + if args.command == "convert": + asset_root = Path(args.asset_root) if args.asset_root else None + assets = collect_figure_assets(asset_root) if asset_root else [] + bundle = convert_doc(Path(args.source), Path(args.out), figure_assets=assets) + print(json.dumps(bundle.model_dump(), indent=2)) + return + if args.command == "convert-dir": + asset_root = Path(args.asset_root) if args.asset_root else None + report = convert_directory(Path(args.source_root), Path(args.out), asset_root=asset_root) + print(json.dumps(report.model_dump(), indent=2)) + return + + +if __name__ == "__main__": + main() diff --git a/src/doclift/convert.py b/src/doclift/convert.py new file mode 100755 index 0000000..47252d3 --- /dev/null +++ b/src/doclift/convert.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +from pathlib import Path + +from .legacy_doc import ( + build_layout_manifest, + clean_text, + collect_figure_assets, + extract_references, + extract_tables, + extract_title, + normalize_text_preserve_layout, + render_markdown, + run_catdoc, + strip_title, +) +from .schemas import ConversionReport, DocumentBundle +from .utils import slugify, write_json + + +def _document_output_dir(out_root: Path, source_path: Path, title: str) -> Path: + return out_root / "documents" / f"{slugify(source_path.stem)}-{slugify(title)}" + + +def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None = None) -> DocumentBundle: + raw = run_catdoc(source_path) + cleaned = clean_text(raw) + title = extract_title(cleaned, source_path.stem) + body = strip_title(cleaned, title) + layout_body = normalize_text_preserve_layout(strip_title(raw, title)) + tables = extract_tables(layout_body) + layout = build_layout_manifest(layout_body) + table_refs = extract_references(body, r"\bTable\s+\d+\b") + figure_refs = extract_references(body, r"\b(?:Fig\.?\s*[\d.]+|Figure\s+[\d.]+)\b") + related_assets = list(figure_assets or []) + + doc_out = _document_output_dir(out_root, source_path, title) + doc_out.mkdir(parents=True, exist_ok=True) + markdown_path = doc_out / "document.md" + layout_path = doc_out / "document.layout.json" + tables_path = doc_out / "document.tables.json" + figures_path = doc_out / "document.figures.json" + + markdown_path.write_text(render_markdown(title, body, tables, figure_refs, related_assets), encoding="utf-8") + write_json(layout_path, layout) + write_json( + tables_path, + { + "source_path": str(source_path), + "table_references": table_refs, + "tables": [table.model_dump() for table in tables], + }, + ) + write_json( + figures_path, + { + "source_path": str(source_path), + "figure_references": figure_refs, + "related_assets": [asset.model_dump() for asset in related_assets], + }, + ) + + return DocumentBundle( + document_id=slugify(title), + title=title, + source_path=str(source_path), + output_dir=str(doc_out), + markdown_path=str(markdown_path), + layout_path=str(layout_path), + tables_path=str(tables_path), + figures_path=str(figures_path), + table_count=len(tables), + figure_reference_count=len(figure_refs), + ) + + +def convert_directory(source_root: Path, out_root: Path, asset_root: Path | None = None) -> ConversionReport: + docs = sorted(path for path in source_root.rglob("*") if path.is_file() and path.suffix.lower() == ".doc") + figure_assets = collect_figure_assets(asset_root) if asset_root is not None else [] + bundles = [convert_doc(path, out_root, figure_assets=figure_assets) for path in docs] + report = ConversionReport( + source_root=str(source_root), + converter="catdoc_doc", + document_count=len(bundles), + documents=bundles, + external_figure_asset_count=len(figure_assets), + ) + write_json(out_root / "manifest.json", report.model_dump()) + write_json( + out_root / "conversion_report.json", + report.model_dump() + | { + "summary": { + "documents_with_tables": sum(1 for bundle in bundles if bundle.table_count > 0), + "documents_with_figure_references": sum(1 for bundle in bundles if bundle.figure_reference_count > 0), + } + }, + ) + if figure_assets: + write_json(out_root / "assets" / "figure_asset_inventory.json", [asset.model_dump() for asset in figure_assets]) + return report diff --git a/src/doclift/inspect.py b/src/doclift/inspect.py new file mode 100755 index 0000000..808ed1d --- /dev/null +++ b/src/doclift/inspect.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from pathlib import Path + +from .legacy_doc import clean_text, extract_title, run_catdoc + + +def inspect_path(path: Path) -> dict: + suffix = path.suffix.lower() + payload = { + "path": str(path), + "suffix": suffix, + "format_family": "unknown", + "supported": False, + } + if suffix == ".doc": + raw = run_catdoc(path) + cleaned = clean_text(raw) + payload |= { + "format_family": "legacy_word_doc", + "supported": True, + "title_guess": extract_title(cleaned, path.stem), + "line_count": len(cleaned.splitlines()), + "char_count": len(cleaned), + } + return payload diff --git a/src/doclift/legacy_doc.py b/src/doclift/legacy_doc.py new file mode 100755 index 0000000..5482680 --- /dev/null +++ b/src/doclift/legacy_doc.py @@ -0,0 +1,266 @@ +from __future__ import annotations + +import re +import subprocess +from pathlib import Path + +from .schemas import FigureAsset, TableArtifact +from .utils import slugify + +IMAGE_SUFFIXES = {".bmp", ".gif", ".jpg", ".jpeg", ".png", ".tif", ".tiff", ".psd"} + + +def run_catdoc(path: Path) -> str: + result = subprocess.run(["catdoc", str(path)], capture_output=True, text=True, check=False) + if result.returncode != 0: + raise RuntimeError(f"catdoc failed for {path}: {result.stderr.strip()}") + return result.stdout.replace("\r\n", "\n").replace("\r", "\n") + + +def clean_text(text: str) -> str: + lines = [line.rstrip() for line in text.replace("\x0b", "\n").replace("\x0c", "\n").splitlines()] + cleaned: list[str] = [] + for line in lines: + stripped = line.strip() + if stripped.startswith("[This was fast-saved"): + continue + if re.match(r"^PAGE\b", stripped): + continue + if not stripped: + if cleaned and cleaned[-1] == "": + continue + cleaned.append("") + continue + cleaned.append(stripped) + return "\n".join(cleaned).strip() + + +def normalize_text_preserve_layout(text: str) -> str: + lines = [line.rstrip() for line in text.replace("\x0b", "\n").replace("\x0c", "\n").splitlines()] + cleaned: list[str] = [] + for line in lines: + stripped = line.strip() + if stripped.startswith("[This was fast-saved"): + continue + if re.match(r"^PAGE\b", stripped): + continue + if not stripped: + if cleaned and cleaned[-1] == "": + continue + cleaned.append("") + continue + cleaned.append(line) + return "\n".join(cleaned).strip() + + +def extract_title(text: str, fallback: str) -> str: + lines = text.splitlines() + for index, line in enumerate(lines): + stripped = line.strip() + if not stripped: + continue + if re.match(r"^Lecture\s+\d+\.", stripped, re.IGNORECASE): + if index + 1 < len(lines): + nxt = lines[index + 1].strip() + if nxt and ( + stripped.endswith(("of", "in", "and", "to")) + or (nxt and nxt[0].islower()) + or nxt in {"Marine Mammals", "the Harbor Seal", "season"} + ): + return f"{stripped} {nxt}".strip() + return stripped + if stripped.upper() in { + "SPRING 2000", + "MARB 401", + "MARB 482 SEMINAR IN MARINE BIOLOGY", + "COURSE SYLLABUS", + "EXAM I", + "EXAM II", + "FINAL EXAM SPRING 1999", + }: + continue + if stripped.startswith(("February ", "April ")): + continue + return stripped + return fallback + + +def strip_title(text: str, title: str) -> str: + lines = text.splitlines() + normalized_title = " ".join(title.split()) + for index, line in enumerate(lines): + candidate = line.strip() + if not candidate: + continue + if " ".join(candidate.split()) == normalized_title: + return "\n".join(lines[index + 1 :]).strip() + if index + 1 < len(lines): + combined = f"{candidate} {lines[index + 1].strip()}".strip() + if " ".join(combined.split()) == normalized_title: + return "\n".join(lines[index + 2 :]).strip() + return text.strip() + + +def indent_level(line: str) -> int: + tabs = len(line) - len(line.lstrip("\t")) + spaces = len(line) - len(line.lstrip(" ")) + return tabs + (spaces // 4) + + +def classify_layout_line(stripped: str) -> str: + if not stripped: + return "blank" + if re.match(r"^(Table\s+\d+\.?|Fig\.?\s*[\d.]+|Figure\s+[\d.]+)", stripped, re.IGNORECASE): + return "caption" + if re.match(r"^[IVX]+\.", stripped): + return "roman-list" + if re.match(r"^[A-Z]\.", stripped): + return "alpha-list" + if re.match(r"^\d+\.", stripped): + return "numbered-list" + if "=" in stripped: + return "equation" + return "paragraph" + + +def split_cells(line: str) -> list[str]: + if "\t" in line: + parts = [cell.strip() for cell in re.split(r"\t+", line) if cell.strip()] + if len(parts) >= 2: + return parts + parts = [cell.strip() for cell in re.split(r"\s{2,}", line.strip()) if cell.strip()] + return parts if len(parts) >= 2 else [] + + +def extract_tables(layout_body: str) -> list[TableArtifact]: + lines = layout_body.splitlines() + tables: list[TableArtifact] = [] + index = 0 + while index < len(lines): + stripped = lines[index].strip() + if not re.match(r"^Table\s+\d+\.?", stripped, re.IGNORECASE): + index += 1 + continue + caption_lines = [stripped] + start = index + index += 1 + while index < len(lines) and lines[index].strip(): + candidate = lines[index].strip() + if split_cells(candidate): + break + caption_lines.append(candidate) + index += 1 + while index < len(lines) and not lines[index].strip(): + index += 1 + + raw_rows: list[str] = [] + parsed_rows: list[list[str]] = [] + section_labels: list[str] = [] + while index < len(lines): + candidate = lines[index] + stripped_candidate = candidate.strip() + if re.match(r"^Table\s+\d+\.?", stripped_candidate, re.IGNORECASE): + break + if re.match(r"^\d+\.\s", stripped_candidate) and parsed_rows: + break + if re.match(r"^PAGE\b", stripped_candidate): + break + if stripped_candidate: + raw_rows.append(candidate) + cells = split_cells(candidate) + if cells: + parsed_rows.append(cells) + elif stripped_candidate.isupper() and len(stripped_candidate.split()) <= 4: + section_labels.append(stripped_candidate) + index += 1 + + caption = " ".join(caption_lines) + tables.append( + TableArtifact( + table_id=slugify(caption), + caption=caption, + start_line=start + 1, + end_line=max(start + 1, index), + raw_lines=raw_rows, + parsed_rows=parsed_rows, + section_labels=section_labels, + column_count_guess=max((len(row) for row in parsed_rows), default=0), + ) + ) + return tables + + +def extract_references(body: str, pattern: str) -> list[str]: + seen: list[str] = [] + seen_keys: set[str] = set() + for match in re.finditer(pattern, body, re.IGNORECASE): + value = match.group(0) + key = value.lower() + if key not in seen_keys: + seen_keys.add(key) + seen.append(value) + return seen + + +def collect_figure_assets(root: Path) -> list[FigureAsset]: + assets: list[FigureAsset] = [] + for path in sorted(root.rglob("*")): + if not path.is_file() or path.suffix.lower() not in IMAGE_SUFFIXES: + continue + relative = path.relative_to(root).as_posix() + assets.append( + FigureAsset( + asset_id=slugify(relative), + path=str(path), + relative_path=relative, + name=path.name, + container=path.parent.name, + looks_like_figure=bool(re.match(r"^fig\.?\s*", path.name, re.IGNORECASE)), + ) + ) + return assets + + +def build_layout_manifest(layout_body: str) -> list[dict]: + manifest: list[dict] = [] + for line_no, line in enumerate(layout_body.splitlines(), start=1): + stripped = line.strip() + if not stripped: + continue + manifest.append( + { + "line_no": line_no, + "indent_level": indent_level(line), + "has_tabs": "\t" in line, + "kind": classify_layout_line(stripped), + "text": stripped, + } + ) + return manifest + + +def render_markdown(title: str, body: str, tables: list[TableArtifact], figure_refs: list[str], related_assets: list[FigureAsset]) -> str: + lines = [f"# {title}", "", "## Converted Text", "", body.strip()] + if tables: + lines.extend(["", "## Extracted Tables", ""]) + for table in tables: + lines.append(f"### {table.caption}") + lines.append("") + lines.append(f"- Source lines: {table.start_line}-{table.end_line}") + lines.append(f"- Parsed row count: {len(table.parsed_rows)}") + lines.append(f"- Column guess: {table.column_count_guess}") + lines.append("") + lines.append("```text") + lines.extend(line.rstrip() for line in table.raw_lines[:40]) + lines.append("```") + lines.append("") + if figure_refs or related_assets: + lines.extend(["", "## Figure Signals", ""]) + if figure_refs: + lines.extend(f"- Referenced in text: {ref}" for ref in figure_refs) + else: + lines.append("- No explicit figure references were recovered from the extracted text.") + if related_assets: + lines.append(f"- Nearby external assets: {len(related_assets)}") + lines.extend(f" - {asset.relative_path}" for asset in related_assets[:12]) + return "\n".join(lines).strip() + "\n" diff --git a/src/doclift/schemas.py b/src/doclift/schemas.py new file mode 100755 index 0000000..286f220 --- /dev/null +++ b/src/doclift/schemas.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +from pydantic import BaseModel, Field + + +class LayoutLine(BaseModel): + line_no: int + indent_level: int = 0 + has_tabs: bool = False + kind: str + text: str + + +class TableArtifact(BaseModel): + table_id: str + caption: str + start_line: int + end_line: int + raw_lines: list[str] = Field(default_factory=list) + parsed_rows: list[list[str]] = Field(default_factory=list) + section_labels: list[str] = Field(default_factory=list) + column_count_guess: int = 0 + + +class FigureAsset(BaseModel): + asset_id: str + path: str + relative_path: str + name: str + container: str = "" + looks_like_figure: bool = False + + +class DocumentBundle(BaseModel): + document_id: str + title: str + source_path: str + output_dir: str + markdown_path: str + layout_path: str + tables_path: str + figures_path: str + table_count: int = 0 + figure_reference_count: int = 0 + + +class ConversionReport(BaseModel): + source_root: str + converter: str + document_count: int = 0 + documents: list[DocumentBundle] = Field(default_factory=list) + external_figure_asset_count: int = 0 diff --git a/src/doclift/utils.py b/src/doclift/utils.py new file mode 100755 index 0000000..6b6ac32 --- /dev/null +++ b/src/doclift/utils.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +import json +import re +from pathlib import Path +from typing import Any + + +def slugify(text: str) -> str: + cleaned = re.sub(r"[^a-zA-Z0-9]+", "-", text.strip().lower()).strip("-") + return cleaned or "untitled" + + +def write_json(path: Path, payload: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, indent=2), encoding="utf-8") diff --git a/tests/test_legacy_doc.py b/tests/test_legacy_doc.py new file mode 100755 index 0000000..c787c7a --- /dev/null +++ b/tests/test_legacy_doc.py @@ -0,0 +1,24 @@ +from doclift.legacy_doc import extract_references, extract_tables + + +def test_extract_references_dedupes() -> None: + refs = extract_references("See Table 1 and table 1 and Table 2.", r"\bTable\s+\d+\b") + assert refs == ["Table 1", "Table 2"] + + +def test_extract_tables_parses_tabbed_rows() -> None: + text = "\n".join( + [ + "Intro", + "Table 1. Example caption", + "", + "Metric\tRest\tSwim", + "O2\t1.0\t2.0", + "CO2\t0.5\t1.1", + ] + ) + tables = extract_tables(text) + assert len(tables) == 1 + assert tables[0].caption == "Table 1. Example caption" + assert tables[0].column_count_guess == 3 + assert tables[0].parsed_rows[1] == ["O2", "1.0", "2.0"]