Initial commit of GPT 5.2 content

2025-12-17 04:40:30 +00:00 · 2025-12-17 04:40:30 +00:00 · db59a84727
parent 21a03466c3
commit db59a84727
8 changed files with 409 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1,3 +1,13 @@
 # DiffSeeker

-This repository is mostly command-line tooling to support characterizing and comparing files on different media, volumes, or directory trees.
+DiffSeeker scans directory trees, records file metadata plus content hashes, and supports cross-volume comparison for:
+- duplicates (same hash + size) across volumes
+- missing files (present on one volume, absent on others by hash+size)
+- suspicious divergences (same name, different size)
+
+## Python CLI (mpchunkcfa compatible)
+
+Install (editable dev install):
+```bash
+pip install -e .
+
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,22 @@
+[build-system]
+requires = ["setuptools>=61", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "diffseeker"
+version = "0.1.0"
+description = "Cross-volume file scanning and matching by hash+size"
+requires-python = ">=3.10"
+dependencies = [
+  "docopt>=0.6.2",
+]
+
+[project.scripts]
+mpchunkcfa = "diffseeker.cli_mpchunkcfa:main"
+
+[tool.setuptools]
+package-dir = {"" = "python/src"}
+
+[tool.setuptools.packages.find]
+where = ["python/src"]
+
--- a/python/scripts/mpchunkcfa.py
+++ b/python/scripts/mpchunkcfa.py
@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+from diffseeker.cli_mpchunkcfa import main
+
+if __name__ == "__main__":
+    raise SystemExit(main())
+
--- a/python/src/diffseeker/cli_mpchunkcfa.py.py
+++ b/python/src/diffseeker/cli_mpchunkcfa.py.py
@ -0,0 +1,60 @@
+"""
+DiffSeeker - mpchunkcfa compatible CLI
+
+USAGE:
+  mpchunkcfa.py --help
+  mpchunkcfa.py --version
+  mpchunkcfa.py --walk <WALKDIR> [-c <CSVNAME>] [-V <VOLUME_NAME>]
+               [--hash <HASHTYPE>] [--exclude <EXCLUDED>...]
+               [--db <DBPATH>] [--reserve-cores <N>]
+
+OPTIONS:
+  -h, --help                   Print this help.
+  -v, --version                Print the version [version=1.00]
+  -w, --walk <WALKDIR>         Walk a given directory and output CSV
+  -c, --csv <CSVNAME>          Name for the output CSV [default: cfa-output.csv]
+  -V, --volume_name <VOLUME_NAME>  Name of the disk volume or resource [default: UNKNOWN]
+  --hash <HASHTYPE>            Hash type (sha256, blake2b, md5, ...) [default: sha256]
+  --exclude <EXCLUDED>         Path element(s) to exclude (repeatable) [default: .git]
+  --db <DBPATH>                Optional SQLite database path to ingest results
+  --reserve-cores <N>          Leave N CPU cores unused [default: 0]
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from docopt import docopt
+
+from .scan import collect_file_attributes
+from .csvio import write_records_csv
+from .db import ingest_records
+
+VERSION = "1.00"
+
+
+def main(argv=None) -> int:
+    args = docopt(__doc__, argv=argv, version=VERSION)
+
+    walkdir = args["<WALKDIR>"]
+    csvname = args["<CSVNAME>"]
+    volume_name = args["<VOLUME_NAME>"]
+    hash_type = args["<HASHTYPE>"]
+    excludes = args["<EXCLUDED>"] or [".git"]
+    dbpath = args["<DBPATH>"]
+    reserve = int(args["<N>"])
+
+    records = collect_file_attributes(
+        volume_name=volume_name,
+        directory=walkdir,
+        hash_type=hash_type,
+        excluded=excludes,
+        reserve_cores=reserve,
+    )
+
+    write_records_csv(csvname, records)
+
+    if dbpath:
+        ingest_records(Path(dbpath), records)
+
+    return 0
+
--- a/python/src/diffseeker/csvio.py
+++ b/python/src/diffseeker/csvio.py
@ -0,0 +1,27 @@
+from __future__ import annotations
+
+import csv
+from typing import Dict, Iterable, List
+
+
+FIELDNAMES = [
+    "name",
+    "relative_path",
+    "extension",
+    "size",
+    "creation_date",
+    "modified_date",
+    "hash_value",
+    "file_type",
+    "number_of_files",
+    "volume_name",
+]
+
+
+def write_records_csv(path: str, records: Iterable[Dict[str, object]]) -> None:
+    records_list: List[Dict[str, object]] = list(records)
+    with open(path, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
+        writer.writeheader()
+        for r in records_list:
+            writer.writerow(r)
--- a/python/src/diffseeker/db.py
+++ b/python/src/diffseeker/db.py
@ -0,0 +1,92 @@
+from __future__ import annotations
+
+import sqlite3
+from pathlib import Path
+from typing import Dict, Iterable, Optional
+
+
+SCHEMA_SQL = """
+CREATE TABLE IF NOT EXISTS file_attributes (
+    id              INTEGER PRIMARY KEY,
+    name            TEXT,
+    relative_path   TEXT,
+    extension       TEXT,
+    size            INTEGER,
+    creation_date   REAL,
+    modified_date   REAL,
+    hash_value      TEXT,
+    file_type       TEXT,
+    number_of_files INTEGER,
+    volume_name     TEXT
+);
+
+CREATE TABLE IF NOT EXISTS matches (
+    id        INTEGER PRIMARY KEY,
+    file1_id  INTEGER,
+    file2_id  INTEGER,
+    FOREIGN KEY(file1_id) REFERENCES file_attributes(id),
+    FOREIGN KEY(file2_id) REFERENCES file_attributes(id)
+);
+"""
+
+INDEX_SQL = """
+CREATE INDEX IF NOT EXISTS idx_file_hash_size
+    ON file_attributes(hash_value, size);
+CREATE INDEX IF NOT EXISTS idx_file_volume
+    ON file_attributes(volume_name);
+"""
+
+
+def connect(db_path: Path) -> sqlite3.Connection:
+    conn = sqlite3.connect(db_path)
+    conn.execute("PRAGMA foreign_keys=ON;")
+    return conn
+
+
+def init_db(db_path: Path) -> None:
+    conn = connect(db_path)
+    try:
+        conn.executescript(SCHEMA_SQL)
+        conn.executescript(INDEX_SQL)
+        conn.commit()
+    finally:
+        conn.close()
+
+
+def upsert_file_attribute(conn: sqlite3.Connection, rec: Dict[str, object]) -> int:
+    """
+    Practical 'upsert' requires a natural key. We recommend (volume_name, hash_value, size, relative_path, name)
+    or introduce a separate unique constraint. For now, we do insert-only and return rowid.
+
+    If you want true upsert, add:
+      UNIQUE(volume_name, hash_value, size, relative_path, name)
+    and switch to INSERT ... ON CONFLICT ... DO UPDATE.
+    """
+    cur = conn.cursor()
+    cur.execute(
+        """
+        INSERT INTO file_attributes
+            (name, relative_path, extension, size,
+             creation_date, modified_date, hash_value,
+             file_type, number_of_files, volume_name)
+        VALUES
+            (:name, :relative_path, :extension, :size,
+             :creation_date, :modified_date, :hash_value,
+             :file_type, :number_of_files, :volume_name)
+        """,
+        rec,
+    )
+    return cur.lastrowid
+
+
+def ingest_records(db_path: Path, records: Iterable[Dict[str, object]]) -> None:
+    init_db(db_path)
+    conn = connect(db_path)
+    try:
+        cur = conn.cursor()
+        for rec in records:
+            upsert_file_attribute(conn, rec)
+        conn.commit()
+    finally:
+        conn.close()
+
--- a/python/src/diffseeker/filter.py
+++ b/python/src/diffseeker/filter.py
@ -0,0 +1,43 @@
+from __future__ import annotations
+
+import os
+from typing import Iterable, List, Tuple
+
+
+def path_contains_any_element(path: str, excluded: Iterable[str]) -> bool:
+    """
+    Exclude based on path *elements* (directory/file names), not substring matches.
+
+    Example:
+      excluded=['.git']
+      /a/b/.git/config  -> True
+      /a/b/.gitignore   -> False ('.gitignore' is a file element, not '.git')
+    """
+    parts = set(os.path.normpath(path).split(os.sep))
+    return any(ex in parts for ex in excluded)
+
+
+def filter_walk_triplet(
+    excluded: List[str],
+    root: str,
+    dirs: List[str],
+    files: List[str],
+) -> Tuple[str, List[str], List[str]]:
+    """
+    Designed for use inside os.walk. Modify dirs to prevent descent into excluded dirs.
+    Returns (root, filtered_dirs, filtered_files).
+    """
+    # Filter dirs in-place compatible manner: caller may assign dirs[:] = filtered_dirs
+    filtered_dirs = []
+    for d in dirs:
+        full = os.path.join(root, d)
+        if not path_contains_any_element(full, excluded):
+            filtered_dirs.append(d)
+
+    filtered_files = []
+    for f in files:
+        full = os.path.join(root, f)
+        if not path_contains_any_element(full, excluded):
+            filtered_files.append(f)
+
+    return root, filtered_dirs, filtered_files
--- a/python/src/diffseeker/scan.py
+++ b/python/src/diffseeker/scan.py
@ -0,0 +1,148 @@
+from __future__ import annotations
+
+import csv
+import hashlib
+import os
+from dataclasses import asdict, dataclass
+from mmap import ACCESS_READ, mmap
+from multiprocessing import Pool, cpu_count
+from typing import Callable, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple
+
+from .filter import filter_walk_triplet
+
+
+@dataclass(frozen=True)
+class FileRecord:
+    name: str
+    relative_path: str
+    extension: str
+    size: int
+    creation_date: float
+    modified_date: float
+    hash_value: str
+    file_type: str
+    number_of_files: int
+    volume_name: str
+
+
+def hash_file_mmap(path: str, hash_type: str) -> str:
+    h = hashlib.new(hash_type)
+    with open(path, "rb") as f:
+        with mmap(f.fileno(), 0, access=ACCESS_READ) as mm:
+            h.update(mm)
+    return h.hexdigest()
+
+
+def hash_file_chunked(path: str, hash_type: str, chunk_size: int = 1024 * 1024) -> str:
+    h = hashlib.new(hash_type)
+    with open(path, "rb") as f:
+        while True:
+            buf = f.read(chunk_size)
+            if not buf:
+                break
+            h.update(buf)
+    return h.hexdigest()
+
+
+def _compute_record(args: Tuple[str, str, str, str, str, int, bool]) -> Optional[FileRecord]:
+    """
+    Worker for multiprocessing.
+
+    args:
+      path, root, directory, volume_name, hash_type, number_of_files, use_mmap
+    """
+    path, root, directory, volume_name, hash_type, number_of_files, use_mmap = args
+    try:
+        base = os.path.basename(path)
+        stem, ext = os.path.splitext(base)
+
+        rel_root = os.path.relpath(root, directory)
+        size = os.path.getsize(path)
+        ctime = os.path.getctime(path)
+        mtime = os.path.getmtime(path)
+
+        if use_mmap:
+            hv = hash_file_mmap(path, hash_type)
+        else:
+            hv = hash_file_chunked(path, hash_type)
+
+        return FileRecord(
+            name=stem,
+            relative_path=rel_root,
+            extension=ext,
+            size=size,
+            creation_date=ctime,
+            modified_date=mtime,
+            hash_value=hv,
+            file_type=ext,
+            number_of_files=number_of_files,
+            volume_name=volume_name,
+        )
+    except (OSError, PermissionError):
+        # Optionally log these elsewhere; for now, skip unreadable entries.
+        return None
+
+
+def iter_file_tasks(
+    directory: str,
+    volume_name: str,
+    hash_type: str,
+    excluded: Sequence[str] = (".git", ".svn"),
+    use_mmap: bool = True,
+) -> Iterator[Tuple[str, str, str, str, str, int, bool]]:
+    """
+    Yield worker tasks from os.walk, pruning excluded directories.
+    """
+    for root, dirs, files in os.walk(directory):
+        root, filtered_dirs, filtered_files = filter_walk_triplet(list(excluded), root, dirs, files)
+        dirs[:] = filtered_dirs  # prune descent
+        if not filtered_files:
+            continue
+
+        nfiles = len(filtered_files)
+        for fname in filtered_files:
+            path = os.path.join(root, fname)
+            yield (path, root, directory, volume_name, hash_type, nfiles, use_mmap)
+
+
+def collect_file_attributes(
+    volume_name: str,
+    directory: str,
+    hash_type: str = "sha256",
+    excluded: Sequence[str] = (".git", ".svn"),
+    processes: Optional[int] = None,
+    reserve_cores: int = 0,
+    chunksize: int = 32,
+    use_mmap: bool = True,
+    progress_cb: Optional[Callable[[int, Optional[int]], None]] = None,
+) -> List[Dict[str, object]]:
+    """
+    Scan directory, compute file metadata + hash, return list of dicts suitable for CSV/db.
+
+    progress_cb(current, total) if provided; total may be None if not pre-counted.
+    """
+    if processes is None:
+        processes = max(cpu_count() - int(reserve_cores), 1)
+
+    tasks = iter_file_tasks(
+        directory=directory,
+        volume_name=volume_name,
+        hash_type=hash_type,
+        excluded=excluded,
+        use_mmap=use_mmap,
+    )
+
+    out: List[Dict[str, object]] = []
+    completed = 0
+
+    with Pool(processes=processes) as pool:
+        for rec in pool.imap(_compute_record, tasks, chunksize=chunksize):
+            if rec is None:
+                continue
+            out.append(asdict(rec))
+            completed += 1
+            if progress_cb:
+                progress_cb(completed, None)
+
+    return out
+