diff --git a/README.md b/README.md index e16b9fa..95c3967 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,13 @@ # DiffSeeker -This repository is mostly command-line tooling to support characterizing and comparing files on different media, volumes, or directory trees. \ No newline at end of file +DiffSeeker scans directory trees, records file metadata plus content hashes, and supports cross-volume comparison for: +- duplicates (same hash + size) across volumes +- missing files (present on one volume, absent on others by hash+size) +- suspicious divergences (same name, different size) + +## Python CLI (mpchunkcfa compatible) + +Install (editable dev install): +```bash +pip install -e . + diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..3517034 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,22 @@ +[build-system] +requires = ["setuptools>=61", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "diffseeker" +version = "0.1.0" +description = "Cross-volume file scanning and matching by hash+size" +requires-python = ">=3.10" +dependencies = [ + "docopt>=0.6.2", +] + +[project.scripts] +mpchunkcfa = "diffseeker.cli_mpchunkcfa:main" + +[tool.setuptools] +package-dir = {"" = "python/src"} + +[tool.setuptools.packages.find] +where = ["python/src"] + diff --git a/python/scripts/mpchunkcfa.py b/python/scripts/mpchunkcfa.py new file mode 100644 index 0000000..18ec20b --- /dev/null +++ b/python/scripts/mpchunkcfa.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 +from diffseeker.cli_mpchunkcfa import main + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/python/src/diffseeker/cli_mpchunkcfa.py.py b/python/src/diffseeker/cli_mpchunkcfa.py.py new file mode 100644 index 0000000..89c1818 --- /dev/null +++ b/python/src/diffseeker/cli_mpchunkcfa.py.py @@ -0,0 +1,60 @@ +""" +DiffSeeker - mpchunkcfa compatible CLI + +USAGE: + mpchunkcfa.py --help + mpchunkcfa.py --version + mpchunkcfa.py --walk [-c ] [-V ] + [--hash ] [--exclude ...] + [--db ] [--reserve-cores ] + +OPTIONS: + -h, --help Print this help. + -v, --version Print the version [version=1.00] + -w, --walk Walk a given directory and output CSV + -c, --csv Name for the output CSV [default: cfa-output.csv] + -V, --volume_name Name of the disk volume or resource [default: UNKNOWN] + --hash Hash type (sha256, blake2b, md5, ...) [default: sha256] + --exclude Path element(s) to exclude (repeatable) [default: .git] + --db Optional SQLite database path to ingest results + --reserve-cores Leave N CPU cores unused [default: 0] +""" + +from __future__ import annotations + +from pathlib import Path +from docopt import docopt + +from .scan import collect_file_attributes +from .csvio import write_records_csv +from .db import ingest_records + +VERSION = "1.00" + + +def main(argv=None) -> int: + args = docopt(__doc__, argv=argv, version=VERSION) + + walkdir = args[""] + csvname = args[""] + volume_name = args[""] + hash_type = args[""] + excludes = args[""] or [".git"] + dbpath = args[""] + reserve = int(args[""]) + + records = collect_file_attributes( + volume_name=volume_name, + directory=walkdir, + hash_type=hash_type, + excluded=excludes, + reserve_cores=reserve, + ) + + write_records_csv(csvname, records) + + if dbpath: + ingest_records(Path(dbpath), records) + + return 0 + diff --git a/python/src/diffseeker/csvio.py b/python/src/diffseeker/csvio.py new file mode 100644 index 0000000..a430413 --- /dev/null +++ b/python/src/diffseeker/csvio.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +import csv +from typing import Dict, Iterable, List + + +FIELDNAMES = [ + "name", + "relative_path", + "extension", + "size", + "creation_date", + "modified_date", + "hash_value", + "file_type", + "number_of_files", + "volume_name", +] + + +def write_records_csv(path: str, records: Iterable[Dict[str, object]]) -> None: + records_list: List[Dict[str, object]] = list(records) + with open(path, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=FIELDNAMES) + writer.writeheader() + for r in records_list: + writer.writerow(r) diff --git a/python/src/diffseeker/db.py b/python/src/diffseeker/db.py new file mode 100644 index 0000000..b8564a7 --- /dev/null +++ b/python/src/diffseeker/db.py @@ -0,0 +1,92 @@ +from __future__ import annotations + +import sqlite3 +from pathlib import Path +from typing import Dict, Iterable, Optional + + +SCHEMA_SQL = """ +CREATE TABLE IF NOT EXISTS file_attributes ( + id INTEGER PRIMARY KEY, + name TEXT, + relative_path TEXT, + extension TEXT, + size INTEGER, + creation_date REAL, + modified_date REAL, + hash_value TEXT, + file_type TEXT, + number_of_files INTEGER, + volume_name TEXT +); + +CREATE TABLE IF NOT EXISTS matches ( + id INTEGER PRIMARY KEY, + file1_id INTEGER, + file2_id INTEGER, + FOREIGN KEY(file1_id) REFERENCES file_attributes(id), + FOREIGN KEY(file2_id) REFERENCES file_attributes(id) +); +""" + +INDEX_SQL = """ +CREATE INDEX IF NOT EXISTS idx_file_hash_size + ON file_attributes(hash_value, size); +CREATE INDEX IF NOT EXISTS idx_file_volume + ON file_attributes(volume_name); +""" + + +def connect(db_path: Path) -> sqlite3.Connection: + conn = sqlite3.connect(db_path) + conn.execute("PRAGMA foreign_keys=ON;") + return conn + + +def init_db(db_path: Path) -> None: + conn = connect(db_path) + try: + conn.executescript(SCHEMA_SQL) + conn.executescript(INDEX_SQL) + conn.commit() + finally: + conn.close() + + +def upsert_file_attribute(conn: sqlite3.Connection, rec: Dict[str, object]) -> int: + """ + Practical 'upsert' requires a natural key. We recommend (volume_name, hash_value, size, relative_path, name) + or introduce a separate unique constraint. For now, we do insert-only and return rowid. + + If you want true upsert, add: + UNIQUE(volume_name, hash_value, size, relative_path, name) + and switch to INSERT ... ON CONFLICT ... DO UPDATE. + """ + cur = conn.cursor() + cur.execute( + """ + INSERT INTO file_attributes + (name, relative_path, extension, size, + creation_date, modified_date, hash_value, + file_type, number_of_files, volume_name) + VALUES + (:name, :relative_path, :extension, :size, + :creation_date, :modified_date, :hash_value, + :file_type, :number_of_files, :volume_name) + """, + rec, + ) + return cur.lastrowid + + +def ingest_records(db_path: Path, records: Iterable[Dict[str, object]]) -> None: + init_db(db_path) + conn = connect(db_path) + try: + cur = conn.cursor() + for rec in records: + upsert_file_attribute(conn, rec) + conn.commit() + finally: + conn.close() + diff --git a/python/src/diffseeker/filter.py b/python/src/diffseeker/filter.py new file mode 100644 index 0000000..e323dc6 --- /dev/null +++ b/python/src/diffseeker/filter.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +import os +from typing import Iterable, List, Tuple + + +def path_contains_any_element(path: str, excluded: Iterable[str]) -> bool: + """ + Exclude based on path *elements* (directory/file names), not substring matches. + + Example: + excluded=['.git'] + /a/b/.git/config -> True + /a/b/.gitignore -> False ('.gitignore' is a file element, not '.git') + """ + parts = set(os.path.normpath(path).split(os.sep)) + return any(ex in parts for ex in excluded) + + +def filter_walk_triplet( + excluded: List[str], + root: str, + dirs: List[str], + files: List[str], +) -> Tuple[str, List[str], List[str]]: + """ + Designed for use inside os.walk. Modify dirs to prevent descent into excluded dirs. + Returns (root, filtered_dirs, filtered_files). + """ + # Filter dirs in-place compatible manner: caller may assign dirs[:] = filtered_dirs + filtered_dirs = [] + for d in dirs: + full = os.path.join(root, d) + if not path_contains_any_element(full, excluded): + filtered_dirs.append(d) + + filtered_files = [] + for f in files: + full = os.path.join(root, f) + if not path_contains_any_element(full, excluded): + filtered_files.append(f) + + return root, filtered_dirs, filtered_files diff --git a/python/src/diffseeker/scan.py b/python/src/diffseeker/scan.py new file mode 100644 index 0000000..8971066 --- /dev/null +++ b/python/src/diffseeker/scan.py @@ -0,0 +1,148 @@ +from __future__ import annotations + +import csv +import hashlib +import os +from dataclasses import asdict, dataclass +from mmap import ACCESS_READ, mmap +from multiprocessing import Pool, cpu_count +from typing import Callable, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple + +from .filter import filter_walk_triplet + + +@dataclass(frozen=True) +class FileRecord: + name: str + relative_path: str + extension: str + size: int + creation_date: float + modified_date: float + hash_value: str + file_type: str + number_of_files: int + volume_name: str + + +def hash_file_mmap(path: str, hash_type: str) -> str: + h = hashlib.new(hash_type) + with open(path, "rb") as f: + with mmap(f.fileno(), 0, access=ACCESS_READ) as mm: + h.update(mm) + return h.hexdigest() + + +def hash_file_chunked(path: str, hash_type: str, chunk_size: int = 1024 * 1024) -> str: + h = hashlib.new(hash_type) + with open(path, "rb") as f: + while True: + buf = f.read(chunk_size) + if not buf: + break + h.update(buf) + return h.hexdigest() + + +def _compute_record(args: Tuple[str, str, str, str, str, int, bool]) -> Optional[FileRecord]: + """ + Worker for multiprocessing. + + args: + path, root, directory, volume_name, hash_type, number_of_files, use_mmap + """ + path, root, directory, volume_name, hash_type, number_of_files, use_mmap = args + try: + base = os.path.basename(path) + stem, ext = os.path.splitext(base) + + rel_root = os.path.relpath(root, directory) + size = os.path.getsize(path) + ctime = os.path.getctime(path) + mtime = os.path.getmtime(path) + + if use_mmap: + hv = hash_file_mmap(path, hash_type) + else: + hv = hash_file_chunked(path, hash_type) + + return FileRecord( + name=stem, + relative_path=rel_root, + extension=ext, + size=size, + creation_date=ctime, + modified_date=mtime, + hash_value=hv, + file_type=ext, + number_of_files=number_of_files, + volume_name=volume_name, + ) + except (OSError, PermissionError): + # Optionally log these elsewhere; for now, skip unreadable entries. + return None + + +def iter_file_tasks( + directory: str, + volume_name: str, + hash_type: str, + excluded: Sequence[str] = (".git", ".svn"), + use_mmap: bool = True, +) -> Iterator[Tuple[str, str, str, str, str, int, bool]]: + """ + Yield worker tasks from os.walk, pruning excluded directories. + """ + for root, dirs, files in os.walk(directory): + root, filtered_dirs, filtered_files = filter_walk_triplet(list(excluded), root, dirs, files) + dirs[:] = filtered_dirs # prune descent + if not filtered_files: + continue + + nfiles = len(filtered_files) + for fname in filtered_files: + path = os.path.join(root, fname) + yield (path, root, directory, volume_name, hash_type, nfiles, use_mmap) + + +def collect_file_attributes( + volume_name: str, + directory: str, + hash_type: str = "sha256", + excluded: Sequence[str] = (".git", ".svn"), + processes: Optional[int] = None, + reserve_cores: int = 0, + chunksize: int = 32, + use_mmap: bool = True, + progress_cb: Optional[Callable[[int, Optional[int]], None]] = None, +) -> List[Dict[str, object]]: + """ + Scan directory, compute file metadata + hash, return list of dicts suitable for CSV/db. + + progress_cb(current, total) if provided; total may be None if not pre-counted. + """ + if processes is None: + processes = max(cpu_count() - int(reserve_cores), 1) + + tasks = iter_file_tasks( + directory=directory, + volume_name=volume_name, + hash_type=hash_type, + excluded=excluded, + use_mmap=use_mmap, + ) + + out: List[Dict[str, object]] = [] + completed = 0 + + with Pool(processes=processes) as pool: + for rec in pool.imap(_compute_record, tasks, chunksize=chunksize): + if rec is None: + continue + out.append(asdict(rec)) + completed += 1 + if progress_cb: + progress_cb(completed, None) + + return out +