Initial commit of GPT 5.2 content

This commit is contained in:
Wesley R. Elsberry 2025-12-17 04:40:30 +00:00
parent 21a03466c3
commit db59a84727
8 changed files with 409 additions and 1 deletions

View File

@ -1,3 +1,13 @@
# DiffSeeker
This repository is mostly command-line tooling to support characterizing and comparing files on different media, volumes, or directory trees.
DiffSeeker scans directory trees, records file metadata plus content hashes, and supports cross-volume comparison for:
- duplicates (same hash + size) across volumes
- missing files (present on one volume, absent on others by hash+size)
- suspicious divergences (same name, different size)
## Python CLI (mpchunkcfa compatible)
Install (editable dev install):
```bash
pip install -e .

22
pyproject.toml Normal file
View File

@ -0,0 +1,22 @@
[build-system]
requires = ["setuptools>=61", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "diffseeker"
version = "0.1.0"
description = "Cross-volume file scanning and matching by hash+size"
requires-python = ">=3.10"
dependencies = [
"docopt>=0.6.2",
]
[project.scripts]
mpchunkcfa = "diffseeker.cli_mpchunkcfa:main"
[tool.setuptools]
package-dir = {"" = "python/src"}
[tool.setuptools.packages.find]
where = ["python/src"]

View File

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from diffseeker.cli_mpchunkcfa import main
if __name__ == "__main__":
raise SystemExit(main())

View File

@ -0,0 +1,60 @@
"""
DiffSeeker - mpchunkcfa compatible CLI
USAGE:
mpchunkcfa.py --help
mpchunkcfa.py --version
mpchunkcfa.py --walk <WALKDIR> [-c <CSVNAME>] [-V <VOLUME_NAME>]
[--hash <HASHTYPE>] [--exclude <EXCLUDED>...]
[--db <DBPATH>] [--reserve-cores <N>]
OPTIONS:
-h, --help Print this help.
-v, --version Print the version [version=1.00]
-w, --walk <WALKDIR> Walk a given directory and output CSV
-c, --csv <CSVNAME> Name for the output CSV [default: cfa-output.csv]
-V, --volume_name <VOLUME_NAME> Name of the disk volume or resource [default: UNKNOWN]
--hash <HASHTYPE> Hash type (sha256, blake2b, md5, ...) [default: sha256]
--exclude <EXCLUDED> Path element(s) to exclude (repeatable) [default: .git]
--db <DBPATH> Optional SQLite database path to ingest results
--reserve-cores <N> Leave N CPU cores unused [default: 0]
"""
from __future__ import annotations
from pathlib import Path
from docopt import docopt
from .scan import collect_file_attributes
from .csvio import write_records_csv
from .db import ingest_records
VERSION = "1.00"
def main(argv=None) -> int:
args = docopt(__doc__, argv=argv, version=VERSION)
walkdir = args["<WALKDIR>"]
csvname = args["<CSVNAME>"]
volume_name = args["<VOLUME_NAME>"]
hash_type = args["<HASHTYPE>"]
excludes = args["<EXCLUDED>"] or [".git"]
dbpath = args["<DBPATH>"]
reserve = int(args["<N>"])
records = collect_file_attributes(
volume_name=volume_name,
directory=walkdir,
hash_type=hash_type,
excluded=excludes,
reserve_cores=reserve,
)
write_records_csv(csvname, records)
if dbpath:
ingest_records(Path(dbpath), records)
return 0

View File

@ -0,0 +1,27 @@
from __future__ import annotations
import csv
from typing import Dict, Iterable, List
FIELDNAMES = [
"name",
"relative_path",
"extension",
"size",
"creation_date",
"modified_date",
"hash_value",
"file_type",
"number_of_files",
"volume_name",
]
def write_records_csv(path: str, records: Iterable[Dict[str, object]]) -> None:
records_list: List[Dict[str, object]] = list(records)
with open(path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
writer.writeheader()
for r in records_list:
writer.writerow(r)

View File

@ -0,0 +1,92 @@
from __future__ import annotations
import sqlite3
from pathlib import Path
from typing import Dict, Iterable, Optional
SCHEMA_SQL = """
CREATE TABLE IF NOT EXISTS file_attributes (
id INTEGER PRIMARY KEY,
name TEXT,
relative_path TEXT,
extension TEXT,
size INTEGER,
creation_date REAL,
modified_date REAL,
hash_value TEXT,
file_type TEXT,
number_of_files INTEGER,
volume_name TEXT
);
CREATE TABLE IF NOT EXISTS matches (
id INTEGER PRIMARY KEY,
file1_id INTEGER,
file2_id INTEGER,
FOREIGN KEY(file1_id) REFERENCES file_attributes(id),
FOREIGN KEY(file2_id) REFERENCES file_attributes(id)
);
"""
INDEX_SQL = """
CREATE INDEX IF NOT EXISTS idx_file_hash_size
ON file_attributes(hash_value, size);
CREATE INDEX IF NOT EXISTS idx_file_volume
ON file_attributes(volume_name);
"""
def connect(db_path: Path) -> sqlite3.Connection:
conn = sqlite3.connect(db_path)
conn.execute("PRAGMA foreign_keys=ON;")
return conn
def init_db(db_path: Path) -> None:
conn = connect(db_path)
try:
conn.executescript(SCHEMA_SQL)
conn.executescript(INDEX_SQL)
conn.commit()
finally:
conn.close()
def upsert_file_attribute(conn: sqlite3.Connection, rec: Dict[str, object]) -> int:
"""
Practical 'upsert' requires a natural key. We recommend (volume_name, hash_value, size, relative_path, name)
or introduce a separate unique constraint. For now, we do insert-only and return rowid.
If you want true upsert, add:
UNIQUE(volume_name, hash_value, size, relative_path, name)
and switch to INSERT ... ON CONFLICT ... DO UPDATE.
"""
cur = conn.cursor()
cur.execute(
"""
INSERT INTO file_attributes
(name, relative_path, extension, size,
creation_date, modified_date, hash_value,
file_type, number_of_files, volume_name)
VALUES
(:name, :relative_path, :extension, :size,
:creation_date, :modified_date, :hash_value,
:file_type, :number_of_files, :volume_name)
""",
rec,
)
return cur.lastrowid
def ingest_records(db_path: Path, records: Iterable[Dict[str, object]]) -> None:
init_db(db_path)
conn = connect(db_path)
try:
cur = conn.cursor()
for rec in records:
upsert_file_attribute(conn, rec)
conn.commit()
finally:
conn.close()

View File

@ -0,0 +1,43 @@
from __future__ import annotations
import os
from typing import Iterable, List, Tuple
def path_contains_any_element(path: str, excluded: Iterable[str]) -> bool:
"""
Exclude based on path *elements* (directory/file names), not substring matches.
Example:
excluded=['.git']
/a/b/.git/config -> True
/a/b/.gitignore -> False ('.gitignore' is a file element, not '.git')
"""
parts = set(os.path.normpath(path).split(os.sep))
return any(ex in parts for ex in excluded)
def filter_walk_triplet(
excluded: List[str],
root: str,
dirs: List[str],
files: List[str],
) -> Tuple[str, List[str], List[str]]:
"""
Designed for use inside os.walk. Modify dirs to prevent descent into excluded dirs.
Returns (root, filtered_dirs, filtered_files).
"""
# Filter dirs in-place compatible manner: caller may assign dirs[:] = filtered_dirs
filtered_dirs = []
for d in dirs:
full = os.path.join(root, d)
if not path_contains_any_element(full, excluded):
filtered_dirs.append(d)
filtered_files = []
for f in files:
full = os.path.join(root, f)
if not path_contains_any_element(full, excluded):
filtered_files.append(f)
return root, filtered_dirs, filtered_files

View File

@ -0,0 +1,148 @@
from __future__ import annotations
import csv
import hashlib
import os
from dataclasses import asdict, dataclass
from mmap import ACCESS_READ, mmap
from multiprocessing import Pool, cpu_count
from typing import Callable, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple
from .filter import filter_walk_triplet
@dataclass(frozen=True)
class FileRecord:
name: str
relative_path: str
extension: str
size: int
creation_date: float
modified_date: float
hash_value: str
file_type: str
number_of_files: int
volume_name: str
def hash_file_mmap(path: str, hash_type: str) -> str:
h = hashlib.new(hash_type)
with open(path, "rb") as f:
with mmap(f.fileno(), 0, access=ACCESS_READ) as mm:
h.update(mm)
return h.hexdigest()
def hash_file_chunked(path: str, hash_type: str, chunk_size: int = 1024 * 1024) -> str:
h = hashlib.new(hash_type)
with open(path, "rb") as f:
while True:
buf = f.read(chunk_size)
if not buf:
break
h.update(buf)
return h.hexdigest()
def _compute_record(args: Tuple[str, str, str, str, str, int, bool]) -> Optional[FileRecord]:
"""
Worker for multiprocessing.
args:
path, root, directory, volume_name, hash_type, number_of_files, use_mmap
"""
path, root, directory, volume_name, hash_type, number_of_files, use_mmap = args
try:
base = os.path.basename(path)
stem, ext = os.path.splitext(base)
rel_root = os.path.relpath(root, directory)
size = os.path.getsize(path)
ctime = os.path.getctime(path)
mtime = os.path.getmtime(path)
if use_mmap:
hv = hash_file_mmap(path, hash_type)
else:
hv = hash_file_chunked(path, hash_type)
return FileRecord(
name=stem,
relative_path=rel_root,
extension=ext,
size=size,
creation_date=ctime,
modified_date=mtime,
hash_value=hv,
file_type=ext,
number_of_files=number_of_files,
volume_name=volume_name,
)
except (OSError, PermissionError):
# Optionally log these elsewhere; for now, skip unreadable entries.
return None
def iter_file_tasks(
directory: str,
volume_name: str,
hash_type: str,
excluded: Sequence[str] = (".git", ".svn"),
use_mmap: bool = True,
) -> Iterator[Tuple[str, str, str, str, str, int, bool]]:
"""
Yield worker tasks from os.walk, pruning excluded directories.
"""
for root, dirs, files in os.walk(directory):
root, filtered_dirs, filtered_files = filter_walk_triplet(list(excluded), root, dirs, files)
dirs[:] = filtered_dirs # prune descent
if not filtered_files:
continue
nfiles = len(filtered_files)
for fname in filtered_files:
path = os.path.join(root, fname)
yield (path, root, directory, volume_name, hash_type, nfiles, use_mmap)
def collect_file_attributes(
volume_name: str,
directory: str,
hash_type: str = "sha256",
excluded: Sequence[str] = (".git", ".svn"),
processes: Optional[int] = None,
reserve_cores: int = 0,
chunksize: int = 32,
use_mmap: bool = True,
progress_cb: Optional[Callable[[int, Optional[int]], None]] = None,
) -> List[Dict[str, object]]:
"""
Scan directory, compute file metadata + hash, return list of dicts suitable for CSV/db.
progress_cb(current, total) if provided; total may be None if not pre-counted.
"""
if processes is None:
processes = max(cpu_count() - int(reserve_cores), 1)
tasks = iter_file_tasks(
directory=directory,
volume_name=volume_name,
hash_type=hash_type,
excluded=excluded,
use_mmap=use_mmap,
)
out: List[Dict[str, object]] = []
completed = 0
with Pool(processes=processes) as pool:
for rec in pool.imap(_compute_record, tasks, chunksize=chunksize):
if rec is None:
continue
out.append(asdict(rec))
completed += 1
if progress_cb:
progress_cb(completed, None)
return out