Initial commit of GPT 5.2 content
This commit is contained in:
parent
21a03466c3
commit
db59a84727
12
README.md
12
README.md
|
|
@ -1,3 +1,13 @@
|
||||||
# DiffSeeker
|
# DiffSeeker
|
||||||
|
|
||||||
This repository is mostly command-line tooling to support characterizing and comparing files on different media, volumes, or directory trees.
|
DiffSeeker scans directory trees, records file metadata plus content hashes, and supports cross-volume comparison for:
|
||||||
|
- duplicates (same hash + size) across volumes
|
||||||
|
- missing files (present on one volume, absent on others by hash+size)
|
||||||
|
- suspicious divergences (same name, different size)
|
||||||
|
|
||||||
|
## Python CLI (mpchunkcfa compatible)
|
||||||
|
|
||||||
|
Install (editable dev install):
|
||||||
|
```bash
|
||||||
|
pip install -e .
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,22 @@
|
||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=61", "wheel"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "diffseeker"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Cross-volume file scanning and matching by hash+size"
|
||||||
|
requires-python = ">=3.10"
|
||||||
|
dependencies = [
|
||||||
|
"docopt>=0.6.2",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
mpchunkcfa = "diffseeker.cli_mpchunkcfa:main"
|
||||||
|
|
||||||
|
[tool.setuptools]
|
||||||
|
package-dir = {"" = "python/src"}
|
||||||
|
|
||||||
|
[tool.setuptools.packages.find]
|
||||||
|
where = ["python/src"]
|
||||||
|
|
||||||
|
|
@ -0,0 +1,6 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
from diffseeker.cli_mpchunkcfa import main
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
|
|
||||||
|
|
@ -0,0 +1,60 @@
|
||||||
|
"""
|
||||||
|
DiffSeeker - mpchunkcfa compatible CLI
|
||||||
|
|
||||||
|
USAGE:
|
||||||
|
mpchunkcfa.py --help
|
||||||
|
mpchunkcfa.py --version
|
||||||
|
mpchunkcfa.py --walk <WALKDIR> [-c <CSVNAME>] [-V <VOLUME_NAME>]
|
||||||
|
[--hash <HASHTYPE>] [--exclude <EXCLUDED>...]
|
||||||
|
[--db <DBPATH>] [--reserve-cores <N>]
|
||||||
|
|
||||||
|
OPTIONS:
|
||||||
|
-h, --help Print this help.
|
||||||
|
-v, --version Print the version [version=1.00]
|
||||||
|
-w, --walk <WALKDIR> Walk a given directory and output CSV
|
||||||
|
-c, --csv <CSVNAME> Name for the output CSV [default: cfa-output.csv]
|
||||||
|
-V, --volume_name <VOLUME_NAME> Name of the disk volume or resource [default: UNKNOWN]
|
||||||
|
--hash <HASHTYPE> Hash type (sha256, blake2b, md5, ...) [default: sha256]
|
||||||
|
--exclude <EXCLUDED> Path element(s) to exclude (repeatable) [default: .git]
|
||||||
|
--db <DBPATH> Optional SQLite database path to ingest results
|
||||||
|
--reserve-cores <N> Leave N CPU cores unused [default: 0]
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from docopt import docopt
|
||||||
|
|
||||||
|
from .scan import collect_file_attributes
|
||||||
|
from .csvio import write_records_csv
|
||||||
|
from .db import ingest_records
|
||||||
|
|
||||||
|
VERSION = "1.00"
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv=None) -> int:
|
||||||
|
args = docopt(__doc__, argv=argv, version=VERSION)
|
||||||
|
|
||||||
|
walkdir = args["<WALKDIR>"]
|
||||||
|
csvname = args["<CSVNAME>"]
|
||||||
|
volume_name = args["<VOLUME_NAME>"]
|
||||||
|
hash_type = args["<HASHTYPE>"]
|
||||||
|
excludes = args["<EXCLUDED>"] or [".git"]
|
||||||
|
dbpath = args["<DBPATH>"]
|
||||||
|
reserve = int(args["<N>"])
|
||||||
|
|
||||||
|
records = collect_file_attributes(
|
||||||
|
volume_name=volume_name,
|
||||||
|
directory=walkdir,
|
||||||
|
hash_type=hash_type,
|
||||||
|
excluded=excludes,
|
||||||
|
reserve_cores=reserve,
|
||||||
|
)
|
||||||
|
|
||||||
|
write_records_csv(csvname, records)
|
||||||
|
|
||||||
|
if dbpath:
|
||||||
|
ingest_records(Path(dbpath), records)
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
@ -0,0 +1,27 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import csv
|
||||||
|
from typing import Dict, Iterable, List
|
||||||
|
|
||||||
|
|
||||||
|
FIELDNAMES = [
|
||||||
|
"name",
|
||||||
|
"relative_path",
|
||||||
|
"extension",
|
||||||
|
"size",
|
||||||
|
"creation_date",
|
||||||
|
"modified_date",
|
||||||
|
"hash_value",
|
||||||
|
"file_type",
|
||||||
|
"number_of_files",
|
||||||
|
"volume_name",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def write_records_csv(path: str, records: Iterable[Dict[str, object]]) -> None:
|
||||||
|
records_list: List[Dict[str, object]] = list(records)
|
||||||
|
with open(path, "w", newline="", encoding="utf-8") as f:
|
||||||
|
writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
|
||||||
|
writer.writeheader()
|
||||||
|
for r in records_list:
|
||||||
|
writer.writerow(r)
|
||||||
|
|
@ -0,0 +1,92 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Iterable, Optional
|
||||||
|
|
||||||
|
|
||||||
|
SCHEMA_SQL = """
|
||||||
|
CREATE TABLE IF NOT EXISTS file_attributes (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
name TEXT,
|
||||||
|
relative_path TEXT,
|
||||||
|
extension TEXT,
|
||||||
|
size INTEGER,
|
||||||
|
creation_date REAL,
|
||||||
|
modified_date REAL,
|
||||||
|
hash_value TEXT,
|
||||||
|
file_type TEXT,
|
||||||
|
number_of_files INTEGER,
|
||||||
|
volume_name TEXT
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS matches (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
file1_id INTEGER,
|
||||||
|
file2_id INTEGER,
|
||||||
|
FOREIGN KEY(file1_id) REFERENCES file_attributes(id),
|
||||||
|
FOREIGN KEY(file2_id) REFERENCES file_attributes(id)
|
||||||
|
);
|
||||||
|
"""
|
||||||
|
|
||||||
|
INDEX_SQL = """
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_file_hash_size
|
||||||
|
ON file_attributes(hash_value, size);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_file_volume
|
||||||
|
ON file_attributes(volume_name);
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def connect(db_path: Path) -> sqlite3.Connection:
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.execute("PRAGMA foreign_keys=ON;")
|
||||||
|
return conn
|
||||||
|
|
||||||
|
|
||||||
|
def init_db(db_path: Path) -> None:
|
||||||
|
conn = connect(db_path)
|
||||||
|
try:
|
||||||
|
conn.executescript(SCHEMA_SQL)
|
||||||
|
conn.executescript(INDEX_SQL)
|
||||||
|
conn.commit()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def upsert_file_attribute(conn: sqlite3.Connection, rec: Dict[str, object]) -> int:
|
||||||
|
"""
|
||||||
|
Practical 'upsert' requires a natural key. We recommend (volume_name, hash_value, size, relative_path, name)
|
||||||
|
or introduce a separate unique constraint. For now, we do insert-only and return rowid.
|
||||||
|
|
||||||
|
If you want true upsert, add:
|
||||||
|
UNIQUE(volume_name, hash_value, size, relative_path, name)
|
||||||
|
and switch to INSERT ... ON CONFLICT ... DO UPDATE.
|
||||||
|
"""
|
||||||
|
cur = conn.cursor()
|
||||||
|
cur.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO file_attributes
|
||||||
|
(name, relative_path, extension, size,
|
||||||
|
creation_date, modified_date, hash_value,
|
||||||
|
file_type, number_of_files, volume_name)
|
||||||
|
VALUES
|
||||||
|
(:name, :relative_path, :extension, :size,
|
||||||
|
:creation_date, :modified_date, :hash_value,
|
||||||
|
:file_type, :number_of_files, :volume_name)
|
||||||
|
""",
|
||||||
|
rec,
|
||||||
|
)
|
||||||
|
return cur.lastrowid
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_records(db_path: Path, records: Iterable[Dict[str, object]]) -> None:
|
||||||
|
init_db(db_path)
|
||||||
|
conn = connect(db_path)
|
||||||
|
try:
|
||||||
|
cur = conn.cursor()
|
||||||
|
for rec in records:
|
||||||
|
upsert_file_attribute(conn, rec)
|
||||||
|
conn.commit()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
@ -0,0 +1,43 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import Iterable, List, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
def path_contains_any_element(path: str, excluded: Iterable[str]) -> bool:
|
||||||
|
"""
|
||||||
|
Exclude based on path *elements* (directory/file names), not substring matches.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
excluded=['.git']
|
||||||
|
/a/b/.git/config -> True
|
||||||
|
/a/b/.gitignore -> False ('.gitignore' is a file element, not '.git')
|
||||||
|
"""
|
||||||
|
parts = set(os.path.normpath(path).split(os.sep))
|
||||||
|
return any(ex in parts for ex in excluded)
|
||||||
|
|
||||||
|
|
||||||
|
def filter_walk_triplet(
|
||||||
|
excluded: List[str],
|
||||||
|
root: str,
|
||||||
|
dirs: List[str],
|
||||||
|
files: List[str],
|
||||||
|
) -> Tuple[str, List[str], List[str]]:
|
||||||
|
"""
|
||||||
|
Designed for use inside os.walk. Modify dirs to prevent descent into excluded dirs.
|
||||||
|
Returns (root, filtered_dirs, filtered_files).
|
||||||
|
"""
|
||||||
|
# Filter dirs in-place compatible manner: caller may assign dirs[:] = filtered_dirs
|
||||||
|
filtered_dirs = []
|
||||||
|
for d in dirs:
|
||||||
|
full = os.path.join(root, d)
|
||||||
|
if not path_contains_any_element(full, excluded):
|
||||||
|
filtered_dirs.append(d)
|
||||||
|
|
||||||
|
filtered_files = []
|
||||||
|
for f in files:
|
||||||
|
full = os.path.join(root, f)
|
||||||
|
if not path_contains_any_element(full, excluded):
|
||||||
|
filtered_files.append(f)
|
||||||
|
|
||||||
|
return root, filtered_dirs, filtered_files
|
||||||
|
|
@ -0,0 +1,148 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import hashlib
|
||||||
|
import os
|
||||||
|
from dataclasses import asdict, dataclass
|
||||||
|
from mmap import ACCESS_READ, mmap
|
||||||
|
from multiprocessing import Pool, cpu_count
|
||||||
|
from typing import Callable, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple
|
||||||
|
|
||||||
|
from .filter import filter_walk_triplet
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class FileRecord:
|
||||||
|
name: str
|
||||||
|
relative_path: str
|
||||||
|
extension: str
|
||||||
|
size: int
|
||||||
|
creation_date: float
|
||||||
|
modified_date: float
|
||||||
|
hash_value: str
|
||||||
|
file_type: str
|
||||||
|
number_of_files: int
|
||||||
|
volume_name: str
|
||||||
|
|
||||||
|
|
||||||
|
def hash_file_mmap(path: str, hash_type: str) -> str:
|
||||||
|
h = hashlib.new(hash_type)
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
with mmap(f.fileno(), 0, access=ACCESS_READ) as mm:
|
||||||
|
h.update(mm)
|
||||||
|
return h.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def hash_file_chunked(path: str, hash_type: str, chunk_size: int = 1024 * 1024) -> str:
|
||||||
|
h = hashlib.new(hash_type)
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
while True:
|
||||||
|
buf = f.read(chunk_size)
|
||||||
|
if not buf:
|
||||||
|
break
|
||||||
|
h.update(buf)
|
||||||
|
return h.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_record(args: Tuple[str, str, str, str, str, int, bool]) -> Optional[FileRecord]:
|
||||||
|
"""
|
||||||
|
Worker for multiprocessing.
|
||||||
|
|
||||||
|
args:
|
||||||
|
path, root, directory, volume_name, hash_type, number_of_files, use_mmap
|
||||||
|
"""
|
||||||
|
path, root, directory, volume_name, hash_type, number_of_files, use_mmap = args
|
||||||
|
try:
|
||||||
|
base = os.path.basename(path)
|
||||||
|
stem, ext = os.path.splitext(base)
|
||||||
|
|
||||||
|
rel_root = os.path.relpath(root, directory)
|
||||||
|
size = os.path.getsize(path)
|
||||||
|
ctime = os.path.getctime(path)
|
||||||
|
mtime = os.path.getmtime(path)
|
||||||
|
|
||||||
|
if use_mmap:
|
||||||
|
hv = hash_file_mmap(path, hash_type)
|
||||||
|
else:
|
||||||
|
hv = hash_file_chunked(path, hash_type)
|
||||||
|
|
||||||
|
return FileRecord(
|
||||||
|
name=stem,
|
||||||
|
relative_path=rel_root,
|
||||||
|
extension=ext,
|
||||||
|
size=size,
|
||||||
|
creation_date=ctime,
|
||||||
|
modified_date=mtime,
|
||||||
|
hash_value=hv,
|
||||||
|
file_type=ext,
|
||||||
|
number_of_files=number_of_files,
|
||||||
|
volume_name=volume_name,
|
||||||
|
)
|
||||||
|
except (OSError, PermissionError):
|
||||||
|
# Optionally log these elsewhere; for now, skip unreadable entries.
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def iter_file_tasks(
|
||||||
|
directory: str,
|
||||||
|
volume_name: str,
|
||||||
|
hash_type: str,
|
||||||
|
excluded: Sequence[str] = (".git", ".svn"),
|
||||||
|
use_mmap: bool = True,
|
||||||
|
) -> Iterator[Tuple[str, str, str, str, str, int, bool]]:
|
||||||
|
"""
|
||||||
|
Yield worker tasks from os.walk, pruning excluded directories.
|
||||||
|
"""
|
||||||
|
for root, dirs, files in os.walk(directory):
|
||||||
|
root, filtered_dirs, filtered_files = filter_walk_triplet(list(excluded), root, dirs, files)
|
||||||
|
dirs[:] = filtered_dirs # prune descent
|
||||||
|
if not filtered_files:
|
||||||
|
continue
|
||||||
|
|
||||||
|
nfiles = len(filtered_files)
|
||||||
|
for fname in filtered_files:
|
||||||
|
path = os.path.join(root, fname)
|
||||||
|
yield (path, root, directory, volume_name, hash_type, nfiles, use_mmap)
|
||||||
|
|
||||||
|
|
||||||
|
def collect_file_attributes(
|
||||||
|
volume_name: str,
|
||||||
|
directory: str,
|
||||||
|
hash_type: str = "sha256",
|
||||||
|
excluded: Sequence[str] = (".git", ".svn"),
|
||||||
|
processes: Optional[int] = None,
|
||||||
|
reserve_cores: int = 0,
|
||||||
|
chunksize: int = 32,
|
||||||
|
use_mmap: bool = True,
|
||||||
|
progress_cb: Optional[Callable[[int, Optional[int]], None]] = None,
|
||||||
|
) -> List[Dict[str, object]]:
|
||||||
|
"""
|
||||||
|
Scan directory, compute file metadata + hash, return list of dicts suitable for CSV/db.
|
||||||
|
|
||||||
|
progress_cb(current, total) if provided; total may be None if not pre-counted.
|
||||||
|
"""
|
||||||
|
if processes is None:
|
||||||
|
processes = max(cpu_count() - int(reserve_cores), 1)
|
||||||
|
|
||||||
|
tasks = iter_file_tasks(
|
||||||
|
directory=directory,
|
||||||
|
volume_name=volume_name,
|
||||||
|
hash_type=hash_type,
|
||||||
|
excluded=excluded,
|
||||||
|
use_mmap=use_mmap,
|
||||||
|
)
|
||||||
|
|
||||||
|
out: List[Dict[str, object]] = []
|
||||||
|
completed = 0
|
||||||
|
|
||||||
|
with Pool(processes=processes) as pool:
|
||||||
|
for rec in pool.imap(_compute_record, tasks, chunksize=chunksize):
|
||||||
|
if rec is None:
|
||||||
|
continue
|
||||||
|
out.append(asdict(rec))
|
||||||
|
completed += 1
|
||||||
|
if progress_cb:
|
||||||
|
progress_cb(completed, None)
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
Loading…
Reference in New Issue