Initial commit of GPT 5.2 content
This commit is contained in:
parent
21a03466c3
commit
db59a84727
12
README.md
12
README.md
|
|
@ -1,3 +1,13 @@
|
|||
# DiffSeeker
|
||||
|
||||
This repository is mostly command-line tooling to support characterizing and comparing files on different media, volumes, or directory trees.
|
||||
DiffSeeker scans directory trees, records file metadata plus content hashes, and supports cross-volume comparison for:
|
||||
- duplicates (same hash + size) across volumes
|
||||
- missing files (present on one volume, absent on others by hash+size)
|
||||
- suspicious divergences (same name, different size)
|
||||
|
||||
## Python CLI (mpchunkcfa compatible)
|
||||
|
||||
Install (editable dev install):
|
||||
```bash
|
||||
pip install -e .
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,22 @@
|
|||
[build-system]
|
||||
requires = ["setuptools>=61", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "diffseeker"
|
||||
version = "0.1.0"
|
||||
description = "Cross-volume file scanning and matching by hash+size"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"docopt>=0.6.2",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
mpchunkcfa = "diffseeker.cli_mpchunkcfa:main"
|
||||
|
||||
[tool.setuptools]
|
||||
package-dir = {"" = "python/src"}
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["python/src"]
|
||||
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
from diffseeker.cli_mpchunkcfa import main
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
"""
|
||||
DiffSeeker - mpchunkcfa compatible CLI
|
||||
|
||||
USAGE:
|
||||
mpchunkcfa.py --help
|
||||
mpchunkcfa.py --version
|
||||
mpchunkcfa.py --walk <WALKDIR> [-c <CSVNAME>] [-V <VOLUME_NAME>]
|
||||
[--hash <HASHTYPE>] [--exclude <EXCLUDED>...]
|
||||
[--db <DBPATH>] [--reserve-cores <N>]
|
||||
|
||||
OPTIONS:
|
||||
-h, --help Print this help.
|
||||
-v, --version Print the version [version=1.00]
|
||||
-w, --walk <WALKDIR> Walk a given directory and output CSV
|
||||
-c, --csv <CSVNAME> Name for the output CSV [default: cfa-output.csv]
|
||||
-V, --volume_name <VOLUME_NAME> Name of the disk volume or resource [default: UNKNOWN]
|
||||
--hash <HASHTYPE> Hash type (sha256, blake2b, md5, ...) [default: sha256]
|
||||
--exclude <EXCLUDED> Path element(s) to exclude (repeatable) [default: .git]
|
||||
--db <DBPATH> Optional SQLite database path to ingest results
|
||||
--reserve-cores <N> Leave N CPU cores unused [default: 0]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from docopt import docopt
|
||||
|
||||
from .scan import collect_file_attributes
|
||||
from .csvio import write_records_csv
|
||||
from .db import ingest_records
|
||||
|
||||
VERSION = "1.00"
|
||||
|
||||
|
||||
def main(argv=None) -> int:
|
||||
args = docopt(__doc__, argv=argv, version=VERSION)
|
||||
|
||||
walkdir = args["<WALKDIR>"]
|
||||
csvname = args["<CSVNAME>"]
|
||||
volume_name = args["<VOLUME_NAME>"]
|
||||
hash_type = args["<HASHTYPE>"]
|
||||
excludes = args["<EXCLUDED>"] or [".git"]
|
||||
dbpath = args["<DBPATH>"]
|
||||
reserve = int(args["<N>"])
|
||||
|
||||
records = collect_file_attributes(
|
||||
volume_name=volume_name,
|
||||
directory=walkdir,
|
||||
hash_type=hash_type,
|
||||
excluded=excludes,
|
||||
reserve_cores=reserve,
|
||||
)
|
||||
|
||||
write_records_csv(csvname, records)
|
||||
|
||||
if dbpath:
|
||||
ingest_records(Path(dbpath), records)
|
||||
|
||||
return 0
|
||||
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
from typing import Dict, Iterable, List
|
||||
|
||||
|
||||
FIELDNAMES = [
|
||||
"name",
|
||||
"relative_path",
|
||||
"extension",
|
||||
"size",
|
||||
"creation_date",
|
||||
"modified_date",
|
||||
"hash_value",
|
||||
"file_type",
|
||||
"number_of_files",
|
||||
"volume_name",
|
||||
]
|
||||
|
||||
|
||||
def write_records_csv(path: str, records: Iterable[Dict[str, object]]) -> None:
|
||||
records_list: List[Dict[str, object]] = list(records)
|
||||
with open(path, "w", newline="", encoding="utf-8") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
|
||||
writer.writeheader()
|
||||
for r in records_list:
|
||||
writer.writerow(r)
|
||||
|
|
@ -0,0 +1,92 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, Optional
|
||||
|
||||
|
||||
SCHEMA_SQL = """
|
||||
CREATE TABLE IF NOT EXISTS file_attributes (
|
||||
id INTEGER PRIMARY KEY,
|
||||
name TEXT,
|
||||
relative_path TEXT,
|
||||
extension TEXT,
|
||||
size INTEGER,
|
||||
creation_date REAL,
|
||||
modified_date REAL,
|
||||
hash_value TEXT,
|
||||
file_type TEXT,
|
||||
number_of_files INTEGER,
|
||||
volume_name TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS matches (
|
||||
id INTEGER PRIMARY KEY,
|
||||
file1_id INTEGER,
|
||||
file2_id INTEGER,
|
||||
FOREIGN KEY(file1_id) REFERENCES file_attributes(id),
|
||||
FOREIGN KEY(file2_id) REFERENCES file_attributes(id)
|
||||
);
|
||||
"""
|
||||
|
||||
INDEX_SQL = """
|
||||
CREATE INDEX IF NOT EXISTS idx_file_hash_size
|
||||
ON file_attributes(hash_value, size);
|
||||
CREATE INDEX IF NOT EXISTS idx_file_volume
|
||||
ON file_attributes(volume_name);
|
||||
"""
|
||||
|
||||
|
||||
def connect(db_path: Path) -> sqlite3.Connection:
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.execute("PRAGMA foreign_keys=ON;")
|
||||
return conn
|
||||
|
||||
|
||||
def init_db(db_path: Path) -> None:
|
||||
conn = connect(db_path)
|
||||
try:
|
||||
conn.executescript(SCHEMA_SQL)
|
||||
conn.executescript(INDEX_SQL)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def upsert_file_attribute(conn: sqlite3.Connection, rec: Dict[str, object]) -> int:
|
||||
"""
|
||||
Practical 'upsert' requires a natural key. We recommend (volume_name, hash_value, size, relative_path, name)
|
||||
or introduce a separate unique constraint. For now, we do insert-only and return rowid.
|
||||
|
||||
If you want true upsert, add:
|
||||
UNIQUE(volume_name, hash_value, size, relative_path, name)
|
||||
and switch to INSERT ... ON CONFLICT ... DO UPDATE.
|
||||
"""
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO file_attributes
|
||||
(name, relative_path, extension, size,
|
||||
creation_date, modified_date, hash_value,
|
||||
file_type, number_of_files, volume_name)
|
||||
VALUES
|
||||
(:name, :relative_path, :extension, :size,
|
||||
:creation_date, :modified_date, :hash_value,
|
||||
:file_type, :number_of_files, :volume_name)
|
||||
""",
|
||||
rec,
|
||||
)
|
||||
return cur.lastrowid
|
||||
|
||||
|
||||
def ingest_records(db_path: Path, records: Iterable[Dict[str, object]]) -> None:
|
||||
init_db(db_path)
|
||||
conn = connect(db_path)
|
||||
try:
|
||||
cur = conn.cursor()
|
||||
for rec in records:
|
||||
upsert_file_attribute(conn, rec)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import Iterable, List, Tuple
|
||||
|
||||
|
||||
def path_contains_any_element(path: str, excluded: Iterable[str]) -> bool:
|
||||
"""
|
||||
Exclude based on path *elements* (directory/file names), not substring matches.
|
||||
|
||||
Example:
|
||||
excluded=['.git']
|
||||
/a/b/.git/config -> True
|
||||
/a/b/.gitignore -> False ('.gitignore' is a file element, not '.git')
|
||||
"""
|
||||
parts = set(os.path.normpath(path).split(os.sep))
|
||||
return any(ex in parts for ex in excluded)
|
||||
|
||||
|
||||
def filter_walk_triplet(
|
||||
excluded: List[str],
|
||||
root: str,
|
||||
dirs: List[str],
|
||||
files: List[str],
|
||||
) -> Tuple[str, List[str], List[str]]:
|
||||
"""
|
||||
Designed for use inside os.walk. Modify dirs to prevent descent into excluded dirs.
|
||||
Returns (root, filtered_dirs, filtered_files).
|
||||
"""
|
||||
# Filter dirs in-place compatible manner: caller may assign dirs[:] = filtered_dirs
|
||||
filtered_dirs = []
|
||||
for d in dirs:
|
||||
full = os.path.join(root, d)
|
||||
if not path_contains_any_element(full, excluded):
|
||||
filtered_dirs.append(d)
|
||||
|
||||
filtered_files = []
|
||||
for f in files:
|
||||
full = os.path.join(root, f)
|
||||
if not path_contains_any_element(full, excluded):
|
||||
filtered_files.append(f)
|
||||
|
||||
return root, filtered_dirs, filtered_files
|
||||
|
|
@ -0,0 +1,148 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import hashlib
|
||||
import os
|
||||
from dataclasses import asdict, dataclass
|
||||
from mmap import ACCESS_READ, mmap
|
||||
from multiprocessing import Pool, cpu_count
|
||||
from typing import Callable, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple
|
||||
|
||||
from .filter import filter_walk_triplet
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FileRecord:
|
||||
name: str
|
||||
relative_path: str
|
||||
extension: str
|
||||
size: int
|
||||
creation_date: float
|
||||
modified_date: float
|
||||
hash_value: str
|
||||
file_type: str
|
||||
number_of_files: int
|
||||
volume_name: str
|
||||
|
||||
|
||||
def hash_file_mmap(path: str, hash_type: str) -> str:
|
||||
h = hashlib.new(hash_type)
|
||||
with open(path, "rb") as f:
|
||||
with mmap(f.fileno(), 0, access=ACCESS_READ) as mm:
|
||||
h.update(mm)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def hash_file_chunked(path: str, hash_type: str, chunk_size: int = 1024 * 1024) -> str:
|
||||
h = hashlib.new(hash_type)
|
||||
with open(path, "rb") as f:
|
||||
while True:
|
||||
buf = f.read(chunk_size)
|
||||
if not buf:
|
||||
break
|
||||
h.update(buf)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def _compute_record(args: Tuple[str, str, str, str, str, int, bool]) -> Optional[FileRecord]:
|
||||
"""
|
||||
Worker for multiprocessing.
|
||||
|
||||
args:
|
||||
path, root, directory, volume_name, hash_type, number_of_files, use_mmap
|
||||
"""
|
||||
path, root, directory, volume_name, hash_type, number_of_files, use_mmap = args
|
||||
try:
|
||||
base = os.path.basename(path)
|
||||
stem, ext = os.path.splitext(base)
|
||||
|
||||
rel_root = os.path.relpath(root, directory)
|
||||
size = os.path.getsize(path)
|
||||
ctime = os.path.getctime(path)
|
||||
mtime = os.path.getmtime(path)
|
||||
|
||||
if use_mmap:
|
||||
hv = hash_file_mmap(path, hash_type)
|
||||
else:
|
||||
hv = hash_file_chunked(path, hash_type)
|
||||
|
||||
return FileRecord(
|
||||
name=stem,
|
||||
relative_path=rel_root,
|
||||
extension=ext,
|
||||
size=size,
|
||||
creation_date=ctime,
|
||||
modified_date=mtime,
|
||||
hash_value=hv,
|
||||
file_type=ext,
|
||||
number_of_files=number_of_files,
|
||||
volume_name=volume_name,
|
||||
)
|
||||
except (OSError, PermissionError):
|
||||
# Optionally log these elsewhere; for now, skip unreadable entries.
|
||||
return None
|
||||
|
||||
|
||||
def iter_file_tasks(
|
||||
directory: str,
|
||||
volume_name: str,
|
||||
hash_type: str,
|
||||
excluded: Sequence[str] = (".git", ".svn"),
|
||||
use_mmap: bool = True,
|
||||
) -> Iterator[Tuple[str, str, str, str, str, int, bool]]:
|
||||
"""
|
||||
Yield worker tasks from os.walk, pruning excluded directories.
|
||||
"""
|
||||
for root, dirs, files in os.walk(directory):
|
||||
root, filtered_dirs, filtered_files = filter_walk_triplet(list(excluded), root, dirs, files)
|
||||
dirs[:] = filtered_dirs # prune descent
|
||||
if not filtered_files:
|
||||
continue
|
||||
|
||||
nfiles = len(filtered_files)
|
||||
for fname in filtered_files:
|
||||
path = os.path.join(root, fname)
|
||||
yield (path, root, directory, volume_name, hash_type, nfiles, use_mmap)
|
||||
|
||||
|
||||
def collect_file_attributes(
|
||||
volume_name: str,
|
||||
directory: str,
|
||||
hash_type: str = "sha256",
|
||||
excluded: Sequence[str] = (".git", ".svn"),
|
||||
processes: Optional[int] = None,
|
||||
reserve_cores: int = 0,
|
||||
chunksize: int = 32,
|
||||
use_mmap: bool = True,
|
||||
progress_cb: Optional[Callable[[int, Optional[int]], None]] = None,
|
||||
) -> List[Dict[str, object]]:
|
||||
"""
|
||||
Scan directory, compute file metadata + hash, return list of dicts suitable for CSV/db.
|
||||
|
||||
progress_cb(current, total) if provided; total may be None if not pre-counted.
|
||||
"""
|
||||
if processes is None:
|
||||
processes = max(cpu_count() - int(reserve_cores), 1)
|
||||
|
||||
tasks = iter_file_tasks(
|
||||
directory=directory,
|
||||
volume_name=volume_name,
|
||||
hash_type=hash_type,
|
||||
excluded=excluded,
|
||||
use_mmap=use_mmap,
|
||||
)
|
||||
|
||||
out: List[Dict[str, object]] = []
|
||||
completed = 0
|
||||
|
||||
with Pool(processes=processes) as pool:
|
||||
for rec in pool.imap(_compute_record, tasks, chunksize=chunksize):
|
||||
if rec is None:
|
||||
continue
|
||||
out.append(asdict(rec))
|
||||
completed += 1
|
||||
if progress_cb:
|
||||
progress_cb(completed, None)
|
||||
|
||||
return out
|
||||
|
||||
Loading…
Reference in New Issue