Operational-Premise-Taxonomy/doc/mkbib.py

#!/usr/bin/env python3
"""
BibTeX Aggregator

Recursively collects BibTeX entries from .bib files in a directory,
merges unique entries by BibTeX key, handles conflicts by selecting
the longest entry, and reports all options with the selected one tagged.
"""

import os
import sys
import argparse
import re
from datetime import datetime
from collections import defaultdict


def backup_existing_file(filepath):
    """Rename existing file by appending a timestamp."""
    if os.path.exists(filepath):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        name, ext = os.path.splitext(filepath)
        backup_name = f"{name}_{timestamp}{ext}"
        os.rename(filepath, backup_name)
        print(f"Backed up existing {filepath} to {backup_name}")


def extract_bibtex_key(entry):
    """Extract the BibTeX key from an entry string."""
    # Match @type{key, ...}
    match = re.match(r'^@[a-zA-Z]+\s*{\s*([^,}\s]+)', entry.strip(), re.IGNORECASE)
    if match:
        return match.group(1)
    return None


def parse_bib_file(filepath):
    """
    Parse a .bib file and return a dict of BibTeX keys to (entry, filepath).
    """
    entries = {}
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
    except UnicodeDecodeError:
        # Try with latin-1 if utf-8 fails
        with open(filepath, 'r', encoding='latin-1') as f:
            content = f.read()

    # Split entries by '@' but keep the '@'
    raw_entries = re.split(r'(?=@[a-zA-Z]+\s*{)', content)

    for raw_entry in raw_entries:
        if not raw_entry.strip() or not raw_entry.strip().startswith('@'):
            continue
        key = extract_bibtex_key(raw_entry)
        if key:
            entries[key] = (raw_entry, filepath)
    return entries


def collect_bib_entries(root_dir):
    """Recursively collect all BibTeX entries from .bib files."""
    all_entries = {}
    conflicts = defaultdict(list)

    for dirpath, _, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.lower().endswith('.bib'):
                filepath = os.path.join(dirpath, filename)
                try:
                    entries = parse_bib_file(filepath)
                    for key, (entry, source) in entries.items():
                        if key in all_entries:
                            conflicts[key].append((entry, source))
                        else:
                            all_entries[key] = (entry, source)
                except Exception as e:
                    print(f"Warning: Skipping {filepath} due to error: {e}", file=sys.stderr)

    # Process conflicts: select the longest entry and prepare conflict data
    resolved_conflicts = {}
    conflict_data = {}

    for key in conflicts:
        # Include the first occurrence that was in all_entries
        candidates = [all_entries[key]] + conflicts[key]
        # Select the longest entry by character count
        selected = max(candidates, key=lambda x: len(x[0]))
        all_entries[key] = selected
        conflict_data[key] = {
            'candidates': candidates,
            'selected': selected
        }

    return all_entries, conflict_data


def write_bib_file(entries, output_file):
    """Write sorted BibTeX entries to output file."""
    sorted_keys = sorted(entries.keys())
    with open(output_file, 'w', encoding='utf-8') as f:
        for key in sorted_keys:
            entry, _ = entries[key]
            f.write(entry)
            if not entry.endswith('\n\n'):
                f.write('\n\n')


def write_conflicts(conflicts, output_file):
    """Write conflict report to org-mode file, tagging the selected entry."""
    if not conflicts:
        return

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("#+TITLE: BibTeX Conflicts Report\n")
        f.write("#+AUTHOR: BibTeX Aggregator\n")
        f.write(f"#+DATE: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")

        for key in sorted(conflicts.keys()):
            f.write(f"* Conflict for BibTeX key: {key}\n")
            candidates = conflicts[key]['candidates']
            selected = conflicts[key]['selected']

            for i, (entry, source) in enumerate(candidates, 1):
                is_selected = (entry == selected[0] and source == selected[1])
                tag = " << SELECTED >>" if is_selected else ""
                f.write(f"** Source {i}: {os.path.relpath(source, os.getcwd())}{tag}\n")
                f.write("#+BEGIN_SRC bibtex\n")
                f.write(entry.strip())
                f.write("\n#+END_SRC\n\n")


def main():
    parser = argparse.ArgumentParser(description="Aggregate BibTeX entries from multiple files.")
    parser.add_argument('--working-dir', '-w', default='.',
                        help='Working directory to search for .bib files (default: .)')
    parser.add_argument('--output', '-o', default='refs.bib',
                        help='Output filename (default: refs.bib)')

    args = parser.parse_args()

    working_dir = os.path.abspath(args.working_dir)
    output_file = os.path.abspath(args.output)
    conflicts_file = os.path.join(os.path.dirname(output_file), 'bib_conflicts.org')

    if not os.path.exists(working_dir):
        print(f"Error: Working directory '{working_dir}' does not exist.", file=sys.stderr)
        sys.exit(1)

    # Backup existing output file if it exists
    backup_existing_file(output_file)
    # Only backup conflicts file if it exists (don't create empty backup)
    if os.path.exists(conflicts_file):
        backup_existing_file(conflicts_file)

    # Collect entries
    print(f"Searching for .bib files in '{working_dir}'...")
    entries, conflicts = collect_bib_entries(working_dir)

    # Write output files
    write_bib_file(entries, output_file)
    write_conflicts(conflicts, conflicts_file)

    # Summary
    print(f"Written {len(entries)} unique entries to {output_file}")
    if conflicts:
        print(f"Found {len(conflicts)} conflicting keys; details in {conflicts_file}")
    else:
        print("No conflicts found.")


if __name__ == '__main__':
    main()