#!/usr/bin/env python3 """ BibTeX Aggregator Recursively collects BibTeX entries from .bib files in a directory, merges unique entries by BibTeX key, handles conflicts by selecting the longest entry, and reports all options with the selected one tagged. """ import os import sys import argparse import re from datetime import datetime from collections import defaultdict def backup_existing_file(filepath): """Rename existing file by appending a timestamp.""" if os.path.exists(filepath): timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") name, ext = os.path.splitext(filepath) backup_name = f"{name}_{timestamp}{ext}" os.rename(filepath, backup_name) print(f"Backed up existing {filepath} to {backup_name}") def extract_bibtex_key(entry): """Extract the BibTeX key from an entry string.""" # Match @type{key, ...} match = re.match(r'^@[a-zA-Z]+\s*{\s*([^,}\s]+)', entry.strip(), re.IGNORECASE) if match: return match.group(1) return None def parse_bib_file(filepath): """ Parse a .bib file and return a dict of BibTeX keys to (entry, filepath). """ entries = {} try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() except UnicodeDecodeError: # Try with latin-1 if utf-8 fails with open(filepath, 'r', encoding='latin-1') as f: content = f.read() # Split entries by '@' but keep the '@' raw_entries = re.split(r'(?=@[a-zA-Z]+\s*{)', content) for raw_entry in raw_entries: if not raw_entry.strip() or not raw_entry.strip().startswith('@'): continue key = extract_bibtex_key(raw_entry) if key: entries[key] = (raw_entry, filepath) return entries def collect_bib_entries(root_dir): """Recursively collect all BibTeX entries from .bib files.""" all_entries = {} conflicts = defaultdict(list) for dirpath, _, filenames in os.walk(root_dir): for filename in filenames: if filename.lower().endswith('.bib'): filepath = os.path.join(dirpath, filename) try: entries = parse_bib_file(filepath) for key, (entry, source) in entries.items(): if key in all_entries: conflicts[key].append((entry, source)) else: all_entries[key] = (entry, source) except Exception as e: print(f"Warning: Skipping {filepath} due to error: {e}", file=sys.stderr) # Process conflicts: select the longest entry and prepare conflict data resolved_conflicts = {} conflict_data = {} for key in conflicts: # Include the first occurrence that was in all_entries candidates = [all_entries[key]] + conflicts[key] # Select the longest entry by character count selected = max(candidates, key=lambda x: len(x[0])) all_entries[key] = selected conflict_data[key] = { 'candidates': candidates, 'selected': selected } return all_entries, conflict_data def write_bib_file(entries, output_file): """Write sorted BibTeX entries to output file.""" sorted_keys = sorted(entries.keys()) with open(output_file, 'w', encoding='utf-8') as f: for key in sorted_keys: entry, _ = entries[key] f.write(entry) if not entry.endswith('\n\n'): f.write('\n\n') def write_conflicts(conflicts, output_file): """Write conflict report to org-mode file, tagging the selected entry.""" if not conflicts: return with open(output_file, 'w', encoding='utf-8') as f: f.write("#+TITLE: BibTeX Conflicts Report\n") f.write("#+AUTHOR: BibTeX Aggregator\n") f.write(f"#+DATE: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") for key in sorted(conflicts.keys()): f.write(f"* Conflict for BibTeX key: {key}\n") candidates = conflicts[key]['candidates'] selected = conflicts[key]['selected'] for i, (entry, source) in enumerate(candidates, 1): is_selected = (entry == selected[0] and source == selected[1]) tag = " << SELECTED >>" if is_selected else "" f.write(f"** Source {i}: {os.path.relpath(source, os.getcwd())}{tag}\n") f.write("#+BEGIN_SRC bibtex\n") f.write(entry.strip()) f.write("\n#+END_SRC\n\n") def main(): parser = argparse.ArgumentParser(description="Aggregate BibTeX entries from multiple files.") parser.add_argument('--working-dir', '-w', default='.', help='Working directory to search for .bib files (default: .)') parser.add_argument('--output', '-o', default='refs.bib', help='Output filename (default: refs.bib)') args = parser.parse_args() working_dir = os.path.abspath(args.working_dir) output_file = os.path.abspath(args.output) conflicts_file = os.path.join(os.path.dirname(output_file), 'bib_conflicts.org') if not os.path.exists(working_dir): print(f"Error: Working directory '{working_dir}' does not exist.", file=sys.stderr) sys.exit(1) # Backup existing output file if it exists backup_existing_file(output_file) # Only backup conflicts file if it exists (don't create empty backup) if os.path.exists(conflicts_file): backup_existing_file(conflicts_file) # Collect entries print(f"Searching for .bib files in '{working_dir}'...") entries, conflicts = collect_bib_entries(working_dir) # Write output files write_bib_file(entries, output_file) write_conflicts(conflicts, conflicts_file) # Summary print(f"Written {len(entries)} unique entries to {output_file}") if conflicts: print(f"Found {len(conflicts)} conflicting keys; details in {conflicts_file}") else: print("No conflicts found.") if __name__ == '__main__': main()