Operational-Premise-Taxonomy/doc/mkbib.py

176 lines
6.1 KiB
Python

#!/usr/bin/env python3
"""
BibTeX Aggregator
Recursively collects BibTeX entries from .bib files in a directory,
merges unique entries by BibTeX key, handles conflicts by selecting
the longest entry, and reports all options with the selected one tagged.
"""
import os
import sys
import argparse
import re
from datetime import datetime
from collections import defaultdict
def backup_existing_file(filepath):
"""Rename existing file by appending a timestamp."""
if os.path.exists(filepath):
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
name, ext = os.path.splitext(filepath)
backup_name = f"{name}_{timestamp}{ext}"
os.rename(filepath, backup_name)
print(f"Backed up existing {filepath} to {backup_name}")
def extract_bibtex_key(entry):
"""Extract the BibTeX key from an entry string."""
# Match @type{key, ...}
match = re.match(r'^@[a-zA-Z]+\s*{\s*([^,}\s]+)', entry.strip(), re.IGNORECASE)
if match:
return match.group(1)
return None
def parse_bib_file(filepath):
"""
Parse a .bib file and return a dict of BibTeX keys to (entry, filepath).
"""
entries = {}
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
except UnicodeDecodeError:
# Try with latin-1 if utf-8 fails
with open(filepath, 'r', encoding='latin-1') as f:
content = f.read()
# Split entries by '@' but keep the '@'
raw_entries = re.split(r'(?=@[a-zA-Z]+\s*{)', content)
for raw_entry in raw_entries:
if not raw_entry.strip() or not raw_entry.strip().startswith('@'):
continue
key = extract_bibtex_key(raw_entry)
if key:
entries[key] = (raw_entry, filepath)
return entries
def collect_bib_entries(root_dir):
"""Recursively collect all BibTeX entries from .bib files."""
all_entries = {}
conflicts = defaultdict(list)
for dirpath, _, filenames in os.walk(root_dir):
for filename in filenames:
if filename.lower().endswith('.bib'):
filepath = os.path.join(dirpath, filename)
try:
entries = parse_bib_file(filepath)
for key, (entry, source) in entries.items():
if key in all_entries:
conflicts[key].append((entry, source))
else:
all_entries[key] = (entry, source)
except Exception as e:
print(f"Warning: Skipping {filepath} due to error: {e}", file=sys.stderr)
# Process conflicts: select the longest entry and prepare conflict data
resolved_conflicts = {}
conflict_data = {}
for key in conflicts:
# Include the first occurrence that was in all_entries
candidates = [all_entries[key]] + conflicts[key]
# Select the longest entry by character count
selected = max(candidates, key=lambda x: len(x[0]))
all_entries[key] = selected
conflict_data[key] = {
'candidates': candidates,
'selected': selected
}
return all_entries, conflict_data
def write_bib_file(entries, output_file):
"""Write sorted BibTeX entries to output file."""
sorted_keys = sorted(entries.keys())
with open(output_file, 'w', encoding='utf-8') as f:
for key in sorted_keys:
entry, _ = entries[key]
f.write(entry)
if not entry.endswith('\n\n'):
f.write('\n\n')
def write_conflicts(conflicts, output_file):
"""Write conflict report to org-mode file, tagging the selected entry."""
if not conflicts:
return
with open(output_file, 'w', encoding='utf-8') as f:
f.write("#+TITLE: BibTeX Conflicts Report\n")
f.write("#+AUTHOR: BibTeX Aggregator\n")
f.write(f"#+DATE: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
for key in sorted(conflicts.keys()):
f.write(f"* Conflict for BibTeX key: {key}\n")
candidates = conflicts[key]['candidates']
selected = conflicts[key]['selected']
for i, (entry, source) in enumerate(candidates, 1):
is_selected = (entry == selected[0] and source == selected[1])
tag = " << SELECTED >>" if is_selected else ""
f.write(f"** Source {i}: {os.path.relpath(source, os.getcwd())}{tag}\n")
f.write("#+BEGIN_SRC bibtex\n")
f.write(entry.strip())
f.write("\n#+END_SRC\n\n")
def main():
parser = argparse.ArgumentParser(description="Aggregate BibTeX entries from multiple files.")
parser.add_argument('--working-dir', '-w', default='.',
help='Working directory to search for .bib files (default: .)')
parser.add_argument('--output', '-o', default='refs.bib',
help='Output filename (default: refs.bib)')
args = parser.parse_args()
working_dir = os.path.abspath(args.working_dir)
output_file = os.path.abspath(args.output)
conflicts_file = os.path.join(os.path.dirname(output_file), 'bib_conflicts.org')
if not os.path.exists(working_dir):
print(f"Error: Working directory '{working_dir}' does not exist.", file=sys.stderr)
sys.exit(1)
# Backup existing output file if it exists
backup_existing_file(output_file)
# Only backup conflicts file if it exists (don't create empty backup)
if os.path.exists(conflicts_file):
backup_existing_file(conflicts_file)
# Collect entries
print(f"Searching for .bib files in '{working_dir}'...")
entries, conflicts = collect_bib_entries(working_dir)
# Write output files
write_bib_file(entries, output_file)
write_conflicts(conflicts, conflicts_file)
# Summary
print(f"Written {len(entries)} unique entries to {output_file}")
if conflicts:
print(f"Found {len(conflicts)} conflicting keys; details in {conflicts_file}")
else:
print("No conflicts found.")
if __name__ == '__main__':
main()