176 lines
6.1 KiB
Python
176 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
BibTeX Aggregator
|
|
|
|
Recursively collects BibTeX entries from .bib files in a directory,
|
|
merges unique entries by BibTeX key, handles conflicts by selecting
|
|
the longest entry, and reports all options with the selected one tagged.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import argparse
|
|
import re
|
|
from datetime import datetime
|
|
from collections import defaultdict
|
|
|
|
|
|
def backup_existing_file(filepath):
|
|
"""Rename existing file by appending a timestamp."""
|
|
if os.path.exists(filepath):
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
name, ext = os.path.splitext(filepath)
|
|
backup_name = f"{name}_{timestamp}{ext}"
|
|
os.rename(filepath, backup_name)
|
|
print(f"Backed up existing {filepath} to {backup_name}")
|
|
|
|
|
|
def extract_bibtex_key(entry):
|
|
"""Extract the BibTeX key from an entry string."""
|
|
# Match @type{key, ...}
|
|
match = re.match(r'^@[a-zA-Z]+\s*{\s*([^,}\s]+)', entry.strip(), re.IGNORECASE)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
|
|
def parse_bib_file(filepath):
|
|
"""
|
|
Parse a .bib file and return a dict of BibTeX keys to (entry, filepath).
|
|
"""
|
|
entries = {}
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
except UnicodeDecodeError:
|
|
# Try with latin-1 if utf-8 fails
|
|
with open(filepath, 'r', encoding='latin-1') as f:
|
|
content = f.read()
|
|
|
|
# Split entries by '@' but keep the '@'
|
|
raw_entries = re.split(r'(?=@[a-zA-Z]+\s*{)', content)
|
|
|
|
for raw_entry in raw_entries:
|
|
if not raw_entry.strip() or not raw_entry.strip().startswith('@'):
|
|
continue
|
|
key = extract_bibtex_key(raw_entry)
|
|
if key:
|
|
entries[key] = (raw_entry, filepath)
|
|
return entries
|
|
|
|
|
|
def collect_bib_entries(root_dir):
|
|
"""Recursively collect all BibTeX entries from .bib files."""
|
|
all_entries = {}
|
|
conflicts = defaultdict(list)
|
|
|
|
for dirpath, _, filenames in os.walk(root_dir):
|
|
for filename in filenames:
|
|
if filename.lower().endswith('.bib'):
|
|
filepath = os.path.join(dirpath, filename)
|
|
try:
|
|
entries = parse_bib_file(filepath)
|
|
for key, (entry, source) in entries.items():
|
|
if key in all_entries:
|
|
conflicts[key].append((entry, source))
|
|
else:
|
|
all_entries[key] = (entry, source)
|
|
except Exception as e:
|
|
print(f"Warning: Skipping {filepath} due to error: {e}", file=sys.stderr)
|
|
|
|
# Process conflicts: select the longest entry and prepare conflict data
|
|
resolved_conflicts = {}
|
|
conflict_data = {}
|
|
|
|
for key in conflicts:
|
|
# Include the first occurrence that was in all_entries
|
|
candidates = [all_entries[key]] + conflicts[key]
|
|
# Select the longest entry by character count
|
|
selected = max(candidates, key=lambda x: len(x[0]))
|
|
all_entries[key] = selected
|
|
conflict_data[key] = {
|
|
'candidates': candidates,
|
|
'selected': selected
|
|
}
|
|
|
|
return all_entries, conflict_data
|
|
|
|
|
|
def write_bib_file(entries, output_file):
|
|
"""Write sorted BibTeX entries to output file."""
|
|
sorted_keys = sorted(entries.keys())
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
for key in sorted_keys:
|
|
entry, _ = entries[key]
|
|
f.write(entry)
|
|
if not entry.endswith('\n\n'):
|
|
f.write('\n\n')
|
|
|
|
|
|
def write_conflicts(conflicts, output_file):
|
|
"""Write conflict report to org-mode file, tagging the selected entry."""
|
|
if not conflicts:
|
|
return
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write("#+TITLE: BibTeX Conflicts Report\n")
|
|
f.write("#+AUTHOR: BibTeX Aggregator\n")
|
|
f.write(f"#+DATE: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
|
|
|
for key in sorted(conflicts.keys()):
|
|
f.write(f"* Conflict for BibTeX key: {key}\n")
|
|
candidates = conflicts[key]['candidates']
|
|
selected = conflicts[key]['selected']
|
|
|
|
for i, (entry, source) in enumerate(candidates, 1):
|
|
is_selected = (entry == selected[0] and source == selected[1])
|
|
tag = " << SELECTED >>" if is_selected else ""
|
|
f.write(f"** Source {i}: {os.path.relpath(source, os.getcwd())}{tag}\n")
|
|
f.write("#+BEGIN_SRC bibtex\n")
|
|
f.write(entry.strip())
|
|
f.write("\n#+END_SRC\n\n")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Aggregate BibTeX entries from multiple files.")
|
|
parser.add_argument('--working-dir', '-w', default='.',
|
|
help='Working directory to search for .bib files (default: .)')
|
|
parser.add_argument('--output', '-o', default='refs.bib',
|
|
help='Output filename (default: refs.bib)')
|
|
|
|
args = parser.parse_args()
|
|
|
|
working_dir = os.path.abspath(args.working_dir)
|
|
output_file = os.path.abspath(args.output)
|
|
conflicts_file = os.path.join(os.path.dirname(output_file), 'bib_conflicts.org')
|
|
|
|
if not os.path.exists(working_dir):
|
|
print(f"Error: Working directory '{working_dir}' does not exist.", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Backup existing output file if it exists
|
|
backup_existing_file(output_file)
|
|
# Only backup conflicts file if it exists (don't create empty backup)
|
|
if os.path.exists(conflicts_file):
|
|
backup_existing_file(conflicts_file)
|
|
|
|
# Collect entries
|
|
print(f"Searching for .bib files in '{working_dir}'...")
|
|
entries, conflicts = collect_bib_entries(working_dir)
|
|
|
|
# Write output files
|
|
write_bib_file(entries, output_file)
|
|
write_conflicts(conflicts, conflicts_file)
|
|
|
|
# Summary
|
|
print(f"Written {len(entries)} unique entries to {output_file}")
|
|
if conflicts:
|
|
print(f"Found {len(conflicts)} conflicting keys; details in {conflicts_file}")
|
|
else:
|
|
print("No conflicts found.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|
|
|