TriuneCadence/tools/extract_thesis_bibtex.py

from __future__ import annotations

from pathlib import Path
import re


ROOT = Path(__file__).resolve().parents[1]
THES = ROOT / "THES" / "INT_ANN.TXT"
OUT = ROOT / "latex" / "integration_and_hybridization_in_neural_network_modelling.bib"

MANUAL_OVERRIDES = {
    "Farhat1986": {
        "kind": "incollection",
        "title": "Neural net models and optical computing: an overview",
        "booktitle": "Hybrid and Optical Computing",
        "editor": "Harold Szu",
        "publisher": "SPIE",
        "address": "Bellingham, Washington",
        "volume": "634",
        "pages": "277-306",
    },
    "Harmon1970": {
        "kind": "incollection",
        "title": "Neural subsystems: an interpretive summary",
        "booktitle": "The Neurosciences Second Study Program",
        "editor": "F. O. Schmitt",
        "publisher": "Rockefeller University Press",
        "address": "New York",
        "pages": "486-494",
    },
    "HechtNielsen1986": {
        "kind": "incollection",
        "title": "Performance limits of optical, electro-optical, and electronic neurocomputers",
        "booktitle": "Hybrid and Optical Computing",
        "editor": "H. Szu",
        "publisher": "SPIE",
        "address": "Bellingham, Washington",
        "volume": "634",
        "pages": "277-306",
    },
    "Hopfield1982": {
        "kind": "article",
        "journal": "Proceedings of the National Academy of Sciences",
        "volume": "79",
        "pages": "2554-2558",
    },
    "Leven1987a": {
        "kind": "phdthesis",
        "title": "Choice and Neural process: A dissertation",
        "school": "University of Texas at Arlington",
        "note": "Chapter 5: Neural process and form -- mathematics and meaning.",
    },
    "Leven1987b": {
        "kind": "inproceedings",
        "title": "S.A.M.: a triune extension to the ART model",
        "note": "Poster presentation at the North Texas State University Symposium on Neural Networks.",
    },
    "Levine1990": {
        "kind": "unpublished",
        "note": "To appear in Motivation, Emotion, and Goal Direction in Neural Networks, D. Levine and S. Leven, eds., Erlbaum, Hillsdale, New Jersey.",
    },
    "Lippmann1987": {
        "kind": "article",
        "journal": "IEEE ASSP Magazine",
        "month": "apr",
        "pages": "4-22",
    },
    "MacLean1970": {
        "kind": "incollection",
        "title": "The triune brain, emotion, and scientific bias",
        "booktitle": "The Neurosciences Second Study Program",
        "editor": "F. O. Schmitt",
        "publisher": "Rockefeller University Press",
        "address": "New York",
        "pages": "486-494",
    },
    "Matsuoka1989": {
        "author": "Matsuoka, T. and Hamada, H. and Nakatsu, R.",
    },
    "Neuroscience1988": {
        "kind": "techreport",
        "author": "{Metroplex Study Group on Computational Neuroscience}",
        "institution": "North Texas Commission Regional Technology Program",
        "note": "Report to the North Texas Commission Regional Technology Program.",
    },
    "Newell1976": {
        "kind": "article",
        "journal": "Communications of the ACM",
        "volume": "19",
        "number": "3",
        "pages": "113-126",
    },
    "Nottebohm1989": {
        "kind": "article",
        "journal": "Scientific American",
        "month": "feb",
        "pages": "74-79",
    },
    "Pao1989": {
        "kind": "book",
        "publisher": "Addison-Wesley",
        "address": "Reading, Massachusetts",
    },
    "Parker1985": {
        "kind": "techreport",
        "institution": "Massachusetts Institute of Technology, Center for Computational Research in Economics and Management Science",
        "address": "Cambridge, Massachusetts",
        "number": "TR-47",
        "title": "Learning-logic",
    },
    "Rumelhart1986": {
        "kind": "incollection",
        "title": "Learning internal representations by back propagation",
        "booktitle": "Parallel Distributed Processing",
        "editor": "D. Rumelhart and J. McClelland and the PDP Research Group",
        "publisher": "MIT Press",
        "address": "Cambridge, Massachusetts",
        "volume": "1",
        "pages": "365-422",
    },
    "Simpson1988": {
        "kind": "unpublished",
        "note": "Submitted to CRC Critical Reviews in Artificial Intelligence.",
    },
    "Sontag1989": {
        "kind": "inproceedings",
        "title": "Back-propagation separates when perceptrons do",
        "booktitle": "Proceedings of the IEEE/INNS International Joint Conference on Neural Networks (IJCNN-89) Vol. I",
        "pages": "639-642",
    },
    "Tsutsumi1989": {
        "kind": "inproceedings",
        "title": "A multi-layered neural network composed of backprop. and Hopfield nets and internal space representation",
        "booktitle": "Proceedings of the IEEE/INNS International Joint Conference on Neural Networks (IJCNN-89) Vol. I",
        "pages": "507-512",
    },
    "Hewitt1985": {
        "kind": "article",
        "journal": "Byte",
        "volume": "10",
        "number": "4",
        "pages": "223-242",
    },
    "Widrow1988": {
        "kind": "article",
        "journal": "IEEE Computer",
        "volume": "21",
        "number": "3",
        "pages": "25-39",
    },
    "Charniak1985": {
        "kind": "book",
        "publisher": "Addison-Wesley",
        "address": "Reading, Massachusetts",
        "note": "701 pp.",
    },
    "Hebb1949": {
        "kind": "book",
        "publisher": "Wiley",
        "address": "New York",
    },
}


def clean_line(line: str) -> str:
    line = re.sub(r"[\x00-\x1f]", "", line)
    line = (
        line.replace("<EFBFBD>", "'")
        .replace("®", "'")
        .replace("÷", "-")
        .replace("`", "'")
    )
    return re.sub(r"\s+", " ", line).strip()


def load_entries() -> list[str]:
    lines = THES.read_text(encoding="latin-1").splitlines()
    start = next(i for i, line in enumerate(lines) if line.strip() == "BIBLIOGRAPHY")
    chunks: list[list[str]] = []
    current: list[str] = []
    for raw in lines[start + 1 :]:
        line = clean_line(raw)
        if not line or re.fullmatch(r"[ivxlcdmIVXLCDM]+|\d+", line):
            if current:
                chunks.append(current)
                current = []
            continue
        current.append(line)
    if current:
        chunks.append(current)
    return [" ".join(chunk) for chunk in chunks]


def bib_key(entry: str, index: int) -> str:
    match = re.match(r"([A-Za-z][A-Za-z\-\.\s,&']+?)\s+(\d{4}[a-z]?)\.", entry)
    if match:
        surname = re.sub(r"[^A-Za-z]", "", match.group(1).split(",")[0].split()[-1])
        year = match.group(2)
        return f"{surname}{year}"
    return f"elsberryRef{index:03d}"


def entry_type(entry: str) -> str:
    lowered = entry.lower()
    if "dissertation" in lowered:
        return "phdthesis"
    if "personal communication" in lowered:
        return "misc"
    if "proceedings" in lowered or "conference" in lowered or "poster presentation" in lowered:
        return "inproceedings"
    if re.search(r"\b\d+\s*,\s*\d+\s*-\s*\d+\.?$", entry):
        return "article"
    if "press" in lowered or "books" in lowered or "wiley" in lowered or "addison-wesley" in lowered:
        return "book"
    if "journal" in lowered or "magazine" in lowered or "cybernetics" in lowered or "biosciences" in lowered:
        return "article"
    return "misc"


def split_author_year(entry: str) -> tuple[str, str, str]:
    match = re.match(r"(.+?)\s+(\d{4}[a-z]?)\.\s+(.*)$", entry)
    if not match:
        return "Unknown", "0000", entry
    return match.group(1).strip(), match.group(2), match.group(3).strip()


def split_title_note(rest: str) -> tuple[str, str]:
    cues = (
        "Proceedings",
        "Proc.",
        "In ",
        "American ",
        "Computer ",
        "Applied ",
        "Mathematical ",
        "Studies ",
        "International ",
        "Rockefeller ",
        "Bantam ",
        "Wiley",
        "SPIE",
        "Biological ",
        "Byte ",
        "Communications ",
        "Scientific ",
        "Bell ",
        "IEEE ",
        "IRE ",
        "Neural Networks ",
        "Bull.",
        "Addison-Wesley",
        "Massachusetts Institute",
        "Report ",
        "Submitted ",
        "To appear ",
        "Unpublished ",
        "University ",
        "Poster ",
        "'North",
    )
    for cue in cues:
        pattern = rf"^(?P<title>.+?)\.\s+(?P<note>{re.escape(cue)}.*)$"
        match = re.match(pattern, rest)
        if match:
            return match.group("title").strip(), match.group("note").strip()
    title = rest.split(".")[0].strip()
    note = rest[len(title):].strip().lstrip(".").strip()
    return title, note


def normalize_author(author: str) -> str:
    author = re.sub(r"\s+", " ", author.strip().rstrip("."))
    author = author.replace("Foo,Y.", "Foo, Y.")
    author = author.replace("Pao,Y.-H.", "Pao, Y.-H.")
    author = author.replace("F. O. Scmitt", "F. O. Schmitt")
    return author


def _field(name: str, value: str) -> str:
    return f"  {name} = {{{value}}}"


def _extract_inproceedings_fields(note: str) -> list[str]:
    fields: list[str] = []
    proceedings_match = re.search(
        r"(Proceedings of .*?(?:\(.*?\))?(?:\s+Vol\.\s*[IVX0-9]+)?)"
        r"(?:\.\s*|,\s*(?:pp\.|\d)|$)",
        note,
        flags=re.IGNORECASE,
    )
    if proceedings_match:
        booktitle = proceedings_match.group(1).rstrip(" .,;")
        fields.append(_field("booktitle", booktitle))
    pages_match = re.search(r"(\d+\s*-\s*\d+)", note)
    if pages_match:
        fields.append(_field("pages", pages_match.group(1).replace(" ", "")))
    return fields


def _extract_article_fields(note: str) -> list[str]:
    fields: list[str] = []
    journal_match = re.match(r"(.+?)\s+(\d+),\s*(\d+\s*-\s*\d+)\.?\s*$", note)
    if journal_match:
        fields.append(_field("journal", journal_match.group(1).rstrip(" .,;")))
        fields.append(_field("volume", journal_match.group(2)))
        fields.append(_field("pages", journal_match.group(3).replace(" ", "")))
        return fields
    journal_match = re.match(r"(.+?)\s+(\d+)\s*,\s*(\d+\s*-\s*\d+)\.?\s*$", note)
    if journal_match:
        fields.append(_field("journal", journal_match.group(1).rstrip(" .,;")))
        fields.append(_field("volume", journal_match.group(2)))
        fields.append(_field("pages", journal_match.group(3).replace(" ", "")))
    return fields


def _extract_book_fields(note: str) -> list[str]:
    fields: list[str] = []
    publisher_match = re.match(r"([^,.]+(?:Press|Books|Wiley|Addison-Wesley|SPIE|University Press))[,\.]\s*(.*)$", note)
    if publisher_match:
        fields.append(_field("publisher", publisher_match.group(1).strip()))
        if publisher_match.group(2).strip():
            fields.append(_field("address", publisher_match.group(2).strip(" .")))
        return fields
    if note:
        fields.append(_field("note", note))
    return fields


def _extract_phdthesis_fields(note: str) -> list[str]:
    fields: list[str] = []
    school_match = re.search(r"(The University of .*?|.*?University.*?)\.", note)
    if school_match:
        fields.append(_field("school", school_match.group(1).strip()))
        remainder = note.replace(school_match.group(0), "", 1).strip(" .")
        if remainder:
            fields.append(_field("note", remainder))
        return fields
    if note:
        fields.append(_field("note", note))
    return fields


def _extra_fields(kind: str, note: str) -> list[str]:
    normalized = note.replace("In '", "In ").replace(",'", ",")
    if not normalized:
        return []
    if kind == "inproceedings":
        fields = _extract_inproceedings_fields(normalized)
        if fields:
            return fields
    if kind == "article":
        fields = _extract_article_fields(normalized)
        if fields:
            return fields
    if kind == "book":
        return _extract_book_fields(normalized)
    if kind == "phdthesis":
        return _extract_phdthesis_fields(normalized)
    return [_field("note", normalized)]


def apply_override(
    key: str,
    kind: str,
    author: str,
    year: str,
    title: str,
    note: str,
    fields: list[str],
) -> tuple[str, str, str, str, list[str]]:
    override = MANUAL_OVERRIDES.get(key)
    if not override:
        return kind, author, year, title, fields

    kind = override.get("kind", kind)
    author = override.get("author", author)
    title = override.get("title", title)

    ordered_fields = [
        _field("author", author),
        _field("year", year),
        _field("title", title),
    ]
    for name in (
        "journal",
        "booktitle",
        "editor",
        "publisher",
        "institution",
        "school",
        "address",
        "volume",
        "number",
        "pages",
        "month",
        "note",
    ):
        value = override.get(name)
        if value:
            ordered_fields.append(_field(name, value))
    return kind, author, year, title, ordered_fields


def to_bibtex(entry: str, index: int) -> str:
    key = bib_key(entry, index)
    kind = entry_type(entry)
    author, year, rest = split_author_year(entry)
    author = normalize_author(author)
    title, note = split_title_note(rest)
    fields = [
        _field("author", author),
        _field("year", year),
        _field("title", title),
    ]
    fields.extend(_extra_fields(kind, note))
    kind, author, year, title, fields = apply_override(
        key, kind, author, year, title, note, fields
    )
    return "@{kind}{{{key},\n{fields}\n}}".format(
        kind=kind,
        key=key,
        fields=",\n".join(fields),
    )


def main() -> int:
    entries = load_entries()
    OUT.parent.mkdir(parents=True, exist_ok=True)
    OUT.write_text(
        "\n\n".join(to_bibtex(entry, idx) for idx, entry in enumerate(entries, start=1)) + "\n",
        encoding="utf-8",
    )
    return 0


if __name__ == "__main__":
    raise SystemExit(main())