TriuneCadence/tools/extract_thesis_bibtex.py

438 lines
14 KiB
Python
Raw Permalink Blame History

from __future__ import annotations
from pathlib import Path
import re
ROOT = Path(__file__).resolve().parents[1]
THES = ROOT / "THES" / "INT_ANN.TXT"
OUT = ROOT / "latex" / "integration_and_hybridization_in_neural_network_modelling.bib"
MANUAL_OVERRIDES = {
"Farhat1986": {
"kind": "incollection",
"title": "Neural net models and optical computing: an overview",
"booktitle": "Hybrid and Optical Computing",
"editor": "Harold Szu",
"publisher": "SPIE",
"address": "Bellingham, Washington",
"volume": "634",
"pages": "277-306",
},
"Harmon1970": {
"kind": "incollection",
"title": "Neural subsystems: an interpretive summary",
"booktitle": "The Neurosciences Second Study Program",
"editor": "F. O. Schmitt",
"publisher": "Rockefeller University Press",
"address": "New York",
"pages": "486-494",
},
"HechtNielsen1986": {
"kind": "incollection",
"title": "Performance limits of optical, electro-optical, and electronic neurocomputers",
"booktitle": "Hybrid and Optical Computing",
"editor": "H. Szu",
"publisher": "SPIE",
"address": "Bellingham, Washington",
"volume": "634",
"pages": "277-306",
},
"Hopfield1982": {
"kind": "article",
"journal": "Proceedings of the National Academy of Sciences",
"volume": "79",
"pages": "2554-2558",
},
"Leven1987a": {
"kind": "phdthesis",
"title": "Choice and Neural process: A dissertation",
"school": "University of Texas at Arlington",
"note": "Chapter 5: Neural process and form -- mathematics and meaning.",
},
"Leven1987b": {
"kind": "inproceedings",
"title": "S.A.M.: a triune extension to the ART model",
"note": "Poster presentation at the North Texas State University Symposium on Neural Networks.",
},
"Levine1990": {
"kind": "unpublished",
"note": "To appear in Motivation, Emotion, and Goal Direction in Neural Networks, D. Levine and S. Leven, eds., Erlbaum, Hillsdale, New Jersey.",
},
"Lippmann1987": {
"kind": "article",
"journal": "IEEE ASSP Magazine",
"month": "apr",
"pages": "4-22",
},
"MacLean1970": {
"kind": "incollection",
"title": "The triune brain, emotion, and scientific bias",
"booktitle": "The Neurosciences Second Study Program",
"editor": "F. O. Schmitt",
"publisher": "Rockefeller University Press",
"address": "New York",
"pages": "486-494",
},
"Matsuoka1989": {
"author": "Matsuoka, T. and Hamada, H. and Nakatsu, R.",
},
"Neuroscience1988": {
"kind": "techreport",
"author": "{Metroplex Study Group on Computational Neuroscience}",
"institution": "North Texas Commission Regional Technology Program",
"note": "Report to the North Texas Commission Regional Technology Program.",
},
"Newell1976": {
"kind": "article",
"journal": "Communications of the ACM",
"volume": "19",
"number": "3",
"pages": "113-126",
},
"Nottebohm1989": {
"kind": "article",
"journal": "Scientific American",
"month": "feb",
"pages": "74-79",
},
"Pao1989": {
"kind": "book",
"publisher": "Addison-Wesley",
"address": "Reading, Massachusetts",
},
"Parker1985": {
"kind": "techreport",
"institution": "Massachusetts Institute of Technology, Center for Computational Research in Economics and Management Science",
"address": "Cambridge, Massachusetts",
"number": "TR-47",
"title": "Learning-logic",
},
"Rumelhart1986": {
"kind": "incollection",
"title": "Learning internal representations by back propagation",
"booktitle": "Parallel Distributed Processing",
"editor": "D. Rumelhart and J. McClelland and the PDP Research Group",
"publisher": "MIT Press",
"address": "Cambridge, Massachusetts",
"volume": "1",
"pages": "365-422",
},
"Simpson1988": {
"kind": "unpublished",
"note": "Submitted to CRC Critical Reviews in Artificial Intelligence.",
},
"Sontag1989": {
"kind": "inproceedings",
"title": "Back-propagation separates when perceptrons do",
"booktitle": "Proceedings of the IEEE/INNS International Joint Conference on Neural Networks (IJCNN-89) Vol. I",
"pages": "639-642",
},
"Tsutsumi1989": {
"kind": "inproceedings",
"title": "A multi-layered neural network composed of backprop. and Hopfield nets and internal space representation",
"booktitle": "Proceedings of the IEEE/INNS International Joint Conference on Neural Networks (IJCNN-89) Vol. I",
"pages": "507-512",
},
"Hewitt1985": {
"kind": "article",
"journal": "Byte",
"volume": "10",
"number": "4",
"pages": "223-242",
},
"Widrow1988": {
"kind": "article",
"journal": "IEEE Computer",
"volume": "21",
"number": "3",
"pages": "25-39",
},
"Charniak1985": {
"kind": "book",
"publisher": "Addison-Wesley",
"address": "Reading, Massachusetts",
"note": "701 pp.",
},
"Hebb1949": {
"kind": "book",
"publisher": "Wiley",
"address": "New York",
},
}
def clean_line(line: str) -> str:
line = re.sub(r"[\x00-\x1f]", "", line)
line = (
line.replace("<EFBFBD>", "'")
.replace("®", "'")
.replace("÷", "-")
.replace("`", "'")
)
return re.sub(r"\s+", " ", line).strip()
def load_entries() -> list[str]:
lines = THES.read_text(encoding="latin-1").splitlines()
start = next(i for i, line in enumerate(lines) if line.strip() == "BIBLIOGRAPHY")
chunks: list[list[str]] = []
current: list[str] = []
for raw in lines[start + 1 :]:
line = clean_line(raw)
if not line or re.fullmatch(r"[ivxlcdmIVXLCDM]+|\d+", line):
if current:
chunks.append(current)
current = []
continue
current.append(line)
if current:
chunks.append(current)
return [" ".join(chunk) for chunk in chunks]
def bib_key(entry: str, index: int) -> str:
match = re.match(r"([A-Za-z][A-Za-z\-\.\s,&']+?)\s+(\d{4}[a-z]?)\.", entry)
if match:
surname = re.sub(r"[^A-Za-z]", "", match.group(1).split(",")[0].split()[-1])
year = match.group(2)
return f"{surname}{year}"
return f"elsberryRef{index:03d}"
def entry_type(entry: str) -> str:
lowered = entry.lower()
if "dissertation" in lowered:
return "phdthesis"
if "personal communication" in lowered:
return "misc"
if "proceedings" in lowered or "conference" in lowered or "poster presentation" in lowered:
return "inproceedings"
if re.search(r"\b\d+\s*,\s*\d+\s*-\s*\d+\.?$", entry):
return "article"
if "press" in lowered or "books" in lowered or "wiley" in lowered or "addison-wesley" in lowered:
return "book"
if "journal" in lowered or "magazine" in lowered or "cybernetics" in lowered or "biosciences" in lowered:
return "article"
return "misc"
def split_author_year(entry: str) -> tuple[str, str, str]:
match = re.match(r"(.+?)\s+(\d{4}[a-z]?)\.\s+(.*)$", entry)
if not match:
return "Unknown", "0000", entry
return match.group(1).strip(), match.group(2), match.group(3).strip()
def split_title_note(rest: str) -> tuple[str, str]:
cues = (
"Proceedings",
"Proc.",
"In ",
"American ",
"Computer ",
"Applied ",
"Mathematical ",
"Studies ",
"International ",
"Rockefeller ",
"Bantam ",
"Wiley",
"SPIE",
"Biological ",
"Byte ",
"Communications ",
"Scientific ",
"Bell ",
"IEEE ",
"IRE ",
"Neural Networks ",
"Bull.",
"Addison-Wesley",
"Massachusetts Institute",
"Report ",
"Submitted ",
"To appear ",
"Unpublished ",
"University ",
"Poster ",
"'North",
)
for cue in cues:
pattern = rf"^(?P<title>.+?)\.\s+(?P<note>{re.escape(cue)}.*)$"
match = re.match(pattern, rest)
if match:
return match.group("title").strip(), match.group("note").strip()
title = rest.split(".")[0].strip()
note = rest[len(title):].strip().lstrip(".").strip()
return title, note
def normalize_author(author: str) -> str:
author = re.sub(r"\s+", " ", author.strip().rstrip("."))
author = author.replace("Foo,Y.", "Foo, Y.")
author = author.replace("Pao,Y.-H.", "Pao, Y.-H.")
author = author.replace("F. O. Scmitt", "F. O. Schmitt")
return author
def _field(name: str, value: str) -> str:
return f" {name} = {{{value}}}"
def _extract_inproceedings_fields(note: str) -> list[str]:
fields: list[str] = []
proceedings_match = re.search(
r"(Proceedings of .*?(?:\(.*?\))?(?:\s+Vol\.\s*[IVX0-9]+)?)"
r"(?:\.\s*|,\s*(?:pp\.|\d)|$)",
note,
flags=re.IGNORECASE,
)
if proceedings_match:
booktitle = proceedings_match.group(1).rstrip(" .,;")
fields.append(_field("booktitle", booktitle))
pages_match = re.search(r"(\d+\s*-\s*\d+)", note)
if pages_match:
fields.append(_field("pages", pages_match.group(1).replace(" ", "")))
return fields
def _extract_article_fields(note: str) -> list[str]:
fields: list[str] = []
journal_match = re.match(r"(.+?)\s+(\d+),\s*(\d+\s*-\s*\d+)\.?\s*$", note)
if journal_match:
fields.append(_field("journal", journal_match.group(1).rstrip(" .,;")))
fields.append(_field("volume", journal_match.group(2)))
fields.append(_field("pages", journal_match.group(3).replace(" ", "")))
return fields
journal_match = re.match(r"(.+?)\s+(\d+)\s*,\s*(\d+\s*-\s*\d+)\.?\s*$", note)
if journal_match:
fields.append(_field("journal", journal_match.group(1).rstrip(" .,;")))
fields.append(_field("volume", journal_match.group(2)))
fields.append(_field("pages", journal_match.group(3).replace(" ", "")))
return fields
def _extract_book_fields(note: str) -> list[str]:
fields: list[str] = []
publisher_match = re.match(r"([^,.]+(?:Press|Books|Wiley|Addison-Wesley|SPIE|University Press))[,\.]\s*(.*)$", note)
if publisher_match:
fields.append(_field("publisher", publisher_match.group(1).strip()))
if publisher_match.group(2).strip():
fields.append(_field("address", publisher_match.group(2).strip(" .")))
return fields
if note:
fields.append(_field("note", note))
return fields
def _extract_phdthesis_fields(note: str) -> list[str]:
fields: list[str] = []
school_match = re.search(r"(The University of .*?|.*?University.*?)\.", note)
if school_match:
fields.append(_field("school", school_match.group(1).strip()))
remainder = note.replace(school_match.group(0), "", 1).strip(" .")
if remainder:
fields.append(_field("note", remainder))
return fields
if note:
fields.append(_field("note", note))
return fields
def _extra_fields(kind: str, note: str) -> list[str]:
normalized = note.replace("In '", "In ").replace(",'", ",")
if not normalized:
return []
if kind == "inproceedings":
fields = _extract_inproceedings_fields(normalized)
if fields:
return fields
if kind == "article":
fields = _extract_article_fields(normalized)
if fields:
return fields
if kind == "book":
return _extract_book_fields(normalized)
if kind == "phdthesis":
return _extract_phdthesis_fields(normalized)
return [_field("note", normalized)]
def apply_override(
key: str,
kind: str,
author: str,
year: str,
title: str,
note: str,
fields: list[str],
) -> tuple[str, str, str, str, list[str]]:
override = MANUAL_OVERRIDES.get(key)
if not override:
return kind, author, year, title, fields
kind = override.get("kind", kind)
author = override.get("author", author)
title = override.get("title", title)
ordered_fields = [
_field("author", author),
_field("year", year),
_field("title", title),
]
for name in (
"journal",
"booktitle",
"editor",
"publisher",
"institution",
"school",
"address",
"volume",
"number",
"pages",
"month",
"note",
):
value = override.get(name)
if value:
ordered_fields.append(_field(name, value))
return kind, author, year, title, ordered_fields
def to_bibtex(entry: str, index: int) -> str:
key = bib_key(entry, index)
kind = entry_type(entry)
author, year, rest = split_author_year(entry)
author = normalize_author(author)
title, note = split_title_note(rest)
fields = [
_field("author", author),
_field("year", year),
_field("title", title),
]
fields.extend(_extra_fields(kind, note))
kind, author, year, title, fields = apply_override(
key, kind, author, year, title, note, fields
)
return "@{kind}{{{key},\n{fields}\n}}".format(
kind=kind,
key=key,
fields=",\n".join(fields),
)
def main() -> int:
entries = load_entries()
OUT.parent.mkdir(parents=True, exist_ok=True)
OUT.write_text(
"\n\n".join(to_bibtex(entry, idx) for idx, entry in enumerate(entries, start=1)) + "\n",
encoding="utf-8",
)
return 0
if __name__ == "__main__":
raise SystemExit(main())