TriuneCadence/tools/convert_thes_to_latex.py

357 lines
12 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
from pathlib import Path
import re
ROOT = Path(__file__).resolve().parents[1]
THES = ROOT / "THES"
OUT = ROOT / "latex"
FILES = {
"INT_ANN.TXT": {
"target": "integration_and_hybridization_in_neural_network_modelling",
"title": "Integration and Hybridization in Neural Network Modelling",
"mode": "thesis",
},
"COMPCOOP.TXT": {
"target": "competing_network_models_and_problem_solving",
"title": "Competing Network Models and Problem-Solving",
"mode": "generic",
},
"THPROPOS.TXT": {
"target": "thesis_proposal",
"title": "Thesis Proposal",
"mode": "generic",
},
}
FRONT_MATTER_HEADINGS = {
"ACKNOWLEDGEMENTS",
"ABSTRACT",
"TABLE OF CONTENTS",
"LIST OF ILLUSTRATIONS",
"LIST OF TABLES",
}
def escape_latex(text: str) -> str:
replacements = {
"\\": r"\textbackslash{}",
"&": r"\&",
"%": r"\%",
"$": r"\$",
"#": r"\#",
"_": r"\_",
"{": r"\{",
"}": r"\}",
"~": r"\textasciitilde{}",
"^": r"\textasciicircum{}",
}
for src, dst in replacements.items():
text = text.replace(src, dst)
return text
def clean_line(line: str) -> str:
line = re.sub(r"[\x00-\x08\x0b-\x1f]", "", line)
line = line.replace("\x1a", "").replace("\x17", "").replace("\ufeff", "")
line = line.replace("<EFBFBD>", "'").replace("®", "'").replace("™", "'")
line = re.sub(r"\s+", " ", line)
return line.rstrip()
def looks_like_heading(line: str) -> bool:
stripped = line.strip()
if not stripped:
return False
if len(stripped) > 80:
return False
if stripped.startswith("."):
return False
if stripped in FRONT_MATTER_HEADINGS:
return True
if re.fullmatch(r"[ivxlcdmIVXLCDM]+", stripped):
return False
if re.fullmatch(r"\d+", stripped):
return False
if re.fullmatch(r"[A-Z][A-Z\s\-]{2,}", stripped) and len(stripped.split()) <= 10:
return True
if re.fullmatch(r"[IVX]+\.\s+.*", stripped):
return True
if re.fullmatch(r"\d+\.\s+.*", stripped):
return True
if stripped.isupper() and len(stripped.split()) <= 12 and len(stripped) >= 4:
return True
return False
def is_page_artifact(line: str) -> bool:
stripped = line.strip()
if not stripped:
return False
if re.fullmatch(r"[ivxlcdmIVXLCDM]+", stripped):
return True
if re.fullmatch(r"\d+", stripped):
return True
if stripped == "Publication No." or re.fullmatch(r"_+", stripped):
return True
return False
def heading_command(line: str) -> str:
stripped = line.strip()
if stripped in FRONT_MATTER_HEADINGS:
return "section*"
if re.fullmatch(r"[IVX]+\.\s+.*", stripped):
return "section*"
if re.fullmatch(r"\d+\.\s+.*", stripped):
return "section*"
return "section*"
def convert_text_file(source: Path, destination: Path, title: str) -> None:
lines = [clean_line(line) for line in source.read_text(encoding="latin-1").splitlines()]
body: list[str] = []
paragraph: list[str] = []
def flush_paragraph() -> None:
if paragraph:
body.append(escape_latex(" ".join(part.strip() for part in paragraph if part.strip())))
body.append("")
paragraph.clear()
for line in lines:
stripped = line.strip()
if not stripped:
flush_paragraph()
continue
if is_page_artifact(stripped):
flush_paragraph()
continue
if stripped.startswith("."):
flush_paragraph()
body.append(f"% raw formatter directive: {escape_latex(stripped)}")
continue
if looks_like_heading(stripped):
flush_paragraph()
body.append(rf"\{heading_command(stripped)}{{{escape_latex(stripped)}}}")
continue
paragraph.append(stripped)
flush_paragraph()
tex = [
r"\documentclass[12pt]{article}",
r"\usepackage[utf8]{inputenc}",
r"\usepackage[T1]{fontenc}",
r"\usepackage{geometry}",
r"\geometry{margin=1in}",
r"\title{" + escape_latex(title) + "}",
r"\author{Converted from legacy plain-text source}",
r"\date{}",
r"\begin{document}",
r"\maketitle",
r"\begin{flushleft}",
"% This is a conservative automated conversion from the legacy text file.",
"% Manual cleanup will still be necessary for figures, references, footnotes, and formatting.",
"",
*body,
r"\end{flushleft}",
r"\end{document}",
"",
]
destination.write_text("\n".join(tex), encoding="utf-8")
def _clean_lines(source: Path) -> list[str]:
return [clean_line(line) for line in source.read_text(encoding="latin-1").splitlines()]
def _find_line_index(lines: list[str], needle: str, start: int = 0) -> int:
for idx in range(start, len(lines)):
if lines[idx].strip() == needle:
return idx
raise ValueError(f"could not find line: {needle}")
def _collect_paragraphs(lines: list[str]) -> list[str]:
paragraphs: list[str] = []
current: list[str] = []
for raw in lines:
stripped = raw.strip()
if not stripped or is_page_artifact(stripped):
if current:
paragraphs.append(escape_latex(" ".join(current)))
current.clear()
continue
current.append(stripped)
if current:
paragraphs.append(escape_latex(" ".join(current)))
return paragraphs
def _collect_uppercase_heading_lines(lines: list[str], start_idx: int) -> tuple[list[str], int]:
heading_lines: list[str] = []
idx = start_idx
while idx < len(lines):
candidate = lines[idx].strip()
idx += 1
if not candidate or is_page_artifact(candidate):
continue
if re.fullmatch(r"(CHAPTER|APPENDIX)\s+\d+", candidate):
idx -= 1
break
if candidate.isupper() and len(candidate) <= 120:
heading_lines.append(candidate)
continue
idx -= 1
break
return heading_lines, idx
def convert_thesis_file(source: Path, destination: Path, title: str) -> None:
lines = _clean_lines(source)
ack_idx = _find_line_index(lines, "ACKNOWLEDGEMENTS")
abs_idx = _find_line_index(lines, "ABSTRACT", ack_idx + 1)
toc_idx = _find_line_index(lines, "TABLE OF CONTENTS", abs_idx + 1)
chap1_idx = _find_line_index(lines, "CHAPTER 1", abs_idx + 1)
bib_idx = _find_line_index(lines, "BIBLIOGRAPHY", chap1_idx + 1)
acknowledgements = _collect_paragraphs(lines[ack_idx + 1 : abs_idx])
abstract_lines = []
for line in lines[abs_idx + 1 : toc_idx]:
stripped = line.strip()
if not stripped or is_page_artifact(stripped):
abstract_lines.append("")
continue
if stripped in {
"INTEGRATION AND HYBRIDIZATION IN NEURAL NETWORK MODELLING",
"Wesley Royce Elsberry, M.S.",
"The University of Texas at Arlington, 1989",
"Supervising Professor: Karan Briggs",
}:
continue
if ". . ." in stripped:
continue
abstract_lines.append(stripped)
abstract = _collect_paragraphs(abstract_lines)
body = _convert_thesis_body(lines[chap1_idx:bib_idx])
tex = [
r"\documentclass[12pt]{report}",
r"\usepackage[utf8]{inputenc}",
r"\usepackage[T1]{fontenc}",
r"\usepackage{geometry}",
r"\geometry{margin=1in}",
r"\title{" + escape_latex(title) + "}",
r"\author{Wesley Royce Elsberry}",
r"\date{August 1989}",
r"\begin{document}",
r"\begin{titlepage}",
r"\centering",
r"{\Large " + escape_latex(title) + r"\par}",
r"\vspace{1.5cm}",
r"{\large Wesley Royce Elsberry\par}",
r"\vspace{1cm}",
r"Presented to the Faculty of the Graduate School of\par",
r"The University of Texas at Arlington in Partial Fulfillment\par",
r"of the Requirements for the Degree of\par",
r"\vspace{0.5cm}",
r"{\large Master of Science in Computer Science\par}",
r"\vfill",
r"The University of Texas at Arlington\par",
r"August 1989\par",
r"\end{titlepage}",
r"\chapter*{Acknowledgements}",
*[p + "\n" for p in acknowledgements],
r"\chapter*{Abstract}",
*[p + "\n" for p in abstract],
r"\tableofcontents",
*body,
r"\nocite{*}",
r"\bibliographystyle{plain}",
r"\bibliography{integration_and_hybridization_in_neural_network_modelling}",
r"\end{document}",
"",
]
destination.write_text("\n".join(tex), encoding="utf-8")
def _convert_thesis_body(lines: list[str]) -> list[str]:
body: list[str] = []
paragraph: list[str] = []
idx = 0
in_appendix = False
appendix_mode = "normal"
def flush_paragraph() -> None:
if paragraph:
body.append(escape_latex(" ".join(paragraph)))
body.append("")
paragraph.clear()
while idx < len(lines):
stripped = lines[idx].strip()
idx += 1
if not stripped or is_page_artifact(stripped):
flush_paragraph()
continue
if stripped.startswith("."):
flush_paragraph()
body.append(f"% raw formatter directive: {escape_latex(stripped)}")
continue
if re.fullmatch(r"CHAPTER\s+\d+", stripped):
flush_paragraph()
heading_lines, idx = _collect_uppercase_heading_lines(lines, idx)
title = " ".join(heading_lines) if heading_lines else stripped
body.append(rf"\chapter{{{escape_latex(title.title())}}}")
continue
if re.fullmatch(r"APPENDIX\s+\d+", stripped):
flush_paragraph()
if not in_appendix:
body.append(r"\appendix")
in_appendix = True
heading_lines, idx = _collect_uppercase_heading_lines(lines, idx)
title = " ".join(heading_lines) if heading_lines else stripped
body.append(rf"\chapter{{{escape_latex(title.title())}}}")
if (
"PROGRAM SOURCE LISTING" in title
or "DATA FILE LISTING" in title
):
appendix_mode = "listing"
body.append(
"This appendix is represented in the repository by the legacy source and data files in \\texttt{THES/}. "
"The automated thesis conversion suppresses the full listing here to keep the document manageable."
)
body.append("")
else:
appendix_mode = "normal"
continue
if in_appendix and appendix_mode == "listing":
continue
if stripped.isupper() and len(stripped.split()) <= 10 and len(stripped) > 4:
flush_paragraph()
body.append(rf"\section*{{{escape_latex(stripped.title())}}}")
continue
paragraph.append(stripped)
flush_paragraph()
return body
def main() -> int:
OUT.mkdir(parents=True, exist_ok=True)
for source_name, config in FILES.items():
source = THES / source_name
destination = OUT / f"{config['target']}.tex"
if config["mode"] == "thesis":
convert_thesis_file(source, destination, config["title"])
else:
convert_text_file(source, destination, config["title"])
return 0
if __name__ == "__main__":
raise SystemExit(main())