TriuneCadence/tools/convert_thes_to_latex.py

from __future__ import annotations

from pathlib import Path
import re


ROOT = Path(__file__).resolve().parents[1]
THES = ROOT / "THES"
OUT = ROOT / "latex"

FILES = {
    "INT_ANN.TXT": {
        "target": "integration_and_hybridization_in_neural_network_modelling",
        "title": "Integration and Hybridization in Neural Network Modelling",
        "mode": "thesis",
    },
    "COMPCOOP.TXT": {
        "target": "competing_network_models_and_problem_solving",
        "title": "Competing Network Models and Problem-Solving",
        "mode": "generic",
    },
    "THPROPOS.TXT": {
        "target": "thesis_proposal",
        "title": "Thesis Proposal",
        "mode": "generic",
    },
}

FRONT_MATTER_HEADINGS = {
    "ACKNOWLEDGEMENTS",
    "ABSTRACT",
    "TABLE OF CONTENTS",
    "LIST OF ILLUSTRATIONS",
    "LIST OF TABLES",
}


def escape_latex(text: str) -> str:
    replacements = {
        "\\": r"\textbackslash{}",
        "&": r"\&",
        "%": r"\%",
        "$": r"\$",
        "#": r"\#",
        "_": r"\_",
        "{": r"\{",
        "}": r"\}",
        "~": r"\textasciitilde{}",
        "^": r"\textasciicircum{}",
    }
    for src, dst in replacements.items():
        text = text.replace(src, dst)
    return text


def clean_line(line: str) -> str:
    line = re.sub(r"[\x00-\x08\x0b-\x1f]", "", line)
    line = line.replace("\x1a", "").replace("\x17", "").replace("\ufeff", "")
    line = line.replace("<EFBFBD>", "'").replace("®", "'").replace("", "'")
    line = re.sub(r"\s+", " ", line)
    return line.rstrip()


def looks_like_heading(line: str) -> bool:
    stripped = line.strip()
    if not stripped:
        return False
    if len(stripped) > 80:
        return False
    if stripped.startswith("."):
        return False
    if stripped in FRONT_MATTER_HEADINGS:
        return True
    if re.fullmatch(r"[ivxlcdmIVXLCDM]+", stripped):
        return False
    if re.fullmatch(r"\d+", stripped):
        return False
    if re.fullmatch(r"[A-Z][A-Z\s\-]{2,}", stripped) and len(stripped.split()) <= 10:
        return True
    if re.fullmatch(r"[IVX]+\.\s+.*", stripped):
        return True
    if re.fullmatch(r"\d+\.\s+.*", stripped):
        return True
    if stripped.isupper() and len(stripped.split()) <= 12 and len(stripped) >= 4:
        return True
    return False


def is_page_artifact(line: str) -> bool:
    stripped = line.strip()
    if not stripped:
        return False
    if re.fullmatch(r"[ivxlcdmIVXLCDM]+", stripped):
        return True
    if re.fullmatch(r"\d+", stripped):
        return True
    if stripped == "Publication No." or re.fullmatch(r"_+", stripped):
        return True
    return False


def heading_command(line: str) -> str:
    stripped = line.strip()
    if stripped in FRONT_MATTER_HEADINGS:
        return "section*"
    if re.fullmatch(r"[IVX]+\.\s+.*", stripped):
        return "section*"
    if re.fullmatch(r"\d+\.\s+.*", stripped):
        return "section*"
    return "section*"


def convert_text_file(source: Path, destination: Path, title: str) -> None:
    lines = [clean_line(line) for line in source.read_text(encoding="latin-1").splitlines()]
    body: list[str] = []
    paragraph: list[str] = []

    def flush_paragraph() -> None:
        if paragraph:
            body.append(escape_latex(" ".join(part.strip() for part in paragraph if part.strip())))
            body.append("")
            paragraph.clear()

    for line in lines:
        stripped = line.strip()
        if not stripped:
            flush_paragraph()
            continue
        if is_page_artifact(stripped):
            flush_paragraph()
            continue
        if stripped.startswith("."):
            flush_paragraph()
            body.append(f"% raw formatter directive: {escape_latex(stripped)}")
            continue
        if looks_like_heading(stripped):
            flush_paragraph()
            body.append(rf"\{heading_command(stripped)}{{{escape_latex(stripped)}}}")
            continue
        paragraph.append(stripped)

    flush_paragraph()

    tex = [
        r"\documentclass[12pt]{article}",
        r"\usepackage[utf8]{inputenc}",
        r"\usepackage[T1]{fontenc}",
        r"\usepackage{geometry}",
        r"\geometry{margin=1in}",
        r"\title{" + escape_latex(title) + "}",
        r"\author{Converted from legacy plain-text source}",
        r"\date{}",
        r"\begin{document}",
        r"\maketitle",
        r"\begin{flushleft}",
        "% This is a conservative automated conversion from the legacy text file.",
        "% Manual cleanup will still be necessary for figures, references, footnotes, and formatting.",
        "",
        *body,
        r"\end{flushleft}",
        r"\end{document}",
        "",
    ]
    destination.write_text("\n".join(tex), encoding="utf-8")


def _clean_lines(source: Path) -> list[str]:
    return [clean_line(line) for line in source.read_text(encoding="latin-1").splitlines()]


def _find_line_index(lines: list[str], needle: str, start: int = 0) -> int:
    for idx in range(start, len(lines)):
        if lines[idx].strip() == needle:
            return idx
    raise ValueError(f"could not find line: {needle}")


def _collect_paragraphs(lines: list[str]) -> list[str]:
    paragraphs: list[str] = []
    current: list[str] = []
    for raw in lines:
        stripped = raw.strip()
        if not stripped or is_page_artifact(stripped):
            if current:
                paragraphs.append(escape_latex(" ".join(current)))
                current.clear()
            continue
        current.append(stripped)
    if current:
        paragraphs.append(escape_latex(" ".join(current)))
    return paragraphs


def _collect_uppercase_heading_lines(lines: list[str], start_idx: int) -> tuple[list[str], int]:
    heading_lines: list[str] = []
    idx = start_idx
    while idx < len(lines):
        candidate = lines[idx].strip()
        idx += 1
        if not candidate or is_page_artifact(candidate):
            continue
        if re.fullmatch(r"(CHAPTER|APPENDIX)\s+\d+", candidate):
            idx -= 1
            break
        if candidate.isupper() and len(candidate) <= 120:
            heading_lines.append(candidate)
            continue
        idx -= 1
        break
    return heading_lines, idx


def convert_thesis_file(source: Path, destination: Path, title: str) -> None:
    lines = _clean_lines(source)
    ack_idx = _find_line_index(lines, "ACKNOWLEDGEMENTS")
    abs_idx = _find_line_index(lines, "ABSTRACT", ack_idx + 1)
    toc_idx = _find_line_index(lines, "TABLE OF CONTENTS", abs_idx + 1)
    chap1_idx = _find_line_index(lines, "CHAPTER 1", abs_idx + 1)
    bib_idx = _find_line_index(lines, "BIBLIOGRAPHY", chap1_idx + 1)

    acknowledgements = _collect_paragraphs(lines[ack_idx + 1 : abs_idx])
    abstract_lines = []
    for line in lines[abs_idx + 1 : toc_idx]:
        stripped = line.strip()
        if not stripped or is_page_artifact(stripped):
            abstract_lines.append("")
            continue
        if stripped in {
            "INTEGRATION AND HYBRIDIZATION IN NEURAL NETWORK MODELLING",
            "Wesley Royce Elsberry, M.S.",
            "The University of Texas at Arlington, 1989",
            "Supervising Professor: Karan Briggs",
        }:
            continue
        if ". . ." in stripped:
            continue
        abstract_lines.append(stripped)
    abstract = _collect_paragraphs(abstract_lines)

    body = _convert_thesis_body(lines[chap1_idx:bib_idx])
    tex = [
        r"\documentclass[12pt]{report}",
        r"\usepackage[utf8]{inputenc}",
        r"\usepackage[T1]{fontenc}",
        r"\usepackage{geometry}",
        r"\geometry{margin=1in}",
        r"\title{" + escape_latex(title) + "}",
        r"\author{Wesley Royce Elsberry}",
        r"\date{August 1989}",
        r"\begin{document}",
        r"\begin{titlepage}",
        r"\centering",
        r"{\Large " + escape_latex(title) + r"\par}",
        r"\vspace{1.5cm}",
        r"{\large Wesley Royce Elsberry\par}",
        r"\vspace{1cm}",
        r"Presented to the Faculty of the Graduate School of\par",
        r"The University of Texas at Arlington in Partial Fulfillment\par",
        r"of the Requirements for the Degree of\par",
        r"\vspace{0.5cm}",
        r"{\large Master of Science in Computer Science\par}",
        r"\vfill",
        r"The University of Texas at Arlington\par",
        r"August 1989\par",
        r"\end{titlepage}",
        r"\chapter*{Acknowledgements}",
        *[p + "\n" for p in acknowledgements],
        r"\chapter*{Abstract}",
        *[p + "\n" for p in abstract],
        r"\tableofcontents",
        *body,
        r"\nocite{*}",
        r"\bibliographystyle{plain}",
        r"\bibliography{integration_and_hybridization_in_neural_network_modelling}",
        r"\end{document}",
        "",
    ]
    destination.write_text("\n".join(tex), encoding="utf-8")


def _convert_thesis_body(lines: list[str]) -> list[str]:
    body: list[str] = []
    paragraph: list[str] = []
    idx = 0
    in_appendix = False
    appendix_mode = "normal"

    def flush_paragraph() -> None:
        if paragraph:
            body.append(escape_latex(" ".join(paragraph)))
            body.append("")
            paragraph.clear()

    while idx < len(lines):
        stripped = lines[idx].strip()
        idx += 1
        if not stripped or is_page_artifact(stripped):
            flush_paragraph()
            continue
        if stripped.startswith("."):
            flush_paragraph()
            body.append(f"% raw formatter directive: {escape_latex(stripped)}")
            continue
        if re.fullmatch(r"CHAPTER\s+\d+", stripped):
            flush_paragraph()
            heading_lines, idx = _collect_uppercase_heading_lines(lines, idx)
            title = " ".join(heading_lines) if heading_lines else stripped
            body.append(rf"\chapter{{{escape_latex(title.title())}}}")
            continue
        if re.fullmatch(r"APPENDIX\s+\d+", stripped):
            flush_paragraph()
            if not in_appendix:
                body.append(r"\appendix")
                in_appendix = True
            heading_lines, idx = _collect_uppercase_heading_lines(lines, idx)
            title = " ".join(heading_lines) if heading_lines else stripped
            body.append(rf"\chapter{{{escape_latex(title.title())}}}")
            if (
                "PROGRAM SOURCE LISTING" in title
                or "DATA FILE LISTING" in title
            ):
                appendix_mode = "listing"
                body.append(
                    "This appendix is represented in the repository by the legacy source and data files in \\texttt{THES/}. "
                    "The automated thesis conversion suppresses the full listing here to keep the document manageable."
                )
                body.append("")
            else:
                appendix_mode = "normal"
            continue
        if in_appendix and appendix_mode == "listing":
            continue
        if stripped.isupper() and len(stripped.split()) <= 10 and len(stripped) > 4:
            flush_paragraph()
            body.append(rf"\section*{{{escape_latex(stripped.title())}}}")
            continue
        paragraph.append(stripped)

    flush_paragraph()
    return body


def main() -> int:
    OUT.mkdir(parents=True, exist_ok=True)
    for source_name, config in FILES.items():
        source = THES / source_name
        destination = OUT / f"{config['target']}.tex"
        if config["mode"] == "thesis":
            convert_thesis_file(source, destination, config["title"])
        else:
            convert_text_file(source, destination, config["title"])
    return 0


if __name__ == "__main__":
    raise SystemExit(main())