from __future__ import annotations from pathlib import Path import re ROOT = Path(__file__).resolve().parents[1] THES = ROOT / "THES" OUT = ROOT / "latex" FILES = { "INT_ANN.TXT": { "target": "integration_and_hybridization_in_neural_network_modelling", "title": "Integration and Hybridization in Neural Network Modelling", "mode": "thesis", }, "COMPCOOP.TXT": { "target": "competing_network_models_and_problem_solving", "title": "Competing Network Models and Problem-Solving", "mode": "generic", }, "THPROPOS.TXT": { "target": "thesis_proposal", "title": "Thesis Proposal", "mode": "generic", }, } FRONT_MATTER_HEADINGS = { "ACKNOWLEDGEMENTS", "ABSTRACT", "TABLE OF CONTENTS", "LIST OF ILLUSTRATIONS", "LIST OF TABLES", } def escape_latex(text: str) -> str: replacements = { "\\": r"\textbackslash{}", "&": r"\&", "%": r"\%", "$": r"\$", "#": r"\#", "_": r"\_", "{": r"\{", "}": r"\}", "~": r"\textasciitilde{}", "^": r"\textasciicircum{}", } for src, dst in replacements.items(): text = text.replace(src, dst) return text def clean_line(line: str) -> str: line = re.sub(r"[\x00-\x08\x0b-\x1f]", "", line) line = line.replace("\x1a", "").replace("\x17", "").replace("\ufeff", "") line = line.replace("�", "'").replace("®", "'").replace("™", "'") line = re.sub(r"\s+", " ", line) return line.rstrip() def looks_like_heading(line: str) -> bool: stripped = line.strip() if not stripped: return False if len(stripped) > 80: return False if stripped.startswith("."): return False if stripped in FRONT_MATTER_HEADINGS: return True if re.fullmatch(r"[ivxlcdmIVXLCDM]+", stripped): return False if re.fullmatch(r"\d+", stripped): return False if re.fullmatch(r"[A-Z][A-Z\s\-]{2,}", stripped) and len(stripped.split()) <= 10: return True if re.fullmatch(r"[IVX]+\.\s+.*", stripped): return True if re.fullmatch(r"\d+\.\s+.*", stripped): return True if stripped.isupper() and len(stripped.split()) <= 12 and len(stripped) >= 4: return True return False def is_page_artifact(line: str) -> bool: stripped = line.strip() if not stripped: return False if re.fullmatch(r"[ivxlcdmIVXLCDM]+", stripped): return True if re.fullmatch(r"\d+", stripped): return True if stripped == "Publication No." or re.fullmatch(r"_+", stripped): return True return False def heading_command(line: str) -> str: stripped = line.strip() if stripped in FRONT_MATTER_HEADINGS: return "section*" if re.fullmatch(r"[IVX]+\.\s+.*", stripped): return "section*" if re.fullmatch(r"\d+\.\s+.*", stripped): return "section*" return "section*" def convert_text_file(source: Path, destination: Path, title: str) -> None: lines = [clean_line(line) for line in source.read_text(encoding="latin-1").splitlines()] body: list[str] = [] paragraph: list[str] = [] def flush_paragraph() -> None: if paragraph: body.append(escape_latex(" ".join(part.strip() for part in paragraph if part.strip()))) body.append("") paragraph.clear() for line in lines: stripped = line.strip() if not stripped: flush_paragraph() continue if is_page_artifact(stripped): flush_paragraph() continue if stripped.startswith("."): flush_paragraph() body.append(f"% raw formatter directive: {escape_latex(stripped)}") continue if looks_like_heading(stripped): flush_paragraph() body.append(rf"\{heading_command(stripped)}{{{escape_latex(stripped)}}}") continue paragraph.append(stripped) flush_paragraph() tex = [ r"\documentclass[12pt]{article}", r"\usepackage[utf8]{inputenc}", r"\usepackage[T1]{fontenc}", r"\usepackage{geometry}", r"\geometry{margin=1in}", r"\title{" + escape_latex(title) + "}", r"\author{Converted from legacy plain-text source}", r"\date{}", r"\begin{document}", r"\maketitle", r"\begin{flushleft}", "% This is a conservative automated conversion from the legacy text file.", "% Manual cleanup will still be necessary for figures, references, footnotes, and formatting.", "", *body, r"\end{flushleft}", r"\end{document}", "", ] destination.write_text("\n".join(tex), encoding="utf-8") def _clean_lines(source: Path) -> list[str]: return [clean_line(line) for line in source.read_text(encoding="latin-1").splitlines()] def _find_line_index(lines: list[str], needle: str, start: int = 0) -> int: for idx in range(start, len(lines)): if lines[idx].strip() == needle: return idx raise ValueError(f"could not find line: {needle}") def _collect_paragraphs(lines: list[str]) -> list[str]: paragraphs: list[str] = [] current: list[str] = [] for raw in lines: stripped = raw.strip() if not stripped or is_page_artifact(stripped): if current: paragraphs.append(escape_latex(" ".join(current))) current.clear() continue current.append(stripped) if current: paragraphs.append(escape_latex(" ".join(current))) return paragraphs def _collect_uppercase_heading_lines(lines: list[str], start_idx: int) -> tuple[list[str], int]: heading_lines: list[str] = [] idx = start_idx while idx < len(lines): candidate = lines[idx].strip() idx += 1 if not candidate or is_page_artifact(candidate): continue if re.fullmatch(r"(CHAPTER|APPENDIX)\s+\d+", candidate): idx -= 1 break if candidate.isupper() and len(candidate) <= 120: heading_lines.append(candidate) continue idx -= 1 break return heading_lines, idx def convert_thesis_file(source: Path, destination: Path, title: str) -> None: lines = _clean_lines(source) ack_idx = _find_line_index(lines, "ACKNOWLEDGEMENTS") abs_idx = _find_line_index(lines, "ABSTRACT", ack_idx + 1) toc_idx = _find_line_index(lines, "TABLE OF CONTENTS", abs_idx + 1) chap1_idx = _find_line_index(lines, "CHAPTER 1", abs_idx + 1) bib_idx = _find_line_index(lines, "BIBLIOGRAPHY", chap1_idx + 1) acknowledgements = _collect_paragraphs(lines[ack_idx + 1 : abs_idx]) abstract_lines = [] for line in lines[abs_idx + 1 : toc_idx]: stripped = line.strip() if not stripped or is_page_artifact(stripped): abstract_lines.append("") continue if stripped in { "INTEGRATION AND HYBRIDIZATION IN NEURAL NETWORK MODELLING", "Wesley Royce Elsberry, M.S.", "The University of Texas at Arlington, 1989", "Supervising Professor: Karan Briggs", }: continue if ". . ." in stripped: continue abstract_lines.append(stripped) abstract = _collect_paragraphs(abstract_lines) body = _convert_thesis_body(lines[chap1_idx:bib_idx]) tex = [ r"\documentclass[12pt]{report}", r"\usepackage[utf8]{inputenc}", r"\usepackage[T1]{fontenc}", r"\usepackage{geometry}", r"\geometry{margin=1in}", r"\title{" + escape_latex(title) + "}", r"\author{Wesley Royce Elsberry}", r"\date{August 1989}", r"\begin{document}", r"\begin{titlepage}", r"\centering", r"{\Large " + escape_latex(title) + r"\par}", r"\vspace{1.5cm}", r"{\large Wesley Royce Elsberry\par}", r"\vspace{1cm}", r"Presented to the Faculty of the Graduate School of\par", r"The University of Texas at Arlington in Partial Fulfillment\par", r"of the Requirements for the Degree of\par", r"\vspace{0.5cm}", r"{\large Master of Science in Computer Science\par}", r"\vfill", r"The University of Texas at Arlington\par", r"August 1989\par", r"\end{titlepage}", r"\chapter*{Acknowledgements}", *[p + "\n" for p in acknowledgements], r"\chapter*{Abstract}", *[p + "\n" for p in abstract], r"\tableofcontents", *body, r"\nocite{*}", r"\bibliographystyle{plain}", r"\bibliography{integration_and_hybridization_in_neural_network_modelling}", r"\end{document}", "", ] destination.write_text("\n".join(tex), encoding="utf-8") def _convert_thesis_body(lines: list[str]) -> list[str]: body: list[str] = [] paragraph: list[str] = [] idx = 0 in_appendix = False appendix_mode = "normal" def flush_paragraph() -> None: if paragraph: body.append(escape_latex(" ".join(paragraph))) body.append("") paragraph.clear() while idx < len(lines): stripped = lines[idx].strip() idx += 1 if not stripped or is_page_artifact(stripped): flush_paragraph() continue if stripped.startswith("."): flush_paragraph() body.append(f"% raw formatter directive: {escape_latex(stripped)}") continue if re.fullmatch(r"CHAPTER\s+\d+", stripped): flush_paragraph() heading_lines, idx = _collect_uppercase_heading_lines(lines, idx) title = " ".join(heading_lines) if heading_lines else stripped body.append(rf"\chapter{{{escape_latex(title.title())}}}") continue if re.fullmatch(r"APPENDIX\s+\d+", stripped): flush_paragraph() if not in_appendix: body.append(r"\appendix") in_appendix = True heading_lines, idx = _collect_uppercase_heading_lines(lines, idx) title = " ".join(heading_lines) if heading_lines else stripped body.append(rf"\chapter{{{escape_latex(title.title())}}}") if ( "PROGRAM SOURCE LISTING" in title or "DATA FILE LISTING" in title ): appendix_mode = "listing" body.append( "This appendix is represented in the repository by the legacy source and data files in \\texttt{THES/}. " "The automated thesis conversion suppresses the full listing here to keep the document manageable." ) body.append("") else: appendix_mode = "normal" continue if in_appendix and appendix_mode == "listing": continue if stripped.isupper() and len(stripped.split()) <= 10 and len(stripped) > 4: flush_paragraph() body.append(rf"\section*{{{escape_latex(stripped.title())}}}") continue paragraph.append(stripped) flush_paragraph() return body def main() -> int: OUT.mkdir(parents=True, exist_ok=True) for source_name, config in FILES.items(): source = THES / source_name destination = OUT / f"{config['target']}.tex" if config["mode"] == "thesis": convert_thesis_file(source, destination, config["title"]) else: convert_text_file(source, destination, config["title"]) return 0 if __name__ == "__main__": raise SystemExit(main())