357 lines
12 KiB
Python
357 lines
12 KiB
Python
from __future__ import annotations
|
||
|
||
from pathlib import Path
|
||
import re
|
||
|
||
|
||
ROOT = Path(__file__).resolve().parents[1]
|
||
THES = ROOT / "THES"
|
||
OUT = ROOT / "latex"
|
||
|
||
FILES = {
|
||
"INT_ANN.TXT": {
|
||
"target": "integration_and_hybridization_in_neural_network_modelling",
|
||
"title": "Integration and Hybridization in Neural Network Modelling",
|
||
"mode": "thesis",
|
||
},
|
||
"COMPCOOP.TXT": {
|
||
"target": "competing_network_models_and_problem_solving",
|
||
"title": "Competing Network Models and Problem-Solving",
|
||
"mode": "generic",
|
||
},
|
||
"THPROPOS.TXT": {
|
||
"target": "thesis_proposal",
|
||
"title": "Thesis Proposal",
|
||
"mode": "generic",
|
||
},
|
||
}
|
||
|
||
FRONT_MATTER_HEADINGS = {
|
||
"ACKNOWLEDGEMENTS",
|
||
"ABSTRACT",
|
||
"TABLE OF CONTENTS",
|
||
"LIST OF ILLUSTRATIONS",
|
||
"LIST OF TABLES",
|
||
}
|
||
|
||
|
||
def escape_latex(text: str) -> str:
|
||
replacements = {
|
||
"\\": r"\textbackslash{}",
|
||
"&": r"\&",
|
||
"%": r"\%",
|
||
"$": r"\$",
|
||
"#": r"\#",
|
||
"_": r"\_",
|
||
"{": r"\{",
|
||
"}": r"\}",
|
||
"~": r"\textasciitilde{}",
|
||
"^": r"\textasciicircum{}",
|
||
}
|
||
for src, dst in replacements.items():
|
||
text = text.replace(src, dst)
|
||
return text
|
||
|
||
|
||
def clean_line(line: str) -> str:
|
||
line = re.sub(r"[\x00-\x08\x0b-\x1f]", "", line)
|
||
line = line.replace("\x1a", "").replace("\x17", "").replace("\ufeff", "")
|
||
line = line.replace("<EFBFBD>", "'").replace("®", "'").replace("", "'")
|
||
line = re.sub(r"\s+", " ", line)
|
||
return line.rstrip()
|
||
|
||
|
||
def looks_like_heading(line: str) -> bool:
|
||
stripped = line.strip()
|
||
if not stripped:
|
||
return False
|
||
if len(stripped) > 80:
|
||
return False
|
||
if stripped.startswith("."):
|
||
return False
|
||
if stripped in FRONT_MATTER_HEADINGS:
|
||
return True
|
||
if re.fullmatch(r"[ivxlcdmIVXLCDM]+", stripped):
|
||
return False
|
||
if re.fullmatch(r"\d+", stripped):
|
||
return False
|
||
if re.fullmatch(r"[A-Z][A-Z\s\-]{2,}", stripped) and len(stripped.split()) <= 10:
|
||
return True
|
||
if re.fullmatch(r"[IVX]+\.\s+.*", stripped):
|
||
return True
|
||
if re.fullmatch(r"\d+\.\s+.*", stripped):
|
||
return True
|
||
if stripped.isupper() and len(stripped.split()) <= 12 and len(stripped) >= 4:
|
||
return True
|
||
return False
|
||
|
||
|
||
def is_page_artifact(line: str) -> bool:
|
||
stripped = line.strip()
|
||
if not stripped:
|
||
return False
|
||
if re.fullmatch(r"[ivxlcdmIVXLCDM]+", stripped):
|
||
return True
|
||
if re.fullmatch(r"\d+", stripped):
|
||
return True
|
||
if stripped == "Publication No." or re.fullmatch(r"_+", stripped):
|
||
return True
|
||
return False
|
||
|
||
|
||
def heading_command(line: str) -> str:
|
||
stripped = line.strip()
|
||
if stripped in FRONT_MATTER_HEADINGS:
|
||
return "section*"
|
||
if re.fullmatch(r"[IVX]+\.\s+.*", stripped):
|
||
return "section*"
|
||
if re.fullmatch(r"\d+\.\s+.*", stripped):
|
||
return "section*"
|
||
return "section*"
|
||
|
||
|
||
def convert_text_file(source: Path, destination: Path, title: str) -> None:
|
||
lines = [clean_line(line) for line in source.read_text(encoding="latin-1").splitlines()]
|
||
body: list[str] = []
|
||
paragraph: list[str] = []
|
||
|
||
def flush_paragraph() -> None:
|
||
if paragraph:
|
||
body.append(escape_latex(" ".join(part.strip() for part in paragraph if part.strip())))
|
||
body.append("")
|
||
paragraph.clear()
|
||
|
||
for line in lines:
|
||
stripped = line.strip()
|
||
if not stripped:
|
||
flush_paragraph()
|
||
continue
|
||
if is_page_artifact(stripped):
|
||
flush_paragraph()
|
||
continue
|
||
if stripped.startswith("."):
|
||
flush_paragraph()
|
||
body.append(f"% raw formatter directive: {escape_latex(stripped)}")
|
||
continue
|
||
if looks_like_heading(stripped):
|
||
flush_paragraph()
|
||
body.append(rf"\{heading_command(stripped)}{{{escape_latex(stripped)}}}")
|
||
continue
|
||
paragraph.append(stripped)
|
||
|
||
flush_paragraph()
|
||
|
||
tex = [
|
||
r"\documentclass[12pt]{article}",
|
||
r"\usepackage[utf8]{inputenc}",
|
||
r"\usepackage[T1]{fontenc}",
|
||
r"\usepackage{geometry}",
|
||
r"\geometry{margin=1in}",
|
||
r"\title{" + escape_latex(title) + "}",
|
||
r"\author{Converted from legacy plain-text source}",
|
||
r"\date{}",
|
||
r"\begin{document}",
|
||
r"\maketitle",
|
||
r"\begin{flushleft}",
|
||
"% This is a conservative automated conversion from the legacy text file.",
|
||
"% Manual cleanup will still be necessary for figures, references, footnotes, and formatting.",
|
||
"",
|
||
*body,
|
||
r"\end{flushleft}",
|
||
r"\end{document}",
|
||
"",
|
||
]
|
||
destination.write_text("\n".join(tex), encoding="utf-8")
|
||
|
||
|
||
def _clean_lines(source: Path) -> list[str]:
|
||
return [clean_line(line) for line in source.read_text(encoding="latin-1").splitlines()]
|
||
|
||
|
||
def _find_line_index(lines: list[str], needle: str, start: int = 0) -> int:
|
||
for idx in range(start, len(lines)):
|
||
if lines[idx].strip() == needle:
|
||
return idx
|
||
raise ValueError(f"could not find line: {needle}")
|
||
|
||
|
||
def _collect_paragraphs(lines: list[str]) -> list[str]:
|
||
paragraphs: list[str] = []
|
||
current: list[str] = []
|
||
for raw in lines:
|
||
stripped = raw.strip()
|
||
if not stripped or is_page_artifact(stripped):
|
||
if current:
|
||
paragraphs.append(escape_latex(" ".join(current)))
|
||
current.clear()
|
||
continue
|
||
current.append(stripped)
|
||
if current:
|
||
paragraphs.append(escape_latex(" ".join(current)))
|
||
return paragraphs
|
||
|
||
|
||
def _collect_uppercase_heading_lines(lines: list[str], start_idx: int) -> tuple[list[str], int]:
|
||
heading_lines: list[str] = []
|
||
idx = start_idx
|
||
while idx < len(lines):
|
||
candidate = lines[idx].strip()
|
||
idx += 1
|
||
if not candidate or is_page_artifact(candidate):
|
||
continue
|
||
if re.fullmatch(r"(CHAPTER|APPENDIX)\s+\d+", candidate):
|
||
idx -= 1
|
||
break
|
||
if candidate.isupper() and len(candidate) <= 120:
|
||
heading_lines.append(candidate)
|
||
continue
|
||
idx -= 1
|
||
break
|
||
return heading_lines, idx
|
||
|
||
|
||
def convert_thesis_file(source: Path, destination: Path, title: str) -> None:
|
||
lines = _clean_lines(source)
|
||
ack_idx = _find_line_index(lines, "ACKNOWLEDGEMENTS")
|
||
abs_idx = _find_line_index(lines, "ABSTRACT", ack_idx + 1)
|
||
toc_idx = _find_line_index(lines, "TABLE OF CONTENTS", abs_idx + 1)
|
||
chap1_idx = _find_line_index(lines, "CHAPTER 1", abs_idx + 1)
|
||
bib_idx = _find_line_index(lines, "BIBLIOGRAPHY", chap1_idx + 1)
|
||
|
||
acknowledgements = _collect_paragraphs(lines[ack_idx + 1 : abs_idx])
|
||
abstract_lines = []
|
||
for line in lines[abs_idx + 1 : toc_idx]:
|
||
stripped = line.strip()
|
||
if not stripped or is_page_artifact(stripped):
|
||
abstract_lines.append("")
|
||
continue
|
||
if stripped in {
|
||
"INTEGRATION AND HYBRIDIZATION IN NEURAL NETWORK MODELLING",
|
||
"Wesley Royce Elsberry, M.S.",
|
||
"The University of Texas at Arlington, 1989",
|
||
"Supervising Professor: Karan Briggs",
|
||
}:
|
||
continue
|
||
if ". . ." in stripped:
|
||
continue
|
||
abstract_lines.append(stripped)
|
||
abstract = _collect_paragraphs(abstract_lines)
|
||
|
||
body = _convert_thesis_body(lines[chap1_idx:bib_idx])
|
||
tex = [
|
||
r"\documentclass[12pt]{report}",
|
||
r"\usepackage[utf8]{inputenc}",
|
||
r"\usepackage[T1]{fontenc}",
|
||
r"\usepackage{geometry}",
|
||
r"\geometry{margin=1in}",
|
||
r"\title{" + escape_latex(title) + "}",
|
||
r"\author{Wesley Royce Elsberry}",
|
||
r"\date{August 1989}",
|
||
r"\begin{document}",
|
||
r"\begin{titlepage}",
|
||
r"\centering",
|
||
r"{\Large " + escape_latex(title) + r"\par}",
|
||
r"\vspace{1.5cm}",
|
||
r"{\large Wesley Royce Elsberry\par}",
|
||
r"\vspace{1cm}",
|
||
r"Presented to the Faculty of the Graduate School of\par",
|
||
r"The University of Texas at Arlington in Partial Fulfillment\par",
|
||
r"of the Requirements for the Degree of\par",
|
||
r"\vspace{0.5cm}",
|
||
r"{\large Master of Science in Computer Science\par}",
|
||
r"\vfill",
|
||
r"The University of Texas at Arlington\par",
|
||
r"August 1989\par",
|
||
r"\end{titlepage}",
|
||
r"\chapter*{Acknowledgements}",
|
||
*[p + "\n" for p in acknowledgements],
|
||
r"\chapter*{Abstract}",
|
||
*[p + "\n" for p in abstract],
|
||
r"\tableofcontents",
|
||
*body,
|
||
r"\nocite{*}",
|
||
r"\bibliographystyle{plain}",
|
||
r"\bibliography{integration_and_hybridization_in_neural_network_modelling}",
|
||
r"\end{document}",
|
||
"",
|
||
]
|
||
destination.write_text("\n".join(tex), encoding="utf-8")
|
||
|
||
|
||
def _convert_thesis_body(lines: list[str]) -> list[str]:
|
||
body: list[str] = []
|
||
paragraph: list[str] = []
|
||
idx = 0
|
||
in_appendix = False
|
||
appendix_mode = "normal"
|
||
|
||
def flush_paragraph() -> None:
|
||
if paragraph:
|
||
body.append(escape_latex(" ".join(paragraph)))
|
||
body.append("")
|
||
paragraph.clear()
|
||
|
||
while idx < len(lines):
|
||
stripped = lines[idx].strip()
|
||
idx += 1
|
||
if not stripped or is_page_artifact(stripped):
|
||
flush_paragraph()
|
||
continue
|
||
if stripped.startswith("."):
|
||
flush_paragraph()
|
||
body.append(f"% raw formatter directive: {escape_latex(stripped)}")
|
||
continue
|
||
if re.fullmatch(r"CHAPTER\s+\d+", stripped):
|
||
flush_paragraph()
|
||
heading_lines, idx = _collect_uppercase_heading_lines(lines, idx)
|
||
title = " ".join(heading_lines) if heading_lines else stripped
|
||
body.append(rf"\chapter{{{escape_latex(title.title())}}}")
|
||
continue
|
||
if re.fullmatch(r"APPENDIX\s+\d+", stripped):
|
||
flush_paragraph()
|
||
if not in_appendix:
|
||
body.append(r"\appendix")
|
||
in_appendix = True
|
||
heading_lines, idx = _collect_uppercase_heading_lines(lines, idx)
|
||
title = " ".join(heading_lines) if heading_lines else stripped
|
||
body.append(rf"\chapter{{{escape_latex(title.title())}}}")
|
||
if (
|
||
"PROGRAM SOURCE LISTING" in title
|
||
or "DATA FILE LISTING" in title
|
||
):
|
||
appendix_mode = "listing"
|
||
body.append(
|
||
"This appendix is represented in the repository by the legacy source and data files in \\texttt{THES/}. "
|
||
"The automated thesis conversion suppresses the full listing here to keep the document manageable."
|
||
)
|
||
body.append("")
|
||
else:
|
||
appendix_mode = "normal"
|
||
continue
|
||
if in_appendix and appendix_mode == "listing":
|
||
continue
|
||
if stripped.isupper() and len(stripped.split()) <= 10 and len(stripped) > 4:
|
||
flush_paragraph()
|
||
body.append(rf"\section*{{{escape_latex(stripped.title())}}}")
|
||
continue
|
||
paragraph.append(stripped)
|
||
|
||
flush_paragraph()
|
||
return body
|
||
|
||
|
||
def main() -> int:
|
||
OUT.mkdir(parents=True, exist_ok=True)
|
||
for source_name, config in FILES.items():
|
||
source = THES / source_name
|
||
destination = OUT / f"{config['target']}.tex"
|
||
if config["mode"] == "thesis":
|
||
convert_thesis_file(source, destination, config["title"])
|
||
else:
|
||
convert_text_file(source, destination, config["title"])
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
raise SystemExit(main())
|