initiative/scripts/extract-great-labors.py

#!/usr/bin/env python3
"""Extract D&D 5.5e stat blocks from The Great Labors PDF.

Usage:
    python3 scripts/extract-great-labors.py <path-to-pdf>

Reads pages 163-199 (Appendix B: Monsters) and emits
data/bestiary/dnd-bundled.json in the Creature[] shape from
packages/domain/src/creature-types.ts.

Requires: PyPDF2 (pip install PyPDF2)
"""

import json
import os
import re
import sys
from pathlib import Path

from PyPDF2 import PdfReader

# --- Constants ---

SOURCE_CODE = "TGL"
SOURCE_DISPLAY = "The Great Labors"
PAGE_START = 163  # 1-indexed
PAGE_END = 199

SIZE_RE = r"(Tiny|Small|Medium|Large|Huge|Gargantuan)"
TYPE_PIECE = r"[A-Za-z][A-Za-z\- ]*?"
ALIGN_PIECE = r"[A-Za-z][A-Za-z ()]*?"
HEADER_RE = re.compile(
    rf"^{SIZE_RE}\s+({TYPE_PIECE}(?:\s+\([^)]+\))?),\s+({ALIGN_PIECE})\s*$"
)

AC_RE = re.compile(r"^AC\s+(\d+)\s+Initiative\s+([+\-–]\s*\d+|[+\-–]?\d+)")
HP_RE = re.compile(r"^HP\s+(\d+)\s*\(([^)]+)\)")
SPEED_RE = re.compile(r"^Speed\s+(.+?)\s*$")
ABILITY_ROW_RE = re.compile(
    r"^(Str|Dex|Con|Int|Wis|Cha)\s+(\d+)\s*([+\-–]?\s*\d+)\s+([+\-–]?\s*\d+)\s+"
    r"(Str|Dex|Con|Int|Wis|Cha)\s+(\d+)\s*([+\-–]?\s*\d+)\s+([+\-–]?\s*\d+)\s+"
    r"(Str|Dex|Con|Int|Wis|Cha)\s+(\d+)\s*([+\-–]?\s*\d+)\s+([+\-–]?\s*\d+)\s*$"
)
CR_RE = re.compile(
    r"^Challenge\s+([\d/]+)\s*\(([\d,]+)\s*XP;\s*PB\s+\+(\d+)\)"
)

SECTION_HEADERS = ("Traits", "Actions", "Bonus Actions", "Reactions",
                   "Legendary Actions", "Mythic Actions")

# Page running header like "166APPENDIX B  �  MONSTERS..." -- marks the
# transition from stat-block content into prose on the next page.
RUNNING_HEADER_RE = re.compile(r"^\d+APPENDIX B\b")

# Condition / status-word false positives that the title-case entry regex
# would otherwise mistake for a new entry name. These names commonly end a
# sentence inside an entry's body (e.g. "...while it is Bloodied.").
NAME_FALSE_POSITIVES = {
    "Bloodied", "Restrained", "Grappled", "Charmed", "Frightened",
    "Prone", "Incapacitated", "Stunned", "Paralyzed", "Petrified",
    "Poisoned", "Blinded", "Deafened", "Invisible", "Unconscious",
    "Exhaustion", "Surprised", "Furious",
    "Failure", "Success", "Trigger", "Response", "Hit", "Miss",
    "Habitat", "Treasure", "Bonus Actions", "Reactions", "Traits", "Actions",
    "Disadvantage", "Advantage",
}

# --- Helpers ---


def norm_dash(s: str) -> str:
    return s.replace("–", "-").replace("—", "-").replace("−", "-")


def proficiency_bonus(cr_str: str) -> int:
    if "/" in cr_str:
        n, d = cr_str.split("/")
        cr = int(n) / int(d)
    else:
        cr = int(cr_str)
    if cr <= 4:
        return 2
    if cr <= 8:
        return 3
    if cr <= 12:
        return 4
    if cr <= 16:
        return 5
    if cr <= 20:
        return 6
    if cr <= 24:
        return 7
    if cr <= 28:
        return 8
    return 9


def make_creature_id(source: str, name: str) -> str:
    slug = re.sub(r"[^a-z0-9]+", "-", name.lower()).strip("-")
    return f"{source.lower()}:{slug}"


def parse_passive_perception(senses_text: str) -> int | None:
    # The PDF sometimes renders multi-digit values with a kerning space
    # (e.g. "Passive Perception 1 1" meaning 11). Collapse those.
    m = re.search(r"Passive Perception\s+(\d(?:\s*\d)*)\s*$", senses_text)
    if not m:
        m = re.search(r"Passive Perception\s+(\d+)", senses_text)
    return int(m.group(1).replace(" ", "")) if m else None


# --- Page extraction ---


def extract_pages(pdf_path: Path) -> str:
    reader = PdfReader(str(pdf_path))
    parts = []
    for i in range(PAGE_START - 1, PAGE_END):
        parts.append(reader.pages[i].extract_text())
    return "\n".join(parts)


# --- Block splitting ---


def find_stat_block_starts(lines: list[str]) -> list[int]:
    starts = []
    for i, line in enumerate(lines):
        if AC_RE.match(line.strip()):
            header_idx = None
            for j in range(i - 1, max(-1, i - 5), -1):
                if HEADER_RE.match(lines[j].strip()):
                    header_idx = j
                    break
            if header_idx is None:
                continue
            name_idx = header_idx - 1
            if name_idx >= 0 and lines[name_idx].strip():
                starts.append(name_idx)
    return starts


SECTION_HEADER_SMUSH_RE = re.compile(
    r"^(?P<body>.+?)\.(?P<hdr>Actions|Bonus Actions|Reactions|Legendary Actions|Traits)\s*$"
)


def block_for(lines: list[str], start: int, next_start: int | None) -> list[str]:
    """Build the line list for one stat block.

    Drops page markers and everything from the first running-header line
    onward (which marks the transition to a new prose page). Splits PDF
    smush lines like "...plants.Actions" into two lines so section header
    detection works.
    """
    end = next_start if next_start is not None else len(lines)
    out: list[str] = []
    for ln in lines[start:end]:
        if ln.startswith("===PAGE"):
            continue
        if RUNNING_HEADER_RE.match(ln.strip()):
            break
        m = SECTION_HEADER_SMUSH_RE.match(ln.strip())
        if m:
            out.append(m.group("body") + ".")
            out.append(m.group("hdr"))
        else:
            out.append(ln)
    return out


# --- Vitals parsing ---


def parse_header(block: list[str]) -> dict:
    name = block[0].strip()
    header = block[1].strip()
    m = HEADER_RE.match(header)
    if not m:
        raise ValueError(f"Bad header for {name!r}: {header!r}")
    size, ctype, alignment = m.group(1), m.group(2).strip(), m.group(3).strip()
    return {"name": name, "size": size, "type": ctype, "alignment": alignment}


def parse_ac(line: str) -> int:
    m = AC_RE.match(line.strip())
    if not m:
        raise ValueError(f"Bad AC line: {line!r}")
    return int(m.group(1))


def parse_hp(line: str) -> dict:
    m = HP_RE.match(line.strip())
    if not m:
        raise ValueError(f"Bad HP line: {line!r}")
    return {"average": int(m.group(1)), "formula": m.group(2).strip()}


def parse_speed(line: str) -> str:
    m = SPEED_RE.match(line.strip())
    if not m:
        raise ValueError(f"Bad Speed line: {line!r}")
    speed = m.group(1).rstrip(".").strip()
    # Normalize "30 ft" → "30 ft."  to match 5etools adapter output style.
    speed = re.sub(r"(\d+)\s+ft\b\.?", r"\1 ft.", speed)
    return speed


def parse_abilities(row1: str, row2: str) -> dict:
    out = {}
    for row in (row1, row2):
        m = ABILITY_ROW_RE.match(row.strip())
        if not m:
            raise ValueError(f"Bad ability row: {row!r}")
        for off in (0, 4, 8):
            ab = m.group(off + 1).lower()
            score = int(m.group(off + 2))
            out[ab] = score
    return out


# --- Meta lines ---


META_KEYS = ("Skills", "Saving Throws", "Resistances", "Immunities",
             "Vulnerabilities", "Senses", "Languages", "Gear")


def is_meta_start(line: str) -> str | None:
    for key in META_KEYS:
        if line.startswith(key + " ") or line.startswith(key + "  "):
            return key
    return None


def parse_meta(lines: list[str], start: int) -> tuple[dict, int]:
    meta: dict[str, str] = {}
    i = start
    current_key: str | None = None
    current_val_parts: list[str] = []

    def flush() -> None:
        nonlocal current_key, current_val_parts
        if current_key is not None:
            meta[current_key] = " ".join(p.strip() for p in current_val_parts).strip()
        current_key = None
        current_val_parts = []

    while i < len(lines):
        line = lines[i].strip()
        if not line:
            i += 1
            continue
        if line.startswith("Challenge "):
            flush()
            return meta, i
        key = is_meta_start(line)
        if key:
            flush()
            current_key = key
            current_val_parts.append(line[len(key):].strip())
        elif current_key is not None:
            current_val_parts.append(line)
        i += 1
    flush()
    return meta, i


# --- Section discovery ---


def find_section_starts(block: list[str], start_idx: int) -> list[tuple[str, int]]:
    starts = []
    for i in range(start_idx, len(block)):
        ln = block[i].strip()
        if ln in SECTION_HEADERS:
            starts.append((ln, i))
    return starts


def collect_section_lines(block: list[str], start: int, end: int) -> list[str]:
    """Collect the raw lines for one section (between header indices)."""
    out: list[str] = []
    for line in block[start:end]:
        if not line.strip():
            continue
        out.append(line.rstrip())
    return out


def join_section_text(lines: list[str]) -> str:
    """Join section lines into a single text blob, repairing wrap hyphens."""
    text = " ".join(line.strip() for line in lines if line.strip())
    text = re.sub(r"\s+", " ", text)
    # Repair "civi -li zation" → "civilization" (PDF column-wrap hyphens).
    text = re.sub(r"(\w)\s*-\s+(\w)", r"\1\2", text)
    return text.strip()


# --- Entry splitting ---

# Entry name: title-case phrase, where each "word" is either a Capitalized
# word, a lowercase connector (of/the/and/or/in/at/on/to/with/from), a roman
# numeral, etc. Optionally followed by parenthesized modifier.
ENTRY_NAME_INNER = (
    r"[A-Z][A-Za-z'’]*"
    r"(?:[ \-](?:[A-Z][A-Za-z'’]*|of|the|and|or|in|at|on|to|with|from))*"
    r"(?:\s*\([^)]+\))?"
)
# An entry boundary occurs at the start of the joined section text, or
# immediately after a sentence-ending punctuation. The PDF sometimes drops
# the space between the period and the new entry name, so `\s*` is fine.
ENTRY_BOUNDARY = re.compile(
    rf"(?:^|(?<=[\.\?\!]))\s*(?P<name>{ENTRY_NAME_INNER})\.\s+(?=[A-Z“\"(])"
)

# Trim attribution quotes / page-header bleed-through from entry bodies.
PROSE_TAIL_PATTERNS = (
    # Em-dash attribution: " —Chondrus, Priest of Lutheria"
    re.compile(r"\s+—\s*[A-Z][^—]*$"),
    # Smushed section header at end ("...plants.Actions").
    re.compile(
        r"\.\s*(?:Actions|Bonus\s+Actions|Reactions|Legendary\s+Actions|Traits)\s*$"
    ),
    # Curated prose subheadings / phrase markers that follow stat blocks in
    # this book. PDF reflow often merges prose onto the same logical line
    # as the last action body, so the leading whitespace is optional.
    re.compile(
        r"\.?\s*(?:Random Trapped Creature|Maenad Bacchanal|The Phalanx Formation"
        r"|Reinforced Portal|TRAPPED|HUNGER FOR|PURSUIT OF|RITUAL|MyTHIC|BRON"
        r"|GOlDEN|NyMPH|MARBlE|KElEDONE|SOlDIER|MINOTAUR|SATyRS|GOATlING|EMPUS"
        r"|ANARCH|GyGAN|CERBERUS|WHITE STAG|STORM|FEy|VOlKAN).*",
        re.DOTALL,
    ),
    # Specific prose sentence-starts observed leaking in.
    re.compile(
        r"\.(?:will gleefully|Some report that|Storm Dory|This magic weapon"
        r"|Thylean soldiers|Some claim|These leaders).*",
        re.DOTALL,
    ),
    # All-caps run of 3+ uppercase letters in a word, then a space, then
    # another word with 3+ uppercase letters (PDF small-caps section header
    # like "BRON zE STRATEGOS", "MyTHIC BEAST", "GOlDEN RAM").
    re.compile(r"(?<=[\.\s])[A-Z]{2}\w*\s+[\w ]{0,12}[A-Z]{3}[A-Z\w ]*"),
)


def trim_prose_tail(body: str) -> str:
    out = body
    for pat in PROSE_TAIL_PATTERNS:
        m = pat.search(out)
        if m:
            out = out[:m.start()].rstrip().rstrip(".") + "."
    return out.strip()


def is_valid_entry_name(name: str) -> bool:
    """Filter false-positive matches that aren't really entry names."""
    if name in NAME_FALSE_POSITIVES:
        return False
    # Single short capitalized word that's a common condition or noun is
    # usually a false positive when followed by a period. Real entry names
    # almost always have either multiple words or a parenthesized modifier.
    bare = re.sub(r"\s*\([^)]+\)\s*", "", name).strip()
    if bare in NAME_FALSE_POSITIVES:
        return False
    return True


def split_text_into_entries(text: str) -> list[tuple[str, str]]:
    """Split section text into (name, body) entries by scanning for entry-name
    boundaries (start-of-text or after a sentence period)."""
    matches: list[tuple[int, int, str]] = []
    for m in ENTRY_BOUNDARY.finditer(text):
        name = m.group("name").strip()
        if is_valid_entry_name(name):
            matches.append((m.start(), m.end(), name))
    if not matches:
        return []
    entries: list[tuple[str, str]] = []
    for i, (_, body_start, name) in enumerate(matches):
        body_end = matches[i + 1][0] if i + 1 < len(matches) else len(text)
        body = text[body_start:body_end].strip()
        entries.append((name, body))
    return entries


def parse_section_traits(lines: list[str]) -> list[dict]:
    text = join_section_text(lines)
    entries = split_text_into_entries(text)
    out = []
    for name, body in entries:
        body = trim_prose_tail(body)
        if body or name:
            out.append({"name": name,
                        "segments": [{"type": "text", "value": body}]})
    return out


def parse_legendary(lines: list[str], creature_name: str) -> dict | None:
    """Parse the Legendary Actions section. Text before the first entry whose
    body contains action vocabulary forms the preamble.
    """
    text = join_section_text(lines)
    all_matches: list[tuple[int, int, str]] = []
    for m in ENTRY_BOUNDARY.finditer(text):
        name = m.group("name").strip()
        if is_valid_entry_name(name):
            all_matches.append((m.start(), m.end(), name))

    action_anchors = ("Saving Throw", "Attack Roll", "Trigger", "Recharge",
                      "Melee", "Ranged", "Constitution", "Dexterity",
                      "Strength", "Intelligence", "Wisdom", "Charisma")
    first_action_idx = None
    for i, (_, body_start, _) in enumerate(all_matches):
        body_end = all_matches[i + 1][0] if i + 1 < len(all_matches) else len(text)
        body_head = text[body_start:min(body_end, body_start + 100)]
        if any(a in body_head for a in action_anchors):
            first_action_idx = i
            break
    if first_action_idx is None:
        return None
    preamble = text[:all_matches[first_action_idx][0]].strip()
    if not preamble:
        preamble = f"{creature_name} can take Legendary Actions."
    entries = []
    for i in range(first_action_idx, len(all_matches)):
        _, body_start, name = all_matches[i]
        body_end = all_matches[i + 1][0] if i + 1 < len(all_matches) else len(text)
        body = text[body_start:body_end].strip()
        entries.append((name, body))
    if not entries:
        return None
    return {
        "preamble": preamble,
        "entries": [
            {"name": name,
             "segments": [{"type": "text", "value": trim_prose_tail(body)}]}
            for name, body in entries if body
        ],
    }


# --- Top-level parse ---


def parse_block(block: list[str]) -> dict:
    head = parse_header(block)
    ac = parse_ac(block[2])
    hp = parse_hp(block[3])
    speed = parse_speed(block[4])
    if not block[5].strip().startswith("MOD"):
        raise ValueError(f"Expected MOD header, got: {block[5]!r}")
    abilities = parse_abilities(block[6], block[7])

    meta, ch_idx = parse_meta(block, 8)
    cr_match = CR_RE.match(block[ch_idx].strip())
    if not cr_match:
        raise ValueError(f"Bad Challenge line: {block[ch_idx]!r}")
    cr_str = cr_match.group(1)

    section_starts = find_section_starts(block, ch_idx + 1)
    sections: dict[str, list[str]] = {}
    for i, (name, idx) in enumerate(section_starts):
        end = section_starts[i + 1][1] if i + 1 < len(section_starts) else len(block)
        sections[name] = collect_section_lines(block, idx + 1, end)

    creature: dict = {
        "id": make_creature_id(SOURCE_CODE, head["name"]),
        "name": head["name"],
        "source": SOURCE_CODE,
        "sourceDisplayName": SOURCE_DISPLAY,
        "size": head["size"],
        "type": head["type"],
        "alignment": head["alignment"],
        "ac": ac,
        "hp": hp,
        "speed": speed,
        "abilities": abilities,
        "cr": cr_str,
        "initiativeProficiency": 0,
        "proficiencyBonus": proficiency_bonus(cr_str),
        "passive": parse_passive_perception(meta.get("Senses", "")) or 10,
    }

    if "Saving Throws" in meta:
        creature["savingThrows"] = meta["Saving Throws"]
    if "Skills" in meta:
        creature["skills"] = meta["Skills"]
    if "Resistances" in meta:
        creature["resist"] = meta["Resistances"]
    if "Immunities" in meta:
        creature["immune"] = meta["Immunities"]
    if "Vulnerabilities" in meta:
        creature["vulnerable"] = meta["Vulnerabilities"]
    if "Senses" in meta:
        senses = re.sub(r"[;,]?\s*Passive Perception\s+\d+\s*$", "", meta["Senses"])
        senses = senses.strip().rstrip(";").strip()
        if senses:
            creature["senses"] = senses
    if "Languages" in meta:
        creature["languages"] = meta["Languages"]

    if "Traits" in sections:
        creature["traits"] = parse_section_traits(sections["Traits"])
    if "Actions" in sections:
        creature["actions"] = parse_section_traits(sections["Actions"])
    if "Bonus Actions" in sections:
        creature["bonusActions"] = parse_section_traits(sections["Bonus Actions"])
    if "Reactions" in sections:
        creature["reactions"] = parse_section_traits(sections["Reactions"])
    if "Legendary Actions" in sections:
        leg = parse_legendary(sections["Legendary Actions"], head["name"])
        if leg:
            creature["legendaryActions"] = leg

    return creature


def main() -> int:
    if len(sys.argv) != 2:
        print("Usage: python3 extract-great-labors.py <path-to-pdf>",
              file=sys.stderr)
        return 1
    pdf_path = Path(os.path.expanduser(sys.argv[1]))
    if not pdf_path.exists():
        print(f"PDF not found: {pdf_path}", file=sys.stderr)
        return 1

    text = extract_pages(pdf_path)
    lines = text.split("\n")

    starts = find_stat_block_starts(lines)
    print(f"Detected {len(starts)} stat blocks", file=sys.stderr)

    creatures = []
    failures = []
    for i, s in enumerate(starts):
        next_s = starts[i + 1] if i + 1 < len(starts) else None
        block = block_for(lines, s, next_s)
        try:
            creatures.append(parse_block(block))
        except Exception as e:
            failures.append((block[0] if block else "<empty>", str(e)))

    if failures:
        print(f"\n{len(failures)} parse failures:", file=sys.stderr)
        for name, err in failures:
            print(f"  - {name}: {err}", file=sys.stderr)

    out_path = Path(__file__).resolve().parent.parent / "data" / "bestiary" / "dnd-bundled.json"
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w") as f:
        json.dump(creatures, f, indent="\t", ensure_ascii=False)
        f.write("\n")
    print(f"Wrote {len(creatures)} creatures to {out_path}", file=sys.stderr)
    return 0 if not failures else 2


if __name__ == "__main__":
    sys.exit(main())