#!/usr/bin/env python3 """Extract D&D 5.5e stat blocks from The Great Labors PDF. Usage: python3 scripts/extract-great-labors.py Reads pages 163-199 (Appendix B: Monsters) and emits data/bestiary/dnd-bundled.json in the Creature[] shape from packages/domain/src/creature-types.ts. Requires: PyPDF2 (pip install PyPDF2) """ import json import os import re import sys from pathlib import Path from PyPDF2 import PdfReader # --- Constants --- SOURCE_CODE = "TGL" SOURCE_DISPLAY = "The Great Labors" PAGE_START = 163 # 1-indexed PAGE_END = 199 SIZE_RE = r"(Tiny|Small|Medium|Large|Huge|Gargantuan)" TYPE_PIECE = r"[A-Za-z][A-Za-z\- ]*?" ALIGN_PIECE = r"[A-Za-z][A-Za-z ()]*?" HEADER_RE = re.compile( rf"^{SIZE_RE}\s+({TYPE_PIECE}(?:\s+$[^)]+$)?),\s+({ALIGN_PIECE})\s*$" ) AC_RE = re.compile(r"^AC\s+(\d+)\s+Initiative\s+([+\-–]\s*\d+|[+\-–]?\d+)") HP_RE = re.compile(r"^HP\s+(\d+)\s*$([^)]+)$") SPEED_RE = re.compile(r"^Speed\s+(.+?)\s*$") ABILITY_ROW_RE = re.compile( r"^(Str|Dex|Con|Int|Wis|Cha)\s+(\d+)\s*([+\-–]?\s*\d+)\s+([+\-–]?\s*\d+)\s+" r"(Str|Dex|Con|Int|Wis|Cha)\s+(\d+)\s*([+\-–]?\s*\d+)\s+([+\-–]?\s*\d+)\s+" r"(Str|Dex|Con|Int|Wis|Cha)\s+(\d+)\s*([+\-–]?\s*\d+)\s+([+\-–]?\s*\d+)\s*$" ) CR_RE = re.compile( r"^Challenge\s+([\d/]+)\s*$([\d,]+)\s*XP;\s*PB\s+\+(\d+)$" ) SECTION_HEADERS = ("Traits", "Actions", "Bonus Actions", "Reactions", "Legendary Actions", "Mythic Actions") # Page running header like "166APPENDIX B � MONSTERS..." -- marks the # transition from stat-block content into prose on the next page. RUNNING_HEADER_RE = re.compile(r"^\d+APPENDIX B\b") # Condition / status-word false positives that the title-case entry regex # would otherwise mistake for a new entry name. These names commonly end a # sentence inside an entry's body (e.g. "...while it is Bloodied."). NAME_FALSE_POSITIVES = { "Bloodied", "Restrained", "Grappled", "Charmed", "Frightened", "Prone", "Incapacitated", "Stunned", "Paralyzed", "Petrified", "Poisoned", "Blinded", "Deafened", "Invisible", "Unconscious", "Exhaustion", "Surprised", "Furious", "Failure", "Success", "Trigger", "Response", "Hit", "Miss", "Habitat", "Treasure", "Bonus Actions", "Reactions", "Traits", "Actions", "Disadvantage", "Advantage", } # --- Helpers --- def norm_dash(s: str) -> str: return s.replace("–", "-").replace("—", "-").replace("−", "-") def proficiency_bonus(cr_str: str) -> int: if "/" in cr_str: n, d = cr_str.split("/") cr = int(n) / int(d) else: cr = int(cr_str) if cr <= 4: return 2 if cr <= 8: return 3 if cr <= 12: return 4 if cr <= 16: return 5 if cr <= 20: return 6 if cr <= 24: return 7 if cr <= 28: return 8 return 9 def make_creature_id(source: str, name: str) -> str: slug = re.sub(r"[^a-z0-9]+", "-", name.lower()).strip("-") return f"{source.lower()}:{slug}" def parse_passive_perception(senses_text: str) -> int | None: # The PDF sometimes renders multi-digit values with a kerning space # (e.g. "Passive Perception 1 1" meaning 11). Collapse those. m = re.search(r"Passive Perception\s+(\d(?:\s*\d)*)\s*$", senses_text) if not m: m = re.search(r"Passive Perception\s+(\d+)", senses_text) return int(m.group(1).replace(" ", "")) if m else None # --- Page extraction --- def extract_pages(pdf_path: Path) -> str: reader = PdfReader(str(pdf_path)) parts = [] for i in range(PAGE_START - 1, PAGE_END): parts.append(reader.pages[i].extract_text()) return "\n".join(parts) # --- Block splitting --- def find_stat_block_starts(lines: list[str]) -> list[int]: starts = [] for i, line in enumerate(lines): if AC_RE.match(line.strip()): header_idx = None for j in range(i - 1, max(-1, i - 5), -1): if HEADER_RE.match(lines[j].strip()): header_idx = j break if header_idx is None: continue name_idx = header_idx - 1 if name_idx >= 0 and lines[name_idx].strip(): starts.append(name_idx) return starts SECTION_HEADER_SMUSH_RE = re.compile( r"^(?P.+?)\.(?PActions|Bonus Actions|Reactions|Legendary Actions|Traits)\s*$" ) def block_for(lines: list[str], start: int, next_start: int | None) -> list[str]: """Build the line list for one stat block. Drops page markers and everything from the first running-header line onward (which marks the transition to a new prose page). Splits PDF smush lines like "...plants.Actions" into two lines so section header detection works. """ end = next_start if next_start is not None else len(lines) out: list[str] = [] for ln in lines[start:end]: if ln.startswith("===PAGE"): continue if RUNNING_HEADER_RE.match(ln.strip()): break m = SECTION_HEADER_SMUSH_RE.match(ln.strip()) if m: out.append(m.group("body") + ".") out.append(m.group("hdr")) else: out.append(ln) return out # --- Vitals parsing --- def parse_header(block: list[str]) -> dict: name = block[0].strip() header = block[1].strip() m = HEADER_RE.match(header) if not m: raise ValueError(f"Bad header for {name!r}: {header!r}") size, ctype, alignment = m.group(1), m.group(2).strip(), m.group(3).strip() return {"name": name, "size": size, "type": ctype, "alignment": alignment} def parse_ac(line: str) -> int: m = AC_RE.match(line.strip()) if not m: raise ValueError(f"Bad AC line: {line!r}") return int(m.group(1)) def parse_hp(line: str) -> dict: m = HP_RE.match(line.strip()) if not m: raise ValueError(f"Bad HP line: {line!r}") return {"average": int(m.group(1)), "formula": m.group(2).strip()} def parse_speed(line: str) -> str: m = SPEED_RE.match(line.strip()) if not m: raise ValueError(f"Bad Speed line: {line!r}") speed = m.group(1).rstrip(".").strip() # Normalize "30 ft" → "30 ft." to match 5etools adapter output style. speed = re.sub(r"(\d+)\s+ft\b\.?", r"\1 ft.", speed) return speed def parse_abilities(row1: str, row2: str) -> dict: out = {} for row in (row1, row2): m = ABILITY_ROW_RE.match(row.strip()) if not m: raise ValueError(f"Bad ability row: {row!r}") for off in (0, 4, 8): ab = m.group(off + 1).lower() score = int(m.group(off + 2)) out[ab] = score return out # --- Meta lines --- META_KEYS = ("Skills", "Saving Throws", "Resistances", "Immunities", "Vulnerabilities", "Senses", "Languages", "Gear") def is_meta_start(line: str) -> str | None: for key in META_KEYS: if line.startswith(key + " ") or line.startswith(key + " "): return key return None def parse_meta(lines: list[str], start: int) -> tuple[dict, int]: meta: dict[str, str] = {} i = start current_key: str | None = None current_val_parts: list[str] = [] def flush() -> None: nonlocal current_key, current_val_parts if current_key is not None: meta[current_key] = " ".join(p.strip() for p in current_val_parts).strip() current_key = None current_val_parts = [] while i < len(lines): line = lines[i].strip() if not line: i += 1 continue if line.startswith("Challenge "): flush() return meta, i key = is_meta_start(line) if key: flush() current_key = key current_val_parts.append(line[len(key):].strip()) elif current_key is not None: current_val_parts.append(line) i += 1 flush() return meta, i # --- Section discovery --- def find_section_starts(block: list[str], start_idx: int) -> list[tuple[str, int]]: starts = [] for i in range(start_idx, len(block)): ln = block[i].strip() if ln in SECTION_HEADERS: starts.append((ln, i)) return starts def collect_section_lines(block: list[str], start: int, end: int) -> list[str]: """Collect the raw lines for one section (between header indices).""" out: list[str] = [] for line in block[start:end]: if not line.strip(): continue out.append(line.rstrip()) return out def join_section_text(lines: list[str]) -> str: """Join section lines into a single text blob, repairing wrap hyphens.""" text = " ".join(line.strip() for line in lines if line.strip()) text = re.sub(r"\s+", " ", text) # Repair "civi -li zation" → "civilization" (PDF column-wrap hyphens). text = re.sub(r"(\w)\s*-\s+(\w)", r"\1\2", text) return text.strip() # --- Entry splitting --- # Entry name: title-case phrase, where each "word" is either a Capitalized # word, a lowercase connector (of/the/and/or/in/at/on/to/with/from), a roman # numeral, etc. Optionally followed by parenthesized modifier. ENTRY_NAME_INNER = ( r"[A-Z][A-Za-z'’]*" r"(?:[ \-](?:[A-Z][A-Za-z'’]*|of|the|and|or|in|at|on|to|with|from))*" r"(?:\s*$[^)]+$)?" ) # An entry boundary occurs at the start of the joined section text, or # immediately after a sentence-ending punctuation. The PDF sometimes drops # the space between the period and the new entry name, so `\s*` is fine. ENTRY_BOUNDARY = re.compile( rf"(?:^|(?<=[\.\?\!]))\s*(?P{ENTRY_NAME_INNER})\.\s+(?=[A-Z“\"(])" ) # Trim attribution quotes / page-header bleed-through from entry bodies. PROSE_TAIL_PATTERNS = ( # Em-dash attribution: " —Chondrus, Priest of Lutheria" re.compile(r"\s+—\s*[A-Z][^—]*$"), # Smushed section header at end ("...plants.Actions"). re.compile( r"\.\s*(?:Actions|Bonus\s+Actions|Reactions|Legendary\s+Actions|Traits)\s*$" ), # Curated prose subheadings / phrase markers that follow stat blocks in # this book. PDF reflow often merges prose onto the same logical line # as the last action body, so the leading whitespace is optional. re.compile( r"\.?\s*(?:Random Trapped Creature|Maenad Bacchanal|The Phalanx Formation" r"|Reinforced Portal|TRAPPED|HUNGER FOR|PURSUIT OF|RITUAL|MyTHIC|BRON" r"|GOlDEN|NyMPH|MARBlE|KElEDONE|SOlDIER|MINOTAUR|SATyRS|GOATlING|EMPUS" r"|ANARCH|GyGAN|CERBERUS|WHITE STAG|STORM|FEy|VOlKAN).*", re.DOTALL, ), # Specific prose sentence-starts observed leaking in. re.compile( r"\.(?:will gleefully|Some report that|Storm Dory|This magic weapon" r"|Thylean soldiers|Some claim|These leaders).*", re.DOTALL, ), # All-caps run of 3+ uppercase letters in a word, then a space, then # another word with 3+ uppercase letters (PDF small-caps section header # like "BRON zE STRATEGOS", "MyTHIC BEAST", "GOlDEN RAM"). re.compile(r"(?<=[\.\s])[A-Z]{2}\w*\s+[\w ]{0,12}[A-Z]{3}[A-Z\w ]*"), ) def trim_prose_tail(body: str) -> str: out = body for pat in PROSE_TAIL_PATTERNS: m = pat.search(out) if m: out = out[:m.start()].rstrip().rstrip(".") + "." return out.strip() def is_valid_entry_name(name: str) -> bool: """Filter false-positive matches that aren't really entry names.""" if name in NAME_FALSE_POSITIVES: return False # Single short capitalized word that's a common condition or noun is # usually a false positive when followed by a period. Real entry names # almost always have either multiple words or a parenthesized modifier. bare = re.sub(r"\s*$[^)]+$\s*", "", name).strip() if bare in NAME_FALSE_POSITIVES: return False return True def split_text_into_entries(text: str) -> list[tuple[str, str]]: """Split section text into (name, body) entries by scanning for entry-name boundaries (start-of-text or after a sentence period).""" matches: list[tuple[int, int, str]] = [] for m in ENTRY_BOUNDARY.finditer(text): name = m.group("name").strip() if is_valid_entry_name(name): matches.append((m.start(), m.end(), name)) if not matches: return [] entries: list[tuple[str, str]] = [] for i, (_, body_start, name) in enumerate(matches): body_end = matches[i + 1][0] if i + 1 < len(matches) else len(text) body = text[body_start:body_end].strip() entries.append((name, body)) return entries def parse_section_traits(lines: list[str]) -> list[dict]: text = join_section_text(lines) entries = split_text_into_entries(text) out = [] for name, body in entries: body = trim_prose_tail(body) if body or name: out.append({"name": name, "segments": [{"type": "text", "value": body}]}) return out def parse_legendary(lines: list[str], creature_name: str) -> dict | None: """Parse the Legendary Actions section. Text before the first entry whose body contains action vocabulary forms the preamble. """ text = join_section_text(lines) all_matches: list[tuple[int, int, str]] = [] for m in ENTRY_BOUNDARY.finditer(text): name = m.group("name").strip() if is_valid_entry_name(name): all_matches.append((m.start(), m.end(), name)) action_anchors = ("Saving Throw", "Attack Roll", "Trigger", "Recharge", "Melee", "Ranged", "Constitution", "Dexterity", "Strength", "Intelligence", "Wisdom", "Charisma") first_action_idx = None for i, (_, body_start, _) in enumerate(all_matches): body_end = all_matches[i + 1][0] if i + 1 < len(all_matches) else len(text) body_head = text[body_start:min(body_end, body_start + 100)] if any(a in body_head for a in action_anchors): first_action_idx = i break if first_action_idx is None: return None preamble = text[:all_matches[first_action_idx][0]].strip() if not preamble: preamble = f"{creature_name} can take Legendary Actions." entries = [] for i in range(first_action_idx, len(all_matches)): _, body_start, name = all_matches[i] body_end = all_matches[i + 1][0] if i + 1 < len(all_matches) else len(text) body = text[body_start:body_end].strip() entries.append((name, body)) if not entries: return None return { "preamble": preamble, "entries": [ {"name": name, "segments": [{"type": "text", "value": trim_prose_tail(body)}]} for name, body in entries if body ], } # --- Top-level parse --- def parse_block(block: list[str]) -> dict: head = parse_header(block) ac = parse_ac(block[2]) hp = parse_hp(block[3]) speed = parse_speed(block[4]) if not block[5].strip().startswith("MOD"): raise ValueError(f"Expected MOD header, got: {block[5]!r}") abilities = parse_abilities(block[6], block[7]) meta, ch_idx = parse_meta(block, 8) cr_match = CR_RE.match(block[ch_idx].strip()) if not cr_match: raise ValueError(f"Bad Challenge line: {block[ch_idx]!r}") cr_str = cr_match.group(1) section_starts = find_section_starts(block, ch_idx + 1) sections: dict[str, list[str]] = {} for i, (name, idx) in enumerate(section_starts): end = section_starts[i + 1][1] if i + 1 < len(section_starts) else len(block) sections[name] = collect_section_lines(block, idx + 1, end) creature: dict = { "id": make_creature_id(SOURCE_CODE, head["name"]), "name": head["name"], "source": SOURCE_CODE, "sourceDisplayName": SOURCE_DISPLAY, "size": head["size"], "type": head["type"], "alignment": head["alignment"], "ac": ac, "hp": hp, "speed": speed, "abilities": abilities, "cr": cr_str, "initiativeProficiency": 0, "proficiencyBonus": proficiency_bonus(cr_str), "passive": parse_passive_perception(meta.get("Senses", "")) or 10, } if "Saving Throws" in meta: creature["savingThrows"] = meta["Saving Throws"] if "Skills" in meta: creature["skills"] = meta["Skills"] if "Resistances" in meta: creature["resist"] = meta["Resistances"] if "Immunities" in meta: creature["immune"] = meta["Immunities"] if "Vulnerabilities" in meta: creature["vulnerable"] = meta["Vulnerabilities"] if "Senses" in meta: senses = re.sub(r"[;,]?\s*Passive Perception\s+\d+\s*$", "", meta["Senses"]) senses = senses.strip().rstrip(";").strip() if senses: creature["senses"] = senses if "Languages" in meta: creature["languages"] = meta["Languages"] if "Traits" in sections: creature["traits"] = parse_section_traits(sections["Traits"]) if "Actions" in sections: creature["actions"] = parse_section_traits(sections["Actions"]) if "Bonus Actions" in sections: creature["bonusActions"] = parse_section_traits(sections["Bonus Actions"]) if "Reactions" in sections: creature["reactions"] = parse_section_traits(sections["Reactions"]) if "Legendary Actions" in sections: leg = parse_legendary(sections["Legendary Actions"], head["name"]) if leg: creature["legendaryActions"] = leg return creature def main() -> int: if len(sys.argv) != 2: print("Usage: python3 extract-great-labors.py ", file=sys.stderr) return 1 pdf_path = Path(os.path.expanduser(sys.argv[1])) if not pdf_path.exists(): print(f"PDF not found: {pdf_path}", file=sys.stderr) return 1 text = extract_pages(pdf_path) lines = text.split("\n") starts = find_stat_block_starts(lines) print(f"Detected {len(starts)} stat blocks", file=sys.stderr) creatures = [] failures = [] for i, s in enumerate(starts): next_s = starts[i + 1] if i + 1 < len(starts) else None block = block_for(lines, s, next_s) try: creatures.append(parse_block(block)) except Exception as e: failures.append((block[0] if block else "", str(e))) if failures: print(f"\n{len(failures)} parse failures:", file=sys.stderr) for name, err in failures: print(f" - {name}: {err}", file=sys.stderr) out_path = Path(__file__).resolve().parent.parent / "data" / "bestiary" / "dnd-bundled.json" out_path.parent.mkdir(parents=True, exist_ok=True) with out_path.open("w") as f: json.dump(creatures, f, indent="\t", ensure_ascii=False) f.write("\n") print(f"Wrote {len(creatures)} creatures to {out_path}", file=sys.stderr) return 0 if not failures else 2 if __name__ == "__main__": sys.exit(main())