Files
initiative/scripts/extract-great-labors.py
T
Lukas c343fd3cd0 Add bundled-bestiary mechanism for shipping creatures with the app
D&D creatures listed in data/bestiary/dnd-bundled.json are now merged into
the search index and pre-loaded into creatureMap, so they appear alongside
5etools creatures with no "Load source" step. Source codes are derived from
the JSON itself (each creature carries source + sourceDisplayName), so adding
a new book is a pure data change. Bundled sources are excluded from
getAllSourceCodes() so bulk-import skips them, and they never appear in the
source manager (which only lists cached sources).

Includes a reference extractor (scripts/extract-great-labors.py) for the
5.5e revised stat-block format and a /bundle-bestiary skill that future
agents can follow to add monsters from other PDF books.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 15:49:34 +02:00

562 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Extract D&D 5.5e stat blocks from The Great Labors PDF.
Usage:
python3 scripts/extract-great-labors.py <path-to-pdf>
Reads pages 163-199 (Appendix B: Monsters) and emits
data/bestiary/dnd-bundled.json in the Creature[] shape from
packages/domain/src/creature-types.ts.
Requires: PyPDF2 (pip install PyPDF2)
"""
import json
import os
import re
import sys
from pathlib import Path
from PyPDF2 import PdfReader
# --- Constants ---
SOURCE_CODE = "TGL"
SOURCE_DISPLAY = "The Great Labors"
PAGE_START = 163 # 1-indexed
PAGE_END = 199
SIZE_RE = r"(Tiny|Small|Medium|Large|Huge|Gargantuan)"
TYPE_PIECE = r"[A-Za-z][A-Za-z\- ]*?"
ALIGN_PIECE = r"[A-Za-z][A-Za-z ()]*?"
HEADER_RE = re.compile(
rf"^{SIZE_RE}\s+({TYPE_PIECE}(?:\s+\([^)]+\))?),\s+({ALIGN_PIECE})\s*$"
)
AC_RE = re.compile(r"^AC\s+(\d+)\s+Initiative\s+([+\-]\s*\d+|[+\-]?\d+)")
HP_RE = re.compile(r"^HP\s+(\d+)\s*\(([^)]+)\)")
SPEED_RE = re.compile(r"^Speed\s+(.+?)\s*$")
ABILITY_ROW_RE = re.compile(
r"^(Str|Dex|Con|Int|Wis|Cha)\s+(\d+)\s*([+\-]?\s*\d+)\s+([+\-]?\s*\d+)\s+"
r"(Str|Dex|Con|Int|Wis|Cha)\s+(\d+)\s*([+\-]?\s*\d+)\s+([+\-]?\s*\d+)\s+"
r"(Str|Dex|Con|Int|Wis|Cha)\s+(\d+)\s*([+\-]?\s*\d+)\s+([+\-]?\s*\d+)\s*$"
)
CR_RE = re.compile(
r"^Challenge\s+([\d/]+)\s*\(([\d,]+)\s*XP;\s*PB\s+\+(\d+)\)"
)
SECTION_HEADERS = ("Traits", "Actions", "Bonus Actions", "Reactions",
"Legendary Actions", "Mythic Actions")
# Page running header like "166APPENDIX B MONSTERS..." -- marks the
# transition from stat-block content into prose on the next page.
RUNNING_HEADER_RE = re.compile(r"^\d+APPENDIX B\b")
# Condition / status-word false positives that the title-case entry regex
# would otherwise mistake for a new entry name. These names commonly end a
# sentence inside an entry's body (e.g. "...while it is Bloodied.").
NAME_FALSE_POSITIVES = {
"Bloodied", "Restrained", "Grappled", "Charmed", "Frightened",
"Prone", "Incapacitated", "Stunned", "Paralyzed", "Petrified",
"Poisoned", "Blinded", "Deafened", "Invisible", "Unconscious",
"Exhaustion", "Surprised", "Furious",
"Failure", "Success", "Trigger", "Response", "Hit", "Miss",
"Habitat", "Treasure", "Bonus Actions", "Reactions", "Traits", "Actions",
"Disadvantage", "Advantage",
}
# --- Helpers ---
def norm_dash(s: str) -> str:
return s.replace("", "-").replace("", "-").replace("", "-")
def proficiency_bonus(cr_str: str) -> int:
if "/" in cr_str:
n, d = cr_str.split("/")
cr = int(n) / int(d)
else:
cr = int(cr_str)
if cr <= 4:
return 2
if cr <= 8:
return 3
if cr <= 12:
return 4
if cr <= 16:
return 5
if cr <= 20:
return 6
if cr <= 24:
return 7
if cr <= 28:
return 8
return 9
def make_creature_id(source: str, name: str) -> str:
slug = re.sub(r"[^a-z0-9]+", "-", name.lower()).strip("-")
return f"{source.lower()}:{slug}"
def parse_passive_perception(senses_text: str) -> int | None:
# The PDF sometimes renders multi-digit values with a kerning space
# (e.g. "Passive Perception 1 1" meaning 11). Collapse those.
m = re.search(r"Passive Perception\s+(\d(?:\s*\d)*)\s*$", senses_text)
if not m:
m = re.search(r"Passive Perception\s+(\d+)", senses_text)
return int(m.group(1).replace(" ", "")) if m else None
# --- Page extraction ---
def extract_pages(pdf_path: Path) -> str:
reader = PdfReader(str(pdf_path))
parts = []
for i in range(PAGE_START - 1, PAGE_END):
parts.append(reader.pages[i].extract_text())
return "\n".join(parts)
# --- Block splitting ---
def find_stat_block_starts(lines: list[str]) -> list[int]:
starts = []
for i, line in enumerate(lines):
if AC_RE.match(line.strip()):
header_idx = None
for j in range(i - 1, max(-1, i - 5), -1):
if HEADER_RE.match(lines[j].strip()):
header_idx = j
break
if header_idx is None:
continue
name_idx = header_idx - 1
if name_idx >= 0 and lines[name_idx].strip():
starts.append(name_idx)
return starts
SECTION_HEADER_SMUSH_RE = re.compile(
r"^(?P<body>.+?)\.(?P<hdr>Actions|Bonus Actions|Reactions|Legendary Actions|Traits)\s*$"
)
def block_for(lines: list[str], start: int, next_start: int | None) -> list[str]:
"""Build the line list for one stat block.
Drops page markers and everything from the first running-header line
onward (which marks the transition to a new prose page). Splits PDF
smush lines like "...plants.Actions" into two lines so section header
detection works.
"""
end = next_start if next_start is not None else len(lines)
out: list[str] = []
for ln in lines[start:end]:
if ln.startswith("===PAGE"):
continue
if RUNNING_HEADER_RE.match(ln.strip()):
break
m = SECTION_HEADER_SMUSH_RE.match(ln.strip())
if m:
out.append(m.group("body") + ".")
out.append(m.group("hdr"))
else:
out.append(ln)
return out
# --- Vitals parsing ---
def parse_header(block: list[str]) -> dict:
name = block[0].strip()
header = block[1].strip()
m = HEADER_RE.match(header)
if not m:
raise ValueError(f"Bad header for {name!r}: {header!r}")
size, ctype, alignment = m.group(1), m.group(2).strip(), m.group(3).strip()
return {"name": name, "size": size, "type": ctype, "alignment": alignment}
def parse_ac(line: str) -> int:
m = AC_RE.match(line.strip())
if not m:
raise ValueError(f"Bad AC line: {line!r}")
return int(m.group(1))
def parse_hp(line: str) -> dict:
m = HP_RE.match(line.strip())
if not m:
raise ValueError(f"Bad HP line: {line!r}")
return {"average": int(m.group(1)), "formula": m.group(2).strip()}
def parse_speed(line: str) -> str:
m = SPEED_RE.match(line.strip())
if not m:
raise ValueError(f"Bad Speed line: {line!r}")
speed = m.group(1).rstrip(".").strip()
# Normalize "30 ft" → "30 ft." to match 5etools adapter output style.
speed = re.sub(r"(\d+)\s+ft\b\.?", r"\1 ft.", speed)
return speed
def parse_abilities(row1: str, row2: str) -> dict:
out = {}
for row in (row1, row2):
m = ABILITY_ROW_RE.match(row.strip())
if not m:
raise ValueError(f"Bad ability row: {row!r}")
for off in (0, 4, 8):
ab = m.group(off + 1).lower()
score = int(m.group(off + 2))
out[ab] = score
return out
# --- Meta lines ---
META_KEYS = ("Skills", "Saving Throws", "Resistances", "Immunities",
"Vulnerabilities", "Senses", "Languages", "Gear")
def is_meta_start(line: str) -> str | None:
for key in META_KEYS:
if line.startswith(key + " ") or line.startswith(key + " "):
return key
return None
def parse_meta(lines: list[str], start: int) -> tuple[dict, int]:
meta: dict[str, str] = {}
i = start
current_key: str | None = None
current_val_parts: list[str] = []
def flush() -> None:
nonlocal current_key, current_val_parts
if current_key is not None:
meta[current_key] = " ".join(p.strip() for p in current_val_parts).strip()
current_key = None
current_val_parts = []
while i < len(lines):
line = lines[i].strip()
if not line:
i += 1
continue
if line.startswith("Challenge "):
flush()
return meta, i
key = is_meta_start(line)
if key:
flush()
current_key = key
current_val_parts.append(line[len(key):].strip())
elif current_key is not None:
current_val_parts.append(line)
i += 1
flush()
return meta, i
# --- Section discovery ---
def find_section_starts(block: list[str], start_idx: int) -> list[tuple[str, int]]:
starts = []
for i in range(start_idx, len(block)):
ln = block[i].strip()
if ln in SECTION_HEADERS:
starts.append((ln, i))
return starts
def collect_section_lines(block: list[str], start: int, end: int) -> list[str]:
"""Collect the raw lines for one section (between header indices)."""
out: list[str] = []
for line in block[start:end]:
if not line.strip():
continue
out.append(line.rstrip())
return out
def join_section_text(lines: list[str]) -> str:
"""Join section lines into a single text blob, repairing wrap hyphens."""
text = " ".join(line.strip() for line in lines if line.strip())
text = re.sub(r"\s+", " ", text)
# Repair "civi -li zation" → "civilization" (PDF column-wrap hyphens).
text = re.sub(r"(\w)\s*-\s+(\w)", r"\1\2", text)
return text.strip()
# --- Entry splitting ---
# Entry name: title-case phrase, where each "word" is either a Capitalized
# word, a lowercase connector (of/the/and/or/in/at/on/to/with/from), a roman
# numeral, etc. Optionally followed by parenthesized modifier.
ENTRY_NAME_INNER = (
r"[A-Z][A-Za-z']*"
r"(?:[ \-](?:[A-Z][A-Za-z']*|of|the|and|or|in|at|on|to|with|from))*"
r"(?:\s*\([^)]+\))?"
)
# An entry boundary occurs at the start of the joined section text, or
# immediately after a sentence-ending punctuation. The PDF sometimes drops
# the space between the period and the new entry name, so `\s*` is fine.
ENTRY_BOUNDARY = re.compile(
rf"(?:^|(?<=[\.\?\!]))\s*(?P<name>{ENTRY_NAME_INNER})\.\s+(?=[A-Z“\"(])"
)
# Trim attribution quotes / page-header bleed-through from entry bodies.
PROSE_TAIL_PATTERNS = (
# Em-dash attribution: " —Chondrus, Priest of Lutheria"
re.compile(r"\s+—\s*[A-Z][^—]*$"),
# Smushed section header at end ("...plants.Actions").
re.compile(
r"\.\s*(?:Actions|Bonus\s+Actions|Reactions|Legendary\s+Actions|Traits)\s*$"
),
# Curated prose subheadings / phrase markers that follow stat blocks in
# this book. PDF reflow often merges prose onto the same logical line
# as the last action body, so the leading whitespace is optional.
re.compile(
r"\.?\s*(?:Random Trapped Creature|Maenad Bacchanal|The Phalanx Formation"
r"|Reinforced Portal|TRAPPED|HUNGER FOR|PURSUIT OF|RITUAL|MyTHIC|BRON"
r"|GOlDEN|NyMPH|MARBlE|KElEDONE|SOlDIER|MINOTAUR|SATyRS|GOATlING|EMPUS"
r"|ANARCH|GyGAN|CERBERUS|WHITE STAG|STORM|FEy|VOlKAN).*",
re.DOTALL,
),
# Specific prose sentence-starts observed leaking in.
re.compile(
r"\.(?:will gleefully|Some report that|Storm Dory|This magic weapon"
r"|Thylean soldiers|Some claim|These leaders).*",
re.DOTALL,
),
# All-caps run of 3+ uppercase letters in a word, then a space, then
# another word with 3+ uppercase letters (PDF small-caps section header
# like "BRON zE STRATEGOS", "MyTHIC BEAST", "GOlDEN RAM").
re.compile(r"(?<=[\.\s])[A-Z]{2}\w*\s+[\w ]{0,12}[A-Z]{3}[A-Z\w ]*"),
)
def trim_prose_tail(body: str) -> str:
out = body
for pat in PROSE_TAIL_PATTERNS:
m = pat.search(out)
if m:
out = out[:m.start()].rstrip().rstrip(".") + "."
return out.strip()
def is_valid_entry_name(name: str) -> bool:
"""Filter false-positive matches that aren't really entry names."""
if name in NAME_FALSE_POSITIVES:
return False
# Single short capitalized word that's a common condition or noun is
# usually a false positive when followed by a period. Real entry names
# almost always have either multiple words or a parenthesized modifier.
bare = re.sub(r"\s*\([^)]+\)\s*", "", name).strip()
if bare in NAME_FALSE_POSITIVES:
return False
return True
def split_text_into_entries(text: str) -> list[tuple[str, str]]:
"""Split section text into (name, body) entries by scanning for entry-name
boundaries (start-of-text or after a sentence period)."""
matches: list[tuple[int, int, str]] = []
for m in ENTRY_BOUNDARY.finditer(text):
name = m.group("name").strip()
if is_valid_entry_name(name):
matches.append((m.start(), m.end(), name))
if not matches:
return []
entries: list[tuple[str, str]] = []
for i, (_, body_start, name) in enumerate(matches):
body_end = matches[i + 1][0] if i + 1 < len(matches) else len(text)
body = text[body_start:body_end].strip()
entries.append((name, body))
return entries
def parse_section_traits(lines: list[str]) -> list[dict]:
text = join_section_text(lines)
entries = split_text_into_entries(text)
out = []
for name, body in entries:
body = trim_prose_tail(body)
if body or name:
out.append({"name": name,
"segments": [{"type": "text", "value": body}]})
return out
def parse_legendary(lines: list[str], creature_name: str) -> dict | None:
"""Parse the Legendary Actions section. Text before the first entry whose
body contains action vocabulary forms the preamble.
"""
text = join_section_text(lines)
all_matches: list[tuple[int, int, str]] = []
for m in ENTRY_BOUNDARY.finditer(text):
name = m.group("name").strip()
if is_valid_entry_name(name):
all_matches.append((m.start(), m.end(), name))
action_anchors = ("Saving Throw", "Attack Roll", "Trigger", "Recharge",
"Melee", "Ranged", "Constitution", "Dexterity",
"Strength", "Intelligence", "Wisdom", "Charisma")
first_action_idx = None
for i, (_, body_start, _) in enumerate(all_matches):
body_end = all_matches[i + 1][0] if i + 1 < len(all_matches) else len(text)
body_head = text[body_start:min(body_end, body_start + 100)]
if any(a in body_head for a in action_anchors):
first_action_idx = i
break
if first_action_idx is None:
return None
preamble = text[:all_matches[first_action_idx][0]].strip()
if not preamble:
preamble = f"{creature_name} can take Legendary Actions."
entries = []
for i in range(first_action_idx, len(all_matches)):
_, body_start, name = all_matches[i]
body_end = all_matches[i + 1][0] if i + 1 < len(all_matches) else len(text)
body = text[body_start:body_end].strip()
entries.append((name, body))
if not entries:
return None
return {
"preamble": preamble,
"entries": [
{"name": name,
"segments": [{"type": "text", "value": trim_prose_tail(body)}]}
for name, body in entries if body
],
}
# --- Top-level parse ---
def parse_block(block: list[str]) -> dict:
head = parse_header(block)
ac = parse_ac(block[2])
hp = parse_hp(block[3])
speed = parse_speed(block[4])
if not block[5].strip().startswith("MOD"):
raise ValueError(f"Expected MOD header, got: {block[5]!r}")
abilities = parse_abilities(block[6], block[7])
meta, ch_idx = parse_meta(block, 8)
cr_match = CR_RE.match(block[ch_idx].strip())
if not cr_match:
raise ValueError(f"Bad Challenge line: {block[ch_idx]!r}")
cr_str = cr_match.group(1)
section_starts = find_section_starts(block, ch_idx + 1)
sections: dict[str, list[str]] = {}
for i, (name, idx) in enumerate(section_starts):
end = section_starts[i + 1][1] if i + 1 < len(section_starts) else len(block)
sections[name] = collect_section_lines(block, idx + 1, end)
creature: dict = {
"id": make_creature_id(SOURCE_CODE, head["name"]),
"name": head["name"],
"source": SOURCE_CODE,
"sourceDisplayName": SOURCE_DISPLAY,
"size": head["size"],
"type": head["type"],
"alignment": head["alignment"],
"ac": ac,
"hp": hp,
"speed": speed,
"abilities": abilities,
"cr": cr_str,
"initiativeProficiency": 0,
"proficiencyBonus": proficiency_bonus(cr_str),
"passive": parse_passive_perception(meta.get("Senses", "")) or 10,
}
if "Saving Throws" in meta:
creature["savingThrows"] = meta["Saving Throws"]
if "Skills" in meta:
creature["skills"] = meta["Skills"]
if "Resistances" in meta:
creature["resist"] = meta["Resistances"]
if "Immunities" in meta:
creature["immune"] = meta["Immunities"]
if "Vulnerabilities" in meta:
creature["vulnerable"] = meta["Vulnerabilities"]
if "Senses" in meta:
senses = re.sub(r"[;,]?\s*Passive Perception\s+\d+\s*$", "", meta["Senses"])
senses = senses.strip().rstrip(";").strip()
if senses:
creature["senses"] = senses
if "Languages" in meta:
creature["languages"] = meta["Languages"]
if "Traits" in sections:
creature["traits"] = parse_section_traits(sections["Traits"])
if "Actions" in sections:
creature["actions"] = parse_section_traits(sections["Actions"])
if "Bonus Actions" in sections:
creature["bonusActions"] = parse_section_traits(sections["Bonus Actions"])
if "Reactions" in sections:
creature["reactions"] = parse_section_traits(sections["Reactions"])
if "Legendary Actions" in sections:
leg = parse_legendary(sections["Legendary Actions"], head["name"])
if leg:
creature["legendaryActions"] = leg
return creature
def main() -> int:
if len(sys.argv) != 2:
print("Usage: python3 extract-great-labors.py <path-to-pdf>",
file=sys.stderr)
return 1
pdf_path = Path(os.path.expanduser(sys.argv[1]))
if not pdf_path.exists():
print(f"PDF not found: {pdf_path}", file=sys.stderr)
return 1
text = extract_pages(pdf_path)
lines = text.split("\n")
starts = find_stat_block_starts(lines)
print(f"Detected {len(starts)} stat blocks", file=sys.stderr)
creatures = []
failures = []
for i, s in enumerate(starts):
next_s = starts[i + 1] if i + 1 < len(starts) else None
block = block_for(lines, s, next_s)
try:
creatures.append(parse_block(block))
except Exception as e:
failures.append((block[0] if block else "<empty>", str(e)))
if failures:
print(f"\n{len(failures)} parse failures:", file=sys.stderr)
for name, err in failures:
print(f" - {name}: {err}", file=sys.stderr)
out_path = Path(__file__).resolve().parent.parent / "data" / "bestiary" / "dnd-bundled.json"
out_path.parent.mkdir(parents=True, exist_ok=True)
with out_path.open("w") as f:
json.dump(creatures, f, indent="\t", ensure_ascii=False)
f.write("\n")
print(f"Wrote {len(creatures)} creatures to {out_path}", file=sys.stderr)
return 0 if not failures else 2
if __name__ == "__main__":
sys.exit(main())