-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsplit_chapters.py
More file actions
executable file
·94 lines (74 loc) · 3.57 KB
/
split_chapters.py
File metadata and controls
executable file
·94 lines (74 loc) · 3.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3
"""Split a single bible/master document into per-chapter files.
Splits on top-level headings matching `# Chapter N` (or a custom prefix),
writes each chapter to `<output-dir>/chapter_NN_<slug>.md`, and skips writes
where the destination is byte-identical to what would be written.
Usage:
python split_chapters.py bible.md --output-dir docs/chapters
python split_chapters.py bible.md --output-dir docs/chapters --prefix "# Chapter "
python split_chapters.py bible.md --output-dir docs/chapters --dry-run
"""
import argparse
import re
import sys
from pathlib import Path
CHAPTER_RE_TEMPLATE = r"(?mi)^{prefix}\s*([\d.]+)[^\n]*$"
def split(text: str, prefix: str) -> list[tuple[str, str, str]]:
"""Return [(source_number, heading_line, body_with_heading), ...].
Anything before the first chapter heading is dropped. Source numbers may
be decimals (e.g. "18.05") and are kept verbatim for slug stripping; the
final chapter number is assigned by encounter order in main().
"""
pattern = re.compile(CHAPTER_RE_TEMPLATE.format(prefix=re.escape(prefix)))
matches = list(pattern.finditer(text))
if not matches:
return []
chunks: list[tuple[str, str, str]] = []
for i, m in enumerate(matches):
start = m.start()
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
body = text[start:end].strip() + "\n"
chunks.append((m.group(1), m.group(0).strip(), body))
return chunks
def slugify(heading: str, prefix: str) -> str:
"""Turn `# Chapter 3: Stealing Weapons` into `stealing_weapons`.
Strips the prefix and any leading number (including decimals like 18.05).
"""
tail = re.sub(rf"^{re.escape(prefix)}\s*[\d.]+[:\-\s]*", "", heading, flags=re.IGNORECASE)
tail = tail.strip().lower()
tail = re.sub(r"[^a-z0-9]+", "_", tail).strip("_")
return tail or "untitled"
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("input", type=Path, help="Source markdown file")
parser.add_argument("--output-dir", type=Path, required=True, help="Destination directory for chapter files")
parser.add_argument("--prefix", default="# Chapter ", help="Heading prefix to split on (default: '# Chapter ')")
parser.add_argument("--dry-run", action="store_true", help="Print what would be written without touching disk")
args = parser.parse_args()
if not args.input.is_file():
print(f"error: input file not found: {args.input}", file=sys.stderr)
return 1
text = args.input.read_text(encoding="utf-8")
chunks = split(text, args.prefix)
if not chunks:
print(f"error: no headings matching {args.prefix!r} found in {args.input}", file=sys.stderr)
return 1
args.output_dir.mkdir(parents=True, exist_ok=True)
written = skipped = 0
for index, (source_number, heading, body) in enumerate(chunks, start=1):
slug = slugify(heading, args.prefix)
dest = args.output_dir / f"chapter_{index:02d}_{slug}.md"
if dest.exists() and dest.read_text(encoding="utf-8") == body:
skipped += 1
continue
if args.dry_run:
print(f"would write {dest} ({len(body):,} chars)")
else:
dest.write_text(body, encoding="utf-8")
print(f"wrote {dest} ({len(body):,} chars)")
written += 1
verb = "would write" if args.dry_run else "wrote"
print(f"\n{verb} {written} file(s); {skipped} unchanged")
return 0
if __name__ == "__main__":
sys.exit(main())