Initial commit: GnommoEditor video pipeline

A code-first, declarative video editing system that compiles text documents into rendered video via FFmpeg. Uses a compiler-style ETL pipeline: Extract (parse inputs) → Validate → Transform (build timeline) → Render (FFmpeg). Features: - Text-based project definition (manuscript, transcript, JSON configs) - Slide markers [S1], [S2] in transcript map to timed overlays - Strict validation with fail-fast error reporting - FFmpeg filter_complex generation with time-based enables - CLI with validate/render/dry-run modes Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-12 11:19:38 +01:00
commit d5a8d38c9c
15 changed files with 967 additions and 0 deletions
@@ -0,0 +1,197 @@
+"""Load stage: generate and execute FFmpeg commands."""
+
+import subprocess
+from pathlib import Path
+
+from .errors import RenderError
+from .models import RenderPlan, SlideEvent, SLIDE_LAYOUTS
+
+
+def render(plan: RenderPlan, output_path: Path, verbose: bool = False) -> None:
+    """
+    Render the final video using FFmpeg.
+
+    Generates a filter_complex command that:
+    1. Scales background video (if present) or creates solid color
+    2. Overlays talking head at configured position
+    3. Overlays slides at their configured positions with time-based enable
+    """
+    # Ensure output directory exists
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # Build and execute FFmpeg command
+    cmd = build_ffmpeg_command(plan, output_path)
+
+    if verbose:
+        print("FFmpeg command:")
+        print(" ".join(cmd))
+        print()
+
+    result = subprocess.run(
+        cmd,
+        capture_output=True,
+        text=True,
+    )
+
+    if result.returncode != 0:
+        raise RenderError(
+            "FFmpeg rendering failed",
+            command=" ".join(cmd),
+            stderr=result.stderr,
+        )
+
+
+def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
+    """Build the complete FFmpeg command as a list of arguments."""
+    cmd = ["ffmpeg", "-y"]  # -y to overwrite output
+
+    # Resolve paths to absolute
+    project_path = plan.project_path.resolve()
+    output_path = output_path.resolve()
+
+    # Input: talking head video
+    talking_head_path = project_path / plan.talking_head.file
+    cmd.extend(["-i", str(talking_head_path)])
+
+    # Input: background video (if specified)
+    has_background = bool(plan.config.background_video)
+    if has_background:
+        bg_path = project_path / plan.config.background_video
+        cmd.extend(["-i", str(bg_path)])
+
+    # Input: slide images
+    slides_path = project_path / "media" / "slides"
+    slide_inputs: list[str] = []  # Track which slides we've added
+
+    for event in plan.slide_events:
+        if event.slide_id not in slide_inputs:
+            image_path = slides_path / event.slide_def.image
+            cmd.extend(["-i", str(image_path)])
+            slide_inputs.append(event.slide_id)
+
+    # Build filter_complex
+    filter_complex = build_filter_complex(plan, has_background, slide_inputs)
+    cmd.extend(["-filter_complex", filter_complex])
+
+    # Map output video and audio
+    cmd.extend(["-map", "[vout]"])
+    cmd.extend(["-map", "0:a"])  # Audio from talking head
+
+    # Output settings
+    cmd.extend([
+        "-t", str(plan.total_duration),  # Limit output duration
+        "-c:v", "libx264",
+        "-preset", "fast",
+        "-crf", "23",
+        "-c:a", "aac",
+        "-b:a", "192k",
+        "-r", str(plan.config.fps),
+        str(output_path),
+    ])
+
+    return cmd
+
+
+def build_filter_complex(
+    plan: RenderPlan,
+    has_background: bool,
+    slide_inputs: list[str],
+) -> str:
+    """
+    Build the filter_complex string for FFmpeg.
+
+    Layer structure:
+    - Layer 1: Background (solid color or video)
+    - Layer 2: Talking head
+    - Layer 3: Slides (with time-based enable)
+    """
+    width, height = plan.config.resolution
+    filters: list[str] = []
+
+    # Input indices:
+    # 0 = talking head
+    # 1 = background (if present)
+    # 2+ = slides
+    talking_head_idx = 0
+    bg_idx = 1 if has_background else None
+    slide_start_idx = 2 if has_background else 1
+
+    # Create base layer (background)
+    if has_background:
+        filters.append(f"[{bg_idx}:v]scale={width}:{height}:force_original_aspect_ratio=increase,"
+                      f"crop={width}:{height}[bg]")
+        base_label = "bg"
+    else:
+        # Create solid color background
+        filters.append(f"color=c=black:s={width}x{height}:r={plan.config.fps}[bg]")
+        base_label = "bg"
+
+    # Scale and position talking head
+    th_config = plan.config.talking_head
+    th_height = th_config.target_height if th_config.target_height > 0 else height
+
+    filters.append(
+        f"[{talking_head_idx}:v]scale=-1:{th_height}[head]"
+    )
+
+    # Overlay talking head on background
+    filters.append(
+        f"[{base_label}][head]overlay=x={th_config.x}:y={th_config.y}[base]"
+    )
+
+    current_label = "base"
+
+    # Add slide overlays with time-based enable
+    for i, event in enumerate(plan.slide_events):
+        slide_idx = slide_start_idx + slide_inputs.index(event.slide_id)
+        layout = SLIDE_LAYOUTS.get(event.slide_def.type, SLIDE_LAYOUTS["square"])
+
+        # Scale slide to fit layout while preserving aspect ratio
+        slide_label = f"s{i}"
+        filters.append(
+            f"[{slide_idx}:v]scale={layout['width']}:{layout['height']}:"
+            f"force_original_aspect_ratio=decrease[{slide_label}]"
+        )
+
+        # Overlay with time-based enable
+        next_label = f"v{i}" if i < len(plan.slide_events) - 1 else "vout"
+        enable_expr = f"between(t,{event.start_time:.3f},{event.end_time:.3f})"
+
+        filters.append(
+            f"[{current_label}][{slide_label}]overlay="
+            f"x={layout['x']}:y={layout['y']}:"
+            f"enable='{enable_expr}'[{next_label}]"
+        )
+
+        current_label = next_label
+
+    # If no slides, just rename base to vout
+    if not plan.slide_events:
+        filters.append(f"[{current_label}]copy[vout]")
+
+    return ";".join(filters)
+
+
+def generate_ffmpeg_command_string(plan: RenderPlan, output_path: Path) -> str:
+    """Generate a human-readable FFmpeg command string (for debugging)."""
+    cmd = build_ffmpeg_command(plan, output_path)
+
+    # Format nicely with line breaks
+    result = []
+    i = 0
+    while i < len(cmd):
+        if cmd[i] == "-filter_complex":
+            result.append(f"  -filter_complex \"\n    {cmd[i+1].replace(';', ';' + chr(10) + '    ')}\n  \"")
+            i += 2
+        elif cmd[i].startswith("-"):
+            if i + 1 < len(cmd) and not cmd[i + 1].startswith("-"):
+                result.append(f"  {cmd[i]} {cmd[i+1]}")
+                i += 2
+            else:
+                result.append(f"  {cmd[i]}")
+                i += 1
+        else:
+            result.append(f"  {cmd[i]}")
+            i += 1
+
+    return "ffmpeg \\\n" + " \\\n".join(result)