Fixing gnommo

2026-03-26 10:46:05 +01:00
parent 0e22fcfbb3
commit 7c75610fce
15 changed files with 2028 additions and 410 deletions
@@ -14,6 +14,7 @@
    "audioonly": [
      {
        "type": "audio_normalize",
+        "compress": false,
        "normalize": true,
        "target_lufs": -14,
        "target_lra": 11,
@@ -27,7 +27,6 @@ from gnommo.parser import _read_json


 def write_manuscript(data: Path, out_path: Path):
-    
    data = _read_json(data.read_text(encoding="utf-8"))
    lines = []
    i = 0
@@ -30,11 +30,15 @@ from pathlib import Path
 try:
    import requests
 except ImportError:
-    print("Error: 'requests' package is required. Run: pip install requests", file=sys.stderr)
+    print(
+        "Error: 'requests' package is required. Run: pip install requests",
+        file=sys.stderr,
+    )
    sys.exit(1)

 SYNC_FILE_LOCAL = ".gnommo_sync.json"
-SYNC_FILE_PROD  = ".gnommo_sync.prod.json"
+SYNC_FILE_PROD = ".gnommo_sync.prod.json"
+

 def _sync_file(prod: bool) -> str:
    return SYNC_FILE_PROD if prod else SYNC_FILE_LOCAL
@@ -69,19 +73,33 @@ def _write_sync(project_path: Path, data: dict, prod: bool = False):
        json.dump(data, f, indent=2)


-def cmd_handoff(project_path: Path, verbose: bool = False, file_override: str | None = None, prod: bool = False, res: str = "full") -> int:
+def cmd_handoff(
+    project_path: Path,
+    verbose: bool = False,
+    file_override: str | None = None,
+    prod: bool = False,
+    res: str = "full",
+) -> int:
    _load_env_file()

    if prod:
        api_url = os.environ.get("GNOMMOWEB_PROD_URL", "").rstrip("/")
        api_key = os.environ.get("GNOMMOWEB_PROD_API_KEY", "")
-        if not api_url: print("Error: GNOMMOWEB_PROD_URL is not set.", file=sys.stderr); return 1
-        if not api_key: print("Error: GNOMMOWEB_PROD_API_KEY is not set.", file=sys.stderr); return 1
+        if not api_url:
+            print("Error: GNOMMOWEB_PROD_URL is not set.", file=sys.stderr)
+            return 1
+        if not api_key:
+            print("Error: GNOMMOWEB_PROD_API_KEY is not set.", file=sys.stderr)
+            return 1
    else:
        api_url = os.environ.get("GNOMMOWEB_URL", "").rstrip("/")
        api_key = os.environ.get("GNOMMOWEB_API_KEY", "")
-        if not api_url: print("Error: GNOMMOWEB_URL is not set.", file=sys.stderr); return 1
-        if not api_key: print("Error: GNOMMOWEB_API_KEY is not set.", file=sys.stderr); return 1
+        if not api_url:
+            print("Error: GNOMMOWEB_URL is not set.", file=sys.stderr)
+            return 1
+        if not api_key:
+            print("Error: GNOMMOWEB_API_KEY is not set.", file=sys.stderr)
+            return 1

    if verbose:
        target = "production" if prod else "local"
@@ -104,7 +122,9 @@ def cmd_handoff(project_path: Path, verbose: bool = False, file_override: str |
    if file_override:
        video_path = Path(file_override)
    else:
-        output_filename = project.get("output") or Path(project.get("output_video", "")).name
+        output_filename = (
+            project.get("output") or Path(project.get("output_video", "")).name
+        )
        if not output_filename:
            print(
                "Error: no 'output' field in project.json and no --file provided.",
@@ -148,17 +168,23 @@ def cmd_handoff(project_path: Path, verbose: bool = False, file_override: str |

    result = r.json()
    video_version = result.get("video_version", "?")
-    video_url     = result.get("video_url", "")
+    video_url = result.get("video_url", "")

    # ── Write sync state ───────────────────────────────────────────────────────
    now_iso = datetime.now(tz=timezone.utc).isoformat(timespec="seconds")
    existing_sync = _read_sync(project_path, prod)
-    _write_sync(project_path, {
-        **existing_sync,
-        "last_handoff_at":  now_iso,
-        "video_version":    video_version,
-        "server_updated_at": result.get("asset", {}).get("updated_at", existing_sync.get("server_updated_at")),
-    }, prod)
+    _write_sync(
+        project_path,
+        {
+            **existing_sync,
+            "last_handoff_at": now_iso,
+            "video_version": video_version,
+            "server_updated_at": result.get("asset", {}).get(
+                "updated_at", existing_sync.get("server_updated_at")
+            ),
+        },
+        prod,
+    )

    print(f"✓ {project_id} → v{video_version} [processed]")
    if video_url:
@@ -170,8 +196,8 @@ def cmd_handoff(project_path: Path, verbose: bool = False, file_override: str |
 def _mime_type(path: Path) -> str:
    ext = path.suffix.lower()
    return {
-        ".mp4":  "video/mp4",
-        ".mov":  "video/quicktime",
+        ".mp4": "video/mp4",
+        ".mov": "video/quicktime",
        ".webm": "video/webm",
-        ".mkv":  "video/x-matroska",
+        ".mkv": "video/x-matroska",
    }.get(ext, "application/octet-stream")
@@ -65,7 +65,9 @@ class ProjectConfig:
    # YouTube description fields
    description: str = ""  # Video description text for YouTube
    footer: str = ""  # Footer text (social links, subscribe CTA, etc.)
-    output_video: str = ""  # Output filename (e.g. "DISC_INT3.mp4"); placed in out/ or out/<res>/
+    output_video: str = (
+        ""  # Output filename (e.g. "DISC_INT3.mp4"); placed in out/ or out/<res>/
+    )


@dataclass
@@ -295,6 +297,10 @@ class VideoSource:
        False  # If True, skip loudnorm during preprocessing (apply after concatenation)
    )
    volume: float = 1.0  # Volume multiplier (1.0=full, >1.0=boost, <1.0=reduce)
+    layer: str = "above"  # "above" = renders on top of slides; "below" = behind slides
+    duration: Optional[float] = None  # Pre-probed file duration in seconds (set by import)
+    has_audio: Optional[bool] = None  # Pre-detected audio presence (set by import)
+    end_on: Optional[str] = None  # When video event ends: "next_slide" | "end" | "take" (None = marker-type default)


@dataclass
@@ -334,6 +340,7 @@ class AudioDefinition:
    ignore_pauses: bool = (
        False  # If True, audio continues playing during narration pauses
    )
+    duration: Optional[float] = None  # Pre-probed duration in seconds (set by import)


@dataclass
@@ -364,6 +371,8 @@ class VideoEvent:
    end_time: float
    video_source: "VideoSource"
    cutout: "CutoutDefinition"
+    cutout_name: str = ""  # resolved cutout name (e.g. "fullscreen"), for display
+    layer: str = "above"  # "above" = on top of slides; "below" = behind slides


@dataclass
@@ -508,7 +517,9 @@ class RenderPlan:
    cached_files: set = field(
        default_factory=set
    )  # Video IDs loaded from external cache (show 📁 indicator)
-    output_path: Optional[Path] = None  # Final output file path (set after plan is built)
+    output_path: Optional[
+        Path
+    ] = None  # Final output file path (set after plan is built)


 # Slide layout configurations (hardcoded for POC)
@@ -161,8 +161,35 @@ def parse_project_config(project_path: Path) -> ProjectConfig:
    except json.JSONDecodeError as e:
        raise ParseError(f"Invalid JSON: {e}", config_path)

-    # Parse cutouts (named zones for video placement)
-    cutouts: dict[str, CutoutDefinition] = {}
+    # Built-in cutouts — used by vft/vfb/vst/vsb marker shorthand.
+    # Projects can override these by defining cutouts with the same names.
+    cutouts: dict[str, CutoutDefinition] = {
+        # 100 % × 100 % at origin — for fullscreen video (vf* markers)
+        "fullscreen": CutoutDefinition(
+            x=-1,
+            y=-1,
+            height=-1,
+            width=-1,
+            x_percent=0.0,
+            y_percent=0.0,
+            height_percent=1.0,
+            width_percent=1.0,
+        ),
+        # 50 % height, square aspect, centred — for square video (vs* markers)
+        "square": CutoutDefinition(
+            x=-1,
+            y=-1,
+            height=-1,
+            width=-1,
+            x_percent=0.25,
+            y_percent=0.25,
+            height_percent=0.5,
+            width_percent=0.0,
+        ),
+    }
+
+    # Parse cutouts (named zones for video placement) — project definitions
+    # override the built-ins above.
    cutouts_data = data.get("cutouts", {})
    for cutout_name, cutout_data in cutouts_data.items():
        x, x_pct = _parse_dimension(cutout_data.get("x", 0))
@@ -243,7 +270,9 @@ def parse_slides(
    # Try cache fallback for reading JSON
    slides_path, _ = resolve_with_cache(local_slides_path, project_path)
    if not slides_path.exists():
-        raise ParseError(f"slides file not found: {local_slides_path}", local_slides_path)
+        raise ParseError(
+            f"slides file not found: {local_slides_path}", local_slides_path
+        )

    try:
        data = _read_json(slides_path)
@@ -305,12 +334,14 @@ def parse_audio(
        if "overlap" in audio_data and audio_data["overlap"]:
            overlap = parse_timestamp(audio_data["overlap"])

+        raw_duration = audio_data.get("duration")
        audio[audio_id] = AudioDefinition(
            file=audio_data["file"],
            volume=float(audio_data.get("volume", 1.0)),
            loop=bool(audio_data.get("loop", False)),
            overlap=overlap,
            ignore_pauses=bool(audio_data.get("ignore_pauses", False)),
+            duration=float(raw_duration) if raw_duration is not None else None,
        )

    return audio, audio_dir
@@ -386,7 +417,9 @@ def parse_videos(
    # Try cache fallback for reading JSON
    videos_path, _ = resolve_with_cache(local_videos_path, project_path)
    if not videos_path.exists():
-        raise ParseError(f"videos.json not found: {local_videos_path}", local_videos_path)
+        raise ParseError(
+            f"videos.json not found: {local_videos_path}", local_videos_path
+        )

    try:
        data = _read_json(videos_path)
@@ -440,6 +473,8 @@ def parse_videos(
            # take = end - begin (duration from begin to end)
            take = end_time - skip

+        raw_duration = video_data.get("duration")
+        raw_has_audio = video_data.get("has_audio")
        videos[video_id] = VideoSource(
            source_file=video_data["source_file"],
            filter=filter_list,
@@ -455,6 +490,10 @@ def parse_videos(
            use_audio_channels=video_data.get("use_audio_channels", "both"),
            defer_loudnorm=video_data.get("defer_loudnorm", False),
            volume=float(video_data.get("volume", 1.0)),
+            layer=video_data.get("layer", "above"),
+            duration=float(raw_duration) if raw_duration is not None else None,
+            has_audio=bool(raw_has_audio) if raw_has_audio is not None else None,
+            end_on=video_data.get("end_on"),
        )

    return videos, videos_dir
@@ -27,9 +27,9 @@ CHUNK_DURATION = 60
 # Resolution presets for preview/proxy workflow
 # Each entry: (width, height, subdir_name)
 RES_CONFIGS: dict[str, tuple[int, int, str] | None] = {
-    "full":  None,              # no downscale, no subdir
-    "low":   (490, 270, "low"),
-    "tiny":  (320, 180, "proxy"),  # "proxy" subdir kept for backward compat
+    "full": None,  # no downscale, no subdir
+    "low": (490, 270, "low"),
+    "tiny": (320, 180, "proxy"),  # "proxy" subdir kept for backward compat
 }

 # Keep legacy constants pointing at "tiny" values
@@ -61,10 +61,14 @@ def _video_has_alpha(video_path: Path) -> bool:
    """Check if a video file has an alpha channel."""
    cmd = [
        "ffprobe",
-        "-v", "error",
-        "-select_streams", "v:0",
-        "-show_entries", "stream=pix_fmt",
-        "-of", "default=noprint_wrappers=1:nokey=1",
+        "-v",
+        "error",
+        "-select_streams",
+        "v:0",
+        "-show_entries",
+        "stream=pix_fmt",
+        "-of",
+        "default=noprint_wrappers=1:nokey=1",
        str(video_path),
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
@@ -104,13 +108,20 @@ def create_downscaled_video(
        return out_path

    cmd = [
-        "ffmpeg", "-y",
-        "-i", str(source_path),
-        "-vf", f"scale={width}:{height}",
-        "-c:v", "libx264",
-        "-preset", "ultrafast",
-        "-crf", "28",
-        "-c:a", "copy",
+        "ffmpeg",
+        "-y",
+        "-i",
+        str(source_path),
+        "-vf",
+        f"scale={width}:{height}",
+        "-c:v",
+        "libx264",
+        "-preset",
+        "ultrafast",
+        "-crf",
+        "28",
+        "-c:a",
+        "copy",
        str(out_path),
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
@@ -204,7 +215,8 @@ def ensure_downscaled_files_exist(
    out_dir.mkdir(parents=True, exist_ok=True)

    video_files = [
-        f for f in source_dir.iterdir()
+        f
+        for f in source_dir.iterdir()
        if f.is_file()
        and f.suffix.lower() in video_extensions
        and "_processed" not in f.stem
@@ -247,6 +259,7 @@ import selectors, time, sys, subprocess

 def run_ffmpeg_with_progress(cmd, duration, description="Processing"):
    cmd = cmd.copy()
+
    insert_pos = cmd.index("-y") + 1 if "-y" in cmd else 1
    cmd[insert_pos:insert_pos] = [
        "-progress",
@@ -269,9 +282,11 @@ def run_ffmpeg_with_progress(cmd, duration, description="Processing"):
    sel.register(p.stdout, selectors.EVENT_READ)

    bar_width = 30
+    start_time = time.time()
    last_update = time.time()
    last_percent = 0
    seen_any_progress = False
+    last_log_line = ""
    logs = []

    def draw(percent, suffix=""):
@@ -287,6 +302,7 @@ def run_ffmpeg_with_progress(cmd, duration, description="Processing"):
    while True:
        # If process ended and no more output, break
        if p.poll() is not None:
+
            # drain any remaining output quickly
            while True:
                line = p.stdout.readline()
@@ -297,8 +313,12 @@ def run_ffmpeg_with_progress(cmd, duration, description="Processing"):

        events = sel.select(timeout=0.2)
        if not events:
-            # No output right now; show finalizing if we're near end
-            if (
+            if not seen_any_progress:
+                # Show elapsed time and last FFmpeg output line during init
+                elapsed = time.time() - start_time
+                hint = f" | {last_log_line[:50]}" if last_log_line else ""
+                draw(0, f"Initializing... ({elapsed:.0f}s){hint}")
+            elif (
                seen_any_progress
                and last_percent >= 99
                and (time.time() - last_update) > 1.0
@@ -311,6 +331,10 @@ def run_ffmpeg_with_progress(cmd, duration, description="Processing"):
            if not line:
                continue
            logs.append(line)
+            # Track last non-empty, non-progress-key line for init diagnostics
+            stripped = line.strip()
+            if stripped and "=" not in stripped:
+                last_log_line = stripped

            if line.startswith("out_time_ms="):
                val = line.split("=", 1)[1].strip()
@@ -332,7 +356,10 @@ def run_ffmpeg_with_progress(cmd, duration, description="Processing"):
    if p.returncode == 0:
        draw(100, "Done\n")
    else:
-        sys.stdout.write("\n")
+        code = p.returncode
+        # On macOS/Linux, -9 means SIGKILL (OOM kill by OS), -6 = SIGABRT
+        signal_hint = " (OOM kill)" if code == -9 else (" (abort)" if code == -6 else "")
+        sys.stdout.write(f"\n          FFmpeg exited with code {code}{signal_hint}\n")
        sys.stdout.flush()

    return subprocess.CompletedProcess(
@@ -340,7 +367,33 @@ def run_ffmpeg_with_progress(cmd, duration, description="Processing"):
    )


-def check_audio_channel_silent(input_path: Path, channel: str, threshold_db: float = -60.0) -> tuple[bool, float]:
+def _has_audio_stream(video_path: Path) -> bool:
+    """Return True if the file has a real (non-ghost) audio stream."""
+    result = subprocess.run(
+        [
+            "ffprobe", "-v", "error",
+            "-analyzeduration", "0",
+            "-probesize", "1000000",
+            "-select_streams", "a:0",
+            "-show_entries", "stream=index,nb_frames",
+            "-of", "csv=p=0",
+            str(video_path),
+        ],
+        capture_output=True,
+        text=True,
+    )
+    output = result.stdout.strip()
+    if not output:
+        return False
+    parts = output.split(",")
+    if len(parts) >= 2 and parts[1].strip() == "0":
+        return False  # Ghost audio track — header present but no sample data
+    return True
+
+
+def check_audio_channel_silent(
+    input_path: Path, channel: str, threshold_db: float = -60.0
+) -> tuple[bool, float]:
    """
    Quick check whether the specified audio channel is silent.
    Uses ffmpeg volumedetect (audio-only pass, much faster than full processing).
@@ -349,9 +402,14 @@ def check_audio_channel_silent(input_path: Path, channel: str, threshold_db: flo
    """
    pan = "pan=mono|c0=c0" if channel == "left" else "pan=mono|c0=c1"
    cmd = [
-        "ffmpeg", "-i", str(input_path),
-        "-af", f"{pan},volumedetect",
-        "-f", "null", "/dev/null",
+        "ffmpeg",
+        "-i",
+        str(input_path),
+        "-af",
+        f"{pan},volumedetect",
+        "-f",
+        "null",
+        "/dev/null",
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
    for line in result.stderr.splitlines():
@@ -416,10 +474,14 @@ def detect_silence_bounds(
    total_duration = get_video_duration(input_path)

    cmd = [
-        "ffmpeg", "-i", str(input_path),
+        "ffmpeg",
+        "-i",
+        str(input_path),
        "-af",
        f"silencedetect=noise={noise_threshold_db}dB:duration={min_silence_duration}",
-        "-f", "null", "/dev/null",
+        "-f",
+        "null",
+        "/dev/null",
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)

@@ -591,6 +653,14 @@ def preprocess_video(
            # Audio normalization: denoise, compress, and normalize loudness
            # Note: skip/take are NOT applied here - they're only used during concatenation
            print("        Filter: audio_normalize")
+            if not _has_audio_stream(current_input):
+                raise PreprocessError(
+                    f"audio_normalize requires an audio stream, but '{current_input.name}' has none.\n"
+                    f"    Check that the source file has audio, or remove audio_normalize from the filter list.",
+                    filter_type="audio_normalize",
+                    command="",
+                    stderr="",
+                )
            step_output = gnommo_scratch / f"{video_id}_batch{batch_num}_audio.mov"
            intermediate_files.append(step_output)
            apply_audio_normalize(
@@ -1122,9 +1192,7 @@ def apply_combined_video_filters_chunked(

    num_chunks = int(duration / CHUNK_DURATION) + 1
    chunk_files: list[Path] = []
-    chunk_tasks: list[
-        tuple
-    ] = []  # (index, chunk_path, start_time, chunk_duration)
+    chunk_tasks: list[tuple] = []  # (index, chunk_path, start_time, chunk_duration)

    # Build list of chunk tasks
    for i in range(num_chunks):
@@ -1179,11 +1247,16 @@ def apply_combined_video_filters_chunked(
        print(f"          Concatenating {len(chunk_files)} chunks → {output_path.name}")

    concat_cmd = [
-        "ffmpeg", "-y",
-        "-f", "concat",
-        "-safe", "0",
-        "-i", str(concat_list),
-        "-c", "copy",
+        "ffmpeg",
+        "-y",
+        "-f",
+        "concat",
+        "-safe",
+        "0",
+        "-i",
+        str(concat_list),
+        "-c",
+        "copy",
        str(output_path),
    ]
    concat_result = run_ffmpeg_with_progress(concat_cmd, duration, "Concatenating")
@@ -1953,12 +2026,14 @@ def parse_audio_normalize_config(config: dict[str, Any]) -> AudioNormalizeConfig
    # Parse EQ bands
    eq_bands = []
    for band in config.get("eq_bands", []):
-        eq_bands.append(EQBand(
-            freq=float(band.get("freq", 1000)),
-            gain=float(band.get("gain", 0)),
-            q=float(band.get("q", 1.0)),
-            type=str(band.get("type", "peak")),
-        ))
+        eq_bands.append(
+            EQBand(
+                freq=float(band.get("freq", 1000)),
+                gain=float(band.get("gain", 0)),
+                q=float(band.get("q", 1.0)),
+                type=str(band.get("type", "peak")),
+            )
+        )

    return AudioNormalizeConfig(
        # Parametric EQ
@@ -2163,12 +2238,18 @@ def stitch_narration_segments(
            # Preserve alpha with ProRes 4444
            cmd.extend(
                [
-                    "-vf", "fps=30,format=yuva444p10le",
-                    "-c:v", "prores_ks",
-                    "-profile:v", "4",
-                    "-pix_fmt", "yuva444p10le",
-                    "-c:a", "pcm_s16le",
-                    "-avoid_negative_ts", "make_zero",
+                    "-vf",
+                    "fps=30,format=yuva444p10le",
+                    "-c:v",
+                    "prores_ks",
+                    "-profile:v",
+                    "4",
+                    "-pix_fmt",
+                    "yuva444p10le",
+                    "-c:a",
+                    "pcm_s16le",
+                    "-avoid_negative_ts",
+                    "make_zero",
                    str(trimmed_path),
                ]
            )
@@ -2176,14 +2257,22 @@ def stitch_narration_segments(
            # No alpha - use fast h264 encoding
            cmd.extend(
                [
-                    "-vf", "fps=30",
-                    "-c:v", "libx264",
-                    "-preset", "fast",
-                    "-crf", "18",
-                    "-c:a", "aac",
-                    "-b:a", "192k",
-                    "-avoid_negative_ts", "make_zero",
-                    "-movflags", "+faststart",
+                    "-vf",
+                    "fps=30",
+                    "-c:v",
+                    "libx264",
+                    "-preset",
+                    "fast",
+                    "-crf",
+                    "18",
+                    "-c:a",
+                    "aac",
+                    "-b:a",
+                    "192k",
+                    "-avoid_negative_ts",
+                    "make_zero",
+                    "-movflags",
+                    "+faststart",
                    str(trimmed_path),
                ]
            )
@@ -2211,12 +2300,18 @@ def stitch_narration_segments(
    cmd = [
        "ffmpeg",
        "-y",
-        "-f", "concat",
-        "-safe", "0",
-        "-i", str(concat_list),
-        "-c:v", "copy",
-        "-c:a", "copy",
-        "-movflags", "+faststart",
+        "-f",
+        "concat",
+        "-safe",
+        "0",
+        "-i",
+        str(concat_list),
+        "-c:v",
+        "copy",
+        "-c:a",
+        "copy",
+        "-movflags",
+        "+faststart",
        str(output_path),
    ]

@@ -2235,16 +2330,26 @@ def stitch_narration_segments(
    )
    if needs_loudnorm:
        print("    Applying loudness normalization to stitched output...")
-        normalized_path = output_path.parent / f"{output_path.stem}_normalized{output_path.suffix}"
+        normalized_path = (
+            output_path.parent / f"{output_path.stem}_normalized{output_path.suffix}"
+        )

        # Use EBU R128 loudnorm targeting YouTube's recommended levels
        loudnorm_cmd = [
-            "ffmpeg", "-y",
-            "-i", str(output_path),
-            "-c:v", "copy",
-            "-af", "loudnorm=I=-14:LRA=11:TP=-1.5",
-            "-c:a", "aac", "-b:a", "192k",
-            "-movflags", "+faststart",
+            "ffmpeg",
+            "-y",
+            "-i",
+            str(output_path),
+            "-c:v",
+            "copy",
+            "-af",
+            "loudnorm=I=-14:LRA=11:TP=-1.5",
+            "-c:a",
+            "aac",
+            "-b:a",
+            "192k",
+            "-movflags",
+            "+faststart",
            str(normalized_path),
        ]

@@ -29,11 +29,15 @@ from pathlib import Path
 try:
    import requests
 except ImportError:
-    print("Error: 'requests' package is required. Run: pip install requests", file=sys.stderr)
+    print(
+        "Error: 'requests' package is required. Run: pip install requests",
+        file=sys.stderr,
+    )
    sys.exit(1)

 SYNC_FILE_LOCAL = ".gnommo_sync.json"
-SYNC_FILE_PROD  = ".gnommo_sync.prod.json"
+SYNC_FILE_PROD = ".gnommo_sync.prod.json"
+

 def _sync_file(prod: bool) -> str:
    return SYNC_FILE_PROD if prod else SYNC_FILE_LOCAL
@@ -77,19 +81,29 @@ def _parse_ts(ts_str) -> datetime | None:
        return None


-def cmd_pull(project_path: Path, verbose: bool = False, force: bool = False, prod: bool = False) -> int:
+def cmd_pull(
+    project_path: Path, verbose: bool = False, force: bool = False, prod: bool = False
+) -> int:
    _load_env_file()

    if prod:
        api_url = os.environ.get("GNOMMOWEB_PROD_URL", "").rstrip("/")
        api_key = os.environ.get("GNOMMOWEB_PROD_API_KEY", "")
-        if not api_url: print("Error: GNOMMOWEB_PROD_URL is not set.", file=sys.stderr); return 1
-        if not api_key: print("Error: GNOMMOWEB_PROD_API_KEY is not set.", file=sys.stderr); return 1
+        if not api_url:
+            print("Error: GNOMMOWEB_PROD_URL is not set.", file=sys.stderr)
+            return 1
+        if not api_key:
+            print("Error: GNOMMOWEB_PROD_API_KEY is not set.", file=sys.stderr)
+            return 1
    else:
        api_url = os.environ.get("GNOMMOWEB_URL", "").rstrip("/")
        api_key = os.environ.get("GNOMMOWEB_API_KEY", "")
-        if not api_url: print("Error: GNOMMOWEB_URL is not set.", file=sys.stderr); return 1
-        if not api_key: print("Error: GNOMMOWEB_API_KEY is not set.", file=sys.stderr); return 1
+        if not api_url:
+            print("Error: GNOMMOWEB_URL is not set.", file=sys.stderr)
+            return 1
+        if not api_key:
+            print("Error: GNOMMOWEB_API_KEY is not set.", file=sys.stderr)
+            return 1

    if verbose:
        target = "production" if prod else "local"
@@ -176,19 +190,23 @@ def cmd_pull(project_path: Path, verbose: bool = False, force: bool = False, pro

    now_iso = datetime.now(tz=timezone.utc).isoformat(timespec="seconds")
    existing_sync = _read_sync(project_path, prod)
-    _write_sync(project_path, {
-        **existing_sync,
-        "last_pulled_at":    now_iso,
-        "server_updated_at": server_updated_at,
-        "last_pushed_at":    existing_sync.get("last_pushed_at"),
-    }, prod)
+    _write_sync(
+        project_path,
+        {
+            **existing_sync,
+            "last_pulled_at": now_iso,
+            "server_updated_at": server_updated_at,
+            "last_pushed_at": existing_sync.get("last_pushed_at"),
+        },
+        prod,
+    )

    return 0


 def _merge_parent(local: dict, server: dict, verbose: bool):
    """Update parent project.json: name, description, shorts index (slugs)."""
-    local["name"]        = server.get("title", local.get("name"))
+    local["name"] = server.get("title", local.get("name"))
    local["description"] = server.get("description") or local.get("description")
    # shorts is a list of slugs — update from server's shorts list
    server_shorts = server.get("shorts", [])
@@ -42,11 +42,15 @@ from pathlib import Path
 try:
    import requests
 except ImportError:
-    print("Error: 'requests' package is required. Run: pip install requests", file=sys.stderr)
+    print(
+        "Error: 'requests' package is required. Run: pip install requests",
+        file=sys.stderr,
+    )
    sys.exit(1)

 SYNC_FILE_LOCAL = ".gnommo_sync.json"
-SYNC_FILE_PROD  = ".gnommo_sync.prod.json"
+SYNC_FILE_PROD = ".gnommo_sync.prod.json"
+

 def _sync_file(prod: bool) -> str:
    return SYNC_FILE_PROD if prod else SYNC_FILE_LOCAL
@@ -90,19 +94,29 @@ def _parse_ts(ts_str) -> datetime | None:
        return None


-def cmd_push(project_path: Path, verbose: bool = False, force: bool = False, prod: bool = False) -> int:
+def cmd_push(
+    project_path: Path, verbose: bool = False, force: bool = False, prod: bool = False
+) -> int:
    _load_env_file()

    if prod:
        api_url = os.environ.get("GNOMMOWEB_PROD_URL", "").rstrip("/")
        api_key = os.environ.get("GNOMMOWEB_PROD_API_KEY", "")
-        if not api_url: print("Error: GNOMMOWEB_PROD_URL is not set.", file=sys.stderr); return 1
-        if not api_key: print("Error: GNOMMOWEB_PROD_API_KEY is not set.", file=sys.stderr); return 1
+        if not api_url:
+            print("Error: GNOMMOWEB_PROD_URL is not set.", file=sys.stderr)
+            return 1
+        if not api_key:
+            print("Error: GNOMMOWEB_PROD_API_KEY is not set.", file=sys.stderr)
+            return 1
    else:
        api_url = os.environ.get("GNOMMOWEB_URL", "").rstrip("/")
        api_key = os.environ.get("GNOMMOWEB_API_KEY", "")
-        if not api_url: print("Error: GNOMMOWEB_URL is not set.", file=sys.stderr); return 1
-        if not api_key: print("Error: GNOMMOWEB_API_KEY is not set.", file=sys.stderr); return 1
+        if not api_url:
+            print("Error: GNOMMOWEB_URL is not set.", file=sys.stderr)
+            return 1
+        if not api_key:
+            print("Error: GNOMMOWEB_API_KEY is not set.", file=sys.stderr)
+            return 1

    if verbose:
        target = "production" if prod else "local"
@@ -160,11 +174,15 @@ def cmd_push(project_path: Path, verbose: bool = False, force: bool = False, pro
    # ── Write sync state ──────────────────────────────────────────────────────
    now_iso = datetime.now(tz=timezone.utc).isoformat(timespec="seconds")
    existing_sync = _read_sync(project_path, prod)
-    _write_sync(project_path, {
-        **existing_sync,
-        "last_pushed_at":    now_iso,
-        "server_updated_at": server_updated_at,
-    }, prod)
+    _write_sync(
+        project_path,
+        {
+            **existing_sync,
+            "last_pushed_at": now_iso,
+            "server_updated_at": server_updated_at,
+        },
+        prod,
+    )

    # ── Print summary ─────────────────────────────────────────────────────────
    asset = result.get("asset", {})
@@ -176,7 +194,9 @@ def cmd_push(project_path: Path, verbose: bool = False, force: bool = False, pro
        print(f"✓ {project_id} → gn_asset #{asset.get('id')} ({asset.get('name')})")
        if verbose:
            script_len = len(asset.get("script") or "")
-            print(f"  server.script: {script_len} chars | fps={asset.get('fps')} res={asset.get('resolution')}")
+            print(
+                f"  server.script: {script_len} chars | fps={asset.get('fps')} res={asset.get('resolution')}"
+            )

    return 0

@@ -201,19 +221,19 @@ def _build_parent_payload(project: dict, project_path: Path, verbose: bool) -> d
            print(f"  no manuscript field in project.json")

    return {
-        "project_id":       project["id"],
-        "name":             project["name"],
-        "description":      project.get("description"),
-        "coursecode":       project.get("coursecode"),
-        "script_content":   script_content,
-        "resolution":       project.get("resolution"),
-        "fps":              project.get("fps"),
+        "project_id": project["id"],
+        "name": project["name"],
+        "description": project.get("description"),
+        "coursecode": project.get("coursecode"),
+        "script_content": script_content,
+        "resolution": project.get("resolution"),
+        "fps": project.get("fps"),
        "duration_seconds": project.get("duration_seconds"),
-        "hook":             project.get("hook"),
+        "hook": project.get("hook"),
        "platform_targets": project.get("platform_targets"),
-        "status":           project.get("status"),
-        "youtube_url":      project.get("youtube_url"),
-        "shorts":           project.get("shorts", []),
+        "status": project.get("status"),
+        "youtube_url": project.get("youtube_url"),
+        "shorts": project.get("shorts", []),
    }


@@ -231,14 +251,14 @@ def _build_short_payload(project: dict, project_path: Path, verbose: bool) -> di
            print(f"  Warning: script file not found: {script_path}", file=sys.stderr)

    return {
-        "project_id":       project["id"],
-        "name":             project["name"],
-        "description":      project.get("description"),
-        "parent_project":   project["parent_project"],
-        "hook":             project.get("hook"),
-        "script_content":   script_content,
+        "project_id": project["id"],
+        "name": project["name"],
+        "description": project.get("description"),
+        "parent_project": project["parent_project"],
+        "hook": project.get("hook"),
+        "script_content": script_content,
        "platform_targets": project.get("platform_targets", ["youtube"]),
-        "resolution":       project.get("resolution"),
-        "fps":              project.get("fps"),
+        "resolution": project.get("resolution"),
+        "fps": project.get("fps"),
        "duration_seconds": project.get("duration_seconds"),
    }
@@ -22,12 +22,46 @@ from .preprocessor import run_ffmpeg_with_progress


 def _get_audio_duration(audio_path: Path) -> float:
-    """Get duration of an audio file using ffprobe."""
+    """Get duration of an audio file using ffprobe.
+
+    For MP3 files, counts packets directly to get an accurate duration regardless
+    of whether the file has a Xing/VBRI header. Falls back to format duration for
+    other formats.
+    """
+    if audio_path.suffix.lower() == ".mp3":
+        # Count actual packets rather than trusting the header estimate.
+        # This is slower but accurate for headerless VBR/CBR MP3s.
+        cmd = [
+            "ffprobe",
+            "-v",
+            "error",
+            "-count_packets",
+            "-show_entries",
+            "stream=nb_read_packets,duration",
+            "-select_streams",
+            "a:0",
+            "-of",
+            "default=noprint_wrappers=1:nokey=1",
+            str(audio_path),
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode == 0:
+            # Output: duration\nnb_read_packets — take the first non-N/A line
+            for line in result.stdout.strip().splitlines():
+                try:
+                    val = float(line)
+                    if val > 0:
+                        return val
+                except ValueError:
+                    continue
    cmd = [
        "ffprobe",
-        "-v", "error",
-        "-show_entries", "format=duration",
-        "-of", "default=noprint_wrappers=1:nokey=1",
+        "-v",
+        "error",
+        "-show_entries",
+        "format=duration",
+        "-of",
+        "default=noprint_wrappers=1:nokey=1",
        str(audio_path),
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
@@ -208,16 +242,28 @@ def _resolve_video_path(


 def _has_audio_stream(video_path: Path) -> bool:
-    """Check if a video file contains an audio stream using ffprobe."""
+    """Check if a video file contains a non-empty audio stream.
+
+    Uses -analyzeduration 0 to avoid the slow avformat_find_stream_info() scan
+    that happens when an MP4 has a declared audio track with no actual frames —
+    ffprobe would otherwise scan the entire file looking for audio packets.
+
+    Also checks nb_frames to reject ghost audio tracks (stream header exists in
+    the moov atom but no sample data in stsc/stsz).
+    """
    result = subprocess.run(
        [
            "ffprobe",
            "-v",
            "error",
+            "-analyzeduration",
+            "0",
+            "-probesize",
+            "1000000",
            "-select_streams",
-            "a",
+            "a:0",
            "-show_entries",
-            "stream=index",
+            "stream=index,nb_frames",
            "-of",
            "csv=p=0",
            str(video_path),
@@ -225,7 +271,16 @@ def _has_audio_stream(video_path: Path) -> bool:
        capture_output=True,
        text=True,
    )
-    return bool(result.stdout.strip())
+    output = result.stdout.strip()
+    if not output:
+        return False
+    # output is "index" or "index,nb_frames"
+    parts = output.split(",")
+    if len(parts) >= 2:
+        nb_frames = parts[1].strip()
+        if nb_frames == "0":
+            return False  # Ghost audio track — declared but no sample data
+    return True


 def _build_audio_channel_filter(use_audio_channels: str) -> str:
@@ -263,11 +318,18 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
    # Add -ss seek BEFORE -i for skip parameter and/or partial rendering
    always_visible_inputs: list[int] = []
    for video_id, video_source, cutout in plan.narration_videos:
-        video_path = _resolve_video_path(videos_dir, video_source, shared_assets_dir, project_path)
+        video_path = _resolve_video_path(
+            videos_dir, video_source, shared_assets_dir, project_path
+        )
        # Combine video skip setting with partial render offset
        total_seek = video_source.skip + plan.input_seek_time
        if total_seek > 0:
            cmd.extend(["-ss", f"{total_seek:.3f}"])
+        # Skip stream analysis — codec params are in the container header, and
+        # duration is already known by gnommo via ffprobe (plan.total_duration).
+        # Without this, FFmpeg reads 100MB+ of compressed data per input at 4K
+        # bitrates before encoding starts ("Estimating duration from bitrate").
+        cmd.extend(["-analyzeduration", "0", "-probesize", "1000"])
        cmd.extend(["-i", str(video_path)])
        always_visible_inputs.append(input_idx)
        input_idx += 1
@@ -283,18 +345,26 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
        shared_assets_dir = project_path.parent / "shared_assets"
        videos_json_bg = shared_assets_dir / "videos.json"
        if not videos_json_bg.exists():
-            raise RenderError(f"shared_assets/videos.json not found (needed for background handle '{bg_handle}')")
+            raise RenderError(
+                f"shared_assets/videos.json not found (needed for background handle '{bg_handle}')"
+            )
        bg_videos = _read_json(videos_json_bg)
        if bg_handle not in bg_videos:
-            raise RenderError(f"Background handle '{bg_handle}' not found in shared_assets/videos.json")
+            raise RenderError(
+                f"Background handle '{bg_handle}' not found in shared_assets/videos.json"
+            )
        bg_path = shared_assets_dir / bg_videos[bg_handle]["source_file"]
        if not bg_path.exists():
-            raise RenderError(f"Background file not found: {bg_path} (from handle '{bg_handle}')")
+            raise RenderError(
+                f"Background file not found: {bg_path} (from handle '{bg_handle}')"
+            )
        image_extensions = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
        bg_is_image = bg_path.suffix.lower() in image_extensions
        # Loop background videos infinitely
        if not bg_is_image:
            cmd.extend(["-stream_loop", "-1"])
+        # Duration of background video is irrelevant (looped or image) — skip analysis
+        cmd.extend(["-analyzeduration", "0", "-probesize", "1000"])
        cmd.extend(["-i", str(bg_path)])
        bg_idx = input_idx
        input_idx += 1
@@ -325,14 +395,24 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
        video_path = _resolve_video_path(
            videos_dir, event.video_source, shared_assets_dir, project_path
        )
-        # Seek to skip point before loading input
        skip = event.video_source.skip
        if skip > 0:
            cmd.extend(["-ss", f"{skip:.3f}"])
+        cmd.extend(["-analyzeduration", "0", "-probesize", "1000"])
+        # Use pre-probed duration to tell FFmpeg exactly how much to read,
+        # preventing scans of ghost audio tracks on empty MP4 audio streams.
+        if event.video_source.duration is not None:
+            remaining = event.video_source.duration - skip
+            if remaining > 0:
+                cmd.extend(["-t", f"{remaining:.3f}"])
        cmd.extend(["-i", str(video_path)])
        video_inputs[i] = input_idx
        input_idx += 1
-        if _has_audio_stream(video_path):
+        has_audio = event.video_source.has_audio
+        if has_audio is None:
+            print(f"  Warning: no cached metadata for '{event.video_source.source_file}' — run 'gnommo import' to avoid slow probing")
+            has_audio = _has_audio_stream(video_path)
+        if has_audio:
            video_events_with_audio.add(i)

    # Input: outro videos (play after narration ends)
@@ -343,14 +423,22 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
        video_path = _resolve_video_path(
            videos_dir, event.video_source, shared_assets_dir, project_path
        )
-        # Seek to skip point before loading input
        skip = event.video_source.skip
        if skip > 0:
            cmd.extend(["-ss", f"{skip:.3f}"])
+        cmd.extend(["-analyzeduration", "0", "-probesize", "1000"])
+        if event.video_source.duration is not None:
+            remaining = event.video_source.duration - skip
+            if remaining > 0:
+                cmd.extend(["-t", f"{remaining:.3f}"])
        cmd.extend(["-i", str(video_path)])
        outro_inputs[i] = input_idx
        input_idx += 1
-        if _has_audio_stream(video_path):
+        has_audio = event.video_source.has_audio
+        if has_audio is None:
+            print(f"  Warning: no cached metadata for '{event.video_source.source_file}' — run 'gnommo import' to avoid slow probing")
+            has_audio = _has_audio_stream(video_path)
+        if has_audio:
            outro_events_with_audio.add(i)

    # Track where audio inputs start
@@ -365,12 +453,24 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
        if event.audio_id not in audio_inputs:
            audio_path = audio_dir / event.audio_def.file
            audio_path, _ = resolve_with_cache(audio_path, project_path)
+            # Use pre-probed duration from audio.json if available (set by import).
+            # For MP3 without Xing/VBRI headers this is critical — FFmpeg otherwise
+            # scans the whole file to estimate duration (100s+ for large files).
+            # Fall back to live probe only for MP3 when duration wasn't pre-cached.
+            file_duration = event.audio_def.duration
+            if file_duration is None and audio_path.suffix.lower() == ".mp3":
+                file_duration = _get_audio_duration(audio_path)
+            if file_duration is not None:
+                cmd.extend(["-t", str(file_duration)])
            cmd.extend(["-i", str(audio_path)])
            audio_inputs[event.audio_id] = input_idx
            input_idx += 1
-            # Cache duration if this audio uses crossfade looping
+            # Cache duration for crossfade loop filter
            if event.audio_def.loop and event.audio_def.overlap:
-                audio_durations[event.audio_id] = _get_audio_duration(audio_path)
+                audio_durations[event.audio_id] = (
+                    file_duration if file_duration is not None
+                    else _get_audio_duration(audio_path)
+                )

    # Build filter_complex
    filter_complex = build_filter_complex(
@@ -418,7 +518,7 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
            "-preset",
            "fast",
            "-crf",
-            "23",
+            "20",
            "-c:a",
            "aac",
            "-b:a",
@@ -793,6 +893,43 @@ def build_filter_complex(
                )
                current_label = next_label

+    # Add "below-slides" triggered video overlays (vfb/vsb or layer="below")
+    for i, event in enumerate(plan.video_events):
+        if event.layer != "below":
+            continue
+        video_idx = video_inputs[i]
+        cut_x, cut_y, cut_width, cut_height = _calculate_cutout_position(
+            event.cutout, width, height
+        )
+
+        duration = event.end_time - event.start_time
+        if event.video_source.take is not None:
+            duration = min(duration, event.video_source.take)
+        effective_end = event.start_time + duration
+
+        zoom = event.video_source.zoom
+        zoomed_width = int(cut_width * zoom)
+        zoomed_height = int(cut_height * zoom)
+
+        video_label = f"tvb{i}"
+        start_pts = event.start_time
+        filters.append(
+            f"[{video_idx}:v]format=yuva444p10le,"
+            f"setpts=PTS-STARTPTS+{start_pts:.3f}/TB,"
+            f"scale={zoomed_width}:{zoomed_height}:force_original_aspect_ratio=increase,"
+            f"crop={cut_width}:{cut_height}:(iw-{cut_width})/2:(ih-{cut_height})/2,"
+            f"format=rgba[{video_label}]"
+        )
+
+        next_label = f"tvbbase{i}"
+        enable_expr = f"between(t\\,{event.start_time:.3f}\\,{effective_end:.3f})"
+        filters.append(
+            f"[{current_label}][{video_label}]overlay="
+            f"x={cut_x}:y={cut_y}:enable={enable_expr}"
+            f"[{next_label}]"
+        )
+        current_label = next_label
+
    # Add slide overlays with time-based enable
    for i, event in enumerate(plan.slide_events):
        slide_idx = slide_inputs[event.slide_id]
@@ -815,8 +952,10 @@ def build_filter_complex(

        current_label = next_label

-    # Add triggered video overlays with time-based enable
+    # Add "above-slides" triggered video overlays (vft/vst or layer="above")
    for i, event in enumerate(plan.video_events):
+        if event.layer != "above":
+            continue
        video_idx = video_inputs[i]
        cut_x, cut_y, cut_width, cut_height = _calculate_cutout_position(
            event.cutout, width, height
@@ -836,22 +975,25 @@ def build_filter_complex(
        # Scale to cover the zoomed area (like CSS object-fit: cover)
        # Then crop to cutout dimensions (centered)
        # Use setpts to sync video start with overlay enable time
+        # IMPORTANT: convert to rgba FIRST (before scale/crop) so the alpha channel
+        # is preserved throughout. scale in yuva444p10le can silently strip alpha.
        video_label = f"tv{i}"
        start_pts = event.start_time
        filters.append(
-            f"[{video_idx}:v]format=yuva444p10le,"
+            f"[{video_idx}:v]format=rgba,"
            f"setpts=PTS-STARTPTS+{start_pts:.3f}/TB,"
            f"scale={zoomed_width}:{zoomed_height}:force_original_aspect_ratio=increase,"
-            f"crop={cut_width}:{cut_height}:(iw-{cut_width})/2:(ih-{cut_height})/2,"
-            f"format=rgba[{video_label}]"
+            f"crop={cut_width}:{cut_height}:(iw-{cut_width})/2:(ih-{cut_height})/2"
+            f"[{video_label}]"
        )

-        # Overlay with time-based enable
+        # Overlay with time-based enable; format=auto lets FFmpeg pick the right
+        # compositing format so the RGBA alpha channel is respected.
        next_label = f"tvbase{i}"
        enable_expr = f"between(t\\,{event.start_time:.3f}\\,{effective_end:.3f})"
        filters.append(
            f"[{current_label}][{video_label}]overlay="
-            f"x={cut_x}:y={cut_y}:enable={enable_expr}"
+            f"x={cut_x}:y={cut_y}:enable={enable_expr}:format=auto"
            f"[{next_label}]"
        )

@@ -950,13 +1092,17 @@ def build_filter_complex(
            _, first_video_source, _ = plan.narration_videos[0]
            use_channels = first_video_source.use_audio_channels
            if use_channels == "auto":
-                narration_path = _resolve_video_path(videos_dir, first_video_source, shared_assets_dir, project_path)
+                narration_path = _resolve_video_path(
+                    videos_dir, first_video_source, shared_assets_dir, project_path
+                )
                use_channels = _resolve_auto_channel(narration_path)
            channel_filter = _build_audio_channel_filter(use_channels)
            narration_volume = first_video_source.volume

        # Build volume filter if not 1.0
-        volume_filter = f"volume={narration_volume:.2f}" if narration_volume != 1.0 else ""
+        volume_filter = (
+            f"volume={narration_volume:.2f}" if narration_volume != 1.0 else ""
+        )

        # Use narration_end_time to stop audio before outro (if outro exists)
        audio_end_time = (
@@ -980,7 +1126,9 @@ def build_filter_complex(
                )
                audio_labels_to_mix.append("[main_aud]")
            elif filter_parts:
-                filters.append(f"[{main_audio_idx}:a]{','.join(filter_parts)}[main_aud]")
+                filters.append(
+                    f"[{main_audio_idx}:a]{','.join(filter_parts)}[main_aud]"
+                )
                audio_labels_to_mix.append("[main_aud]")
            else:
                audio_labels_to_mix.append(f"[{main_audio_idx}:a]")
@@ -1066,7 +1214,10 @@ def build_filter_complex(
                        label = f"aud{i}"
                        delay_ms = int(event.start_time * 1000)

-                        if event.audio_def.overlap and event.audio_id in audio_durations:
+                        if (
+                            event.audio_def.overlap
+                            and event.audio_id in audio_durations
+                        ):
                            # Crossfade loop: overlap copies with fade in/out
                            audio_dur = audio_durations[event.audio_id]
                            crossfade_filters = _build_crossfade_loop_filter(
@@ -180,7 +180,9 @@ def words_to_srt(
    srt_lines = []
    for idx, (start, end, text) in enumerate(segments, 1):
        srt_lines.append(str(idx))
-        srt_lines.append(f"{_format_srt_timestamp(start)} --> {_format_srt_timestamp(end)}")
+        srt_lines.append(
+            f"{_format_srt_timestamp(start)} --> {_format_srt_timestamp(end)}"
+        )
        srt_lines.append(text)
        srt_lines.append("")  # Blank line between entries

@@ -1,6 +1,7 @@
 """Transform stage: resolve timings and build render plan."""

 import re
+import string
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
@@ -99,6 +100,16 @@ def _normalize_text(text: str) -> str:
    return text.strip()


+def _normalize_token(word: str) -> str:
+    """Normalize a single word token for comparison.
+
+    Strips leading/trailing punctuation and lowercases. Interior characters
+    (e.g. apostrophes in contractions) are preserved so "don't" stays "don't".
+    Applied to both transcript tokens and phrase words at comparison time.
+    """
+    return word.lower().strip(string.punctuation)
+
+
 def _is_known_marker(
    marker_id: str, slides: dict = None, videos: dict = None, audio: dict = None
 ) -> bool:
@@ -122,8 +133,9 @@ def _is_known_marker(
    if marker_id in slides:
        return True

-    # Video/narration triggers
-    if marker_id.startswith("video:") or marker_id.startswith("narration:"):
+    # Video/narration triggers (all supported prefixes)
+    _VIDEO_PREFIXES = ("video:", "narration:", "vft:", "vfb:", "vst:", "vsb:", "vftp:", "vfbp:", "vstp:", "vsbp:")
+    if any(marker_id.startswith(p) for p in _VIDEO_PREFIXES):
        return True

    # Camera presets
@@ -143,20 +155,11 @@ def _strip_unknown_markers(
    text: str, slides: dict = None, videos: dict = None, audio: dict = None
 ) -> str:
    """
-    Remove unknown markers from text.
+    Remove all [...] markers from context text — none are pronounced aloud.

-    Unknown markers aren't pronounced, so they should be stripped
-    before fuzzy matching. Note: [cite:...] markers are already
-    stripped at parse time by parse_manuscript().
+    Note: [cite:...] markers are already stripped at parse time by parse_manuscript().
    """
-
-    def replace_marker(match):
-        marker_id = match.group(1)
-        if _is_known_marker(marker_id, slides, videos, audio):
-            return match.group(0)  # Keep known markers
-        return ""  # Strip unknown markers
-
-    return re.sub(r"\[([A-Za-z0-9_:]+)\]", replace_marker, text)
+    return re.sub(r"\[([^\]]+)\]", "", text)


 def _extract_marker_contexts(
@@ -177,8 +180,9 @@ def _extract_marker_contexts(
    videos = videos or {}
    audio = audio or {}

-    # Split by markers, keeping the markers
-    parts = re.split(r"\[([A-Za-z0-9_:]+)\]", manuscript_text)
+    # Split by markers, keeping the markers — broad pattern handles any content
+    # including paths with / and - (e.g. [vfb:pexels/7670835-uhd_3840_2160_30fps])
+    parts = re.split(r"\[([^\]]+)\]", manuscript_text)

    # parts: [text_before, marker1, text_after1, marker2, text_after2, ...]
    raw_contexts = []
@@ -189,16 +193,27 @@ def _extract_marker_contexts(
        if not _is_known_marker(marker_id, slides, videos, audio):
            continue

-        if i + 1 < len(parts):
-            following_text = parts[i + 1].strip()
-            # Clean up: remove newlines, collapse whitespace
-            following_text = " ".join(following_text.split())
-            # Strip unknown markers from following text (they're not pronounced)
-            following_text = _strip_unknown_markers(
-                following_text, slides, videos, audio
-            )
-            following_text = " ".join(following_text.split())  # Clean up extra spaces
-            raw_contexts.append((marker_id, following_text))
+        # Collect all following text, looking past unknown markers until the
+        # next known marker. This handles [S1][segment:1] text... where the
+        # text lives two parts ahead rather than immediately after S1.
+        text_pieces = []
+        j = i + 1
+        while j < len(parts):
+            chunk = parts[j].strip()
+            if chunk:
+                text_pieces.append(chunk)
+            j += 1  # advance to the marker after this text chunk
+            if j >= len(parts):
+                break
+            if _is_known_marker(parts[j], slides, videos, audio):
+                break  # stop at the next known marker
+            j += 1  # skip the unknown marker; its following text is next
+
+        following_text = " ".join(text_pieces)
+        following_text = " ".join(following_text.split())  # collapse whitespace
+        following_text = _strip_unknown_markers(following_text, slides, videos, audio)
+        following_text = " ".join(following_text.split())
+        raw_contexts.append((marker_id, following_text))

    # For markers with no following text (consecutive markers), look ahead
    # Return (marker_id, following_text, is_borrowed) - is_borrowed=True means text came from look-ahead
@@ -209,13 +224,20 @@ def _extract_marker_contexts(
            words = following_text.split()[:10]
            contexts.append((marker_id, " ".join(words), False))
        else:
-            # Look ahead for next marker with text
+            # Look ahead for next marker with text, but never borrow from another
+            # slide marker — slides must align independently to avoid two consecutive
+            # slides matching the same transcription position simultaneously.
+            borrowed = False
            for j in range(i + 1, len(raw_contexts)):
-                if raw_contexts[j][1]:
-                    words = raw_contexts[j][1].split()[:10]
+                next_marker_id, next_text = raw_contexts[j]
+                if next_text:
+                    if next_marker_id in (slides or {}):
+                        break  # Slide owns this text; give up borrowing
+                    words = next_text.split()[:10]
                    contexts.append((marker_id, " ".join(words), True))  # Borrowed
+                    borrowed = True
                    break
-            else:
+            if not borrowed:
                contexts.append((marker_id, "", False))

    return contexts
@@ -250,7 +272,8 @@ def _fuzzy_match_ratio(
        return 0.0, 0, 0

    transcript_words = [
-        _normalize_text(transcription[j].word) for j in range(start_idx, transcript_end)
+        _normalize_token(transcription[j].word)
+        for j in range(start_idx, transcript_end)
    ]

    # Match phrase words sequentially against transcript window
@@ -261,7 +284,7 @@ def _fuzzy_match_ratio(
    last_match_end_offset = 0

    for phrase_word in phrase_words[:words_to_check]:
-        normalized = _normalize_text(phrase_word)
+        normalized = _normalize_token(phrase_word)
        if len(normalized) < 2:
            continue  # skip very short words (a, I, etc.) - don't count them
        words_checked += 1
@@ -303,8 +326,12 @@ def _find_phrase_timestamp(
    (-1, -1.0, 0.0, -1) if not found. word_index points to the first
    matched word. match_end_idx points past the last matched word.
    """
-    phrase_normalized = _normalize_text(phrase)
-    phrase_words = phrase_normalized.split()
+    # Normalize each word individually — same method as transcript tokens.
+    # This keeps contractions as single tokens ("haven't" stays "haven't") so
+    # phrase and transcript word counts stay in sync. Using _normalize_text on
+    # the whole phrase would expand "haven't" → "have not" (2 words), creating
+    # a phantom "not" that fails to match the transcript and corrupts the window.
+    phrase_words = [tok for tok in (_normalize_token(w) for w in phrase.split()) if tok]

    if not phrase_words:
        return -1, -1.0, 0.0, -1
@@ -504,7 +531,9 @@ def build_render_plan(
    cached_files: set[str] = set()

    narration_videos: list[tuple[str, VideoSource, CutoutDefinition]] = []
-    video_path, is_cached = _resolve_video_path(videos_dir, narration_video, shared_assets_dir, project_path)
+    video_path, is_cached = _resolve_video_path(
+        videos_dir, narration_video, shared_assets_dir, project_path
+    )
    if is_cached:
        cached_files.add(narration_video_id)
    full_duration = get_video_duration(video_path)
@@ -798,40 +827,127 @@ def _extract_video_events(
        ]
    )

-    # Collect video markers
-    video_markers: list[tuple[float, str, str]] = []  # (time, video_id, type)
+    # Mapping from shorthand marker prefix → (implied_cutout_name, implied_layer)
+    # These are the defaults; videos.json values act as a base but the marker wins.
+    _SHORTHAND: dict[str, tuple[str, str]] = {
+        "vft:": ("fullscreen", "above"),
+        "vfb:": ("fullscreen", "below"),
+        "vst:": ("square", "above"),
+        "vsb:": ("square", "below"),
+        "vftp:": ("fullscreen", "above", "pause_narration"),
+        "vfbp:": ("fullscreen", "below", "pause_narration"),
+        "vstp:": ("square", "above", "pause_narration"),
+        "vsbp:": ("square", "below", "pause_narration"),
+    }
+
+    # Collect video markers: (time, video_id, event_type, cutout_name_override, layer_override)
+    # event_type is "video" (ends at next slide) or "narration" (runs to end)
+    video_markers: list[tuple[float, str, str, str | None, str | None]] = []
+
    for timing in marker_timings:
        if timing.timestamp < 0:
            continue

-        if timing.marker_id.startswith("video:"):
-            video_id = timing.marker_id[6:]
-            if video_id in videos:
-                video_source = videos[video_id]
-                if video_source.cutout and video_source.cutout in cutouts:
-                    video_markers.append((timing.timestamp, video_id, "video"))
+        mid = timing.marker_id

-        elif timing.marker_id.startswith("narration:"):
-            video_id = timing.marker_id[10:]
-            if video_id in videos:
-                video_source = videos[video_id]
-                if video_source.cutout and video_source.cutout in cutouts:
-                    video_markers.append((timing.timestamp, video_id, "narration"))
+        # --- shorthand markers: vft/vfb/vst/vsb ---
+        shorthand_match = next((p for p in _SHORTHAND if mid.startswith(p)), None)
+        if shorthand_match:
+            video_id = mid[len(shorthand_match) :]
+            if video_id not in videos:
+                raise ValueError(
+                    f"Marker [{mid}] references unknown video '{video_id}'. "
+                    f"Add it to videos.json or remove the marker."
+                )
+            implied_cutout, implied_layer = _SHORTHAND[shorthand_match]
+            if implied_cutout not in cutouts:
+                raise ValueError(
+                    f"Marker [{mid}] uses shorthand '{shorthand_match}' which requires "
+                    f"cutout '{implied_cutout}' but it is not defined in project config. "
+                    f"Available cutouts: {list(cutouts.keys())}"
+                )
+            video_markers.append(
+                (timing.timestamp, video_id, "video", implied_cutout, implied_layer)
+            )
+            continue
+
+        # --- legacy [video:xxx] ---
+        if mid.startswith("video:"):
+            video_id = mid[6:]
+            if video_id not in videos:
+                raise ValueError(
+                    f"Marker [video:{video_id}] references unknown video '{video_id}'. "
+                    f"Add it to videos.json or remove the marker."
+                )
+            video_source = videos[video_id]
+            if not video_source.cutout:
+                raise ValueError(
+                    f"Marker [video:{video_id}] — video '{video_id}' has no 'cutout' set in videos.json."
+                )
+            if video_source.cutout not in cutouts:
+                raise ValueError(
+                    f"Marker [video:{video_id}] — cutout '{video_source.cutout}' is not defined in project config. "
+                    f"Available: {list(cutouts.keys())}"
+                )
+            video_markers.append(
+                (timing.timestamp, video_id, "video", None, None)
+            )
+            continue
+
+        # --- [narration:xxx] ---
+        if mid.startswith("narration:"):
+            video_id = mid[10:]
+            if video_id not in videos:
+                raise ValueError(
+                    f"Marker [narration:{video_id}] references unknown video '{video_id}'. "
+                    f"Add it to videos.json or remove the marker."
+                )
+            video_source = videos[video_id]
+            if not video_source.cutout:
+                raise ValueError(
+                    f"Marker [narration:{video_id}] — video '{video_id}' has no 'cutout' set in videos.json."
+                )
+            if video_source.cutout not in cutouts:
+                raise ValueError(
+                    f"Marker [narration:{video_id}] — cutout '{video_source.cutout}' is not defined in project config. "
+                    f"Available: {list(cutouts.keys())}"
+                )
+            video_markers.append(
+                (timing.timestamp, video_id, "narration", None, None)
+            )

    events: list[VideoEvent] = []
-    for start_time, video_id, marker_type in video_markers:
+    for (
+        start_time,
+        video_id,
+        marker_type,
+        cutout_override,
+        layer_override,
+    ) in video_markers:
        video_source = videos[video_id]
-        cutout = cutouts[video_source.cutout]

-        if marker_type == "video":
-            # End at next slide
+        # Resolve cutout: marker override > videos.json cutout
+        # (validation already ensured cutout exists — this is a safety assertion)
+        cutout_name = cutout_override or video_source.cutout
+        cutout = cutouts[cutout_name]
+
+        # Resolve layer: marker override > videos.json layer
+        layer = layer_override if layer_override is not None else video_source.layer
+
+        end_on = video_source.end_on
+        if end_on == "take" and video_source.take is not None:
+            end_time = start_time + video_source.take
+        elif end_on == "end":
+            end_time = total_duration
+        elif end_on == "next_slide" or (end_on is None and marker_type == "video"):
+            # End at next slide marker
            end_time = total_duration
            for slide_time in slide_times:
                if slide_time > start_time:
                    end_time = slide_time
                    break
        else:
-            # narration: runs to end
+            # end_on is None and marker_type == "narration": runs to end
            end_time = total_duration

        # Filter by time range
@@ -846,6 +962,8 @@ def _extract_video_events(
                end_time=end_time,
                video_source=video_source,
                cutout=cutout,
+                cutout_name=cutout_name,
+                layer=layer,
            )
        )

@@ -992,7 +1110,9 @@ def _extract_outro_events(
        video_source = videos[video_id]

        # Get the video duration
-        video_path, is_cached = _resolve_video_path(videos_dir, video_source, shared_assets_dir, project_path)
+        video_path, is_cached = _resolve_video_path(
+            videos_dir, video_source, shared_assets_dir, project_path
+        )
        if is_cached and cached_files is not None:
            cached_files.add(video_id)
        if video_path.exists():
@@ -57,16 +57,26 @@ def validate_project(
        # Skip audio markers (start with 'A' followed by audio id, e.g., Awoosh)
        if marker.startswith("A") and len(marker) > 1 and marker[1:].isalnum():
            continue
-        # Validate video trigger markers (video:xxx) - slide-like videos
-        if marker.startswith("video:"):
-            video_id = marker[6:]  # Remove 'video:' prefix
+        # Validate video trigger markers — both legacy [video:xxx] and
+        # shorthand [vft:xxx] / [vfb:xxx] / [vst:xxx] / [vsb:xxx].
+        _VIDEO_PREFIXES = {
+            "video:": 6,
+            "vft:": 4,
+            "vfb:": 4,
+            "vst:": 4,
+            "vsb:": 4,
+        }
+        matched_prefix = next(
+            (p for p in _VIDEO_PREFIXES if marker.startswith(p)), None
+        )
+        if matched_prefix is not None:
+            video_id = marker[_VIDEO_PREFIXES[matched_prefix] :]
            if video_id not in videos:
-                # Check if it's a file extension mismatch
                hint = ""
                if "." in video_id:
                    base_name = video_id.rsplit(".", 1)[0]
                    if base_name in videos:
-                        hint = f" (Did you mean [video:{base_name}]? Don't include file extensions in markers)"
+                        hint = f" (Did you mean [{matched_prefix}{base_name}]? Don't include file extensions in markers)"
                warnings.append(
                    ValidationIssue(
                        f"Video marker [{marker}] referenced in manuscript but '{video_id}' not defined in videos.json{hint} — using PlaceholderVideo instead",
@@ -214,11 +224,12 @@ def validate_project(
                        )
                    )

-    # Check we have at least one video source
-    if not videos:
+    # Check videos.json exists (empty is fine — project may not need triggered videos)
+    if not (project_path / config.videos_path).exists():
        issues.append(
            ValidationIssue(
-                "No video sources defined in videos.json", project_path / "videos.json"
+                "videos.json not found — run 'gnommo import' to create it",
+                project_path / "videos.json",
            )
        )

@@ -8,13 +8,13 @@
 # Options:
 #   --replace    Delete original files after successful transcoding
 #   --dry-run    Show what would be transcoded without doing it
-#   --crf <N>    Quality level (default: 23, lower=better quality, 18-28 typical)
+#   --crf <N>    Quality level (default: 20, lower=better quality, 18-28 typical)
 #

 set -e

 # Configuration
-DEFAULT_CRF=23
+DEFAULT_CRF=18
 EXTENSIONS=("mov" "mp4" "m4v" "avi" "mkv" "mxf")

 usage() {
@@ -44,7 +44,7 @@ Examples:
  $(basename "$0") ./media/videos                    # Transcode folder (smallest first)
  $(basename "$0") ./media/videos --dry-run          # Preview only
  $(basename "$0") ./media/videos --replace          # Transcode and delete originals
-  $(basename "$0") ./media/videos --crf 20           # Higher quality
+  $(basename "$0") ./media/videos --crf 18           # Higher quality

 EOF
    exit 0