Fixing gnommo

2026-03-26 10:46:05 +01:00
parent 0e22fcfbb3
commit 7c75610fce
15 changed files with 2028 additions and 410 deletions
@@ -22,12 +22,46 @@ from .preprocessor import run_ffmpeg_with_progress


 def _get_audio_duration(audio_path: Path) -> float:
-    """Get duration of an audio file using ffprobe."""
+    """Get duration of an audio file using ffprobe.
+
+    For MP3 files, counts packets directly to get an accurate duration regardless
+    of whether the file has a Xing/VBRI header. Falls back to format duration for
+    other formats.
+    """
+    if audio_path.suffix.lower() == ".mp3":
+        # Count actual packets rather than trusting the header estimate.
+        # This is slower but accurate for headerless VBR/CBR MP3s.
+        cmd = [
+            "ffprobe",
+            "-v",
+            "error",
+            "-count_packets",
+            "-show_entries",
+            "stream=nb_read_packets,duration",
+            "-select_streams",
+            "a:0",
+            "-of",
+            "default=noprint_wrappers=1:nokey=1",
+            str(audio_path),
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode == 0:
+            # Output: duration\nnb_read_packets — take the first non-N/A line
+            for line in result.stdout.strip().splitlines():
+                try:
+                    val = float(line)
+                    if val > 0:
+                        return val
+                except ValueError:
+                    continue
    cmd = [
        "ffprobe",
-        "-v", "error",
-        "-show_entries", "format=duration",
-        "-of", "default=noprint_wrappers=1:nokey=1",
+        "-v",
+        "error",
+        "-show_entries",
+        "format=duration",
+        "-of",
+        "default=noprint_wrappers=1:nokey=1",
        str(audio_path),
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
@@ -208,16 +242,28 @@ def _resolve_video_path(


 def _has_audio_stream(video_path: Path) -> bool:
-    """Check if a video file contains an audio stream using ffprobe."""
+    """Check if a video file contains a non-empty audio stream.
+
+    Uses -analyzeduration 0 to avoid the slow avformat_find_stream_info() scan
+    that happens when an MP4 has a declared audio track with no actual frames —
+    ffprobe would otherwise scan the entire file looking for audio packets.
+
+    Also checks nb_frames to reject ghost audio tracks (stream header exists in
+    the moov atom but no sample data in stsc/stsz).
+    """
    result = subprocess.run(
        [
            "ffprobe",
            "-v",
            "error",
+            "-analyzeduration",
+            "0",
+            "-probesize",
+            "1000000",
            "-select_streams",
-            "a",
+            "a:0",
            "-show_entries",
-            "stream=index",
+            "stream=index,nb_frames",
            "-of",
            "csv=p=0",
            str(video_path),
@@ -225,7 +271,16 @@ def _has_audio_stream(video_path: Path) -> bool:
        capture_output=True,
        text=True,
    )
-    return bool(result.stdout.strip())
+    output = result.stdout.strip()
+    if not output:
+        return False
+    # output is "index" or "index,nb_frames"
+    parts = output.split(",")
+    if len(parts) >= 2:
+        nb_frames = parts[1].strip()
+        if nb_frames == "0":
+            return False  # Ghost audio track — declared but no sample data
+    return True


 def _build_audio_channel_filter(use_audio_channels: str) -> str:
@@ -263,11 +318,18 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
    # Add -ss seek BEFORE -i for skip parameter and/or partial rendering
    always_visible_inputs: list[int] = []
    for video_id, video_source, cutout in plan.narration_videos:
-        video_path = _resolve_video_path(videos_dir, video_source, shared_assets_dir, project_path)
+        video_path = _resolve_video_path(
+            videos_dir, video_source, shared_assets_dir, project_path
+        )
        # Combine video skip setting with partial render offset
        total_seek = video_source.skip + plan.input_seek_time
        if total_seek > 0:
            cmd.extend(["-ss", f"{total_seek:.3f}"])
+        # Skip stream analysis — codec params are in the container header, and
+        # duration is already known by gnommo via ffprobe (plan.total_duration).
+        # Without this, FFmpeg reads 100MB+ of compressed data per input at 4K
+        # bitrates before encoding starts ("Estimating duration from bitrate").
+        cmd.extend(["-analyzeduration", "0", "-probesize", "1000"])
        cmd.extend(["-i", str(video_path)])
        always_visible_inputs.append(input_idx)
        input_idx += 1
@@ -283,18 +345,26 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
        shared_assets_dir = project_path.parent / "shared_assets"
        videos_json_bg = shared_assets_dir / "videos.json"
        if not videos_json_bg.exists():
-            raise RenderError(f"shared_assets/videos.json not found (needed for background handle '{bg_handle}')")
+            raise RenderError(
+                f"shared_assets/videos.json not found (needed for background handle '{bg_handle}')"
+            )
        bg_videos = _read_json(videos_json_bg)
        if bg_handle not in bg_videos:
-            raise RenderError(f"Background handle '{bg_handle}' not found in shared_assets/videos.json")
+            raise RenderError(
+                f"Background handle '{bg_handle}' not found in shared_assets/videos.json"
+            )
        bg_path = shared_assets_dir / bg_videos[bg_handle]["source_file"]
        if not bg_path.exists():
-            raise RenderError(f"Background file not found: {bg_path} (from handle '{bg_handle}')")
+            raise RenderError(
+                f"Background file not found: {bg_path} (from handle '{bg_handle}')"
+            )
        image_extensions = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
        bg_is_image = bg_path.suffix.lower() in image_extensions
        # Loop background videos infinitely
        if not bg_is_image:
            cmd.extend(["-stream_loop", "-1"])
+        # Duration of background video is irrelevant (looped or image) — skip analysis
+        cmd.extend(["-analyzeduration", "0", "-probesize", "1000"])
        cmd.extend(["-i", str(bg_path)])
        bg_idx = input_idx
        input_idx += 1
@@ -325,14 +395,24 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
        video_path = _resolve_video_path(
            videos_dir, event.video_source, shared_assets_dir, project_path
        )
-        # Seek to skip point before loading input
        skip = event.video_source.skip
        if skip > 0:
            cmd.extend(["-ss", f"{skip:.3f}"])
+        cmd.extend(["-analyzeduration", "0", "-probesize", "1000"])
+        # Use pre-probed duration to tell FFmpeg exactly how much to read,
+        # preventing scans of ghost audio tracks on empty MP4 audio streams.
+        if event.video_source.duration is not None:
+            remaining = event.video_source.duration - skip
+            if remaining > 0:
+                cmd.extend(["-t", f"{remaining:.3f}"])
        cmd.extend(["-i", str(video_path)])
        video_inputs[i] = input_idx
        input_idx += 1
-        if _has_audio_stream(video_path):
+        has_audio = event.video_source.has_audio
+        if has_audio is None:
+            print(f"  Warning: no cached metadata for '{event.video_source.source_file}' — run 'gnommo import' to avoid slow probing")
+            has_audio = _has_audio_stream(video_path)
+        if has_audio:
            video_events_with_audio.add(i)

    # Input: outro videos (play after narration ends)
@@ -343,14 +423,22 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
        video_path = _resolve_video_path(
            videos_dir, event.video_source, shared_assets_dir, project_path
        )
-        # Seek to skip point before loading input
        skip = event.video_source.skip
        if skip > 0:
            cmd.extend(["-ss", f"{skip:.3f}"])
+        cmd.extend(["-analyzeduration", "0", "-probesize", "1000"])
+        if event.video_source.duration is not None:
+            remaining = event.video_source.duration - skip
+            if remaining > 0:
+                cmd.extend(["-t", f"{remaining:.3f}"])
        cmd.extend(["-i", str(video_path)])
        outro_inputs[i] = input_idx
        input_idx += 1
-        if _has_audio_stream(video_path):
+        has_audio = event.video_source.has_audio
+        if has_audio is None:
+            print(f"  Warning: no cached metadata for '{event.video_source.source_file}' — run 'gnommo import' to avoid slow probing")
+            has_audio = _has_audio_stream(video_path)
+        if has_audio:
            outro_events_with_audio.add(i)

    # Track where audio inputs start
@@ -365,12 +453,24 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
        if event.audio_id not in audio_inputs:
            audio_path = audio_dir / event.audio_def.file
            audio_path, _ = resolve_with_cache(audio_path, project_path)
+            # Use pre-probed duration from audio.json if available (set by import).
+            # For MP3 without Xing/VBRI headers this is critical — FFmpeg otherwise
+            # scans the whole file to estimate duration (100s+ for large files).
+            # Fall back to live probe only for MP3 when duration wasn't pre-cached.
+            file_duration = event.audio_def.duration
+            if file_duration is None and audio_path.suffix.lower() == ".mp3":
+                file_duration = _get_audio_duration(audio_path)
+            if file_duration is not None:
+                cmd.extend(["-t", str(file_duration)])
            cmd.extend(["-i", str(audio_path)])
            audio_inputs[event.audio_id] = input_idx
            input_idx += 1
-            # Cache duration if this audio uses crossfade looping
+            # Cache duration for crossfade loop filter
            if event.audio_def.loop and event.audio_def.overlap:
-                audio_durations[event.audio_id] = _get_audio_duration(audio_path)
+                audio_durations[event.audio_id] = (
+                    file_duration if file_duration is not None
+                    else _get_audio_duration(audio_path)
+                )

    # Build filter_complex
    filter_complex = build_filter_complex(
@@ -418,7 +518,7 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
            "-preset",
            "fast",
            "-crf",
-            "23",
+            "20",
            "-c:a",
            "aac",
            "-b:a",
@@ -793,6 +893,43 @@ def build_filter_complex(
                )
                current_label = next_label

+    # Add "below-slides" triggered video overlays (vfb/vsb or layer="below")
+    for i, event in enumerate(plan.video_events):
+        if event.layer != "below":
+            continue
+        video_idx = video_inputs[i]
+        cut_x, cut_y, cut_width, cut_height = _calculate_cutout_position(
+            event.cutout, width, height
+        )
+
+        duration = event.end_time - event.start_time
+        if event.video_source.take is not None:
+            duration = min(duration, event.video_source.take)
+        effective_end = event.start_time + duration
+
+        zoom = event.video_source.zoom
+        zoomed_width = int(cut_width * zoom)
+        zoomed_height = int(cut_height * zoom)
+
+        video_label = f"tvb{i}"
+        start_pts = event.start_time
+        filters.append(
+            f"[{video_idx}:v]format=yuva444p10le,"
+            f"setpts=PTS-STARTPTS+{start_pts:.3f}/TB,"
+            f"scale={zoomed_width}:{zoomed_height}:force_original_aspect_ratio=increase,"
+            f"crop={cut_width}:{cut_height}:(iw-{cut_width})/2:(ih-{cut_height})/2,"
+            f"format=rgba[{video_label}]"
+        )
+
+        next_label = f"tvbbase{i}"
+        enable_expr = f"between(t\\,{event.start_time:.3f}\\,{effective_end:.3f})"
+        filters.append(
+            f"[{current_label}][{video_label}]overlay="
+            f"x={cut_x}:y={cut_y}:enable={enable_expr}"
+            f"[{next_label}]"
+        )
+        current_label = next_label
+
    # Add slide overlays with time-based enable
    for i, event in enumerate(plan.slide_events):
        slide_idx = slide_inputs[event.slide_id]
@@ -815,8 +952,10 @@ def build_filter_complex(

        current_label = next_label

-    # Add triggered video overlays with time-based enable
+    # Add "above-slides" triggered video overlays (vft/vst or layer="above")
    for i, event in enumerate(plan.video_events):
+        if event.layer != "above":
+            continue
        video_idx = video_inputs[i]
        cut_x, cut_y, cut_width, cut_height = _calculate_cutout_position(
            event.cutout, width, height
@@ -836,22 +975,25 @@ def build_filter_complex(
        # Scale to cover the zoomed area (like CSS object-fit: cover)
        # Then crop to cutout dimensions (centered)
        # Use setpts to sync video start with overlay enable time
+        # IMPORTANT: convert to rgba FIRST (before scale/crop) so the alpha channel
+        # is preserved throughout. scale in yuva444p10le can silently strip alpha.
        video_label = f"tv{i}"
        start_pts = event.start_time
        filters.append(
-            f"[{video_idx}:v]format=yuva444p10le,"
+            f"[{video_idx}:v]format=rgba,"
            f"setpts=PTS-STARTPTS+{start_pts:.3f}/TB,"
            f"scale={zoomed_width}:{zoomed_height}:force_original_aspect_ratio=increase,"
-            f"crop={cut_width}:{cut_height}:(iw-{cut_width})/2:(ih-{cut_height})/2,"
-            f"format=rgba[{video_label}]"
+            f"crop={cut_width}:{cut_height}:(iw-{cut_width})/2:(ih-{cut_height})/2"
+            f"[{video_label}]"
        )

-        # Overlay with time-based enable
+        # Overlay with time-based enable; format=auto lets FFmpeg pick the right
+        # compositing format so the RGBA alpha channel is respected.
        next_label = f"tvbase{i}"
        enable_expr = f"between(t\\,{event.start_time:.3f}\\,{effective_end:.3f})"
        filters.append(
            f"[{current_label}][{video_label}]overlay="
-            f"x={cut_x}:y={cut_y}:enable={enable_expr}"
+            f"x={cut_x}:y={cut_y}:enable={enable_expr}:format=auto"
            f"[{next_label}]"
        )

@@ -950,13 +1092,17 @@ def build_filter_complex(
            _, first_video_source, _ = plan.narration_videos[0]
            use_channels = first_video_source.use_audio_channels
            if use_channels == "auto":
-                narration_path = _resolve_video_path(videos_dir, first_video_source, shared_assets_dir, project_path)
+                narration_path = _resolve_video_path(
+                    videos_dir, first_video_source, shared_assets_dir, project_path
+                )
                use_channels = _resolve_auto_channel(narration_path)
            channel_filter = _build_audio_channel_filter(use_channels)
            narration_volume = first_video_source.volume

        # Build volume filter if not 1.0
-        volume_filter = f"volume={narration_volume:.2f}" if narration_volume != 1.0 else ""
+        volume_filter = (
+            f"volume={narration_volume:.2f}" if narration_volume != 1.0 else ""
+        )

        # Use narration_end_time to stop audio before outro (if outro exists)
        audio_end_time = (
@@ -980,7 +1126,9 @@ def build_filter_complex(
                )
                audio_labels_to_mix.append("[main_aud]")
            elif filter_parts:
-                filters.append(f"[{main_audio_idx}:a]{','.join(filter_parts)}[main_aud]")
+                filters.append(
+                    f"[{main_audio_idx}:a]{','.join(filter_parts)}[main_aud]"
+                )
                audio_labels_to_mix.append("[main_aud]")
            else:
                audio_labels_to_mix.append(f"[{main_audio_idx}:a]")
@@ -1066,7 +1214,10 @@ def build_filter_complex(
                        label = f"aud{i}"
                        delay_ms = int(event.start_time * 1000)

-                        if event.audio_def.overlap and event.audio_id in audio_durations:
+                        if (
+                            event.audio_def.overlap
+                            and event.audio_id in audio_durations
+                        ):
                            # Crossfade loop: overlap copies with fade in/out
                            audio_dur = audio_durations[event.audio_id]
                            crossfade_filters = _build_crossfade_loop_filter(