Fixing gnommo

This commit is contained in:
2026-03-26 10:46:05 +01:00
parent 0e22fcfbb3
commit 7c75610fce
15 changed files with 2028 additions and 410 deletions
+180 -29
View File
@@ -22,12 +22,46 @@ from .preprocessor import run_ffmpeg_with_progress
def _get_audio_duration(audio_path: Path) -> float:
"""Get duration of an audio file using ffprobe."""
"""Get duration of an audio file using ffprobe.
For MP3 files, counts packets directly to get an accurate duration regardless
of whether the file has a Xing/VBRI header. Falls back to format duration for
other formats.
"""
if audio_path.suffix.lower() == ".mp3":
# Count actual packets rather than trusting the header estimate.
# This is slower but accurate for headerless VBR/CBR MP3s.
cmd = [
"ffprobe",
"-v",
"error",
"-count_packets",
"-show_entries",
"stream=nb_read_packets,duration",
"-select_streams",
"a:0",
"-of",
"default=noprint_wrappers=1:nokey=1",
str(audio_path),
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
# Output: duration\nnb_read_packets — take the first non-N/A line
for line in result.stdout.strip().splitlines():
try:
val = float(line)
if val > 0:
return val
except ValueError:
continue
cmd = [
"ffprobe",
"-v", "error",
"-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1",
"-v",
"error",
"-show_entries",
"format=duration",
"-of",
"default=noprint_wrappers=1:nokey=1",
str(audio_path),
]
result = subprocess.run(cmd, capture_output=True, text=True)
@@ -208,16 +242,28 @@ def _resolve_video_path(
def _has_audio_stream(video_path: Path) -> bool:
"""Check if a video file contains an audio stream using ffprobe."""
"""Check if a video file contains a non-empty audio stream.
Uses -analyzeduration 0 to avoid the slow avformat_find_stream_info() scan
that happens when an MP4 has a declared audio track with no actual frames —
ffprobe would otherwise scan the entire file looking for audio packets.
Also checks nb_frames to reject ghost audio tracks (stream header exists in
the moov atom but no sample data in stsc/stsz).
"""
result = subprocess.run(
[
"ffprobe",
"-v",
"error",
"-analyzeduration",
"0",
"-probesize",
"1000000",
"-select_streams",
"a",
"a:0",
"-show_entries",
"stream=index",
"stream=index,nb_frames",
"-of",
"csv=p=0",
str(video_path),
@@ -225,7 +271,16 @@ def _has_audio_stream(video_path: Path) -> bool:
capture_output=True,
text=True,
)
return bool(result.stdout.strip())
output = result.stdout.strip()
if not output:
return False
# output is "index" or "index,nb_frames"
parts = output.split(",")
if len(parts) >= 2:
nb_frames = parts[1].strip()
if nb_frames == "0":
return False # Ghost audio track — declared but no sample data
return True
def _build_audio_channel_filter(use_audio_channels: str) -> str:
@@ -263,11 +318,18 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
# Add -ss seek BEFORE -i for skip parameter and/or partial rendering
always_visible_inputs: list[int] = []
for video_id, video_source, cutout in plan.narration_videos:
video_path = _resolve_video_path(videos_dir, video_source, shared_assets_dir, project_path)
video_path = _resolve_video_path(
videos_dir, video_source, shared_assets_dir, project_path
)
# Combine video skip setting with partial render offset
total_seek = video_source.skip + plan.input_seek_time
if total_seek > 0:
cmd.extend(["-ss", f"{total_seek:.3f}"])
# Skip stream analysis — codec params are in the container header, and
# duration is already known by gnommo via ffprobe (plan.total_duration).
# Without this, FFmpeg reads 100MB+ of compressed data per input at 4K
# bitrates before encoding starts ("Estimating duration from bitrate").
cmd.extend(["-analyzeduration", "0", "-probesize", "1000"])
cmd.extend(["-i", str(video_path)])
always_visible_inputs.append(input_idx)
input_idx += 1
@@ -283,18 +345,26 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
shared_assets_dir = project_path.parent / "shared_assets"
videos_json_bg = shared_assets_dir / "videos.json"
if not videos_json_bg.exists():
raise RenderError(f"shared_assets/videos.json not found (needed for background handle '{bg_handle}')")
raise RenderError(
f"shared_assets/videos.json not found (needed for background handle '{bg_handle}')"
)
bg_videos = _read_json(videos_json_bg)
if bg_handle not in bg_videos:
raise RenderError(f"Background handle '{bg_handle}' not found in shared_assets/videos.json")
raise RenderError(
f"Background handle '{bg_handle}' not found in shared_assets/videos.json"
)
bg_path = shared_assets_dir / bg_videos[bg_handle]["source_file"]
if not bg_path.exists():
raise RenderError(f"Background file not found: {bg_path} (from handle '{bg_handle}')")
raise RenderError(
f"Background file not found: {bg_path} (from handle '{bg_handle}')"
)
image_extensions = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
bg_is_image = bg_path.suffix.lower() in image_extensions
# Loop background videos infinitely
if not bg_is_image:
cmd.extend(["-stream_loop", "-1"])
# Duration of background video is irrelevant (looped or image) — skip analysis
cmd.extend(["-analyzeduration", "0", "-probesize", "1000"])
cmd.extend(["-i", str(bg_path)])
bg_idx = input_idx
input_idx += 1
@@ -325,14 +395,24 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
video_path = _resolve_video_path(
videos_dir, event.video_source, shared_assets_dir, project_path
)
# Seek to skip point before loading input
skip = event.video_source.skip
if skip > 0:
cmd.extend(["-ss", f"{skip:.3f}"])
cmd.extend(["-analyzeduration", "0", "-probesize", "1000"])
# Use pre-probed duration to tell FFmpeg exactly how much to read,
# preventing scans of ghost audio tracks on empty MP4 audio streams.
if event.video_source.duration is not None:
remaining = event.video_source.duration - skip
if remaining > 0:
cmd.extend(["-t", f"{remaining:.3f}"])
cmd.extend(["-i", str(video_path)])
video_inputs[i] = input_idx
input_idx += 1
if _has_audio_stream(video_path):
has_audio = event.video_source.has_audio
if has_audio is None:
print(f" Warning: no cached metadata for '{event.video_source.source_file}' — run 'gnommo import' to avoid slow probing")
has_audio = _has_audio_stream(video_path)
if has_audio:
video_events_with_audio.add(i)
# Input: outro videos (play after narration ends)
@@ -343,14 +423,22 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
video_path = _resolve_video_path(
videos_dir, event.video_source, shared_assets_dir, project_path
)
# Seek to skip point before loading input
skip = event.video_source.skip
if skip > 0:
cmd.extend(["-ss", f"{skip:.3f}"])
cmd.extend(["-analyzeduration", "0", "-probesize", "1000"])
if event.video_source.duration is not None:
remaining = event.video_source.duration - skip
if remaining > 0:
cmd.extend(["-t", f"{remaining:.3f}"])
cmd.extend(["-i", str(video_path)])
outro_inputs[i] = input_idx
input_idx += 1
if _has_audio_stream(video_path):
has_audio = event.video_source.has_audio
if has_audio is None:
print(f" Warning: no cached metadata for '{event.video_source.source_file}' — run 'gnommo import' to avoid slow probing")
has_audio = _has_audio_stream(video_path)
if has_audio:
outro_events_with_audio.add(i)
# Track where audio inputs start
@@ -365,12 +453,24 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
if event.audio_id not in audio_inputs:
audio_path = audio_dir / event.audio_def.file
audio_path, _ = resolve_with_cache(audio_path, project_path)
# Use pre-probed duration from audio.json if available (set by import).
# For MP3 without Xing/VBRI headers this is critical — FFmpeg otherwise
# scans the whole file to estimate duration (100s+ for large files).
# Fall back to live probe only for MP3 when duration wasn't pre-cached.
file_duration = event.audio_def.duration
if file_duration is None and audio_path.suffix.lower() == ".mp3":
file_duration = _get_audio_duration(audio_path)
if file_duration is not None:
cmd.extend(["-t", str(file_duration)])
cmd.extend(["-i", str(audio_path)])
audio_inputs[event.audio_id] = input_idx
input_idx += 1
# Cache duration if this audio uses crossfade looping
# Cache duration for crossfade loop filter
if event.audio_def.loop and event.audio_def.overlap:
audio_durations[event.audio_id] = _get_audio_duration(audio_path)
audio_durations[event.audio_id] = (
file_duration if file_duration is not None
else _get_audio_duration(audio_path)
)
# Build filter_complex
filter_complex = build_filter_complex(
@@ -418,7 +518,7 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
"-preset",
"fast",
"-crf",
"23",
"20",
"-c:a",
"aac",
"-b:a",
@@ -793,6 +893,43 @@ def build_filter_complex(
)
current_label = next_label
# Add "below-slides" triggered video overlays (vfb/vsb or layer="below")
for i, event in enumerate(plan.video_events):
if event.layer != "below":
continue
video_idx = video_inputs[i]
cut_x, cut_y, cut_width, cut_height = _calculate_cutout_position(
event.cutout, width, height
)
duration = event.end_time - event.start_time
if event.video_source.take is not None:
duration = min(duration, event.video_source.take)
effective_end = event.start_time + duration
zoom = event.video_source.zoom
zoomed_width = int(cut_width * zoom)
zoomed_height = int(cut_height * zoom)
video_label = f"tvb{i}"
start_pts = event.start_time
filters.append(
f"[{video_idx}:v]format=yuva444p10le,"
f"setpts=PTS-STARTPTS+{start_pts:.3f}/TB,"
f"scale={zoomed_width}:{zoomed_height}:force_original_aspect_ratio=increase,"
f"crop={cut_width}:{cut_height}:(iw-{cut_width})/2:(ih-{cut_height})/2,"
f"format=rgba[{video_label}]"
)
next_label = f"tvbbase{i}"
enable_expr = f"between(t\\,{event.start_time:.3f}\\,{effective_end:.3f})"
filters.append(
f"[{current_label}][{video_label}]overlay="
f"x={cut_x}:y={cut_y}:enable={enable_expr}"
f"[{next_label}]"
)
current_label = next_label
# Add slide overlays with time-based enable
for i, event in enumerate(plan.slide_events):
slide_idx = slide_inputs[event.slide_id]
@@ -815,8 +952,10 @@ def build_filter_complex(
current_label = next_label
# Add triggered video overlays with time-based enable
# Add "above-slides" triggered video overlays (vft/vst or layer="above")
for i, event in enumerate(plan.video_events):
if event.layer != "above":
continue
video_idx = video_inputs[i]
cut_x, cut_y, cut_width, cut_height = _calculate_cutout_position(
event.cutout, width, height
@@ -836,22 +975,25 @@ def build_filter_complex(
# Scale to cover the zoomed area (like CSS object-fit: cover)
# Then crop to cutout dimensions (centered)
# Use setpts to sync video start with overlay enable time
# IMPORTANT: convert to rgba FIRST (before scale/crop) so the alpha channel
# is preserved throughout. scale in yuva444p10le can silently strip alpha.
video_label = f"tv{i}"
start_pts = event.start_time
filters.append(
f"[{video_idx}:v]format=yuva444p10le,"
f"[{video_idx}:v]format=rgba,"
f"setpts=PTS-STARTPTS+{start_pts:.3f}/TB,"
f"scale={zoomed_width}:{zoomed_height}:force_original_aspect_ratio=increase,"
f"crop={cut_width}:{cut_height}:(iw-{cut_width})/2:(ih-{cut_height})/2,"
f"format=rgba[{video_label}]"
f"crop={cut_width}:{cut_height}:(iw-{cut_width})/2:(ih-{cut_height})/2"
f"[{video_label}]"
)
# Overlay with time-based enable
# Overlay with time-based enable; format=auto lets FFmpeg pick the right
# compositing format so the RGBA alpha channel is respected.
next_label = f"tvbase{i}"
enable_expr = f"between(t\\,{event.start_time:.3f}\\,{effective_end:.3f})"
filters.append(
f"[{current_label}][{video_label}]overlay="
f"x={cut_x}:y={cut_y}:enable={enable_expr}"
f"x={cut_x}:y={cut_y}:enable={enable_expr}:format=auto"
f"[{next_label}]"
)
@@ -950,13 +1092,17 @@ def build_filter_complex(
_, first_video_source, _ = plan.narration_videos[0]
use_channels = first_video_source.use_audio_channels
if use_channels == "auto":
narration_path = _resolve_video_path(videos_dir, first_video_source, shared_assets_dir, project_path)
narration_path = _resolve_video_path(
videos_dir, first_video_source, shared_assets_dir, project_path
)
use_channels = _resolve_auto_channel(narration_path)
channel_filter = _build_audio_channel_filter(use_channels)
narration_volume = first_video_source.volume
# Build volume filter if not 1.0
volume_filter = f"volume={narration_volume:.2f}" if narration_volume != 1.0 else ""
volume_filter = (
f"volume={narration_volume:.2f}" if narration_volume != 1.0 else ""
)
# Use narration_end_time to stop audio before outro (if outro exists)
audio_end_time = (
@@ -980,7 +1126,9 @@ def build_filter_complex(
)
audio_labels_to_mix.append("[main_aud]")
elif filter_parts:
filters.append(f"[{main_audio_idx}:a]{','.join(filter_parts)}[main_aud]")
filters.append(
f"[{main_audio_idx}:a]{','.join(filter_parts)}[main_aud]"
)
audio_labels_to_mix.append("[main_aud]")
else:
audio_labels_to_mix.append(f"[{main_audio_idx}:a]")
@@ -1066,7 +1214,10 @@ def build_filter_complex(
label = f"aud{i}"
delay_ms = int(event.start_time * 1000)
if event.audio_def.overlap and event.audio_id in audio_durations:
if (
event.audio_def.overlap
and event.audio_id in audio_durations
):
# Crossfade loop: overlap copies with fade in/out
audio_dur = audio_durations[event.audio_id]
crossfade_filters = _build_crossfade_loop_filter(