diff --git a/gnommo/renderer.py b/gnommo/renderer.py index f67ac34..8c5bc9f 100644 --- a/gnommo/renderer.py +++ b/gnommo/renderer.py @@ -814,10 +814,10 @@ def build_filter_complex( Layer structure (bottom to top): - Layer 1: Background (solid color, image, or video) - - Layer 2: "below" triggered videos (vfb/vsb) — behind talking head - - Layer 3: Always visible videos (like talking head) in cutouts - - Layer 4: Slides (with time-based enable) - - Layer 5: "above" triggered videos (vft/vst) — in front of slides + - Layer 2: "below" triggered videos (vfb/vf2b/vsb) — behind slides, use with slide on top to mask + - Layer 3: Slides (transparent in talking-head cutout area) + - Layer 4: Always visible videos (talking head) — above slides, visible through cutout + - Layer 5: "above" triggered videos (vft/vf2t/vst) — topmost, covers everything including talking head - Layer 6: Camera transform - Layer 7: Outro videos (fullscreen, after narration ends) - Audio: Main audio mixed with triggered sound effects and outro audio @@ -846,8 +846,7 @@ def build_filter_complex( current_label = "bg" - # Add "below" triggered video overlays (vfb/vsb) BEFORE the talking head - # so they sit behind it in the composite stack. + # Layer 2: "below" triggered video overlays (vfb/vsb) — behind slides and talking head for i, event in enumerate(plan.video_events): if event.layer != "below": continue @@ -884,23 +883,37 @@ def build_filter_complex( ) current_label = next_label - # Overlay always_visible videos (like talking head) - # If there are narration pauses, we need to segment the video + # Layer 3: Slides (transparent in the talking-head cutout area) + for i, event in enumerate(plan.slide_events): + slide_idx = slide_inputs[event.slide_id] + + slide_label = f"s{i}" + filters.append( + f"[{slide_idx}:v]scale={width}:{height}:" + f"force_original_aspect_ratio=decrease,pad={width}:{height}:(ow-iw)/2:(oh-ih)/2:color=0x00000000[{slide_label}]" + ) + + next_label = f"sbase{i}" + enable_expr = f"between(t\\,{event.start_time:.3f}\\,{event.end_time:.3f})" + filters.append( + f"[{current_label}][{slide_label}]overlay=" + f"x=0:y=0:enable={enable_expr}" + f"[{next_label}]" + ) + current_label = next_label + + # Layer 4: Always-visible videos (talking head) — above slides, visible through cutout for i, (video_id, video_source, cutout) in enumerate(plan.narration_videos): input_idx = always_visible_inputs[i] cut_x, cut_y, cut_width, cut_height = _calculate_cutout_position( cutout, width, height ) - # Apply zoom factor to cutout dimensions zoom = video_source.zoom zoomed_width = int(cut_width * zoom) zoomed_height = int(cut_height * zoom) if not plan.narration_pauses: - # Simple case: no pauses, continuous overlay - # fps+setpts normalise the source to a constant frame rate and reset - # the timeline to 0 so the video stays locked to the audio track. video_label = f"av{i}" filters.append( f"[{input_idx}:v]fps={plan.config.fps},setpts=PTS-STARTPTS," @@ -916,18 +929,12 @@ def build_filter_complex( ) current_label = next_label else: - # Complex case: narration pauses - segment the video - # Each segment is trimmed from source and positioned in output timeline segments = _build_narration_segments( plan.narration_pauses, plan.total_duration ) - for seg_idx, (src_start, src_end, out_start, out_end) in enumerate( - segments - ): + for seg_idx, (src_start, src_end, out_start, out_end) in enumerate(segments): seg_label = f"av{i}_seg{seg_idx}" - # Trim to source range, then shift PTS to output position - # setpts=PTS-STARTPTS puts segment at 0, then +offset/TB shifts to output time pts_offset = out_start filters.append( f"[{input_idx}:v]trim={src_start:.3f}:{src_end:.3f}," @@ -938,7 +945,6 @@ def build_filter_complex( f"format=rgba[{seg_label}]" ) - # Overlay with enable for this segment's output time range next_label = f"avbase{i}_seg{seg_idx}" enable_expr = f"between(t\\,{out_start:.3f}\\,{out_end:.3f})" filters.append( @@ -947,29 +953,8 @@ def build_filter_complex( ) current_label = next_label - # Add slide overlays with time-based enable - for i, event in enumerate(plan.slide_events): - slide_idx = slide_inputs[event.slide_id] - - # Scale slide to full frame size (transparent areas show through) - slide_label = f"s{i}" - filters.append( - f"[{slide_idx}:v]scale={width}:{height}:" - f"force_original_aspect_ratio=decrease,pad={width}:{height}:(ow-iw)/2:(oh-ih)/2:color=0x00000000[{slide_label}]" - ) - - # Overlay at 0,0 (full frame) with time-based enable - next_label = f"sbase{i}" - enable_expr = f"between(t\\,{event.start_time:.3f}\\,{event.end_time:.3f})" - filters.append( - f"[{current_label}][{slide_label}]overlay=" - f"x=0:y=0:enable={enable_expr}" - f"[{next_label}]" - ) - - current_label = next_label - - # Add "above-slides" triggered video overlays (vft/vst or layer="above") + # Layer 5: "above" triggered videos (vft/vf2t/vst) — topmost, covers slides and talking head + # Use case: fullscreen video that intentionally masks the narrator for i, event in enumerate(plan.video_events): if event.layer != "above": continue @@ -978,22 +963,15 @@ def build_filter_complex( event.cutout, width, height ) - # Calculate effective end time (respecting 'take' parameter) duration = event.end_time - event.start_time if event.video_source.take is not None: duration = min(duration, event.video_source.take) effective_end = event.start_time + duration - # Apply zoom factor to cutout dimensions zoom = event.video_source.zoom zoomed_width = int(cut_width * zoom) zoomed_height = int(cut_height * zoom) - # Scale to cover the zoomed area (like CSS object-fit: cover) - # Then crop to cutout dimensions (centered) - # Use setpts to sync video start with overlay enable time - # IMPORTANT: convert to rgba FIRST (before scale/crop) so the alpha channel - # is preserved throughout. scale in yuva444p10le can silently strip alpha. video_label = f"tv{i}" start_pts = event.start_time filters.append( @@ -1004,8 +982,6 @@ def build_filter_complex( f"[{video_label}]" ) - # Overlay with time-based enable; format=auto lets FFmpeg pick the right - # compositing format so the RGBA alpha channel is respected. next_label = f"tvbase{i}" enable_expr = f"between(t\\,{event.start_time:.3f}\\,{effective_end:.3f})" filters.append( @@ -1013,7 +989,6 @@ def build_filter_complex( f"x={cut_x}:y={cut_y}:enable={enable_expr}:format=auto" f"[{next_label}]" ) - current_label = next_label # Scene composition complete - now apply camera transform